mirror of
https://gitee.com/rainbond/Rainbond.git
synced 2024-12-02 19:57:42 +08:00
[ADD] add default rules
This commit is contained in:
parent
b60afa8da9
commit
425bd5198a
@ -41,7 +41,7 @@ func main() {
|
||||
c.CompleteConfig()
|
||||
|
||||
// start prometheus daemon and watching tis status in all time, exit monitor process if start failed
|
||||
a := prometheus.NewRulesManager()
|
||||
a := prometheus.NewRulesManager(c)
|
||||
p := prometheus.NewManager(c, a)
|
||||
controllerManager := controller.NewControllerManager(a,p)
|
||||
|
||||
|
@ -38,6 +38,7 @@ type Config struct {
|
||||
|
||||
StartArgs []string
|
||||
ConfigFile string
|
||||
AlertingRulesFile string
|
||||
LocalStoragePath string
|
||||
Web Web
|
||||
Tsdb Tsdb
|
||||
@ -96,6 +97,7 @@ func NewConfig() *Config {
|
||||
LogLevel: "info",
|
||||
|
||||
ConfigFile: "/etc/prometheus/prometheus.yml",
|
||||
AlertingRulesFile: "/etc/prometheus/rules.yml",
|
||||
LocalStoragePath: "/prometheusdata",
|
||||
WebTimeout: "5m",
|
||||
RemoteFlushDeadline: "1m",
|
||||
@ -128,6 +130,8 @@ func (c *Config) AddFlag(cmd *pflag.FlagSet) {
|
||||
func (c *Config) AddPrometheusFlag(cmd *pflag.FlagSet) {
|
||||
cmd.StringVar(&c.ConfigFile, "config.file", c.ConfigFile, "Prometheus configuration file path.")
|
||||
|
||||
cmd.StringVar(&c.AlertingRulesFile, "rules-config.file", c.AlertingRulesFile, "Prometheus alerting rules config file path.")
|
||||
|
||||
cmd.StringVar(&c.Web.ListenAddress, "web.listen-address", c.Web.ListenAddress, "Address to listen on for UI, API, and telemetry.")
|
||||
|
||||
cmd.StringVar(&c.WebTimeout, "web.read-timeout", c.WebTimeout, "Maximum duration before timing out read of the request, and closing idle connections.")
|
||||
|
@ -47,7 +47,7 @@ func Run(s *option.WebCliServer) error {
|
||||
return err
|
||||
}
|
||||
defer ap.Exit()
|
||||
keepalive, err := discover.CreateKeepAlive(s.EtcdEndPoints, "acp_webcli", s.HostName, s.HostIP, s.Port)
|
||||
keepalive, err := discover.CreateKeepAlive(s.EtcdEndPoints, "acp_webcli", s.HostName, s.HostIP, 6301)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -6,7 +6,6 @@ import (
|
||||
"github.com/goodrain/rainbond/grctl/clients"
|
||||
"fmt"
|
||||
"github.com/ghodss/yaml"
|
||||
"encoding/json"
|
||||
"github.com/goodrain/rainbond/node/api/model"
|
||||
"errors"
|
||||
)
|
||||
@ -56,10 +55,9 @@ func NewCmdAlerting() cli.Command {
|
||||
logrus.Errorf("need args")
|
||||
return nil
|
||||
}
|
||||
v, err := clients.RegionClient.Monitor().DelRule(name)
|
||||
_, err := clients.RegionClient.Monitor().DelRule(name)
|
||||
handleErr(err)
|
||||
result, _ := json.Marshal(v.Bean)
|
||||
fmt.Println(string(result))
|
||||
fmt.Println("Delete rule succeeded")
|
||||
return nil
|
||||
},
|
||||
},
|
||||
@ -78,13 +76,11 @@ func NewCmdAlerting() cli.Command {
|
||||
if c.IsSet("Rules") {
|
||||
rules := c.String("Rules")
|
||||
|
||||
println("====>", rules)
|
||||
var rulesConfig model.AlertingNameConfig
|
||||
yaml.Unmarshal([]byte(rules), &rulesConfig)
|
||||
v, err := clients.RegionClient.Monitor().AddRule(&rulesConfig)
|
||||
_, err := clients.RegionClient.Monitor().AddRule(&rulesConfig)
|
||||
handleErr(err)
|
||||
result, _ := json.Marshal(v.Bean)
|
||||
fmt.Println(string(result))
|
||||
fmt.Println("Add rule successfully")
|
||||
return nil
|
||||
}
|
||||
return errors.New("rules not null")
|
||||
@ -110,13 +106,11 @@ func NewCmdAlerting() cli.Command {
|
||||
if c.IsSet("RulesName") && c.IsSet("Rules") {
|
||||
rules := c.String("Rules")
|
||||
ruleName := c.String("RulesName")
|
||||
println("====>", rules)
|
||||
var rulesConfig model.AlertingNameConfig
|
||||
yaml.Unmarshal([]byte(rules), &rulesConfig)
|
||||
v, err := clients.RegionClient.Monitor().RegRule(ruleName, &rulesConfig)
|
||||
_, err := clients.RegionClient.Monitor().RegRule(ruleName, &rulesConfig)
|
||||
handleErr(err)
|
||||
result, _ := json.Marshal(v.Bean)
|
||||
fmt.Println(string(result))
|
||||
fmt.Println("Modify rule successfully")
|
||||
return nil
|
||||
}
|
||||
return errors.New("rule name or rules not null")
|
||||
|
@ -35,32 +35,13 @@ func (c *ControllerManager) AddRules(w http.ResponseWriter, r *http.Request) {
|
||||
var RulesConfig prometheus.AlertingNameConfig
|
||||
|
||||
unmarshalErr := json.Unmarshal(in, &RulesConfig)
|
||||
if unmarshalErr != nil{
|
||||
logrus.Info("反序列化错误",unmarshalErr)
|
||||
if unmarshalErr != nil {
|
||||
logrus.Info(unmarshalErr)
|
||||
httputil.ReturnError(r, w, 400, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
//err = ioutil.WriteFile("/etc/prometheus/cache_rule.yml", in, 0644)
|
||||
//if err != nil {
|
||||
// logrus.Error(err.Error())
|
||||
//}
|
||||
//
|
||||
//content, err := ioutil.ReadFile("/etc/prometheus/cache_rule.yml")
|
||||
//if err != nil {
|
||||
// logrus.Error( err)
|
||||
//
|
||||
//}
|
||||
//
|
||||
//if err := yaml.Unmarshal(content, &RulesConfig); err != nil {
|
||||
// logrus.Error("Unmarshal prometheus alerting rules config string to object error.", err.Error())
|
||||
// httputil.ReturnError(r, w, 400, err.Error())
|
||||
// return
|
||||
//}
|
||||
println("======01")
|
||||
c.Rules.RulesConfig.LoadAlertingRulesConfig()
|
||||
println("======02")
|
||||
|
||||
group := c.Rules.RulesConfig.Groups
|
||||
for _, v := range group {
|
||||
@ -69,14 +50,9 @@ func (c *ControllerManager) AddRules(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
}
|
||||
println("======03")
|
||||
|
||||
println("=====>", RulesConfig.Name)
|
||||
group = append(group, &RulesConfig)
|
||||
c.Rules.RulesConfig.Groups = group
|
||||
println("======04")
|
||||
c.Rules.RulesConfig.SaveAlertingRulesConfig()
|
||||
println("======05")
|
||||
c.Manager.RestartDaemon()
|
||||
httputil.ReturnSuccess(r, w, "Add rule successfully")
|
||||
|
||||
@ -105,7 +81,9 @@ func (c *ControllerManager) DelRules(w http.ResponseWriter, r *http.Request) {
|
||||
for i, v := range groupsList {
|
||||
if v.Name == rulesName {
|
||||
groupsList = append(groupsList[:i], groupsList[i+1:]...)
|
||||
c.Rules.RulesConfig.Groups = groupsList
|
||||
c.Rules.RulesConfig.SaveAlertingRulesConfig()
|
||||
c.Manager.RestartDaemon()
|
||||
httputil.ReturnSuccess(r, w, "successfully deleted")
|
||||
return
|
||||
}
|
||||
@ -124,34 +102,19 @@ func (c *ControllerManager) RegRules(w http.ResponseWriter, r *http.Request) {
|
||||
var RulesConfig prometheus.AlertingNameConfig
|
||||
|
||||
unmarshalErr := json.Unmarshal(in, &RulesConfig)
|
||||
if unmarshalErr != nil{
|
||||
logrus.Info("反序列化错误",unmarshalErr)
|
||||
if unmarshalErr != nil {
|
||||
logrus.Info(unmarshalErr)
|
||||
httputil.ReturnError(r, w, 400, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
//err = ioutil.WriteFile("/etc/prometheus/cache_rule.yml", in, 0644)
|
||||
//if err != nil {
|
||||
// logrus.Error(err.Error())
|
||||
//}
|
||||
//
|
||||
//content, err := ioutil.ReadFile("/etc/prometheus/cache_rule.yml")
|
||||
//if err != nil {
|
||||
// logrus.Error(err)
|
||||
//
|
||||
//}
|
||||
//
|
||||
//if err := yaml.Unmarshal(content, &RulesConfig); err != nil {
|
||||
// logrus.Error("Unmarshal prometheus alerting rules config string to object error.", err.Error())
|
||||
// httputil.ReturnError(r, w, 400, err.Error())
|
||||
// return
|
||||
//}
|
||||
c.Rules.RulesConfig.LoadAlertingRulesConfig()
|
||||
|
||||
group := c.Rules.RulesConfig.Groups
|
||||
for i, v := range group {
|
||||
if v.Name == rulesName {
|
||||
group[i] = &RulesConfig
|
||||
c.Manager.RestartDaemon()
|
||||
httputil.ReturnSuccess(r, w, "Update rule succeeded")
|
||||
c.Rules.RulesConfig.SaveAlertingRulesConfig()
|
||||
return
|
||||
|
@ -74,7 +74,7 @@ func NewManager(config *option.Config, a *AlertingRulesManager) *Manager {
|
||||
ScrapeInterval: model.Duration(time.Second * 5),
|
||||
EvaluationInterval: model.Duration(time.Second * 30),
|
||||
},
|
||||
RuleFiles: []string{"/etc/prometheus/rules.yml"},
|
||||
RuleFiles: []string{config.AlertingRulesFile},
|
||||
},
|
||||
Registry: reg,
|
||||
httpClient: client,
|
||||
@ -82,7 +82,7 @@ func NewManager(config *option.Config, a *AlertingRulesManager) *Manager {
|
||||
a: a,
|
||||
}
|
||||
m.LoadConfig()
|
||||
m.a.RulesConfig.InitRulesConfig()
|
||||
m.a.InitRulesConfig()
|
||||
|
||||
return m
|
||||
}
|
||||
|
@ -5,6 +5,8 @@ import (
|
||||
"io/ioutil"
|
||||
"gopkg.in/yaml.v2"
|
||||
"os"
|
||||
"github.com/goodrain/rainbond/cmd/monitor/option"
|
||||
|
||||
)
|
||||
|
||||
type AlertingRulesConfig struct {
|
||||
@ -17,57 +19,138 @@ type AlertingNameConfig struct {
|
||||
}
|
||||
|
||||
type RulesConfig struct {
|
||||
Alert string `yaml:"alert" json:"alert"`
|
||||
Expr string `yaml:"expr" json:"expr"`
|
||||
For string `yaml:"for" json:"for"`
|
||||
Labels map[string]string `yaml:"labels" json:"labels"`
|
||||
Alert string `yaml:"alert" json:"alert"`
|
||||
Expr string `yaml:"expr" json:"expr"`
|
||||
For string `yaml:"for" json:"for"`
|
||||
Labels map[string]string `yaml:"labels" json:"labels"`
|
||||
Annotations map[string]string `yaml:"annotations" json:"annotations"`
|
||||
}
|
||||
|
||||
type AlertingRulesManager struct {
|
||||
RulesConfig *AlertingRulesConfig
|
||||
|
||||
config *option.Config
|
||||
}
|
||||
|
||||
func NewRulesManager() *AlertingRulesManager {
|
||||
a:= &AlertingRulesManager{
|
||||
func NewRulesManager(config *option.Config) *AlertingRulesManager {
|
||||
a := &AlertingRulesManager{
|
||||
RulesConfig: &AlertingRulesConfig{
|
||||
Groups:[]*AlertingNameConfig{
|
||||
Groups: []*AlertingNameConfig{
|
||||
&AlertingNameConfig{
|
||||
|
||||
Name: "test",
|
||||
Name: "InstanceHealth",
|
||||
Rules: []*RulesConfig{
|
||||
&RulesConfig{
|
||||
Alert: "MqHealth",
|
||||
Expr: "acp_mq_exporter_health_status{job='mq'} < 1",
|
||||
For: "2m",
|
||||
Labels: map[string]string{"service_name": "mq"},
|
||||
Annotations: map[string]string{"summary": "unhealthy"},
|
||||
Alert: "InstanceDown",
|
||||
Expr: "up == 0",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "builder {{$labels.instance}} down", "description":"{{$labels.instance}} of job {{$labels.job}} has been down for more than 3 minutes"},
|
||||
},
|
||||
},
|
||||
},
|
||||
&AlertingNameConfig{
|
||||
|
||||
Name: "test2",
|
||||
Name: "BuilderHealth",
|
||||
Rules: []*RulesConfig{
|
||||
&RulesConfig{
|
||||
Alert: "builderHealth",
|
||||
Expr: "acp_mq_exporter_health_status{job='mq'} < 1",
|
||||
For: "5m",
|
||||
Labels: map[string]string{"service_name": "builder"},
|
||||
Annotations: map[string]string{"summary": "unhealthy"},
|
||||
Alert: "BuilderUnhealthy",
|
||||
Expr: "builder_exporter_health_status == 0",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "builder unhealthy"},
|
||||
},
|
||||
&RulesConfig{
|
||||
Alert: "BuilderTaskError",
|
||||
Expr: "builder_exporter_builder_task_error > 30",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "Builder execution task error number is greater than 30"},
|
||||
},
|
||||
},
|
||||
},
|
||||
&AlertingNameConfig{
|
||||
|
||||
Name: "WorkerHealth",
|
||||
Rules: []*RulesConfig{
|
||||
&RulesConfig{
|
||||
Alert: "WorkerUnhealthy",
|
||||
Expr: "app_resource_exporter_health_status == 0",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "worker unhealthy"},
|
||||
},
|
||||
&RulesConfig{
|
||||
Alert: "WorkerTaskError",
|
||||
Expr: "app_resource_exporter_worker_task_error > 50",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "worker execution task error number is greater than 50"},
|
||||
},
|
||||
},
|
||||
},
|
||||
&AlertingNameConfig{
|
||||
|
||||
Name: "EntranceHealth",
|
||||
Rules: []*RulesConfig{
|
||||
&RulesConfig{
|
||||
Alert: "EntranceUnHealthy",
|
||||
Expr: "acp_entrance_exporter_health_status == 0",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "entrance unhealthy"},
|
||||
},
|
||||
},
|
||||
},
|
||||
&AlertingNameConfig{
|
||||
|
||||
Name: "MqHealth",
|
||||
Rules: []*RulesConfig{
|
||||
&RulesConfig{
|
||||
Alert: "MqUnhealthy",
|
||||
Expr: "acp_mq_exporter_health_status == 0",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "mq unhealthy"},
|
||||
},
|
||||
&RulesConfig{
|
||||
Alert: "TeamTaskMany",
|
||||
Expr: "acp_mq_dequeue_number-acp_mq_enqueue_number > 200",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "The number of tasks in the queue is greater than 200"},
|
||||
},
|
||||
},
|
||||
},
|
||||
&AlertingNameConfig{
|
||||
|
||||
Name: "EventlogHealth",
|
||||
Rules: []*RulesConfig{
|
||||
&RulesConfig{
|
||||
Alert: "EventLogUnhealthy",
|
||||
Expr: "event_log_exporter_health_status == 0",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "eventlog unhealthy"},
|
||||
},
|
||||
&RulesConfig{
|
||||
Alert: "EventLogDown",
|
||||
Expr: "event_log_exporter_instanse_up == 0",
|
||||
For: "3m",
|
||||
Labels: map[string]string{},
|
||||
Annotations: map[string]string{"summary": "eventlog service down"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
config: config,
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
func (a *AlertingRulesConfig)LoadAlertingRulesConfig() error {
|
||||
func (a *AlertingRulesManager) LoadAlertingRulesConfig() error {
|
||||
logrus.Info("Load AlertingRules config file.")
|
||||
content, err := ioutil.ReadFile("/etc/prometheus/rules.yml")
|
||||
content, err := ioutil.ReadFile(a.config.AlertingRulesFile)
|
||||
if err != nil {
|
||||
logrus.Error("Failed to read AlertingRules config file: ", err)
|
||||
logrus.Info("Init config file by default values.")
|
||||
@ -82,8 +165,7 @@ func (a *AlertingRulesConfig)LoadAlertingRulesConfig() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error {
|
||||
func (a *AlertingRulesManager) SaveAlertingRulesConfig() error {
|
||||
logrus.Debug("Save alerting rules config file.")
|
||||
|
||||
data, err := yaml.Marshal(a)
|
||||
@ -92,7 +174,7 @@ func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error {
|
||||
return err
|
||||
}
|
||||
|
||||
err = ioutil.WriteFile("/etc/prometheus/rules.yml", data, 0644)
|
||||
err = ioutil.WriteFile(a.config.AlertingRulesFile, data, 0644)
|
||||
if err != nil {
|
||||
logrus.Error("Write alerting rules config file error.", err.Error())
|
||||
return err
|
||||
@ -101,15 +183,14 @@ func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
func (a *AlertingRulesConfig) AddRules(val AlertingNameConfig) error {
|
||||
group := a.Groups
|
||||
func (a *AlertingRulesManager) AddRules(val AlertingNameConfig) error {
|
||||
group := a.RulesConfig.Groups
|
||||
group = append(group, &val)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *AlertingRulesConfig) InitRulesConfig() {
|
||||
_, err := os.Stat("/etc/prometheus/rules.yml") //os.Stat获取文件信息
|
||||
func (a *AlertingRulesManager) InitRulesConfig() {
|
||||
_, err := os.Stat(a.config.AlertingRulesFile) //os.Stat获取文件信息
|
||||
if err != nil {
|
||||
if os.IsExist(err) {
|
||||
return
|
||||
@ -119,4 +200,4 @@ func (a *AlertingRulesConfig) InitRulesConfig() {
|
||||
}
|
||||
return
|
||||
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user