diff --git a/cmd/monitor/main.go b/cmd/monitor/main.go index 0e5ba622e..27a67c03a 100644 --- a/cmd/monitor/main.go +++ b/cmd/monitor/main.go @@ -41,7 +41,7 @@ func main() { c.CompleteConfig() // start prometheus daemon and watching tis status in all time, exit monitor process if start failed - a := prometheus.NewRulesManager() + a := prometheus.NewRulesManager(c) p := prometheus.NewManager(c, a) controllerManager := controller.NewControllerManager(a,p) diff --git a/cmd/monitor/option/option.go b/cmd/monitor/option/option.go index b6872bf07..1b07f5c03 100644 --- a/cmd/monitor/option/option.go +++ b/cmd/monitor/option/option.go @@ -38,6 +38,7 @@ type Config struct { StartArgs []string ConfigFile string + AlertingRulesFile string LocalStoragePath string Web Web Tsdb Tsdb @@ -96,6 +97,7 @@ func NewConfig() *Config { LogLevel: "info", ConfigFile: "/etc/prometheus/prometheus.yml", + AlertingRulesFile: "/etc/prometheus/rules.yml", LocalStoragePath: "/prometheusdata", WebTimeout: "5m", RemoteFlushDeadline: "1m", @@ -128,6 +130,8 @@ func (c *Config) AddFlag(cmd *pflag.FlagSet) { func (c *Config) AddPrometheusFlag(cmd *pflag.FlagSet) { cmd.StringVar(&c.ConfigFile, "config.file", c.ConfigFile, "Prometheus configuration file path.") + cmd.StringVar(&c.AlertingRulesFile, "rules-config.file", c.AlertingRulesFile, "Prometheus alerting rules config file path.") + cmd.StringVar(&c.Web.ListenAddress, "web.listen-address", c.Web.ListenAddress, "Address to listen on for UI, API, and telemetry.") cmd.StringVar(&c.WebTimeout, "web.read-timeout", c.WebTimeout, "Maximum duration before timing out read of the request, and closing idle connections.") diff --git a/cmd/webcli/server/server.go b/cmd/webcli/server/server.go index fb2ace8bb..39e65ebb1 100644 --- a/cmd/webcli/server/server.go +++ b/cmd/webcli/server/server.go @@ -47,7 +47,7 @@ func Run(s *option.WebCliServer) error { return err } defer ap.Exit() - keepalive, err := discover.CreateKeepAlive(s.EtcdEndPoints, "acp_webcli", s.HostName, s.HostIP, s.Port) + keepalive, err := discover.CreateKeepAlive(s.EtcdEndPoints, "acp_webcli", s.HostName, s.HostIP, 6301) if err != nil { return err } diff --git a/grctl/cmd/monitor.go b/grctl/cmd/monitor.go index 4cdc32cb0..819c741f5 100644 --- a/grctl/cmd/monitor.go +++ b/grctl/cmd/monitor.go @@ -6,7 +6,6 @@ import ( "github.com/goodrain/rainbond/grctl/clients" "fmt" "github.com/ghodss/yaml" - "encoding/json" "github.com/goodrain/rainbond/node/api/model" "errors" ) @@ -56,10 +55,9 @@ func NewCmdAlerting() cli.Command { logrus.Errorf("need args") return nil } - v, err := clients.RegionClient.Monitor().DelRule(name) + _, err := clients.RegionClient.Monitor().DelRule(name) handleErr(err) - result, _ := json.Marshal(v.Bean) - fmt.Println(string(result)) + fmt.Println("Delete rule succeeded") return nil }, }, @@ -78,13 +76,11 @@ func NewCmdAlerting() cli.Command { if c.IsSet("Rules") { rules := c.String("Rules") - println("====>", rules) var rulesConfig model.AlertingNameConfig yaml.Unmarshal([]byte(rules), &rulesConfig) - v, err := clients.RegionClient.Monitor().AddRule(&rulesConfig) + _, err := clients.RegionClient.Monitor().AddRule(&rulesConfig) handleErr(err) - result, _ := json.Marshal(v.Bean) - fmt.Println(string(result)) + fmt.Println("Add rule successfully") return nil } return errors.New("rules not null") @@ -110,13 +106,11 @@ func NewCmdAlerting() cli.Command { if c.IsSet("RulesName") && c.IsSet("Rules") { rules := c.String("Rules") ruleName := c.String("RulesName") - println("====>", rules) var rulesConfig model.AlertingNameConfig yaml.Unmarshal([]byte(rules), &rulesConfig) - v, err := clients.RegionClient.Monitor().RegRule(ruleName, &rulesConfig) + _, err := clients.RegionClient.Monitor().RegRule(ruleName, &rulesConfig) handleErr(err) - result, _ := json.Marshal(v.Bean) - fmt.Println(string(result)) + fmt.Println("Modify rule successfully") return nil } return errors.New("rule name or rules not null") diff --git a/monitor/api/controller/rules.go b/monitor/api/controller/rules.go index cc03d63f7..07dd77ff5 100644 --- a/monitor/api/controller/rules.go +++ b/monitor/api/controller/rules.go @@ -35,32 +35,13 @@ func (c *ControllerManager) AddRules(w http.ResponseWriter, r *http.Request) { var RulesConfig prometheus.AlertingNameConfig unmarshalErr := json.Unmarshal(in, &RulesConfig) - if unmarshalErr != nil{ - logrus.Info("反序列化错误",unmarshalErr) + if unmarshalErr != nil { + logrus.Info(unmarshalErr) httputil.ReturnError(r, w, 400, err.Error()) return } - - //err = ioutil.WriteFile("/etc/prometheus/cache_rule.yml", in, 0644) - //if err != nil { - // logrus.Error(err.Error()) - //} - // - //content, err := ioutil.ReadFile("/etc/prometheus/cache_rule.yml") - //if err != nil { - // logrus.Error( err) - // - //} - // - //if err := yaml.Unmarshal(content, &RulesConfig); err != nil { - // logrus.Error("Unmarshal prometheus alerting rules config string to object error.", err.Error()) - // httputil.ReturnError(r, w, 400, err.Error()) - // return - //} - println("======01") c.Rules.RulesConfig.LoadAlertingRulesConfig() - println("======02") group := c.Rules.RulesConfig.Groups for _, v := range group { @@ -69,14 +50,9 @@ func (c *ControllerManager) AddRules(w http.ResponseWriter, r *http.Request) { return } } - println("======03") - - println("=====>", RulesConfig.Name) group = append(group, &RulesConfig) c.Rules.RulesConfig.Groups = group - println("======04") c.Rules.RulesConfig.SaveAlertingRulesConfig() - println("======05") c.Manager.RestartDaemon() httputil.ReturnSuccess(r, w, "Add rule successfully") @@ -105,7 +81,9 @@ func (c *ControllerManager) DelRules(w http.ResponseWriter, r *http.Request) { for i, v := range groupsList { if v.Name == rulesName { groupsList = append(groupsList[:i], groupsList[i+1:]...) + c.Rules.RulesConfig.Groups = groupsList c.Rules.RulesConfig.SaveAlertingRulesConfig() + c.Manager.RestartDaemon() httputil.ReturnSuccess(r, w, "successfully deleted") return } @@ -124,34 +102,19 @@ func (c *ControllerManager) RegRules(w http.ResponseWriter, r *http.Request) { var RulesConfig prometheus.AlertingNameConfig unmarshalErr := json.Unmarshal(in, &RulesConfig) - if unmarshalErr != nil{ - logrus.Info("反序列化错误",unmarshalErr) + if unmarshalErr != nil { + logrus.Info(unmarshalErr) httputil.ReturnError(r, w, 400, err.Error()) return } - //err = ioutil.WriteFile("/etc/prometheus/cache_rule.yml", in, 0644) - //if err != nil { - // logrus.Error(err.Error()) - //} - // - //content, err := ioutil.ReadFile("/etc/prometheus/cache_rule.yml") - //if err != nil { - // logrus.Error(err) - // - //} - // - //if err := yaml.Unmarshal(content, &RulesConfig); err != nil { - // logrus.Error("Unmarshal prometheus alerting rules config string to object error.", err.Error()) - // httputil.ReturnError(r, w, 400, err.Error()) - // return - //} c.Rules.RulesConfig.LoadAlertingRulesConfig() group := c.Rules.RulesConfig.Groups for i, v := range group { if v.Name == rulesName { group[i] = &RulesConfig + c.Manager.RestartDaemon() httputil.ReturnSuccess(r, w, "Update rule succeeded") c.Rules.RulesConfig.SaveAlertingRulesConfig() return diff --git a/monitor/prometheus/manager.go b/monitor/prometheus/manager.go index 929df5cda..5c39bf451 100644 --- a/monitor/prometheus/manager.go +++ b/monitor/prometheus/manager.go @@ -74,7 +74,7 @@ func NewManager(config *option.Config, a *AlertingRulesManager) *Manager { ScrapeInterval: model.Duration(time.Second * 5), EvaluationInterval: model.Duration(time.Second * 30), }, - RuleFiles: []string{"/etc/prometheus/rules.yml"}, + RuleFiles: []string{config.AlertingRulesFile}, }, Registry: reg, httpClient: client, @@ -82,7 +82,7 @@ func NewManager(config *option.Config, a *AlertingRulesManager) *Manager { a: a, } m.LoadConfig() - m.a.RulesConfig.InitRulesConfig() + m.a.InitRulesConfig() return m } diff --git a/monitor/prometheus/rules_manager.go b/monitor/prometheus/rules_manager.go index 5dc4c32bf..3b04076ff 100644 --- a/monitor/prometheus/rules_manager.go +++ b/monitor/prometheus/rules_manager.go @@ -5,6 +5,8 @@ import ( "io/ioutil" "gopkg.in/yaml.v2" "os" + "github.com/goodrain/rainbond/cmd/monitor/option" + ) type AlertingRulesConfig struct { @@ -17,57 +19,138 @@ type AlertingNameConfig struct { } type RulesConfig struct { - Alert string `yaml:"alert" json:"alert"` - Expr string `yaml:"expr" json:"expr"` - For string `yaml:"for" json:"for"` - Labels map[string]string `yaml:"labels" json:"labels"` + Alert string `yaml:"alert" json:"alert"` + Expr string `yaml:"expr" json:"expr"` + For string `yaml:"for" json:"for"` + Labels map[string]string `yaml:"labels" json:"labels"` Annotations map[string]string `yaml:"annotations" json:"annotations"` } type AlertingRulesManager struct { RulesConfig *AlertingRulesConfig - + config *option.Config } -func NewRulesManager() *AlertingRulesManager { - a:= &AlertingRulesManager{ +func NewRulesManager(config *option.Config) *AlertingRulesManager { + a := &AlertingRulesManager{ RulesConfig: &AlertingRulesConfig{ - Groups:[]*AlertingNameConfig{ + Groups: []*AlertingNameConfig{ &AlertingNameConfig{ - Name: "test", + Name: "InstanceHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "MqHealth", - Expr: "acp_mq_exporter_health_status{job='mq'} < 1", - For: "2m", - Labels: map[string]string{"service_name": "mq"}, - Annotations: map[string]string{"summary": "unhealthy"}, + Alert: "InstanceDown", + Expr: "up == 0", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "builder {{$labels.instance}} down", "description":"{{$labels.instance}} of job {{$labels.job}} has been down for more than 3 minutes"}, }, }, }, &AlertingNameConfig{ - Name: "test2", + Name: "BuilderHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "builderHealth", - Expr: "acp_mq_exporter_health_status{job='mq'} < 1", - For: "5m", - Labels: map[string]string{"service_name": "builder"}, - Annotations: map[string]string{"summary": "unhealthy"}, + Alert: "BuilderUnhealthy", + Expr: "builder_exporter_health_status == 0", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "builder unhealthy"}, + }, + &RulesConfig{ + Alert: "BuilderTaskError", + Expr: "builder_exporter_builder_task_error > 30", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "Builder execution task error number is greater than 30"}, + }, + }, + }, + &AlertingNameConfig{ + + Name: "WorkerHealth", + Rules: []*RulesConfig{ + &RulesConfig{ + Alert: "WorkerUnhealthy", + Expr: "app_resource_exporter_health_status == 0", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "worker unhealthy"}, + }, + &RulesConfig{ + Alert: "WorkerTaskError", + Expr: "app_resource_exporter_worker_task_error > 50", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "worker execution task error number is greater than 50"}, + }, + }, + }, + &AlertingNameConfig{ + + Name: "EntranceHealth", + Rules: []*RulesConfig{ + &RulesConfig{ + Alert: "EntranceUnHealthy", + Expr: "acp_entrance_exporter_health_status == 0", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "entrance unhealthy"}, + }, + }, + }, + &AlertingNameConfig{ + + Name: "MqHealth", + Rules: []*RulesConfig{ + &RulesConfig{ + Alert: "MqUnhealthy", + Expr: "acp_mq_exporter_health_status == 0", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "mq unhealthy"}, + }, + &RulesConfig{ + Alert: "TeamTaskMany", + Expr: "acp_mq_dequeue_number-acp_mq_enqueue_number > 200", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "The number of tasks in the queue is greater than 200"}, + }, + }, + }, + &AlertingNameConfig{ + + Name: "EventlogHealth", + Rules: []*RulesConfig{ + &RulesConfig{ + Alert: "EventLogUnhealthy", + Expr: "event_log_exporter_health_status == 0", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "eventlog unhealthy"}, + }, + &RulesConfig{ + Alert: "EventLogDown", + Expr: "event_log_exporter_instanse_up == 0", + For: "3m", + Labels: map[string]string{}, + Annotations: map[string]string{"summary": "eventlog service down"}, }, }, }, }, }, + config: config, } return a } -func (a *AlertingRulesConfig)LoadAlertingRulesConfig() error { +func (a *AlertingRulesManager) LoadAlertingRulesConfig() error { logrus.Info("Load AlertingRules config file.") - content, err := ioutil.ReadFile("/etc/prometheus/rules.yml") + content, err := ioutil.ReadFile(a.config.AlertingRulesFile) if err != nil { logrus.Error("Failed to read AlertingRules config file: ", err) logrus.Info("Init config file by default values.") @@ -82,8 +165,7 @@ func (a *AlertingRulesConfig)LoadAlertingRulesConfig() error { return nil } - -func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error { +func (a *AlertingRulesManager) SaveAlertingRulesConfig() error { logrus.Debug("Save alerting rules config file.") data, err := yaml.Marshal(a) @@ -92,7 +174,7 @@ func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error { return err } - err = ioutil.WriteFile("/etc/prometheus/rules.yml", data, 0644) + err = ioutil.WriteFile(a.config.AlertingRulesFile, data, 0644) if err != nil { logrus.Error("Write alerting rules config file error.", err.Error()) return err @@ -101,15 +183,14 @@ func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error { return nil } - -func (a *AlertingRulesConfig) AddRules(val AlertingNameConfig) error { - group := a.Groups +func (a *AlertingRulesManager) AddRules(val AlertingNameConfig) error { + group := a.RulesConfig.Groups group = append(group, &val) return nil } -func (a *AlertingRulesConfig) InitRulesConfig() { - _, err := os.Stat("/etc/prometheus/rules.yml") //os.Stat获取文件信息 +func (a *AlertingRulesManager) InitRulesConfig() { + _, err := os.Stat(a.config.AlertingRulesFile) //os.Stat获取文件信息 if err != nil { if os.IsExist(err) { return @@ -119,4 +200,4 @@ func (a *AlertingRulesConfig) InitRulesConfig() { } return -} \ No newline at end of file +}