[ADD] add default rules

This commit is contained in:
zhoujunhao 2018-08-01 18:49:31 +08:00
parent b60afa8da9
commit 425bd5198a
7 changed files with 133 additions and 91 deletions

View File

@ -41,7 +41,7 @@ func main() {
c.CompleteConfig()
// start prometheus daemon and watching tis status in all time, exit monitor process if start failed
a := prometheus.NewRulesManager()
a := prometheus.NewRulesManager(c)
p := prometheus.NewManager(c, a)
controllerManager := controller.NewControllerManager(a,p)

View File

@ -38,6 +38,7 @@ type Config struct {
StartArgs []string
ConfigFile string
AlertingRulesFile string
LocalStoragePath string
Web Web
Tsdb Tsdb
@ -96,6 +97,7 @@ func NewConfig() *Config {
LogLevel: "info",
ConfigFile: "/etc/prometheus/prometheus.yml",
AlertingRulesFile: "/etc/prometheus/rules.yml",
LocalStoragePath: "/prometheusdata",
WebTimeout: "5m",
RemoteFlushDeadline: "1m",
@ -128,6 +130,8 @@ func (c *Config) AddFlag(cmd *pflag.FlagSet) {
func (c *Config) AddPrometheusFlag(cmd *pflag.FlagSet) {
cmd.StringVar(&c.ConfigFile, "config.file", c.ConfigFile, "Prometheus configuration file path.")
cmd.StringVar(&c.AlertingRulesFile, "rules-config.file", c.AlertingRulesFile, "Prometheus alerting rules config file path.")
cmd.StringVar(&c.Web.ListenAddress, "web.listen-address", c.Web.ListenAddress, "Address to listen on for UI, API, and telemetry.")
cmd.StringVar(&c.WebTimeout, "web.read-timeout", c.WebTimeout, "Maximum duration before timing out read of the request, and closing idle connections.")

View File

@ -47,7 +47,7 @@ func Run(s *option.WebCliServer) error {
return err
}
defer ap.Exit()
keepalive, err := discover.CreateKeepAlive(s.EtcdEndPoints, "acp_webcli", s.HostName, s.HostIP, s.Port)
keepalive, err := discover.CreateKeepAlive(s.EtcdEndPoints, "acp_webcli", s.HostName, s.HostIP, 6301)
if err != nil {
return err
}

View File

@ -6,7 +6,6 @@ import (
"github.com/goodrain/rainbond/grctl/clients"
"fmt"
"github.com/ghodss/yaml"
"encoding/json"
"github.com/goodrain/rainbond/node/api/model"
"errors"
)
@ -56,10 +55,9 @@ func NewCmdAlerting() cli.Command {
logrus.Errorf("need args")
return nil
}
v, err := clients.RegionClient.Monitor().DelRule(name)
_, err := clients.RegionClient.Monitor().DelRule(name)
handleErr(err)
result, _ := json.Marshal(v.Bean)
fmt.Println(string(result))
fmt.Println("Delete rule succeeded")
return nil
},
},
@ -78,13 +76,11 @@ func NewCmdAlerting() cli.Command {
if c.IsSet("Rules") {
rules := c.String("Rules")
println("====>", rules)
var rulesConfig model.AlertingNameConfig
yaml.Unmarshal([]byte(rules), &rulesConfig)
v, err := clients.RegionClient.Monitor().AddRule(&rulesConfig)
_, err := clients.RegionClient.Monitor().AddRule(&rulesConfig)
handleErr(err)
result, _ := json.Marshal(v.Bean)
fmt.Println(string(result))
fmt.Println("Add rule successfully")
return nil
}
return errors.New("rules not null")
@ -110,13 +106,11 @@ func NewCmdAlerting() cli.Command {
if c.IsSet("RulesName") && c.IsSet("Rules") {
rules := c.String("Rules")
ruleName := c.String("RulesName")
println("====>", rules)
var rulesConfig model.AlertingNameConfig
yaml.Unmarshal([]byte(rules), &rulesConfig)
v, err := clients.RegionClient.Monitor().RegRule(ruleName, &rulesConfig)
_, err := clients.RegionClient.Monitor().RegRule(ruleName, &rulesConfig)
handleErr(err)
result, _ := json.Marshal(v.Bean)
fmt.Println(string(result))
fmt.Println("Modify rule successfully")
return nil
}
return errors.New("rule name or rules not null")

View File

@ -35,32 +35,13 @@ func (c *ControllerManager) AddRules(w http.ResponseWriter, r *http.Request) {
var RulesConfig prometheus.AlertingNameConfig
unmarshalErr := json.Unmarshal(in, &RulesConfig)
if unmarshalErr != nil{
logrus.Info("反序列化错误",unmarshalErr)
if unmarshalErr != nil {
logrus.Info(unmarshalErr)
httputil.ReturnError(r, w, 400, err.Error())
return
}
//err = ioutil.WriteFile("/etc/prometheus/cache_rule.yml", in, 0644)
//if err != nil {
// logrus.Error(err.Error())
//}
//
//content, err := ioutil.ReadFile("/etc/prometheus/cache_rule.yml")
//if err != nil {
// logrus.Error( err)
//
//}
//
//if err := yaml.Unmarshal(content, &RulesConfig); err != nil {
// logrus.Error("Unmarshal prometheus alerting rules config string to object error.", err.Error())
// httputil.ReturnError(r, w, 400, err.Error())
// return
//}
println("======01")
c.Rules.RulesConfig.LoadAlertingRulesConfig()
println("======02")
group := c.Rules.RulesConfig.Groups
for _, v := range group {
@ -69,14 +50,9 @@ func (c *ControllerManager) AddRules(w http.ResponseWriter, r *http.Request) {
return
}
}
println("======03")
println("=====>", RulesConfig.Name)
group = append(group, &RulesConfig)
c.Rules.RulesConfig.Groups = group
println("======04")
c.Rules.RulesConfig.SaveAlertingRulesConfig()
println("======05")
c.Manager.RestartDaemon()
httputil.ReturnSuccess(r, w, "Add rule successfully")
@ -105,7 +81,9 @@ func (c *ControllerManager) DelRules(w http.ResponseWriter, r *http.Request) {
for i, v := range groupsList {
if v.Name == rulesName {
groupsList = append(groupsList[:i], groupsList[i+1:]...)
c.Rules.RulesConfig.Groups = groupsList
c.Rules.RulesConfig.SaveAlertingRulesConfig()
c.Manager.RestartDaemon()
httputil.ReturnSuccess(r, w, "successfully deleted")
return
}
@ -124,34 +102,19 @@ func (c *ControllerManager) RegRules(w http.ResponseWriter, r *http.Request) {
var RulesConfig prometheus.AlertingNameConfig
unmarshalErr := json.Unmarshal(in, &RulesConfig)
if unmarshalErr != nil{
logrus.Info("反序列化错误",unmarshalErr)
if unmarshalErr != nil {
logrus.Info(unmarshalErr)
httputil.ReturnError(r, w, 400, err.Error())
return
}
//err = ioutil.WriteFile("/etc/prometheus/cache_rule.yml", in, 0644)
//if err != nil {
// logrus.Error(err.Error())
//}
//
//content, err := ioutil.ReadFile("/etc/prometheus/cache_rule.yml")
//if err != nil {
// logrus.Error(err)
//
//}
//
//if err := yaml.Unmarshal(content, &RulesConfig); err != nil {
// logrus.Error("Unmarshal prometheus alerting rules config string to object error.", err.Error())
// httputil.ReturnError(r, w, 400, err.Error())
// return
//}
c.Rules.RulesConfig.LoadAlertingRulesConfig()
group := c.Rules.RulesConfig.Groups
for i, v := range group {
if v.Name == rulesName {
group[i] = &RulesConfig
c.Manager.RestartDaemon()
httputil.ReturnSuccess(r, w, "Update rule succeeded")
c.Rules.RulesConfig.SaveAlertingRulesConfig()
return

View File

@ -74,7 +74,7 @@ func NewManager(config *option.Config, a *AlertingRulesManager) *Manager {
ScrapeInterval: model.Duration(time.Second * 5),
EvaluationInterval: model.Duration(time.Second * 30),
},
RuleFiles: []string{"/etc/prometheus/rules.yml"},
RuleFiles: []string{config.AlertingRulesFile},
},
Registry: reg,
httpClient: client,
@ -82,7 +82,7 @@ func NewManager(config *option.Config, a *AlertingRulesManager) *Manager {
a: a,
}
m.LoadConfig()
m.a.RulesConfig.InitRulesConfig()
m.a.InitRulesConfig()
return m
}

View File

@ -5,6 +5,8 @@ import (
"io/ioutil"
"gopkg.in/yaml.v2"
"os"
"github.com/goodrain/rainbond/cmd/monitor/option"
)
type AlertingRulesConfig struct {
@ -17,57 +19,138 @@ type AlertingNameConfig struct {
}
type RulesConfig struct {
Alert string `yaml:"alert" json:"alert"`
Expr string `yaml:"expr" json:"expr"`
For string `yaml:"for" json:"for"`
Labels map[string]string `yaml:"labels" json:"labels"`
Alert string `yaml:"alert" json:"alert"`
Expr string `yaml:"expr" json:"expr"`
For string `yaml:"for" json:"for"`
Labels map[string]string `yaml:"labels" json:"labels"`
Annotations map[string]string `yaml:"annotations" json:"annotations"`
}
type AlertingRulesManager struct {
RulesConfig *AlertingRulesConfig
config *option.Config
}
func NewRulesManager() *AlertingRulesManager {
a:= &AlertingRulesManager{
func NewRulesManager(config *option.Config) *AlertingRulesManager {
a := &AlertingRulesManager{
RulesConfig: &AlertingRulesConfig{
Groups:[]*AlertingNameConfig{
Groups: []*AlertingNameConfig{
&AlertingNameConfig{
Name: "test",
Name: "InstanceHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "MqHealth",
Expr: "acp_mq_exporter_health_status{job='mq'} < 1",
For: "2m",
Labels: map[string]string{"service_name": "mq"},
Annotations: map[string]string{"summary": "unhealthy"},
Alert: "InstanceDown",
Expr: "up == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "builder {{$labels.instance}} down", "description":"{{$labels.instance}} of job {{$labels.job}} has been down for more than 3 minutes"},
},
},
},
&AlertingNameConfig{
Name: "test2",
Name: "BuilderHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "builderHealth",
Expr: "acp_mq_exporter_health_status{job='mq'} < 1",
For: "5m",
Labels: map[string]string{"service_name": "builder"},
Annotations: map[string]string{"summary": "unhealthy"},
Alert: "BuilderUnhealthy",
Expr: "builder_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "builder unhealthy"},
},
&RulesConfig{
Alert: "BuilderTaskError",
Expr: "builder_exporter_builder_task_error > 30",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "Builder execution task error number is greater than 30"},
},
},
},
&AlertingNameConfig{
Name: "WorkerHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "WorkerUnhealthy",
Expr: "app_resource_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "worker unhealthy"},
},
&RulesConfig{
Alert: "WorkerTaskError",
Expr: "app_resource_exporter_worker_task_error > 50",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "worker execution task error number is greater than 50"},
},
},
},
&AlertingNameConfig{
Name: "EntranceHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "EntranceUnHealthy",
Expr: "acp_entrance_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "entrance unhealthy"},
},
},
},
&AlertingNameConfig{
Name: "MqHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "MqUnhealthy",
Expr: "acp_mq_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "mq unhealthy"},
},
&RulesConfig{
Alert: "TeamTaskMany",
Expr: "acp_mq_dequeue_number-acp_mq_enqueue_number > 200",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "The number of tasks in the queue is greater than 200"},
},
},
},
&AlertingNameConfig{
Name: "EventlogHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "EventLogUnhealthy",
Expr: "event_log_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "eventlog unhealthy"},
},
&RulesConfig{
Alert: "EventLogDown",
Expr: "event_log_exporter_instanse_up == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "eventlog service down"},
},
},
},
},
},
config: config,
}
return a
}
func (a *AlertingRulesConfig)LoadAlertingRulesConfig() error {
func (a *AlertingRulesManager) LoadAlertingRulesConfig() error {
logrus.Info("Load AlertingRules config file.")
content, err := ioutil.ReadFile("/etc/prometheus/rules.yml")
content, err := ioutil.ReadFile(a.config.AlertingRulesFile)
if err != nil {
logrus.Error("Failed to read AlertingRules config file: ", err)
logrus.Info("Init config file by default values.")
@ -82,8 +165,7 @@ func (a *AlertingRulesConfig)LoadAlertingRulesConfig() error {
return nil
}
func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error {
func (a *AlertingRulesManager) SaveAlertingRulesConfig() error {
logrus.Debug("Save alerting rules config file.")
data, err := yaml.Marshal(a)
@ -92,7 +174,7 @@ func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error {
return err
}
err = ioutil.WriteFile("/etc/prometheus/rules.yml", data, 0644)
err = ioutil.WriteFile(a.config.AlertingRulesFile, data, 0644)
if err != nil {
logrus.Error("Write alerting rules config file error.", err.Error())
return err
@ -101,15 +183,14 @@ func (a *AlertingRulesConfig)SaveAlertingRulesConfig() error {
return nil
}
func (a *AlertingRulesConfig) AddRules(val AlertingNameConfig) error {
group := a.Groups
func (a *AlertingRulesManager) AddRules(val AlertingNameConfig) error {
group := a.RulesConfig.Groups
group = append(group, &val)
return nil
}
func (a *AlertingRulesConfig) InitRulesConfig() {
_, err := os.Stat("/etc/prometheus/rules.yml") //os.Stat获取文件信息
func (a *AlertingRulesManager) InitRulesConfig() {
_, err := os.Stat(a.config.AlertingRulesFile) //os.Stat获取文件信息
if err != nil {
if os.IsExist(err) {
return
@ -119,4 +200,4 @@ func (a *AlertingRulesConfig) InitRulesConfig() {
}
return
}
}