Rainbond/monitor/prometheus/rules_manager.go

275 lines
9.5 KiB
Go

package prometheus
import (
"io/ioutil"
"os"
"github.com/Sirupsen/logrus"
"github.com/goodrain/rainbond/cmd/monitor/option"
yaml "gopkg.in/yaml.v2"
)
//AlertingRulesConfig alerting rule config
type AlertingRulesConfig struct {
Groups []*AlertingNameConfig `yaml:"groups" json:"groups"`
}
//AlertingNameConfig alerting config
type AlertingNameConfig struct {
Name string `yaml:"name" json:"name"`
Rules []*RulesConfig `yaml:"rules" json:"rules"`
}
//RulesConfig rule config
type RulesConfig struct {
Alert string `yaml:"alert" json:"alert"`
Expr string `yaml:"expr" json:"expr"`
For string `yaml:"for" json:"for"`
Labels map[string]string `yaml:"labels" json:"labels"`
Annotations map[string]string `yaml:"annotations" json:"annotations"`
}
//AlertingRulesManager alerting rule manage
type AlertingRulesManager struct {
RulesConfig *AlertingRulesConfig
config *option.Config
}
//NewRulesManager new rule manager
func NewRulesManager(config *option.Config) *AlertingRulesManager {
a := &AlertingRulesManager{
RulesConfig: &AlertingRulesConfig{
Groups: []*AlertingNameConfig{
&AlertingNameConfig{
Name: "BuilderHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "BuilderUnhealthy",
Expr: "builder_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "builder unhealthy"},
},
&RulesConfig{
Alert: "BuilderTaskError",
Expr: "builder_exporter_builder_task_error > 30",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "Builder execution task error number is greater than 30"},
},
},
},
&AlertingNameConfig{
Name: "WorkerHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "WorkerUnhealthy",
Expr: "app_resource_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "worker unhealthy"},
},
&RulesConfig{
Alert: "WorkerTaskError",
Expr: "app_resource_exporter_worker_task_error > 50",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "worker execution task error number is greater than 50"},
},
},
},
&AlertingNameConfig{
Name: "MqHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "MqUnhealthy",
Expr: "acp_mq_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "mq unhealthy"},
},
&RulesConfig{
Alert: "TeamTaskMany",
Expr: "acp_mq_dequeue_number-acp_mq_enqueue_number > 200",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "The number of tasks in the queue is greater than 200"},
},
},
},
&AlertingNameConfig{
Name: "EventlogHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "EventLogUnhealthy",
Expr: "event_log_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "eventlog unhealthy"},
},
&RulesConfig{
Alert: "EventLogDown",
Expr: "event_log_exporter_instance_up == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "eventlog service down"},
},
},
},
&AlertingNameConfig{
Name: "WebcliHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "WebcliUnhealthy",
Expr: "webcli_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "webcli unhealthy"},
},
&RulesConfig{
Alert: "WebcliUnhealthy",
Expr: "webcli_exporter_execute_command_failed > 100",
For: "3m",
Labels: map[string]string{},
Annotations: map[string]string{"summary": "The number of errors that occurred while executing the command was greater than 100."},
},
},
},
&AlertingNameConfig{
Name: "NodeHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "high_cpu_usage_on_node",
Expr: "sum by(instance) (rate(process_cpu_seconds_total[5m])) * 100 > 70",
For: "5m",
Labels: map[string]string{"service": "node_cpu"},
Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.", "summary": "HIGH CPU USAGE WARNING ON '{{ $labels.instance }}'"},
},
&RulesConfig{
Alert: "high_la_usage_on_node",
Expr: "node_load5 > 5",
For: "5m",
Labels: map[string]string{"service": "node_load5"},
Annotations: map[string]string{"description": "{{ $labels.instance }} has a high load average. Load Average 5m is {{ humanize $value}}.", "summary": "HIGH LOAD AVERAGE WARNING ON '{{ $labels.instance }}'"},
},
&RulesConfig{
Alert: "node_running_out_of_disk_space",
Expr: "(node_filesystem_size{mountpoint='/'} - node_filesystem_free{mountpoint='/'}) * 100 / node_filesystem_size{mountpoint='/'} > 80",
For: "5m",
Labels: map[string]string{"service": "node_running_out_of_disk_space"},
Annotations: map[string]string{"description": "More than 80% of disk used. Disk usage {{ humanize $value }}%.", "summary": "LOW DISK SPACE WARING:NODE '{{ $labels.instance }}"},
},
&RulesConfig{
Alert: "monitoring_service_down",
Expr: "up == 0",
For: "5m",
Labels: map[string]string{"service": "service_down"},
Annotations: map[string]string{"description": "The monitoring service '{{ $labels.job }}' is down.", "summary": "MONITORING SERVICE DOWN WARNING:NODE '{{ $labels.instance }}'"},
},
&RulesConfig{
Alert: "high_memory_usage_on_node",
Expr: "((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) * 100 > 80",
For: "5m",
Labels: map[string]string{"service": "node_memory"},
Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of MEMORY. MEMORY usage is over {{ humanize $value}}%.", "summary": "HIGH MEMORY USAGE WARNING TASK ON '{{ $labels.instance }}'"},
},
},
},
&AlertingNameConfig{
Name: "ClusterHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "cluster_unhealth",
Expr: "rainbond_cluster_node_health != 0",
For: "3m",
Labels: map[string]string{"service": "cluster_health"},
Annotations: map[string]string{"summary": "!!!Dangerous, the current cluster is in an unhealthy state."},
},
&RulesConfig{
Alert: "monitoring_component_status_unhealth",
Expr: "rainbond_cluster_component_health != 0",
For: "3m",
Labels: map[string]string{"service": "component_unhealth"},
Annotations: map[string]string{"description": "The monitoring component '{{ $labels.component }}' is down.", "summary": "MONITORING COMPONENT UNHEALTHY WARNING:NODE '{{ $labels.node_ip }}'"},
},
&RulesConfig{
Alert: "rainbond_cluster_collector_duration_seconds_timeout",
Expr: "rainbond_cluster_collector_duration_seconds > 10",
For: "3m",
Labels: map[string]string{"service": "cluster_collector"},
Annotations: map[string]string{"summary": "Cluster collector '{{ $labels.instance }}' more than 10s"},
},
},
},
},
},
config: config,
}
return a
}
//LoadAlertingRulesConfig load alerting rule config
func (a *AlertingRulesManager) LoadAlertingRulesConfig() error {
logrus.Info("Load AlertingRules config file.")
content, err := ioutil.ReadFile(a.config.AlertingRulesFile)
if err != nil {
logrus.Error("Failed to read AlertingRules config file: ", err)
logrus.Info("Init config file by default values.")
return nil
}
if err := yaml.Unmarshal(content, a.RulesConfig); err != nil {
logrus.Error("Unmarshal AlertingRulesConfig config string to object error.", err.Error())
return err
}
logrus.Debugf("Loaded config file to memory: %+v", a)
return nil
}
//SaveAlertingRulesConfig save alerting rule config
func (a *AlertingRulesManager) SaveAlertingRulesConfig() error {
logrus.Debug("Save alerting rules config file.")
data, err := yaml.Marshal(a.RulesConfig)
if err != nil {
logrus.Error("Marshal alerting rules config to yaml error.", err.Error())
return err
}
err = ioutil.WriteFile(a.config.AlertingRulesFile, data, 0644)
if err != nil {
logrus.Error("Write alerting rules config file error.", err.Error())
return err
}
return nil
}
//AddRules add rule
func (a *AlertingRulesManager) AddRules(val AlertingNameConfig) error {
group := a.RulesConfig.Groups
group = append(group, &val)
return nil
}
//InitRulesConfig init rule config
func (a *AlertingRulesManager) InitRulesConfig() {
_, err := os.Stat(a.config.AlertingRulesFile) //os.Stat获取文件信息
if err != nil {
if os.IsExist(err) {
return
}
a.SaveAlertingRulesConfig()
return
}
return
}