2018-07-31 14:26:47 +08:00
package prometheus
2018-07-31 16:09:46 +08:00
import (
2019-02-12 15:40:42 +08:00
"io/ioutil"
2018-07-31 18:43:31 +08:00
"os"
2019-02-12 16:41:14 +08:00
"github.com/Sirupsen/logrus"
"github.com/goodrain/rainbond/cmd/monitor/option"
yaml "gopkg.in/yaml.v2"
2018-07-31 16:09:46 +08:00
)
2018-07-31 14:26:47 +08:00
2019-02-12 16:41:14 +08:00
//AlertingRulesConfig alerting rule config
2018-07-31 14:26:47 +08:00
type AlertingRulesConfig struct {
2018-08-01 12:29:06 +08:00
Groups [ ] * AlertingNameConfig ` yaml:"groups" json:"groups" `
2018-07-31 14:26:47 +08:00
}
2019-02-12 16:41:14 +08:00
//AlertingNameConfig alerting config
2018-07-31 14:26:47 +08:00
type AlertingNameConfig struct {
2018-08-01 12:29:06 +08:00
Name string ` yaml:"name" json:"name" `
Rules [ ] * RulesConfig ` yaml:"rules" json:"rules" `
2018-07-31 14:26:47 +08:00
}
2019-02-12 16:41:14 +08:00
//RulesConfig rule config
2018-07-31 14:26:47 +08:00
type RulesConfig struct {
2018-08-01 18:49:31 +08:00
Alert string ` yaml:"alert" json:"alert" `
Expr string ` yaml:"expr" json:"expr" `
For string ` yaml:"for" json:"for" `
Labels map [ string ] string ` yaml:"labels" json:"labels" `
2018-08-01 12:29:06 +08:00
Annotations map [ string ] string ` yaml:"annotations" json:"annotations" `
2018-07-31 14:26:47 +08:00
}
2019-02-12 16:41:14 +08:00
//AlertingRulesManager alerting rule manage
2018-07-31 16:09:46 +08:00
type AlertingRulesManager struct {
RulesConfig * AlertingRulesConfig
2018-08-13 11:54:38 +08:00
config * option . Config
2018-07-31 16:09:46 +08:00
}
2019-02-12 16:41:14 +08:00
//NewRulesManager new rule manager
2018-08-01 18:49:31 +08:00
func NewRulesManager ( config * option . Config ) * AlertingRulesManager {
a := & AlertingRulesManager {
2018-07-31 16:09:46 +08:00
RulesConfig : & AlertingRulesConfig {
2018-08-01 18:49:31 +08:00
Groups : [ ] * AlertingNameConfig {
2018-07-31 16:09:46 +08:00
& AlertingNameConfig {
2018-08-01 18:49:31 +08:00
Name : "BuilderHealth" ,
2018-07-31 16:09:46 +08:00
Rules : [ ] * RulesConfig {
& RulesConfig {
2018-08-01 18:49:31 +08:00
Alert : "BuilderUnhealthy" ,
Expr : "builder_exporter_health_status == 0" ,
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "builder unhealthy" } ,
} ,
& RulesConfig {
Alert : "BuilderTaskError" ,
2019-07-12 14:42:24 +08:00
Expr : "builder_exporter_builder_current_concurrent_task == builder_exporter_builder_max_concurrent_task" ,
For : "20s" ,
2018-08-01 18:49:31 +08:00
Labels : map [ string ] string { } ,
2019-07-12 14:42:24 +08:00
Annotations : map [ string ] string { "summary" : "The build service is performing a maximum number of tasks" } ,
2018-08-01 18:49:31 +08:00
} ,
} ,
} ,
& AlertingNameConfig {
Name : "WorkerHealth" ,
Rules : [ ] * RulesConfig {
& RulesConfig {
Alert : "WorkerUnhealthy" ,
Expr : "app_resource_exporter_health_status == 0" ,
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "worker unhealthy" } ,
} ,
& RulesConfig {
Alert : "WorkerTaskError" ,
Expr : "app_resource_exporter_worker_task_error > 50" ,
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "worker execution task error number is greater than 50" } ,
} ,
} ,
} ,
& AlertingNameConfig {
Name : "MqHealth" ,
Rules : [ ] * RulesConfig {
& RulesConfig {
Alert : "MqUnhealthy" ,
Expr : "acp_mq_exporter_health_status == 0" ,
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "mq unhealthy" } ,
} ,
& RulesConfig {
Alert : "TeamTaskMany" ,
Expr : "acp_mq_dequeue_number-acp_mq_enqueue_number > 200" ,
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "The number of tasks in the queue is greater than 200" } ,
} ,
} ,
} ,
& AlertingNameConfig {
Name : "EventlogHealth" ,
Rules : [ ] * RulesConfig {
& RulesConfig {
Alert : "EventLogUnhealthy" ,
Expr : "event_log_exporter_health_status == 0" ,
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "eventlog unhealthy" } ,
} ,
& RulesConfig {
Alert : "EventLogDown" ,
2019-02-12 16:41:14 +08:00
Expr : "event_log_exporter_instance_up == 0" ,
2018-08-01 18:49:31 +08:00
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "eventlog service down" } ,
2018-07-31 16:09:46 +08:00
} ,
} ,
} ,
2018-08-01 22:35:51 +08:00
& AlertingNameConfig {
Name : "WebcliHealth" ,
Rules : [ ] * RulesConfig {
& RulesConfig {
Alert : "WebcliUnhealthy" ,
Expr : "webcli_exporter_health_status == 0" ,
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "webcli unhealthy" } ,
} ,
2018-08-02 17:11:01 +08:00
& RulesConfig {
Alert : "WebcliUnhealthy" ,
Expr : "webcli_exporter_execute_command_failed > 100" ,
For : "3m" ,
Labels : map [ string ] string { } ,
Annotations : map [ string ] string { "summary" : "The number of errors that occurred while executing the command was greater than 100." } ,
} ,
2018-08-01 22:35:51 +08:00
} ,
} ,
2018-08-13 11:54:38 +08:00
& AlertingNameConfig {
Name : "NodeHealth" ,
Rules : [ ] * RulesConfig {
& RulesConfig {
Alert : "high_cpu_usage_on_node" ,
Expr : "sum by(instance) (rate(process_cpu_seconds_total[5m])) * 100 > 70" ,
For : "5m" ,
Labels : map [ string ] string { "service" : "node_cpu" } ,
Annotations : map [ string ] string { "description" : "{{ $labels.instance }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%." , "summary" : "HIGH CPU USAGE WARNING ON '{{ $labels.instance }}'" } ,
} ,
& RulesConfig {
Alert : "high_la_usage_on_node" ,
Expr : "node_load5 > 5" ,
For : "5m" ,
Labels : map [ string ] string { "service" : "node_load5" } ,
Annotations : map [ string ] string { "description" : "{{ $labels.instance }} has a high load average. Load Average 5m is {{ humanize $value}}." , "summary" : "HIGH LOAD AVERAGE WARNING ON '{{ $labels.instance }}'" } ,
} ,
& RulesConfig {
Alert : "node_running_out_of_disk_space" ,
Expr : "(node_filesystem_size{mountpoint='/'} - node_filesystem_free{mountpoint='/'}) * 100 / node_filesystem_size{mountpoint='/'} > 80" ,
For : "5m" ,
Labels : map [ string ] string { "service" : "node_running_out_of_disk_space" } ,
Annotations : map [ string ] string { "description" : "More than 80% of disk used. Disk usage {{ humanize $value }}%." , "summary" : "LOW DISK SPACE WARING:NODE '{{ $labels.instance }}" } ,
} ,
& RulesConfig {
Alert : "monitoring_service_down" ,
Expr : "up == 0" ,
For : "5m" ,
Labels : map [ string ] string { "service" : "service_down" } ,
Annotations : map [ string ] string { "description" : "The monitoring service '{{ $labels.job }}' is down." , "summary" : "MONITORING SERVICE DOWN WARNING:NODE '{{ $labels.instance }}'" } ,
} ,
& RulesConfig {
Alert : "high_memory_usage_on_node" ,
Expr : "((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) * 100 > 80" ,
For : "5m" ,
Labels : map [ string ] string { "service" : "node_memory" } ,
Annotations : map [ string ] string { "description" : "{{ $labels.instance }} is using a LOT of MEMORY. MEMORY usage is over {{ humanize $value}}%." , "summary" : "HIGH MEMORY USAGE WARNING TASK ON '{{ $labels.instance }}'" } ,
} ,
} ,
} ,
2019-05-08 13:27:22 +08:00
& AlertingNameConfig {
Name : "ClusterHealth" ,
Rules : [ ] * RulesConfig {
& RulesConfig {
Alert : "cluster_unhealth" ,
Expr : "rainbond_cluster_node_health != 0" ,
For : "3m" ,
Labels : map [ string ] string { "service" : "cluster_health" } ,
Annotations : map [ string ] string { "summary" : "!!!Dangerous, the current cluster is in an unhealthy state." } ,
} ,
& RulesConfig {
Alert : "monitoring_component_status_unhealth" ,
Expr : "rainbond_cluster_component_health != 0" ,
For : "3m" ,
Labels : map [ string ] string { "service" : "component_unhealth" } ,
Annotations : map [ string ] string { "description" : "The monitoring component '{{ $labels.component }}' is down." , "summary" : "MONITORING COMPONENT UNHEALTHY WARNING:NODE '{{ $labels.node_ip }}'" } ,
} ,
& RulesConfig {
Alert : "rainbond_cluster_collector_duration_seconds_timeout" ,
Expr : "rainbond_cluster_collector_duration_seconds > 10" ,
For : "3m" ,
Labels : map [ string ] string { "service" : "cluster_collector" } ,
Annotations : map [ string ] string { "summary" : "Cluster collector '{{ $labels.instance }}' more than 10s" } ,
} ,
} ,
} ,
2018-07-31 16:09:46 +08:00
} ,
} ,
2018-08-01 18:49:31 +08:00
config : config ,
2018-07-31 16:09:46 +08:00
}
return a
}
2019-02-12 16:41:14 +08:00
//LoadAlertingRulesConfig load alerting rule config
2018-08-01 18:49:31 +08:00
func ( a * AlertingRulesManager ) LoadAlertingRulesConfig ( ) error {
2018-07-31 19:45:22 +08:00
logrus . Info ( "Load AlertingRules config file." )
2018-08-01 18:49:31 +08:00
content , err := ioutil . ReadFile ( a . config . AlertingRulesFile )
2018-07-31 16:09:46 +08:00
if err != nil {
logrus . Error ( "Failed to read AlertingRules config file: " , err )
logrus . Info ( "Init config file by default values." )
return nil
}
2018-08-01 21:49:05 +08:00
if err := yaml . Unmarshal ( content , a . RulesConfig ) ; err != nil {
2018-07-31 16:09:46 +08:00
logrus . Error ( "Unmarshal AlertingRulesConfig config string to object error." , err . Error ( ) )
return err
}
2018-07-31 19:45:22 +08:00
logrus . Debugf ( "Loaded config file to memory: %+v" , a )
2018-07-31 16:09:46 +08:00
return nil
}
2019-02-12 16:41:14 +08:00
//SaveAlertingRulesConfig save alerting rule config
2018-08-01 18:49:31 +08:00
func ( a * AlertingRulesManager ) SaveAlertingRulesConfig ( ) error {
2018-07-31 19:45:22 +08:00
logrus . Debug ( "Save alerting rules config file." )
2018-07-31 16:09:46 +08:00
2018-08-01 21:49:05 +08:00
data , err := yaml . Marshal ( a . RulesConfig )
2018-07-31 16:09:46 +08:00
if err != nil {
logrus . Error ( "Marshal alerting rules config to yaml error." , err . Error ( ) )
return err
}
2018-08-01 18:49:31 +08:00
err = ioutil . WriteFile ( a . config . AlertingRulesFile , data , 0644 )
2018-07-31 16:09:46 +08:00
if err != nil {
logrus . Error ( "Write alerting rules config file error." , err . Error ( ) )
return err
}
return nil
}
2019-02-12 16:41:14 +08:00
//AddRules add rule
2018-08-01 18:49:31 +08:00
func ( a * AlertingRulesManager ) AddRules ( val AlertingNameConfig ) error {
group := a . RulesConfig . Groups
2018-07-31 16:09:46 +08:00
group = append ( group , & val )
return nil
2018-07-31 18:43:31 +08:00
}
2019-02-12 16:41:14 +08:00
//InitRulesConfig init rule config
2018-08-01 18:49:31 +08:00
func ( a * AlertingRulesManager ) InitRulesConfig ( ) {
_ , err := os . Stat ( a . config . AlertingRulesFile ) //os.Stat获取文件信息
2018-07-31 18:43:31 +08:00
if err != nil {
if os . IsExist ( err ) {
return
}
a . SaveAlertingRulesConfig ( )
return
}
return
2018-08-01 18:49:31 +08:00
}