support set alert rule region name

This commit is contained in:
barnettZQG 2020-07-18 16:56:57 +08:00
parent 475d625347
commit d9da07274b

View File

@ -37,6 +37,21 @@ type AlertingRulesManager struct {
//NewRulesManager new rule manager
func NewRulesManager(config *option.Config) *AlertingRulesManager {
region := os.Getenv("REGION_NAME")
if region == "" {
region = "default"
}
commonLables := map[string]string{
"Alert": "Rainbond",
"Region": region,
}
getseverityLables := func(severity string) map[string]string {
return map[string]string{
"Alert": "Rainbond",
"severity": severity,
"Region": region,
}
}
a := &AlertingRulesManager{
RulesConfig: &AlertingRulesConfig{
Groups: []*AlertingNameConfig{
@ -44,57 +59,47 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Name: "GatewayHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "GatewayDown",
Expr: "absent(up{job=\"gateway\"})",
For: "10s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "GatewayDown",
Expr: "absent(up{job=\"gateway\"})",
For: "10s",
Labels: commonLables,
Annotations: map[string]string{
"description": "gateway node {{ $labels.instance }} is down, ",
"summary": "gateway is down",
},
},
&RulesConfig{
Alert: "RequestSizeTooMuch",
Expr: "sum by (instance, host) (rate(gateway_request_size_sum[5m])) > 1024*1024*10",
For: "20s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "RequestSizeTooMuch",
Expr: "sum by (instance, host) (rate(gateway_request_size_sum[5m])) > 1024*1024*10",
For: "20s",
Labels: commonLables,
Annotations: map[string]string{
"description": "http doamin {{ $labels.host }} per-second request size {{ humanize $value }}, more than 10M",
"summary": "Too much traffic",
},
},
&RulesConfig{
Alert: "ResponseSizeTooMuch",
Expr: "sum by (instance, host) (rate(gateway_response_size_sum[5m])) > 1024*1024*10",
For: "20s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "ResponseSizeTooMuch",
Expr: "sum by (instance, host) (rate(gateway_response_size_sum[5m])) > 1024*1024*10",
For: "20s",
Labels: commonLables,
Annotations: map[string]string{
"description": "http doamin {{ $labels.host }} per-second response size {{ humanize $value }}, more than 10M",
"summary": "Too much traffic",
},
},
&RulesConfig{
Alert: "RequestMany",
Expr: "rate(gateway_requests[5m]) > 200",
For: "10s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "RequestMany",
Expr: "rate(gateway_requests[5m]) > 200",
For: "10s",
Labels: commonLables,
Annotations: map[string]string{"description": "http doamin {{ $labels.host }} per-second requests {{ humanize $value }}, more than 200"},
},
&RulesConfig{
Alert: "FailureRequestMany",
Expr: "rate(gateway_requests{status=~\"5..\"}[5m]) > 5",
For: "10s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "FailureRequestMany",
Expr: "rate(gateway_requests{status=~\"5..\"}[5m]) > 5",
For: "10s",
Labels: commonLables,
Annotations: map[string]string{"description": "http doamin {{ $labels.host }} per-second failure requests {{ humanize $value }}, more than 5"},
},
},
@ -103,33 +108,27 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Name: "BuilderHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "BuilderDown",
Expr: "absent(up{component=\"builder\"})",
For: "10s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "BuilderDown",
Expr: "absent(up{component=\"builder\"})",
For: "10s",
Labels: commonLables,
Annotations: map[string]string{
"description": "builder(rbd-chaos) node {{ $labels.instance }} is down, ",
"summary": "builder(rbd-chaos) is down",
},
},
&RulesConfig{
Alert: "BuilderUnhealthy",
Expr: "builder_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "BuilderUnhealthy",
Expr: "builder_exporter_health_status == 0",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{"description": "builder unhealthy"},
},
&RulesConfig{
Alert: "BuilderTaskError",
Expr: "builder_exporter_builder_current_concurrent_task == builder_exporter_builder_max_concurrent_task",
For: "20s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "BuilderTaskError",
Expr: "builder_exporter_builder_current_concurrent_task == builder_exporter_builder_max_concurrent_task",
For: "20s",
Labels: commonLables,
Annotations: map[string]string{"summary": "The build service is performing a maximum number of tasks"},
},
},
@ -138,36 +137,30 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Name: "WorkerHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "WorkerDown",
Expr: "absent(up{component=\"worker\"})",
For: "10s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "WorkerDown",
Expr: "absent(up{component=\"worker\"})",
For: "10s",
Labels: commonLables,
Annotations: map[string]string{
"description": "worker node {{ $labels.instance }} is down",
"summary": "worker is down",
},
},
&RulesConfig{
Alert: "WorkerUnhealthy",
Expr: "app_resource_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "WorkerUnhealthy",
Expr: "app_resource_exporter_health_status == 0",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{
"summary": "worker unhealthy",
"description": "worker node {{ $labels.instance }} is unhealthy",
},
},
&RulesConfig{
Alert: "WorkerTaskError",
Expr: "app_resource_exporter_worker_task_error > 50",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "WorkerTaskError",
Expr: "app_resource_exporter_worker_task_error > 50",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{
"description": "worker node {{ $labels.instance }} execution task error number is greater than 50",
},
@ -178,33 +171,27 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Name: "MqHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "MqDown",
Expr: "absent(up{component=\"mq\"})",
For: "20s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "MqDown",
Expr: "absent(up{component=\"mq\"})",
For: "20s",
Labels: commonLables,
Annotations: map[string]string{
"description": "mq node {{ $labels.instance }} is down",
"summary": "mq is down",
},
},
&RulesConfig{
Alert: "MqUnhealthy",
Expr: "acp_mq_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "MqUnhealthy",
Expr: "acp_mq_exporter_health_status == 0",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{"summary": "mq unhealthy"},
},
&RulesConfig{
Alert: "TeamTaskMany",
Expr: "acp_mq_dequeue_number-acp_mq_enqueue_number > 200",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "TeamTaskMany",
Expr: "acp_mq_dequeue_number-acp_mq_enqueue_number > 200",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{"summary": "The number of tasks in the queue is greater than 200"},
},
},
@ -213,21 +200,17 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Name: "EventlogHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "EventLogUnhealthy",
Expr: "event_log_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "EventLogUnhealthy",
Expr: "event_log_exporter_health_status == 0",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{"summary": "eventlog unhealthy"},
},
&RulesConfig{
Alert: "EventLogDown",
Expr: "absent(up{component=\"eventlog\"})",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "EventLogDown",
Expr: "absent(up{component=\"eventlog\"})",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{
"description": "worker node {{ $labels.instance }} is down",
"summary": "eventlog service down",
@ -239,33 +222,27 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Name: "WebcliHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "WebcliDown",
Expr: "absent(up{component=\"webcli\"})",
For: "20s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "WebcliDown",
Expr: "absent(up{component=\"webcli\"})",
For: "20s",
Labels: commonLables,
Annotations: map[string]string{
"description": "webcli node {{ $labels.instance }} is down",
"summary": "webcli is down",
},
},
&RulesConfig{
Alert: "WebcliUnhealthy",
Expr: "webcli_exporter_health_status == 0",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "WebcliUnhealthy",
Expr: "webcli_exporter_health_status == 0",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{"summary": "webcli unhealthy"},
},
&RulesConfig{
Alert: "WebcliUnhealthy",
Expr: "rate(webcli_exporter_execute_command_failed[5m]) > 5",
For: "3m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "WebcliUnhealthy",
Expr: "rate(webcli_exporter_execute_command_failed[5m]) > 5",
For: "3m",
Labels: commonLables,
Annotations: map[string]string{"summary": "The number of errors that occurred while executing the command was greater than 5 per-second."},
},
},
@ -274,12 +251,10 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Name: "NodeHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "NodeDown",
Expr: "absent(up{component=\"rbd_node\"})",
For: "30s",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "NodeDown",
Expr: "absent(up{component=\"rbd_node\"})",
For: "30s",
Labels: commonLables,
Annotations: map[string]string{
"description": "node {{ $labels.instance }} is down",
"summary": "rbd_node is down",
@ -289,42 +264,42 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Alert: "high_cpu_usage_on_node",
Expr: "sum by(instance) (rate(process_cpu_seconds_total[5m])) * 100 > 70",
For: "5m",
Labels: map[string]string{"Alert": "Rainbond"},
Labels: commonLables,
Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.", "summary": "HIGH CPU USAGE WARNING ON '{{ $labels.instance }}'"},
},
&RulesConfig{
Alert: "high_la_usage_on_node",
Expr: "count by (instance) (node_load5) > count by(instance)(count by(job, instance, cpu)(node_cpu))",
For: "5m",
Labels: map[string]string{"Alert": "Rainbond"},
Labels: commonLables,
Annotations: map[string]string{"description": "{{ $labels.instance }} has a high load average. Load Average 5m is {{ humanize $value}}.", "summary": "HIGH LOAD AVERAGE WARNING ON '{{ $labels.instance }}'"},
},
&RulesConfig{
Alert: "inode_freerate_low",
Expr: "node_filesystem_files_free{fstype=~\"ext4|xfs\"} / node_filesystem_files{fstype=~\"ext4|xfs\"} < 0.3",
For: "5m",
Labels: map[string]string{"service": "node_filesystem_files_free"},
Labels: commonLables,
Annotations: map[string]string{"description": "the inode free rate is low of node {{ $labels.instance }}, current value is {{ humanize $value}}."},
},
&RulesConfig{
Alert: "high_rootdisk_usage_on_node",
Expr: "(node_filesystem_size{mountpoint='/'} - node_filesystem_free{mountpoint='/'}) * 100 / node_filesystem_size{mountpoint='/'} > 75",
For: "5m",
Labels: map[string]string{"Alert": "Rainbond"},
Labels: commonLables,
Annotations: map[string]string{"description": "More than 75% of disk used. Disk usage {{ humanize $value }} mountpoint {{ $labels.mountpoint }}%.", "summary": "LOW DISK SPACE WARING:NODE '{{ $labels.instance }}"},
},
&RulesConfig{
Alert: "high_dockerdisk_usage_on_node",
Expr: "(node_filesystem_size{mountpoint='/var/lib/docker'} - node_filesystem_free{mountpoint='/var/lib/docker'}) * 100 / node_filesystem_size{mountpoint='/var/lib/docker'} > 75",
For: "5m",
Labels: map[string]string{"Alert": "Rainbond"},
Labels: commonLables,
Annotations: map[string]string{"description": "More than 75% of disk used. Disk usage {{ humanize $value }} mountpoint {{ $labels.mountpoint }}%.", "summary": "LOW DISK SPACE WARING:NODE '{{ $labels.instance }}"},
},
&RulesConfig{
Alert: "high_memory_usage_on_node",
Expr: "((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) * 100 > 80",
For: "5m",
Labels: map[string]string{"Alert": "Rainbond"},
Labels: commonLables,
Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of MEMORY. MEMORY usage is over {{ humanize $value}}%.", "summary": "HIGH MEMORY USAGE WARNING TASK ON '{{ $labels.instance }}'"},
},
},
@ -336,21 +311,21 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Alert: "cluster_node_unhealth",
Expr: "rainbond_cluster_node_health != 0",
For: "3m",
Labels: map[string]string{"Alert": "Rainbond"},
Labels: commonLables,
Annotations: map[string]string{"description": "cluster node {{ $labels.node_ip }} is unhealth"},
},
&RulesConfig{
Alert: "cluster_kube_node_unhealth",
Expr: "rainbond_cluster_component_health{component=\"KubeNodeReady\"} != 0",
For: "3m",
Labels: map[string]string{"Alert": "Rainbond"},
Labels: commonLables,
Annotations: map[string]string{"description": "kubernetes cluster node {{ $labels.node_ip }} is unhealth"},
},
&RulesConfig{
Alert: "rainbond_cluster_collector_duration_seconds_timeout",
Expr: "rainbond_cluster_collector_duration_seconds > 10",
For: "3m",
Labels: map[string]string{"Alert": "Rainbond"},
Labels: commonLables,
Annotations: map[string]string{"description": "Cluster collector '{{ $labels.instance }}' more than 10s"},
},
},
@ -359,128 +334,100 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
Name: "EtcdHealth",
Rules: []*RulesConfig{
&RulesConfig{
Alert: "EtcdDown",
Expr: "absent(up{component=\"etcd\"})",
For: "1m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "EtcdDown",
Expr: "absent(up{component=\"etcd\"})",
For: "1m",
Labels: commonLables,
Annotations: map[string]string{
"description": "etcd node {{ $labels.instance }} is down, ",
"summary": "etcd node is down",
},
},
&RulesConfig{
Alert: "EtcdLoseLeader",
Expr: "etcd_server_has_leader == 0",
For: "1m",
Labels: map[string]string{
"Alert": "Rainbond",
},
Alert: "EtcdLoseLeader",
Expr: "etcd_server_has_leader == 0",
For: "1m",
Labels: commonLables,
Annotations: map[string]string{
"description": "etcd node {{ $labels.instance }} is lose leader",
"summary": "etcd lose leader",
},
},
&RulesConfig{
Alert: "InsufficientMembers",
Expr: "count(up{job=\"etcd\"} == 0) > (count(up{job=\"etcd\"}) / 2 - 1)",
For: "1m",
Labels: map[string]string{
"severity": "critical",
"Alert": "Rainbond",
},
Alert: "InsufficientMembers",
Expr: "count(up{job=\"etcd\"} == 0) > (count(up{job=\"etcd\"}) / 2 - 1)",
For: "1m",
Labels: getseverityLables("critical"),
Annotations: map[string]string{
"description": "If one more etcd member goes down the cluster will be unavailable",
"summary": "etcd cluster insufficient members",
},
},
&RulesConfig{
Alert: "HighNumberOfLeaderChanges",
Expr: "increase(etcd_server_leader_changes_seen_total{job=\"etcd\"}[1h]) > 3",
For: "1m",
Labels: map[string]string{
"severity": "warning",
"Alert": "Rainbond",
},
Alert: "HighNumberOfLeaderChanges",
Expr: "increase(etcd_server_leader_changes_seen_total{job=\"etcd\"}[1h]) > 3",
For: "1m",
Labels: getseverityLables("warning"),
Annotations: map[string]string{
"description": "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
"summary": "a high number of leader changes within the etcd cluster are happening",
},
},
&RulesConfig{
Alert: "HighNumberOfFailedGRPCRequests",
Expr: "sum(rate(etcd_grpc_requests_failed_total{job=\"etcd\"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job=\"etcd\"}[5m])) BY (grpc_method) > 0.05",
For: "5m",
Labels: map[string]string{
"severity": "critical",
"Alert": "Rainbond",
},
Alert: "HighNumberOfFailedGRPCRequests",
Expr: "sum(rate(etcd_grpc_requests_failed_total{job=\"etcd\"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job=\"etcd\"}[5m])) BY (grpc_method) > 0.05",
For: "5m",
Labels: getseverityLables("critical"),
Annotations: map[string]string{
"description": "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
"summary": "a high number of gRPC requests are failing",
},
},
&RulesConfig{
Alert: "HighNumberOfFailedHTTPRequests",
Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05",
For: "1m",
Labels: map[string]string{
"severity": "critical",
"Alert": "Rainbond",
},
Alert: "HighNumberOfFailedHTTPRequests",
Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05",
For: "1m",
Labels: getseverityLables("critical"),
Annotations: map[string]string{
"description": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
"summary": "a high number of HTTP requests are failing",
},
},
&RulesConfig{
Alert: "GRPCRequestsSlow",
Expr: "histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15",
For: "1m",
Labels: map[string]string{
"severity": "critical",
"Alert": "Rainbond",
},
Alert: "GRPCRequestsSlow",
Expr: "histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15",
For: "1m",
Labels: getseverityLables("critical"),
Annotations: map[string]string{
"description": "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method}} are slow",
"summary": "slow gRPC requests",
},
},
&RulesConfig{
Alert: "HighNumberOfFailedHTTPRequests",
Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05",
For: "1m",
Labels: map[string]string{
"severity": "critical",
"Alert": "Rainbond",
},
Alert: "HighNumberOfFailedHTTPRequests",
Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05",
For: "1m",
Labels: getseverityLables("critical"),
Annotations: map[string]string{
"description": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
"summary": "a high number of HTTP requests are failing",
},
},
&RulesConfig{
Alert: "HighNumberOfFailedHTTPRequests",
Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05",
For: "1m",
Labels: map[string]string{
"severity": "critical",
"Alert": "Rainbond",
},
Alert: "HighNumberOfFailedHTTPRequests",
Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05",
For: "1m",
Labels: getseverityLables("critical"),
Annotations: map[string]string{
"description": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
"summary": "a high number of HTTP requests are failing",
},
},
&RulesConfig{
Alert: "DatabaseSpaceExceeded",
Expr: "etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes > 0.80",
For: "1m",
Labels: map[string]string{
"severity": "critical",
"Alert": "Rainbond",
},
Alert: "DatabaseSpaceExceeded",
Expr: "etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes > 0.80",
For: "1m",
Labels: getseverityLables("critical"),
Annotations: map[string]string{
"description": "{{ $labels.instance }}, {{ $labels.job }} of etcd DB space uses more than 80%",
"summary": "Etcd DB space is overused",