From d9da07274bed989e0d4a8bb5f34f69745b064899 Mon Sep 17 00:00:00 2001 From: barnettZQG Date: Sat, 18 Jul 2020 16:56:57 +0800 Subject: [PATCH] support set alert rule region name --- monitor/prometheus/rules_manager.go | 341 ++++++++++++---------------- 1 file changed, 144 insertions(+), 197 deletions(-) diff --git a/monitor/prometheus/rules_manager.go b/monitor/prometheus/rules_manager.go index 7828a4f08..cf28ae9c9 100644 --- a/monitor/prometheus/rules_manager.go +++ b/monitor/prometheus/rules_manager.go @@ -37,6 +37,21 @@ type AlertingRulesManager struct { //NewRulesManager new rule manager func NewRulesManager(config *option.Config) *AlertingRulesManager { + region := os.Getenv("REGION_NAME") + if region == "" { + region = "default" + } + commonLables := map[string]string{ + "Alert": "Rainbond", + "Region": region, + } + getseverityLables := func(severity string) map[string]string { + return map[string]string{ + "Alert": "Rainbond", + "severity": severity, + "Region": region, + } + } a := &AlertingRulesManager{ RulesConfig: &AlertingRulesConfig{ Groups: []*AlertingNameConfig{ @@ -44,57 +59,47 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Name: "GatewayHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "GatewayDown", - Expr: "absent(up{job=\"gateway\"})", - For: "10s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "GatewayDown", + Expr: "absent(up{job=\"gateway\"})", + For: "10s", + Labels: commonLables, Annotations: map[string]string{ "description": "gateway node {{ $labels.instance }} is down, ", "summary": "gateway is down", }, }, &RulesConfig{ - Alert: "RequestSizeTooMuch", - Expr: "sum by (instance, host) (rate(gateway_request_size_sum[5m])) > 1024*1024*10", - For: "20s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "RequestSizeTooMuch", + Expr: "sum by (instance, host) (rate(gateway_request_size_sum[5m])) > 1024*1024*10", + For: "20s", + Labels: commonLables, Annotations: map[string]string{ "description": "http doamin {{ $labels.host }} per-second request size {{ humanize $value }}, more than 10M", "summary": "Too much traffic", }, }, &RulesConfig{ - Alert: "ResponseSizeTooMuch", - Expr: "sum by (instance, host) (rate(gateway_response_size_sum[5m])) > 1024*1024*10", - For: "20s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "ResponseSizeTooMuch", + Expr: "sum by (instance, host) (rate(gateway_response_size_sum[5m])) > 1024*1024*10", + For: "20s", + Labels: commonLables, Annotations: map[string]string{ "description": "http doamin {{ $labels.host }} per-second response size {{ humanize $value }}, more than 10M", "summary": "Too much traffic", }, }, &RulesConfig{ - Alert: "RequestMany", - Expr: "rate(gateway_requests[5m]) > 200", - For: "10s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "RequestMany", + Expr: "rate(gateway_requests[5m]) > 200", + For: "10s", + Labels: commonLables, Annotations: map[string]string{"description": "http doamin {{ $labels.host }} per-second requests {{ humanize $value }}, more than 200"}, }, &RulesConfig{ - Alert: "FailureRequestMany", - Expr: "rate(gateway_requests{status=~\"5..\"}[5m]) > 5", - For: "10s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "FailureRequestMany", + Expr: "rate(gateway_requests{status=~\"5..\"}[5m]) > 5", + For: "10s", + Labels: commonLables, Annotations: map[string]string{"description": "http doamin {{ $labels.host }} per-second failure requests {{ humanize $value }}, more than 5"}, }, }, @@ -103,33 +108,27 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Name: "BuilderHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "BuilderDown", - Expr: "absent(up{component=\"builder\"})", - For: "10s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "BuilderDown", + Expr: "absent(up{component=\"builder\"})", + For: "10s", + Labels: commonLables, Annotations: map[string]string{ "description": "builder(rbd-chaos) node {{ $labels.instance }} is down, ", "summary": "builder(rbd-chaos) is down", }, }, &RulesConfig{ - Alert: "BuilderUnhealthy", - Expr: "builder_exporter_health_status == 0", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "BuilderUnhealthy", + Expr: "builder_exporter_health_status == 0", + For: "3m", + Labels: commonLables, Annotations: map[string]string{"description": "builder unhealthy"}, }, &RulesConfig{ - Alert: "BuilderTaskError", - Expr: "builder_exporter_builder_current_concurrent_task == builder_exporter_builder_max_concurrent_task", - For: "20s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "BuilderTaskError", + Expr: "builder_exporter_builder_current_concurrent_task == builder_exporter_builder_max_concurrent_task", + For: "20s", + Labels: commonLables, Annotations: map[string]string{"summary": "The build service is performing a maximum number of tasks"}, }, }, @@ -138,36 +137,30 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Name: "WorkerHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "WorkerDown", - Expr: "absent(up{component=\"worker\"})", - For: "10s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "WorkerDown", + Expr: "absent(up{component=\"worker\"})", + For: "10s", + Labels: commonLables, Annotations: map[string]string{ "description": "worker node {{ $labels.instance }} is down", "summary": "worker is down", }, }, &RulesConfig{ - Alert: "WorkerUnhealthy", - Expr: "app_resource_exporter_health_status == 0", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "WorkerUnhealthy", + Expr: "app_resource_exporter_health_status == 0", + For: "3m", + Labels: commonLables, Annotations: map[string]string{ "summary": "worker unhealthy", "description": "worker node {{ $labels.instance }} is unhealthy", }, }, &RulesConfig{ - Alert: "WorkerTaskError", - Expr: "app_resource_exporter_worker_task_error > 50", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "WorkerTaskError", + Expr: "app_resource_exporter_worker_task_error > 50", + For: "3m", + Labels: commonLables, Annotations: map[string]string{ "description": "worker node {{ $labels.instance }} execution task error number is greater than 50", }, @@ -178,33 +171,27 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Name: "MqHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "MqDown", - Expr: "absent(up{component=\"mq\"})", - For: "20s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "MqDown", + Expr: "absent(up{component=\"mq\"})", + For: "20s", + Labels: commonLables, Annotations: map[string]string{ "description": "mq node {{ $labels.instance }} is down", "summary": "mq is down", }, }, &RulesConfig{ - Alert: "MqUnhealthy", - Expr: "acp_mq_exporter_health_status == 0", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "MqUnhealthy", + Expr: "acp_mq_exporter_health_status == 0", + For: "3m", + Labels: commonLables, Annotations: map[string]string{"summary": "mq unhealthy"}, }, &RulesConfig{ - Alert: "TeamTaskMany", - Expr: "acp_mq_dequeue_number-acp_mq_enqueue_number > 200", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "TeamTaskMany", + Expr: "acp_mq_dequeue_number-acp_mq_enqueue_number > 200", + For: "3m", + Labels: commonLables, Annotations: map[string]string{"summary": "The number of tasks in the queue is greater than 200"}, }, }, @@ -213,21 +200,17 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Name: "EventlogHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "EventLogUnhealthy", - Expr: "event_log_exporter_health_status == 0", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "EventLogUnhealthy", + Expr: "event_log_exporter_health_status == 0", + For: "3m", + Labels: commonLables, Annotations: map[string]string{"summary": "eventlog unhealthy"}, }, &RulesConfig{ - Alert: "EventLogDown", - Expr: "absent(up{component=\"eventlog\"})", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "EventLogDown", + Expr: "absent(up{component=\"eventlog\"})", + For: "3m", + Labels: commonLables, Annotations: map[string]string{ "description": "worker node {{ $labels.instance }} is down", "summary": "eventlog service down", @@ -239,33 +222,27 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Name: "WebcliHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "WebcliDown", - Expr: "absent(up{component=\"webcli\"})", - For: "20s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "WebcliDown", + Expr: "absent(up{component=\"webcli\"})", + For: "20s", + Labels: commonLables, Annotations: map[string]string{ "description": "webcli node {{ $labels.instance }} is down", "summary": "webcli is down", }, }, &RulesConfig{ - Alert: "WebcliUnhealthy", - Expr: "webcli_exporter_health_status == 0", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "WebcliUnhealthy", + Expr: "webcli_exporter_health_status == 0", + For: "3m", + Labels: commonLables, Annotations: map[string]string{"summary": "webcli unhealthy"}, }, &RulesConfig{ - Alert: "WebcliUnhealthy", - Expr: "rate(webcli_exporter_execute_command_failed[5m]) > 5", - For: "3m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "WebcliUnhealthy", + Expr: "rate(webcli_exporter_execute_command_failed[5m]) > 5", + For: "3m", + Labels: commonLables, Annotations: map[string]string{"summary": "The number of errors that occurred while executing the command was greater than 5 per-second."}, }, }, @@ -274,12 +251,10 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Name: "NodeHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "NodeDown", - Expr: "absent(up{component=\"rbd_node\"})", - For: "30s", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "NodeDown", + Expr: "absent(up{component=\"rbd_node\"})", + For: "30s", + Labels: commonLables, Annotations: map[string]string{ "description": "node {{ $labels.instance }} is down", "summary": "rbd_node is down", @@ -289,42 +264,42 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Alert: "high_cpu_usage_on_node", Expr: "sum by(instance) (rate(process_cpu_seconds_total[5m])) * 100 > 70", For: "5m", - Labels: map[string]string{"Alert": "Rainbond"}, + Labels: commonLables, Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.", "summary": "HIGH CPU USAGE WARNING ON '{{ $labels.instance }}'"}, }, &RulesConfig{ Alert: "high_la_usage_on_node", Expr: "count by (instance) (node_load5) > count by(instance)(count by(job, instance, cpu)(node_cpu))", For: "5m", - Labels: map[string]string{"Alert": "Rainbond"}, + Labels: commonLables, Annotations: map[string]string{"description": "{{ $labels.instance }} has a high load average. Load Average 5m is {{ humanize $value}}.", "summary": "HIGH LOAD AVERAGE WARNING ON '{{ $labels.instance }}'"}, }, &RulesConfig{ Alert: "inode_freerate_low", Expr: "node_filesystem_files_free{fstype=~\"ext4|xfs\"} / node_filesystem_files{fstype=~\"ext4|xfs\"} < 0.3", For: "5m", - Labels: map[string]string{"service": "node_filesystem_files_free"}, + Labels: commonLables, Annotations: map[string]string{"description": "the inode free rate is low of node {{ $labels.instance }}, current value is {{ humanize $value}}."}, }, &RulesConfig{ Alert: "high_rootdisk_usage_on_node", Expr: "(node_filesystem_size{mountpoint='/'} - node_filesystem_free{mountpoint='/'}) * 100 / node_filesystem_size{mountpoint='/'} > 75", For: "5m", - Labels: map[string]string{"Alert": "Rainbond"}, + Labels: commonLables, Annotations: map[string]string{"description": "More than 75% of disk used. Disk usage {{ humanize $value }} mountpoint {{ $labels.mountpoint }}%.", "summary": "LOW DISK SPACE WARING:NODE '{{ $labels.instance }}"}, }, &RulesConfig{ Alert: "high_dockerdisk_usage_on_node", Expr: "(node_filesystem_size{mountpoint='/var/lib/docker'} - node_filesystem_free{mountpoint='/var/lib/docker'}) * 100 / node_filesystem_size{mountpoint='/var/lib/docker'} > 75", For: "5m", - Labels: map[string]string{"Alert": "Rainbond"}, + Labels: commonLables, Annotations: map[string]string{"description": "More than 75% of disk used. Disk usage {{ humanize $value }} mountpoint {{ $labels.mountpoint }}%.", "summary": "LOW DISK SPACE WARING:NODE '{{ $labels.instance }}"}, }, &RulesConfig{ Alert: "high_memory_usage_on_node", Expr: "((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) * 100 > 80", For: "5m", - Labels: map[string]string{"Alert": "Rainbond"}, + Labels: commonLables, Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of MEMORY. MEMORY usage is over {{ humanize $value}}%.", "summary": "HIGH MEMORY USAGE WARNING TASK ON '{{ $labels.instance }}'"}, }, }, @@ -336,21 +311,21 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Alert: "cluster_node_unhealth", Expr: "rainbond_cluster_node_health != 0", For: "3m", - Labels: map[string]string{"Alert": "Rainbond"}, + Labels: commonLables, Annotations: map[string]string{"description": "cluster node {{ $labels.node_ip }} is unhealth"}, }, &RulesConfig{ Alert: "cluster_kube_node_unhealth", Expr: "rainbond_cluster_component_health{component=\"KubeNodeReady\"} != 0", For: "3m", - Labels: map[string]string{"Alert": "Rainbond"}, + Labels: commonLables, Annotations: map[string]string{"description": "kubernetes cluster node {{ $labels.node_ip }} is unhealth"}, }, &RulesConfig{ Alert: "rainbond_cluster_collector_duration_seconds_timeout", Expr: "rainbond_cluster_collector_duration_seconds > 10", For: "3m", - Labels: map[string]string{"Alert": "Rainbond"}, + Labels: commonLables, Annotations: map[string]string{"description": "Cluster collector '{{ $labels.instance }}' more than 10s"}, }, }, @@ -359,128 +334,100 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { Name: "EtcdHealth", Rules: []*RulesConfig{ &RulesConfig{ - Alert: "EtcdDown", - Expr: "absent(up{component=\"etcd\"})", - For: "1m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "EtcdDown", + Expr: "absent(up{component=\"etcd\"})", + For: "1m", + Labels: commonLables, Annotations: map[string]string{ "description": "etcd node {{ $labels.instance }} is down, ", "summary": "etcd node is down", }, }, &RulesConfig{ - Alert: "EtcdLoseLeader", - Expr: "etcd_server_has_leader == 0", - For: "1m", - Labels: map[string]string{ - "Alert": "Rainbond", - }, + Alert: "EtcdLoseLeader", + Expr: "etcd_server_has_leader == 0", + For: "1m", + Labels: commonLables, Annotations: map[string]string{ "description": "etcd node {{ $labels.instance }} is lose leader", "summary": "etcd lose leader", }, }, &RulesConfig{ - Alert: "InsufficientMembers", - Expr: "count(up{job=\"etcd\"} == 0) > (count(up{job=\"etcd\"}) / 2 - 1)", - For: "1m", - Labels: map[string]string{ - "severity": "critical", - "Alert": "Rainbond", - }, + Alert: "InsufficientMembers", + Expr: "count(up{job=\"etcd\"} == 0) > (count(up{job=\"etcd\"}) / 2 - 1)", + For: "1m", + Labels: getseverityLables("critical"), Annotations: map[string]string{ "description": "If one more etcd member goes down the cluster will be unavailable", "summary": "etcd cluster insufficient members", }, }, &RulesConfig{ - Alert: "HighNumberOfLeaderChanges", - Expr: "increase(etcd_server_leader_changes_seen_total{job=\"etcd\"}[1h]) > 3", - For: "1m", - Labels: map[string]string{ - "severity": "warning", - "Alert": "Rainbond", - }, + Alert: "HighNumberOfLeaderChanges", + Expr: "increase(etcd_server_leader_changes_seen_total{job=\"etcd\"}[1h]) > 3", + For: "1m", + Labels: getseverityLables("warning"), Annotations: map[string]string{ "description": "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour", "summary": "a high number of leader changes within the etcd cluster are happening", }, }, &RulesConfig{ - Alert: "HighNumberOfFailedGRPCRequests", - Expr: "sum(rate(etcd_grpc_requests_failed_total{job=\"etcd\"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job=\"etcd\"}[5m])) BY (grpc_method) > 0.05", - For: "5m", - Labels: map[string]string{ - "severity": "critical", - "Alert": "Rainbond", - }, + Alert: "HighNumberOfFailedGRPCRequests", + Expr: "sum(rate(etcd_grpc_requests_failed_total{job=\"etcd\"}[5m])) BY (grpc_method) / sum(rate(etcd_grpc_total{job=\"etcd\"}[5m])) BY (grpc_method) > 0.05", + For: "5m", + Labels: getseverityLables("critical"), Annotations: map[string]string{ "description": "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}", "summary": "a high number of gRPC requests are failing", }, }, &RulesConfig{ - Alert: "HighNumberOfFailedHTTPRequests", - Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05", - For: "1m", - Labels: map[string]string{ - "severity": "critical", - "Alert": "Rainbond", - }, + Alert: "HighNumberOfFailedHTTPRequests", + Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05", + For: "1m", + Labels: getseverityLables("critical"), Annotations: map[string]string{ "description": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", "summary": "a high number of HTTP requests are failing", }, }, &RulesConfig{ - Alert: "GRPCRequestsSlow", - Expr: "histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15", - For: "1m", - Labels: map[string]string{ - "severity": "critical", - "Alert": "Rainbond", - }, + Alert: "GRPCRequestsSlow", + Expr: "histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) > 0.15", + For: "1m", + Labels: getseverityLables("critical"), Annotations: map[string]string{ "description": "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method}} are slow", "summary": "slow gRPC requests", }, }, &RulesConfig{ - Alert: "HighNumberOfFailedHTTPRequests", - Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05", - For: "1m", - Labels: map[string]string{ - "severity": "critical", - "Alert": "Rainbond", - }, + Alert: "HighNumberOfFailedHTTPRequests", + Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05", + For: "1m", + Labels: getseverityLables("critical"), Annotations: map[string]string{ "description": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", "summary": "a high number of HTTP requests are failing", }, }, &RulesConfig{ - Alert: "HighNumberOfFailedHTTPRequests", - Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05", - For: "1m", - Labels: map[string]string{ - "severity": "critical", - "Alert": "Rainbond", - }, + Alert: "HighNumberOfFailedHTTPRequests", + Expr: "sum(rate(etcd_http_failed_total{job=\"etcd\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=\"etcd\"}[5m]))BY (method) > 0.05", + For: "1m", + Labels: getseverityLables("critical"), Annotations: map[string]string{ "description": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", "summary": "a high number of HTTP requests are failing", }, }, &RulesConfig{ - Alert: "DatabaseSpaceExceeded", - Expr: "etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes > 0.80", - For: "1m", - Labels: map[string]string{ - "severity": "critical", - "Alert": "Rainbond", - }, + Alert: "DatabaseSpaceExceeded", + Expr: "etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes > 0.80", + For: "1m", + Labels: getseverityLables("critical"), Annotations: map[string]string{ "description": "{{ $labels.instance }}, {{ $labels.job }} of etcd DB space uses more than 80%", "summary": "Etcd DB space is overused",