mirror of
https://gitee.com/rainbond/Rainbond.git
synced 2024-11-30 18:58:02 +08:00
[ADD] add node default alerting rules
This commit is contained in:
parent
e8314d8fc2
commit
f4f5fd1963
@ -6,7 +6,6 @@ import (
|
|||||||
"gopkg.in/yaml.v2"
|
"gopkg.in/yaml.v2"
|
||||||
"os"
|
"os"
|
||||||
"github.com/goodrain/rainbond/cmd/monitor/option"
|
"github.com/goodrain/rainbond/cmd/monitor/option"
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type AlertingRulesConfig struct {
|
type AlertingRulesConfig struct {
|
||||||
@ -35,19 +34,6 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
|
|||||||
a := &AlertingRulesManager{
|
a := &AlertingRulesManager{
|
||||||
RulesConfig: &AlertingRulesConfig{
|
RulesConfig: &AlertingRulesConfig{
|
||||||
Groups: []*AlertingNameConfig{
|
Groups: []*AlertingNameConfig{
|
||||||
&AlertingNameConfig{
|
|
||||||
|
|
||||||
Name: "InstanceHealth",
|
|
||||||
Rules: []*RulesConfig{
|
|
||||||
&RulesConfig{
|
|
||||||
Alert: "InstanceDown",
|
|
||||||
Expr: "up == 0",
|
|
||||||
For: "3m",
|
|
||||||
Labels: map[string]string{},
|
|
||||||
Annotations: map[string]string{"summary": "builder {{$labels.instance}} down", "description":"{{$labels.instance}} of job {{$labels.job}} has been down for more than 3 minutes"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
&AlertingNameConfig{
|
&AlertingNameConfig{
|
||||||
|
|
||||||
Name: "BuilderHealth",
|
Name: "BuilderHealth",
|
||||||
@ -161,6 +147,47 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
&AlertingNameConfig{
|
||||||
|
|
||||||
|
Name: "NodeHealth",
|
||||||
|
Rules: []*RulesConfig{
|
||||||
|
&RulesConfig{
|
||||||
|
Alert: "high_cpu_usage_on_node",
|
||||||
|
Expr: "sum by(instance) (rate(process_cpu_seconds_total[5m])) * 100 > 70",
|
||||||
|
For: "5m",
|
||||||
|
Labels: map[string]string{"service": "node_cpu"},
|
||||||
|
Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.", "summary": "HIGH CPU USAGE WARNING ON '{{ $labels.instance }}'"},
|
||||||
|
},
|
||||||
|
&RulesConfig{
|
||||||
|
Alert: "high_la_usage_on_node",
|
||||||
|
Expr: "node_load5 > 5",
|
||||||
|
For: "5m",
|
||||||
|
Labels: map[string]string{"service": "node_load5"},
|
||||||
|
Annotations: map[string]string{"description": "{{ $labels.instance }} has a high load average. Load Average 5m is {{ humanize $value}}.", "summary": "HIGH LOAD AVERAGE WARNING ON '{{ $labels.instance }}'"},
|
||||||
|
},
|
||||||
|
&RulesConfig{
|
||||||
|
Alert: "node_running_out_of_disk_space",
|
||||||
|
Expr: "(node_filesystem_size{mountpoint='/'} - node_filesystem_free{mountpoint='/'}) * 100 / node_filesystem_size{mountpoint='/'} > 80",
|
||||||
|
For: "5m",
|
||||||
|
Labels: map[string]string{"service": "node_running_out_of_disk_space"},
|
||||||
|
Annotations: map[string]string{"description": "More than 80% of disk used. Disk usage {{ humanize $value }}%.", "summary": "LOW DISK SPACE WARING:NODE '{{ $labels.instance }}"},
|
||||||
|
},
|
||||||
|
&RulesConfig{
|
||||||
|
Alert: "monitoring_service_down",
|
||||||
|
Expr: "up == 0",
|
||||||
|
For: "5m",
|
||||||
|
Labels: map[string]string{"service": "service_down"},
|
||||||
|
Annotations: map[string]string{"description": "The monitoring service '{{ $labels.job }}' is down.", "summary": "MONITORING SERVICE DOWN WARNING:NODE '{{ $labels.instance }}'"},
|
||||||
|
},
|
||||||
|
&RulesConfig{
|
||||||
|
Alert: "high_memory_usage_on_node",
|
||||||
|
Expr: "((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) * 100 > 80",
|
||||||
|
For: "5m",
|
||||||
|
Labels: map[string]string{"service": "node_memory"},
|
||||||
|
Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of MEMORY. MEMORY usage is over {{ humanize $value}}%.", "summary": "HIGH MEMORY USAGE WARNING TASK ON '{{ $labels.instance }}'"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
config: config,
|
config: config,
|
||||||
|
Loading…
Reference in New Issue
Block a user