Skip to content

Commit 3ae33bf

Browse files
add ceph healthcheck alerts
add ceph healthcheck alerts
1 parent 9016850 commit 3ae33bf

1 file changed

Lines changed: 78 additions & 0 deletions

File tree

charts/ceph-operations/alerts/healthchecks.yaml

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,82 @@ groups:
3131
description: "`{{`{{ $labels.ceph_daemon }}`}}` operations are taking too long to process (complaint time exceeded)"
3232
documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
3333
{{- end }}
34+
35+
{{- if not (.Values.prometheusRules.disabled.HardwareStorageError | default false) }}
36+
- alert: "HardwareStorageError"
37+
expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
38+
for: "30s"
39+
labels:
40+
oid: "1.3.6.1.4.1.50495.1.2.1.13.1"
41+
severity: "critical"
42+
type: "ceph_default"
43+
annotations:
44+
description: "Some storage devices are in error. Check `ceph health detail`."
45+
summary: "Storage devices error(s) detected"
46+
{{- end }}
47+
48+
{{- if not (.Values.prometheusRules.disabled.HardwareMemoryError | default false) }}
49+
- alert: "HardwareMemoryError"
50+
expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
51+
for: "30s"
52+
labels:
53+
oid: "1.3.6.1.4.1.50495.1.2.1.13.2"
54+
severity: "critical"
55+
type: "ceph_default"
56+
annotations:
57+
description: "DIMM error(s) detected. Check `ceph health detail`."
58+
summary: "DIMM error(s) detected"
59+
{{- end }}
60+
61+
{{- if not (.Values.prometheusRules.disabled.HardwareProcessorError | default false) }}
62+
- alert: "HardwareProcessorError"
63+
expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
64+
for: "30s"
65+
labels:
66+
oid: "1.3.6.1.4.1.50495.1.2.1.13.3"
67+
severity: "critical"
68+
type: "ceph_default"
69+
annotations:
70+
description: "Processor error(s) detected. Check `ceph health detail`."
71+
summary: "Processor error(s) detected"
72+
{{- end }}
73+
74+
{{- if not (.Values.prometheusRules.disabled.HardwareNetworkError | default false) }}
75+
- alert: "HardwareNetworkError"
76+
expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
77+
for: "30s"
78+
labels:
79+
oid: "1.3.6.1.4.1.50495.1.2.1.13.4"
80+
severity: "critical"
81+
type: "ceph_default"
82+
annotations:
83+
description: "Network error(s) detected. Check `ceph health detail`."
84+
summary: "Network error(s) detected"
85+
{{- end }}
86+
87+
{{- if not (.Values.prometheusRules.disabled.HardwarePowerError | default false) }}
88+
- alert: "HardwarePowerError"
89+
expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
90+
for: "30s"
91+
labels:
92+
oid: "1.3.6.1.4.1.50495.1.2.1.13.5"
93+
severity: "critical"
94+
type: "ceph_default"
95+
annotations:
96+
description: "Power supply error(s) detected. Check `ceph health detail`."
97+
summary: "Power supply error(s) detected"
98+
{{- end }}
99+
100+
{{- if not (.Values.prometheusRules.disabled.HardwareFanError | default false) }}
101+
- alert: "HardwareFanError"
102+
expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
103+
for: "30s"
104+
labels:
105+
oid: "1.3.6.1.4.1.50495.1.2.1.13.6"
106+
severity: "critical"
107+
type: "ceph_default"
108+
annotations:
109+
description: "Fan error(s) detected. Check `ceph health detail`."
110+
summary: "Fan error(s) detected"
111+
{{- end }}
34112
{{- end }}

0 commit comments

Comments
 (0)