@@ -31,4 +31,82 @@ groups:
3131 description : " `{{`{{ $labels.ceph_daemon }}`}}` operations are taking too long to process (complaint time exceeded)"
3232 documentation : " https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
3333{{- end }}
34+
35+ {{- if not (.Values.prometheusRules.disabled.HardwareStorageError | default false) }}
36+ - alert : " HardwareStorageError"
37+ expr : " ceph_health_detail{name=\" HARDWARE_STORAGE\" } > 0"
38+ for : " 30s"
39+ labels :
40+ oid : " 1.3.6.1.4.1.50495.1.2.1.13.1"
41+ severity : " critical"
42+ type : " ceph_default"
43+ annotations :
44+ description : " Some storage devices are in error. Check `ceph health detail`."
45+ summary : " Storage devices error(s) detected"
46+ {{- end }}
47+
48+ {{- if not (.Values.prometheusRules.disabled.HardwareMemoryError | default false) }}
49+ - alert : " HardwareMemoryError"
50+ expr : " ceph_health_detail{name=\" HARDWARE_MEMORY\" } > 0"
51+ for : " 30s"
52+ labels :
53+ oid : " 1.3.6.1.4.1.50495.1.2.1.13.2"
54+ severity : " critical"
55+ type : " ceph_default"
56+ annotations :
57+ description : " DIMM error(s) detected. Check `ceph health detail`."
58+ summary : " DIMM error(s) detected"
59+ {{- end }}
60+
61+ {{- if not (.Values.prometheusRules.disabled.HardwareProcessorError | default false) }}
62+ - alert : " HardwareProcessorError"
63+ expr : " ceph_health_detail{name=\" HARDWARE_PROCESSOR\" } > 0"
64+ for : " 30s"
65+ labels :
66+ oid : " 1.3.6.1.4.1.50495.1.2.1.13.3"
67+ severity : " critical"
68+ type : " ceph_default"
69+ annotations :
70+ description : " Processor error(s) detected. Check `ceph health detail`."
71+ summary : " Processor error(s) detected"
72+ {{- end }}
73+
74+ {{- if not (.Values.prometheusRules.disabled.HardwareNetworkError | default false) }}
75+ - alert : " HardwareNetworkError"
76+ expr : " ceph_health_detail{name=\" HARDWARE_NETWORK\" } > 0"
77+ for : " 30s"
78+ labels :
79+ oid : " 1.3.6.1.4.1.50495.1.2.1.13.4"
80+ severity : " critical"
81+ type : " ceph_default"
82+ annotations :
83+ description : " Network error(s) detected. Check `ceph health detail`."
84+ summary : " Network error(s) detected"
85+ {{- end }}
86+
87+ {{- if not (.Values.prometheusRules.disabled.HardwarePowerError | default false) }}
88+ - alert : " HardwarePowerError"
89+ expr : " ceph_health_detail{name=\" HARDWARE_POWER\" } > 0"
90+ for : " 30s"
91+ labels :
92+ oid : " 1.3.6.1.4.1.50495.1.2.1.13.5"
93+ severity : " critical"
94+ type : " ceph_default"
95+ annotations :
96+ description : " Power supply error(s) detected. Check `ceph health detail`."
97+ summary : " Power supply error(s) detected"
98+ {{- end }}
99+
100+ {{- if not (.Values.prometheusRules.disabled.HardwareFanError | default false) }}
101+ - alert : " HardwareFanError"
102+ expr : " ceph_health_detail{name=\" HARDWARE_FANS\" } > 0"
103+ for : " 30s"
104+ labels :
105+ oid : " 1.3.6.1.4.1.50495.1.2.1.13.6"
106+ severity : " critical"
107+ type : " ceph_default"
108+ annotations :
109+ description : " Fan error(s) detected. Check `ceph health detail`."
110+ summary : " Fan error(s) detected"
111+ {{- end }}
34112{{- end }}
0 commit comments