You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{- if not (.Values.prometheusRules.disabled.PrometheusJobMissing | default false) }}
23
-
- alert: "PrometheusJobMissing"
24
+
- alert: "PrometheusMgrJobMissing"
24
25
expr: "absent(up{job=\"rook-ceph-mgr\"})"
25
-
for: "30s"
26
+
for: "1m"
26
27
labels:
27
-
oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
28
-
severity: "critical"
28
+
service: ceph
29
+
oid: "1.3.6.1.4.1.50495.1.2.1.12.2"
30
+
severity: "warning"
29
31
type: "ceph_default"
32
+
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
30
33
annotations:
31
34
description: "The prometheus job that scrapes from Ceph MGR is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance."
32
35
summary: "The scrape job for Ceph MGR is missing from Prometheus"
@@ -35,13 +38,45 @@ groups:
35
38
{{- if not (.Values.prometheusRules.disabled.PrometheusJobExporterMissing | default false) }}
36
39
- alert: "PrometheusJobExporterMissing"
37
40
expr: "sum(absent(up{job=\"rook-ceph-exporter\"})) and sum(ceph_osd_metadata{ceph_version=~\"^ceph version (1[89]|[2-9][0-9]).*\"}) > 0"
38
-
for: "30s"
41
+
for: "1m"
39
42
labels:
40
-
oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
41
-
severity: "critical"
43
+
service: ceph
44
+
oid: "1.3.6.1.4.1.50495.1.2.1.12.3"
45
+
severity: "warning"
42
46
type: "ceph_default"
47
+
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
43
48
annotations:
44
49
description: "The prometheus job that scrapes from Ceph Exporter is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance."
45
50
summary: "The scrape job for Ceph Exporter is missing from Prometheus"
46
51
{{- end }}
52
+
53
+
{{- if not (.Values.prometheusRules.disabled.CephMgrServiceScrapeIssue | default false) }}
54
+
- alert: "CephMgrServiceScrapeIssue"
55
+
expr: up{service="rook-ceph-mgr"} == 0
56
+
for: "1m"
57
+
labels:
58
+
service: ceph
59
+
oid: "1.3.6.1.4.1.50495.1.2.1.12.4"
60
+
severity: "warning"
61
+
type: "ceph_default"
62
+
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
63
+
annotations:
64
+
description: "The ServiceMonitor target 'rook-ceph-mgr' is not responding to Prometheus for more than 5 minutes."
65
+
summary: "The scrape job for Ceph MGR is missing from Prometheus"
66
+
{{- end }}
67
+
68
+
{{- if not (.Values.prometheusRules.disabled.CephExporterServiceScrapeIssue | default false) }}
69
+
- alert: "CephExporterServiceScrapeIssue"
70
+
expr: up{service="rook-ceph-exporter"} == 0
71
+
for: "1m"
72
+
labels:
73
+
service: ceph
74
+
oid: "1.3.6.1.4.1.50495.1.2.1.12.5"
75
+
severity: "warning"
76
+
type: "ceph_default"
77
+
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
78
+
annotations:
79
+
description: "The ServiceMonitor target 'rook-ceph-exporter' is not responding to Prometheus for more than 5 minutes."
80
+
summary: "CephExporterService is having scrape issue"
0 commit comments