Skip to content

Commit 05ebf65

Browse files
labels missing for alert to fire (#57)
* labels missing * Update generic.yaml * Update healthchecks.yaml * Update mon.yaml * Update nodes.yaml * Update osd.yaml * Update pgr.yaml * Update pools.yaml * Update Chart.yaml * Update plugindefinition.yaml * Update generic.yaml
1 parent 669811e commit 05ebf65

10 files changed

Lines changed: 105 additions & 10 deletions

File tree

charts/ceph-operations/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: v2
22
name: ceph-operations
33
description: Ceph operations bundle
44
type: application
5-
version: 1.7.8
5+
version: 1.7.9
66
maintainers:
77
- name: sumitarora2786
88
- name: richardtief

charts/ceph-operations/alerts/cluster.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ groups:
99
expr: ceph_health_status == 2
1010
for: 5m
1111
labels:
12+
service: ceph
1213
oid: "1.3.6.1.4.1.50495.1.2.1.2.1"
1314
severity: critical
1415
type: ceph_default
@@ -24,6 +25,7 @@ groups:
2425
expr: ceph_health_status == 1
2526
for: 15m
2627
labels:
28+
service: ceph
2729
severity: warning
2830
type: ceph_default
2931
inhibited_by: cluster-maintenance

charts/ceph-operations/alerts/generic.yaml

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ groups:
99
expr: ceph_health_detail{name="RECENT_CRASH"} == 1
1010
for: 1m
1111
labels:
12+
service: ceph
1213
oid: "1.3.6.1.4.1.50495.1.2.1.1.2"
1314
severity: critical
1415
type: ceph_default
@@ -20,13 +21,15 @@ groups:
2021
{{- end }}
2122

2223
{{- if not (.Values.prometheusRules.disabled.PrometheusJobMissing | default false) }}
23-
- alert: "PrometheusJobMissing"
24+
- alert: "PrometheusMgrJobMissing"
2425
expr: "absent(up{job=\"rook-ceph-mgr\"})"
25-
for: "30s"
26+
for: "1m"
2627
labels:
27-
oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
28-
severity: "critical"
28+
service: ceph
29+
oid: "1.3.6.1.4.1.50495.1.2.1.12.2"
30+
severity: "warning"
2931
type: "ceph_default"
32+
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
3033
annotations:
3134
description: "The prometheus job that scrapes from Ceph MGR is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance."
3235
summary: "The scrape job for Ceph MGR is missing from Prometheus"
@@ -35,13 +38,45 @@ groups:
3538
{{- if not (.Values.prometheusRules.disabled.PrometheusJobExporterMissing | default false) }}
3639
- alert: "PrometheusJobExporterMissing"
3740
expr: "sum(absent(up{job=\"rook-ceph-exporter\"})) and sum(ceph_osd_metadata{ceph_version=~\"^ceph version (1[89]|[2-9][0-9]).*\"}) > 0"
38-
for: "30s"
41+
for: "1m"
3942
labels:
40-
oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
41-
severity: "critical"
43+
service: ceph
44+
oid: "1.3.6.1.4.1.50495.1.2.1.12.3"
45+
severity: "warning"
4246
type: "ceph_default"
47+
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
4348
annotations:
4449
description: "The prometheus job that scrapes from Ceph Exporter is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance."
4550
summary: "The scrape job for Ceph Exporter is missing from Prometheus"
4651
{{- end }}
52+
53+
{{- if not (.Values.prometheusRules.disabled.CephMgrServiceScrapeIssue | default false) }}
54+
- alert: "CephMgrServiceScrapeIssue"
55+
expr: up{service="rook-ceph-mgr"} == 0
56+
for: "1m"
57+
labels:
58+
service: ceph
59+
oid: "1.3.6.1.4.1.50495.1.2.1.12.4"
60+
severity: "warning"
61+
type: "ceph_default"
62+
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
63+
annotations:
64+
description: "The ServiceMonitor target 'rook-ceph-mgr' is not responding to Prometheus for more than 5 minutes."
65+
summary: "The scrape job for Ceph MGR is missing from Prometheus"
66+
{{- end }}
67+
68+
{{- if not (.Values.prometheusRules.disabled.CephExporterServiceScrapeIssue | default false) }}
69+
- alert: "CephExporterServiceScrapeIssue"
70+
expr: up{service="rook-ceph-exporter"} == 0
71+
for: "1m"
72+
labels:
73+
service: ceph
74+
oid: "1.3.6.1.4.1.50495.1.2.1.12.5"
75+
severity: "warning"
76+
type: "ceph_default"
77+
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
78+
annotations:
79+
description: "The ServiceMonitor target 'rook-ceph-exporter' is not responding to Prometheus for more than 5 minutes."
80+
summary: "CephExporterService is having scrape issue"
81+
{{- end }}
4782
{{- end }}

charts/ceph-operations/alerts/healthchecks.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ groups:
99
expr: ceph_healthcheck_slow_ops > 0
1010
for: 30s
1111
labels:
12+
service: ceph
1213
severity: warning
1314
type: ceph_default
1415
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
@@ -23,6 +24,7 @@ groups:
2324
for: 30s
2425
expr: ceph_daemon_health_metrics{type="SLOW_OPS"} > 0
2526
labels:
27+
service: ceph
2628
severity: warning
2729
type: ceph_default
2830
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
@@ -37,6 +39,7 @@ groups:
3739
expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
3840
for: "30s"
3941
labels:
42+
service: ceph
4043
oid: "1.3.6.1.4.1.50495.1.2.1.13.1"
4144
severity: "critical"
4245
type: "ceph_default"
@@ -51,6 +54,7 @@ groups:
5154
expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
5255
for: "30s"
5356
labels:
57+
service: ceph
5458
oid: "1.3.6.1.4.1.50495.1.2.1.13.2"
5559
severity: "critical"
5660
type: "ceph_default"
@@ -65,6 +69,7 @@ groups:
6569
expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
6670
for: "30s"
6771
labels:
72+
service: ceph
6873
oid: "1.3.6.1.4.1.50495.1.2.1.13.3"
6974
severity: "critical"
7075
type: "ceph_default"
@@ -79,6 +84,7 @@ groups:
7984
expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
8085
for: "30s"
8186
labels:
87+
service: ceph
8288
oid: "1.3.6.1.4.1.50495.1.2.1.13.4"
8389
severity: "critical"
8490
type: "ceph_default"
@@ -93,6 +99,7 @@ groups:
9399
expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
94100
for: "30s"
95101
labels:
102+
service: ceph
96103
oid: "1.3.6.1.4.1.50495.1.2.1.13.5"
97104
severity: "critical"
98105
type: "ceph_default"
@@ -107,6 +114,7 @@ groups:
107114
expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
108115
for: "30s"
109116
labels:
117+
service: ceph
110118
oid: "1.3.6.1.4.1.50495.1.2.1.13.6"
111119
severity: "critical"
112120
type: "ceph_default"

charts/ceph-operations/alerts/mon.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ groups:
1414
) == 1
1515
for: 30s
1616
labels:
17+
service: ceph
1718
oid: "1.3.6.1.4.1.50495.1.2.1.3.1"
1819
severity: critical
1920
type: ceph_default
@@ -32,6 +33,7 @@ groups:
3233
count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)
3334
for: 30s
3435
labels:
36+
service: ceph
3537
severity: warning
3638
type: ceph_default
3739
inhibited_by: node-maintenance
@@ -48,6 +50,7 @@ groups:
4850
expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
4951
for: 1m
5052
labels:
53+
service: ceph
5154
oid: "1.3.6.1.4.1.50495.1.2.1.3.2"
5255
severity: critical
5356
type: ceph_default
@@ -64,6 +67,7 @@ groups:
6467
expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
6568
for: 5m
6669
labels:
70+
service: ceph
6771
severity: warning
6872
type: ceph_default
6973
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
@@ -79,6 +83,7 @@ groups:
7983
expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
8084
for: 1m
8185
labels:
86+
service: ceph
8287
severity: warning
8388
type: ceph_default
8489
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}

charts/ceph-operations/alerts/nodes.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ groups:
99
expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5
1010
for: 5m
1111
labels:
12+
service: ceph
1213
oid: "1.3.6.1.4.1.50495.1.2.1.8.1"
1314
severity: critical
1415
type: ceph_default
@@ -32,6 +33,7 @@ groups:
3233
rate(node_network_transmit_drop_total{device="bond0"}[1m])
3334
) >= 10
3435
labels:
36+
service: ceph
3537
oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
3638
severity: warning
3739
type: ceph_default
@@ -55,6 +57,7 @@ groups:
5557
rate(node_network_transmit_errs_total{device="bond0"}[1m])
5658
) >= 10
5759
labels:
60+
service: ceph
5861
oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
5962
severity: warning
6063
type: ceph_default
@@ -69,6 +72,7 @@ groups:
6972
expr: node_bonding_slaves - node_bonding_active != 0
7073
for: 10m
7174
labels:
75+
service: ceph
7276
severity: warning
7377
type: ceph_default
7478
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
@@ -82,6 +86,7 @@ groups:
8286
expr: node_bonding_active != 2
8387
for: 10m
8488
labels:
89+
service: ceph
8590
severity: warning
8691
type: ceph_default
8792
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
@@ -95,6 +100,7 @@ groups:
95100
expr: max_over_time(node_bonding_active[6h]) unless ignoring(instance, pod) node_bonding_active
96101
for: 30m
97102
labels:
103+
service: ceph
98104
severity: warning
99105
type: ceph_default
100106
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
@@ -108,6 +114,7 @@ groups:
108114
expr: up{job="node-exporter"} == 0
109115
for: 10m
110116
labels:
117+
service: ceph
111118
severity: warning
112119
type: ceph_default
113120
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}
@@ -120,6 +127,7 @@ groups:
120127
- alert: CephNodeDiskspaceWarning
121128
expr: predict_linear(node_filesystem_free_bytes{device=~"/.*", mountpoint!="/boot"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0
122129
labels:
130+
service: ceph
123131
oid: "1.3.6.1.4.1.50495.1.2.1.8.4"
124132
severity: warning
125133
type: ceph_default
@@ -135,6 +143,7 @@ groups:
135143
expr: |
136144
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) )
137145
labels:
146+
service: ceph
138147
severity: warning
139148
type: ceph_default
140149
{{- include "cloud-storage-operations.additionalRuleLabels" . | nindent 6 }}

0 commit comments

Comments
 (0)