Skip to content

Commit 1b5a0e9

Browse files
committed
ROX-33584: Reduce the severity of the fleetshard-sync unavailability alerts
1 parent 4648a91 commit 1b5a0e9

4 files changed

Lines changed: 6 additions & 6 deletions

File tree

resources/prometheus/prometheus-rules.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ spec:
106106
(avg_over_time(up{pod=~"fleetshard-sync-.*"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{pod=~"fleetshard-sync-.*"} == 1) or absent(up{pod=~"fleetshard-sync-.*"})
107107
for: 20m
108108
labels:
109-
severity: critical
109+
severity: warning
110110
annotations:
111111
summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`."
112112
description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful."
@@ -116,15 +116,15 @@ spec:
116116
avg_over_time(kube_pod_container_status_ready{pod=~"fleetshard-sync-.*"}[10m]) < 0.5
117117
for: 20m
118118
labels:
119-
severity: critical
119+
severity: warning
120120
annotations:
121121
summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status."
122122
description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes."
123123
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-005-fleetshard-sync-unavailable.md"
124124
- alert: RHACSFleetshardSyncContainerFrequentlyRestarting
125125
expr: increase(kube_pod_container_status_restarts_total{pod=~"fleetshard-sync-.*"}[30m]) > 3
126126
labels:
127-
severity: critical
127+
severity: warning
128128
annotations:
129129
summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times."
130130
description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes."

resources/prometheus/unit_tests/RHACSFleetshardSyncContainerDown.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ tests:
2020
container: fleetshard-sync
2121
namespace: rhacs
2222
pod: fleetshard-sync-1234
23-
severity: critical
23+
severity: warning
2424
exp_annotations:
2525
summary: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` is down or in a CrashLoopBackOff status."
2626
description: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` has been down or in a CrashLoopBackOff status for at least 10 minutes."

resources/prometheus/unit_tests/RHACSFleetshardSyncContainerFrequentlyRestarting.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ tests:
2020
container: fleetshard-sync
2121
namespace: rhacs
2222
pod: fleetshard-sync-1234
23-
severity: critical
23+
severity: warning
2424
exp_annotations:
2525
summary: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` restarted more than 3 times."
2626
description: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` has restarted more than 3 times during the last 30 minutes."

resources/prometheus/unit_tests/RHACSFleetshardSyncScrapeFailed.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ tests:
2222
instance: 1.2.3.4:9090
2323
namespace: rhacs
2424
pod: fleetshard-sync-1234
25-
severity: critical
25+
severity: warning
2626
exp_annotations:
2727
summary: "Prometheus unable to scrape metrics from target `fleetshard-sync-1234` in namespace `rhacs`."
2828
description: "During the last 10 minutes, only `40%` of scrapes of target `fleetshard-sync-1234` in namespace `rhacs` were successful. This alert is raised when less than 50% of scrapes are successful."

0 commit comments

Comments
 (0)