From 1b5a0e928961013a380eebf5a4475c34d7252abc Mon Sep 17 00:00:00 2001 From: Yury Kovalev Date: Mon, 16 Mar 2026 10:53:35 +0100 Subject: [PATCH] ROX-33584: Reduce the severity of the fleetshard-sync unavailability alerts --- resources/prometheus/prometheus-rules.yaml | 6 +++--- .../unit_tests/RHACSFleetshardSyncContainerDown.yaml | 2 +- .../RHACSFleetshardSyncContainerFrequentlyRestarting.yaml | 2 +- .../unit_tests/RHACSFleetshardSyncScrapeFailed.yaml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 19bdcafb..a4c94ee0 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -106,7 +106,7 @@ spec: (avg_over_time(up{pod=~"fleetshard-sync-.*"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{pod=~"fleetshard-sync-.*"} == 1) or absent(up{pod=~"fleetshard-sync-.*"}) for: 20m labels: - severity: critical + severity: warning annotations: summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`." description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful." @@ -116,7 +116,7 @@ spec: avg_over_time(kube_pod_container_status_ready{pod=~"fleetshard-sync-.*"}[10m]) < 0.5 for: 20m labels: - severity: critical + severity: warning annotations: summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status." description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes." @@ -124,7 +124,7 @@ spec: - alert: RHACSFleetshardSyncContainerFrequentlyRestarting expr: increase(kube_pod_container_status_restarts_total{pod=~"fleetshard-sync-.*"}[30m]) > 3 labels: - severity: critical + severity: warning annotations: summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." diff --git a/resources/prometheus/unit_tests/RHACSFleetshardSyncContainerDown.yaml b/resources/prometheus/unit_tests/RHACSFleetshardSyncContainerDown.yaml index 571d641f..31f64239 100644 --- a/resources/prometheus/unit_tests/RHACSFleetshardSyncContainerDown.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetshardSyncContainerDown.yaml @@ -20,7 +20,7 @@ tests: container: fleetshard-sync namespace: rhacs pod: fleetshard-sync-1234 - severity: critical + severity: warning exp_annotations: summary: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` is down or in a CrashLoopBackOff status." description: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` has been down or in a CrashLoopBackOff status for at least 10 minutes." diff --git a/resources/prometheus/unit_tests/RHACSFleetshardSyncContainerFrequentlyRestarting.yaml b/resources/prometheus/unit_tests/RHACSFleetshardSyncContainerFrequentlyRestarting.yaml index f9a64490..f1cd10d1 100644 --- a/resources/prometheus/unit_tests/RHACSFleetshardSyncContainerFrequentlyRestarting.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetshardSyncContainerFrequentlyRestarting.yaml @@ -20,7 +20,7 @@ tests: container: fleetshard-sync namespace: rhacs pod: fleetshard-sync-1234 - severity: critical + severity: warning exp_annotations: summary: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` restarted more than 3 times." description: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` has restarted more than 3 times during the last 30 minutes." diff --git a/resources/prometheus/unit_tests/RHACSFleetshardSyncScrapeFailed.yaml b/resources/prometheus/unit_tests/RHACSFleetshardSyncScrapeFailed.yaml index 0d45493c..362bd4c9 100644 --- a/resources/prometheus/unit_tests/RHACSFleetshardSyncScrapeFailed.yaml +++ b/resources/prometheus/unit_tests/RHACSFleetshardSyncScrapeFailed.yaml @@ -22,7 +22,7 @@ tests: instance: 1.2.3.4:9090 namespace: rhacs pod: fleetshard-sync-1234 - severity: critical + severity: warning exp_annotations: summary: "Prometheus unable to scrape metrics from target `fleetshard-sync-1234` in namespace `rhacs`." description: "During the last 10 minutes, only `40%` of scrapes of target `fleetshard-sync-1234` in namespace `rhacs` were successful. This alert is raised when less than 50% of scrapes are successful."