Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ spec:
(avg_over_time(up{pod=~"fleetshard-sync-.*"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{pod=~"fleetshard-sync-.*"} == 1) or absent(up{pod=~"fleetshard-sync-.*"})
for: 20m
labels:
severity: critical
severity: warning
annotations:
summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`."
description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful."
Expand All @@ -116,15 +116,15 @@ spec:
avg_over_time(kube_pod_container_status_ready{pod=~"fleetshard-sync-.*"}[10m]) < 0.5
for: 20m
labels:
severity: critical
severity: warning
annotations:
summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status."
description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-005-fleetshard-sync-unavailable.md"
- alert: RHACSFleetshardSyncContainerFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts_total{pod=~"fleetshard-sync-.*"}[30m]) > 3
labels:
severity: critical
severity: warning
annotations:
summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times."
description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ tests:
container: fleetshard-sync
namespace: rhacs
pod: fleetshard-sync-1234
severity: critical
severity: warning
exp_annotations:
summary: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` is down or in a CrashLoopBackOff status."
description: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` has been down or in a CrashLoopBackOff status for at least 10 minutes."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ tests:
container: fleetshard-sync
namespace: rhacs
pod: fleetshard-sync-1234
severity: critical
severity: warning
exp_annotations:
summary: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` restarted more than 3 times."
description: "Fleetshard synchronizer container `fleetshard-sync-1234/fleetshard-sync` in namespace `rhacs` has restarted more than 3 times during the last 30 minutes."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ tests:
instance: 1.2.3.4:9090
namespace: rhacs
pod: fleetshard-sync-1234
severity: critical
severity: warning
exp_annotations:
summary: "Prometheus unable to scrape metrics from target `fleetshard-sync-1234` in namespace `rhacs`."
description: "During the last 10 minutes, only `40%` of scrapes of target `fleetshard-sync-1234` in namespace `rhacs` were successful. This alert is raised when less than 50% of scrapes are successful."
Expand Down
Loading