You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: rhobs/alerting/data_plane/prometheus.kueue_alerts.yaml
+46-78Lines changed: 46 additions & 78 deletions
Original file line number
Diff line number
Diff line change
@@ -9,82 +9,6 @@ spec:
9
9
- name: kueue.health
10
10
interval: 30s
11
11
rules:
12
-
- alert: TektonKueueControllerDown
13
-
expr: |
14
-
konflux_up{
15
-
namespace="tekton-kueue",
16
-
check="replicas-available",
17
-
service="tekton-kueue-controller-manager",
18
-
} != 1
19
-
for: 5m
20
-
labels:
21
-
severity: critical
22
-
component: tekton-kueue
23
-
slo: "true"
24
-
annotations:
25
-
summary: "Tekton Kueue controller is down"
26
-
description: "The Tekton Kueue controller has no available replicas in cluster {{ $labels.source_cluster }}. PipelineRuns will be created in Pending state but cannot be processed, resulting in stuck builds."
description: "The Tekton Kueue webhook has no available replicas in cluster {{ $labels.source_cluster }}. PipelineRuns may not be created with Pending state and Workload resources, bypassing Kueue admission control."
summary: "Tekton Kueue pods are repeatedly restarting"
63
-
description: "Tekton Kueue pods have restarted {{ $value }} times in the last 5 minutes in cluster {{ $labels.source_cluster }}. This may indicate resource pressure or application issues."
summary: "Kueue mutating webhook success rate is below 99%"
119
43
description: "The mutating webhook 'pipelinerun-kueue-defaulter.tekton-kueue.io' has had a success rate below 99% over the past 10 minutes in cluster {{ $labels.source_cluster }}. Possible causes include webhook errors, rejections, or unreachability (e.g., code=600)."
0 commit comments