Skip to content

Commit 8538f57

Browse files
committed
SPRE-5085: Add predictive alerts for Kueue resource saturation and admission wait times
1 parent 8e34853 commit 8538f57

2 files changed

Lines changed: 85 additions & 247 deletions

File tree

rhobs/alerting/data_plane/prometheus.kueue_alerts.yaml

Lines changed: 46 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -9,82 +9,6 @@ spec:
99
- name: kueue.health
1010
interval: 30s
1111
rules:
12-
- alert: TektonKueueControllerDown
13-
expr: |
14-
konflux_up{
15-
namespace="tekton-kueue",
16-
check="replicas-available",
17-
service="tekton-kueue-controller-manager",
18-
} != 1
19-
for: 5m
20-
labels:
21-
severity: critical
22-
component: tekton-kueue
23-
slo: "true"
24-
annotations:
25-
summary: "Tekton Kueue controller is down"
26-
description: "The Tekton Kueue controller has no available replicas in cluster {{ $labels.source_cluster }}. PipelineRuns will be created in Pending state but cannot be processed, resulting in stuck builds."
27-
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue/alert-TektonKueueControllerDown.md
28-
alert_team_handle: <!subteam^S05Q1P4Q2TG>
29-
team: konflux-infra
30-
31-
- alert: TektonKueueWebhookDown
32-
expr: |
33-
konflux_up{
34-
namespace="tekton-kueue",
35-
check="replicas-available",
36-
service="tekton-kueue-webhook",
37-
} != 1
38-
for: 5m
39-
labels:
40-
severity: critical
41-
component: tekton-kueue
42-
slo: "true"
43-
annotations:
44-
summary: "Tekton Kueue webhook is down"
45-
description: "The Tekton Kueue webhook has no available replicas in cluster {{ $labels.source_cluster }}. PipelineRuns may not be created with Pending state and Workload resources, bypassing Kueue admission control."
46-
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue/alert-TektonKueueWebhookDown.md
47-
alert_team_handle: <!subteam^S05Q1P4Q2TG>
48-
team: konflux-infra
49-
50-
- alert: TektonKueuePodsRepeatedRestarts
51-
expr: |
52-
sum by (source_cluster) (
53-
increase(kube_pod_container_status_restarts_total{
54-
namespace="tekton-kueue"
55-
}[5m])
56-
) > 0
57-
for: 5m
58-
labels:
59-
severity: warning
60-
component: tekton-kueue
61-
annotations:
62-
summary: "Tekton Kueue pods are repeatedly restarting"
63-
description: "Tekton Kueue pods have restarted {{ $value }} times in the last 5 minutes in cluster {{ $labels.source_cluster }}. This may indicate resource pressure or application issues."
64-
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue/alert-TektonKueuePodsRepeatedRestarts.md
65-
alert_routing_key: infra
66-
team: konflux-infra
67-
68-
- alert: TektonKueuePodsCrashLoopBackOff
69-
expr: |
70-
sum by (source_cluster) (
71-
max_over_time(kube_pod_container_status_waiting_reason{
72-
namespace="tekton-kueue",
73-
reason="CrashLoopBackOff"
74-
}[3m]) or vector(0)
75-
) > 0
76-
for: 3m
77-
labels:
78-
severity: critical
79-
component: tekton-kueue
80-
slo: "true"
81-
annotations:
82-
summary: "Tekton Kueue pod is in a crash loop"
83-
description: "Tekton Kueue pod has degraded into CrashLoopBackOff status in cluster {{ $labels.source_cluster }} and is not starting up."
84-
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue/alert-TektonKueuePodsCrashLoopBackOff.md
85-
alert_team_handle: <!subteam^S05Q1P4Q2TG>
86-
team: konflux-infra
87-
8812
- alert: KueueCELEvaluationFailures
8913
expr: increase(tekton_kueue_cel_evaluations_total{result="failure"}[5m]) > 0
9014
for: 1m
@@ -111,14 +35,13 @@ spec:
11135
< 99
11236
for: 10m
11337
labels:
114-
severity: high
38+
severity: critical
11539
component: kueue
11640
slo: "false"
11741
annotations:
11842
summary: "Kueue mutating webhook success rate is below 99%"
11943
description: "The mutating webhook 'pipelinerun-kueue-defaulter.tekton-kueue.io' has had a success rate below 99% over the past 10 minutes in cluster {{ $labels.source_cluster }}. Possible causes include webhook errors, rejections, or unreachability (e.g., code=600)."
12044
runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue/queue.md?ref_type=heads
121-
# alert_team_handle: <!subteam^S05Q1P4Q2TG>
12245
alert_routing_key: infra
12346
team: konflux-infra
12447

@@ -150,3 +73,48 @@ spec:
15073
description: "99th percentile admission wait time is {{ $value }}s, which is above 30 minutes in cluster {{ $labels.source_cluster }}"
15174
alert_routing_key: infra
15275
team: konflux-infra
76+
77+
# SPRE-5085: Resource Saturation
78+
- alert: KueueClusterQueueResourceSaturation
79+
expr: |
80+
(
81+
sum by (cluster_queue, resource, source_cluster) (kueue_cluster_queue_resource_reservation)
82+
/
83+
sum by (cluster_queue, resource, source_cluster) (kueue_cluster_queue_nominal_quota)
84+
) > 0.9
85+
for: 10m
86+
labels:
87+
severity: high
88+
component: kueue
89+
annotations:
90+
summary: "Kueue Resource Saturation: {{ $labels.resource }} above 90%"
91+
description: "Resource {{ $labels.resource }} in cluster {{ $labels.source_cluster }} (queue: {{ $labels.cluster_queue }}) is nearing capacity (>90%). Future workloads may experience delays."
92+
runbook_url: "https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/infra/queue/alert-KueueResourceSaturation.md"
93+
alert_routing_key: infra
94+
team: konflux-infra
95+
96+
# SPRE-5085: Pre-Merge Latency
97+
- alert: KueueHighAdmissionWaitTimePreMerge
98+
expr: histogram_quantile(0.99, sum(increase(kueue_admission_wait_time_seconds_bucket{cluster_queue="cluster-pipeline-queue", workload_name_prefix="pre-merge"}[10m])) by (le, source_cluster)) > 600
99+
for: 5m
100+
labels:
101+
severity: warning
102+
component: kueue
103+
annotations:
104+
summary: "High admission wait time for Pre-Merge pipelines"
105+
description: "99th percentile wait time for Pre-Merge is {{ $value }}s (>10m) in cluster {{ $labels.source_cluster }}"
106+
alert_routing_key: infra
107+
team: konflux-infra
108+
109+
# SPRE-5085: Mintmaker Latency
110+
- alert: KueueHighAdmissionWaitTimeMintmaker
111+
expr: histogram_quantile(0.99, sum(increase(kueue_admission_wait_time_seconds_bucket{cluster_queue="cluster-pipeline-queue", workload_name_prefix="mintmaker"}[10m])) by (le, source_cluster)) > 300
112+
for: 5m
113+
labels:
114+
severity: warning
115+
component: kueue
116+
annotations:
117+
summary: "High admission wait time for Mintmaker pipelines"
118+
description: "99th percentile wait time for Mintmaker is {{ $value }}s (>5m) in cluster {{ $labels.source_cluster }}"
119+
alert_routing_key: infra
120+
team: konflux-infra

0 commit comments

Comments
 (0)