diff --git a/class/defaults.yml b/class/defaults.yml index d3de5ee..e2cbe8e 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -163,14 +163,14 @@ parameters: severity: critical K8upBackupNotRunning: annotations: - message: No K8up jobs were run in {{ $labels.exported_namespace }} within the last 24 hours. Check the operator, there might be a deadlock - expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0 + message: No K8up jobs were run in {{ $labels.namespace }} within the last 24 hours. Check the operator, there might be a deadlock + expr: sum by (namespace) (rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge > 0 for: 1m labels: severity: critical K8upJobStuck: annotations: - message: Queued K8up jobs in {{ $labels.exported_namespace }} for the last hour. + message: Queued K8up jobs in {{ $labels.namespace }} for the last hour. expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0 for: 1h labels: diff --git a/component/monitoring.jsonnet b/component/monitoring.jsonnet index 5394fa3..8d2e869 100644 --- a/component/monitoring.jsonnet +++ b/component/monitoring.jsonnet @@ -27,6 +27,19 @@ local service_monitor = com.namespaced(params.namespace, { { interval: '10s', port: 'http', + // NOTE(sg): This is required to ensure that the backup namespace is + // preserved as label `namespace`. Without this, the scraped metrics + // have the backup namespace as `exported_namespace` and are useless + // for OCP User Workload monitoring users, because UWM only allows + // querying metrics whose `namespace` label matches the alert rule + // source namespace. + honorLabels: true, + // add k8up namespace as label `k8up_namespace`. + relabelings: [ { + action: 'replace', + sourceLabels: [ 'namespace' ], + targetLabel: 'k8up_namespace', + } ], }, ], selector: { diff --git a/tests/golden/defaults/backup-k8up/backup-k8up/30_monitoring.yaml b/tests/golden/defaults/backup-k8up/backup-k8up/30_monitoring.yaml index ba15d24..6e13616 100644 --- a/tests/golden/defaults/backup-k8up/backup-k8up/30_monitoring.yaml +++ b/tests/golden/defaults/backup-k8up/backup-k8up/30_monitoring.yaml @@ -13,10 +13,10 @@ spec: rules: - alert: K8upBackupNotRunning annotations: - message: No K8up jobs were run in {{ $labels.exported_namespace }} within - the last 24 hours. Check the operator, there might be a deadlock - expr: sum(rate(k8up_jobs_total[25h])) == 0 and on(namespace) k8up_schedules_gauge - > 0 + message: No K8up jobs were run in {{ $labels.namespace }} within the last + 24 hours. Check the operator, there might be a deadlock + expr: sum by (namespace) (rate(k8up_jobs_total[25h])) == 0 and on(namespace) + k8up_schedules_gauge > 0 for: 1m labels: severity: critical @@ -24,8 +24,7 @@ spec: syn_component: backup-k8up - alert: K8upJobStuck annotations: - message: Queued K8up jobs in {{ $labels.exported_namespace }} for the - last hour. + message: Queued K8up jobs in {{ $labels.namespace }} for the last hour. expr: k8up_jobs_queued_gauge{jobType="backup"} > 0 and on(namespace) k8up_schedules_gauge > 0 for: 1h @@ -127,8 +126,14 @@ metadata: namespace: syn-backup-k8up spec: endpoints: - - interval: 10s + - honorLabels: true + interval: 10s port: http + relabelings: + - action: replace + sourceLabels: + - namespace + targetLabel: k8up_namespace selector: matchLabels: app.kubernetes.io/instance: k8up