Skip to content

Commit 9644aca

Browse files
added service monitor for metrics
1 parent 800d17e commit 9644aca

5 files changed

Lines changed: 81 additions & 0 deletions

File tree

charts/openstack-hypervisor-operator/alerts/eviction.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ groups:
1111
labels:
1212
severity: warning
1313
type: hypervisor_operator
14+
playbook: docs/compute/kvm/playbooks/evictionfailed
1415
annotations:
1516
summary: "Eviction {{ $labels.name }} has failed"
1617
description: "The eviction {{ $labels.name }} for hypervisor {{ $labels.hypervisor }} has reached a terminal failure state. Manual intervention is required — check if the hypervisor exists in OpenStack."
@@ -24,6 +25,7 @@ groups:
2425
labels:
2526
severity: warning
2627
type: hypervisor_operator
28+
playbook: docs/compute/kvm/playbooks/evictionmigrationfailing
2729
annotations:
2830
summary: "Eviction {{ $labels.name }} has failing instance migrations for over 1 hour"
2931
description: "The eviction {{ $labels.name }} has had MigratingInstance=Failed for more than 1 hour while still running. Instances may be in ERROR state, blocking eviction progress."
@@ -37,6 +39,7 @@ groups:
3739
labels:
3840
severity: warning
3941
type: hypervisor_operator
42+
playbook: docs/compute/kvm/playbooks/evictionoutstandingram
4043
annotations:
4144
summary: "Eviction {{ $labels.name }} has outstanding RAM for over 6 hours"
4245
description: "The eviction {{ $labels.name }} has had {{ $value }}MB of outstanding RAM for more than 6 hours. Check for stuck live-migrations or instances that cannot be moved."

charts/openstack-hypervisor-operator/alerts/operator.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ groups:
1111
labels:
1212
severity: warning
1313
type: hypervisor_operator
14+
playbook: docs/compute/kvm/playbooks/hypervisoronboardingstuck
1415
annotations:
1516
summary: "Hypervisor {{ $labels.name }} onboarding stuck for over 1 hour"
1617
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been onboarding for more than 1 hour. Check nova registration, test VM status, or trait/aggregate sync."
@@ -22,6 +23,7 @@ groups:
2223
labels:
2324
severity: warning
2425
type: hypervisor_operator
26+
playbook: docs/compute/kvm/playbooks/hypervisorevictionstuck
2527
annotations:
2628
summary: "Hypervisor {{ $labels.name }} eviction running for over 4 hours"
2729
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had an active eviction for more than 4 hours. Check for stuck live-migrations or failed VMs."
@@ -35,6 +37,7 @@ groups:
3537
labels:
3638
severity: info
3739
type: hypervisor_operator
40+
playbook: docs/compute/kvm/playbooks/hypervisorevictedtoolong
3841
annotations:
3942
summary: "Hypervisor {{ $labels.name }} has been evicted for over 7 days"
4043
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has been evicted for more than 7 days without being offboarded. Consider re-enabling or decommissioning."
@@ -50,6 +53,7 @@ groups:
5053
labels:
5154
severity: warning
5255
type: hypervisor_operator
56+
playbook: docs/compute/kvm/playbooks/hypervisortraitsyncfailed
5357
annotations:
5458
summary: "Hypervisor {{ $labels.name }} trait sync has been failing"
5559
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had TraitsUpdated=False for more than 30 minutes outside of onboarding. Check OpenStack Placement API connectivity."
@@ -65,6 +69,7 @@ groups:
6569
labels:
6670
severity: warning
6771
type: hypervisor_operator
72+
playbook: docs/compute/kvm/playbooks/hypervisoraggregatesyncfailed
6873
annotations:
6974
summary: "Hypervisor {{ $labels.name }} aggregate sync has been failing"
7075
description: "The hypervisor {{ $labels.name }} in zone {{ $labels.zone }} has had AggregatesUpdated=False for more than 30 minutes outside of onboarding and eviction. Check OpenStack Nova API connectivity."
@@ -78,6 +83,7 @@ groups:
7883
labels:
7984
severity: warning
8085
type: hypervisor_operator
86+
playbook: docs/compute/kvm/playbooks/hypervisorreconcileerrors
8187
annotations:
8288
summary: "Hypervisor operator controller {{ $labels.controller }} has persistent reconcile errors"
8389
description: "The controller {{ $labels.controller }} has been producing sustained reconciliation errors for more than 15 minutes."
@@ -89,6 +95,7 @@ groups:
8995
labels:
9096
severity: critical
9197
type: hypervisor_operator
98+
playbook: docs/compute/kvm/playbooks/hypervisoroperatordown
9299
annotations:
93100
summary: "Hypervisor operator is down"
94101
description: "The hypervisor operator metrics endpoint has been unreachable for more than 5 minutes."

charts/openstack-hypervisor-operator/templates/metrics-reader-rbac.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,18 @@ rules:
99
- /metrics
1010
verbs:
1111
- get
12+
---
13+
apiVersion: rbac.authorization.k8s.io/v1
14+
kind: ClusterRoleBinding
15+
metadata:
16+
name: {{ include "openstack-hypervisor-operator.fullname" . }}-metrics-reader-binding
17+
labels:
18+
{{- include "openstack-hypervisor-operator.labels" . | nindent 4 }}
19+
roleRef:
20+
apiGroup: rbac.authorization.k8s.io
21+
kind: ClusterRole
22+
name: '{{ include "openstack-hypervisor-operator.fullname" . }}-metrics-reader'
23+
subjects:
24+
- kind: ServiceAccount
25+
name: '{{ include "openstack-hypervisor-operator.serviceAccountName" . }}'
26+
namespace: '{{ .Release.Namespace }}'
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and cobaltcore-dev contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
{{- if .Values.serviceMonitor.create }}
5+
apiVersion: monitoring.coreos.com/v1
6+
kind: ServiceMonitor
7+
metadata:
8+
name: {{ include "openstack-hypervisor-operator.fullname" . }}-metrics
9+
labels:
10+
control-plane: controller-manager
11+
{{- include "openstack-hypervisor-operator.labels" . | nindent 4 }}
12+
{{- with .Values.serviceMonitor.labels }}
13+
{{- toYaml . | nindent 4 }}
14+
{{- end }}
15+
{{- with .Values.serviceMonitor.annotations }}
16+
annotations:
17+
{{- toYaml . | nindent 4 }}
18+
{{- end }}
19+
spec:
20+
jobLabel: control-plane
21+
selector:
22+
matchLabels:
23+
control-plane: controller-manager
24+
{{- include "openstack-hypervisor-operator.selectorLabels" . | nindent 6 }}
25+
namespaceSelector:
26+
matchNames:
27+
- {{ .Release.Namespace }}
28+
endpoints:
29+
- port: https
30+
scheme: https
31+
tlsConfig:
32+
insecureSkipVerify: true
33+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
34+
{{- with .Values.serviceMonitor.interval }}
35+
interval: {{ . }}
36+
{{- end }}
37+
{{- with .Values.serviceMonitor.scrapeTimeout }}
38+
scrapeTimeout: {{ . }}
39+
{{- end }}
40+
{{- with .Values.serviceMonitor.metricRelabelings }}
41+
metricRelabelings:
42+
{{- toYaml . | nindent 8 }}
43+
{{- end }}
44+
{{- with .Values.serviceMonitor.relabelings }}
45+
relabelings:
46+
{{- toYaml . | nindent 8 }}
47+
{{- end }}
48+
{{- end }}

charts/openstack-hypervisor-operator/values.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,14 @@ metricsService:
4545
protocol: TCP
4646
targetPort: 8443
4747
type: ClusterIP
48+
serviceMonitor:
49+
create: true
50+
labels: {}
51+
annotations: {}
52+
interval: ""
53+
scrapeTimeout: ""
54+
metricRelabelings: []
55+
relabelings: []
4856
secret:
4957
servicePassword: ""
5058
serviceAccount:

0 commit comments

Comments
 (0)