Skip to content

Commit b769b21

Browse files
authored
Release core 0.0.39 and bundles to 0.0.52 (#675)
## Changes - Fixed duplicate uuids from nova api - Added alerts and metrics to monitor if a vm comes up and the syncers are working as expected - CR Syncer fixed auth bug
2 parents 7300a90 + 12bedc8 commit b769b21

34 files changed

Lines changed: 1696 additions & 132 deletions

File tree

helm/bundles/cortex-cinder/Chart.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ apiVersion: v2
55
name: cortex-cinder
66
description: A Helm chart deploying Cortex for Cinder.
77
type: application
8-
version: 0.0.51
8+
version: 0.0.52
99
appVersion: 0.1.0
1010
dependencies:
1111
# from: file://../../library/cortex-postgres
@@ -16,12 +16,12 @@ dependencies:
1616
# from: file://../../library/cortex
1717
- name: cortex
1818
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
19-
version: 0.0.38
19+
version: 0.0.39
2020
alias: cortex-knowledge-controllers
2121
# from: file://../../library/cortex
2222
- name: cortex
2323
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
24-
version: 0.0.38
24+
version: 0.0.39
2525
alias: cortex-scheduling-controllers
2626

2727
# Owner info adds a configmap to the kubernetes cluster with information on

helm/bundles/cortex-crds/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ apiVersion: v2
55
name: cortex-crds
66
description: A Helm chart deploying Cortex CRDs.
77
type: application
8-
version: 0.0.51
8+
version: 0.0.52
99
appVersion: 0.1.0
1010
dependencies:
1111
# from: file://../../library/cortex
1212
- name: cortex
1313
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
14-
version: 0.0.38
14+
version: 0.0.39
1515

1616
# Owner info adds a configmap to the kubernetes cluster with information on
1717
# the service owner. This makes it easier to find out who to contact in case

helm/bundles/cortex-ironcore/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ apiVersion: v2
55
name: cortex-ironcore
66
description: A Helm chart deploying Cortex for IronCore.
77
type: application
8-
version: 0.0.51
8+
version: 0.0.52
99
appVersion: 0.1.0
1010
dependencies:
1111
# from: file://../../library/cortex
1212
- name: cortex
1313
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
14-
version: 0.0.38
14+
version: 0.0.39
1515

1616
# Owner info adds a configmap to the kubernetes cluster with information on
1717
# the service owner. This makes it easier to find out who to contact in case

helm/bundles/cortex-manila/Chart.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ apiVersion: v2
55
name: cortex-manila
66
description: A Helm chart deploying Cortex for Manila.
77
type: application
8-
version: 0.0.51
8+
version: 0.0.52
99
appVersion: 0.1.0
1010
dependencies:
1111
# from: file://../../library/cortex-postgres
@@ -16,12 +16,12 @@ dependencies:
1616
# from: file://../../library/cortex
1717
- name: cortex
1818
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
19-
version: 0.0.38
19+
version: 0.0.39
2020
alias: cortex-knowledge-controllers
2121
# from: file://../../library/cortex
2222
- name: cortex
2323
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
24-
version: 0.0.38
24+
version: 0.0.39
2525
alias: cortex-scheduling-controllers
2626

2727
# Owner info adds a configmap to the kubernetes cluster with information on

helm/bundles/cortex-nova/Chart.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ apiVersion: v2
55
name: cortex-nova
66
description: A Helm chart deploying Cortex for Nova.
77
type: application
8-
version: 0.0.51
8+
version: 0.0.52
99
appVersion: 0.1.0
1010
dependencies:
1111
# from: file://../../library/cortex-postgres
@@ -16,12 +16,12 @@ dependencies:
1616
# from: file://../../library/cortex
1717
- name: cortex
1818
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
19-
version: 0.0.38
19+
version: 0.0.39
2020
alias: cortex-knowledge-controllers
2121
# from: file://../../library/cortex
2222
- name: cortex
2323
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
24-
version: 0.0.38
24+
version: 0.0.39
2525
alias: cortex-scheduling-controllers
2626

2727
# Owner info adds a configmap to the kubernetes cluster with information on

helm/bundles/cortex-nova/alerts/nova.alerts.yaml

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,3 +592,141 @@ groups:
592592
corruption, bugs in reservation creation, or external modifications.
593593
Reservations are automatically repaired, but the root cause should be
594594
investigated if this alert persists.
595+
596+
- alert: CortexNovaDoesntFindValidKVMHosts
597+
expr: sum by (az, hvtype) (cortex_vm_faults{hvtype=~"CH|QEMU",faultmsg=~".*No valid host was found.*"}) > 0
598+
for: 5m
599+
labels:
600+
context: scheduling
601+
dashboard: cortex/cortex
602+
service: cortex
603+
severity: warning
604+
support_group: workload-management
605+
annotations:
606+
summary: "Nova scheduling cannot find valid KVM hosts"
607+
description: >
608+
Cortex is seeing faulty vms in `{{$labels.az}}` where Nova scheduling
609+
failed to find a valid `{{$labels.hvtype}}` host. This may indicate
610+
capacity issues, misconfigured filters, or resource constraints in the
611+
datacenter. Investigate the affected VMs and hypervisor availability.
612+
613+
- alert: CortexNovaNewDatasourcesNotReconciling
614+
expr: count by(datasource) (cortex_datasource_seconds_until_reconcile{queued="false",domain="nova"}) > 0
615+
for: 60m
616+
labels:
617+
context: datasources
618+
dashboard: cortex/cortex
619+
service: cortex
620+
severity: warning
621+
support_group: workload-management
622+
annotations:
623+
summary: "New datasource `{{$labels.datasource}}` has not reconciled"
624+
description: >
625+
A new datasource `{{$labels.datasource}}` has been added but has not
626+
completed its first reconciliation yet. This may indicate issues with
627+
the datasource controller's workqueue overprioritizing other datasources.
628+
629+
- alert: CortexNovaExistingDatasourcesLackingBehind
630+
expr: sum by(datasource) (cortex_datasource_seconds_until_reconcile{queued="true",domain="nova"}) < -600
631+
for: 10m
632+
labels:
633+
context: datasources
634+
dashboard: cortex/cortex
635+
service: cortex
636+
severity: warning
637+
support_group: workload-management
638+
annotations:
639+
summary: "Existing datasource `{{$labels.datasource}}` is lacking behind"
640+
description: >
641+
An existing datasource `{{$labels.datasource}}` has been queued for
642+
reconciliation for more than 10 minutes. This may indicate issues with
643+
the datasource controller's workqueue or that this or another datasource
644+
is taking an unusually long time to reconcile.
645+
646+
- alert: CortexNovaReconcileErrorsHigh
647+
expr: |
648+
(sum by (controller) (rate(controller_runtime_reconcile_errors_total{service="cortex-nova-metrics"}[5m])))
649+
/ (sum by (controller) (rate(controller_runtime_reconcile_total{service="cortex-nova-metrics"}[5m]))) > 0.1
650+
for: 15m
651+
labels:
652+
context: controller-errors
653+
dashboard: cortex/cortex
654+
service: cortex
655+
severity: warning
656+
support_group: workload-management
657+
annotations:
658+
summary: "Controller reconcile error rate >10%"
659+
description: >
660+
More than 10% of controller reconciles are resulting in errors. This may
661+
indicate issues with the controller logic, connectivity problems, or
662+
external factors causing failures. Check the controller logs for error
663+
details and investigate the affected resources.
664+
665+
- alert: CortexNovaReconcileDurationHigher10Min
666+
expr: |
667+
(sum by (controller) (rate(controller_runtime_reconcile_time_seconds_sum{service="cortex-nova-metrics"}[5m])))
668+
/ (sum by (controller) (rate(controller_runtime_reconcile_time_seconds_count{service="cortex-nova-metrics"}[5m]))) > 600
669+
for: 15m
670+
labels:
671+
context: controller-duration
672+
dashboard: cortex/cortex
673+
service: cortex
674+
severity: warning
675+
support_group: workload-management
676+
annotations:
677+
summary: "Controller reconciliation takes longer than ({{ $value | humanizeDuration }})"
678+
description: "Reconcile duration higher than 10m while reconciling {{ $labels.controller }}"
679+
680+
- alert: CortexNovaWorkqueueNotDrained
681+
expr: |
682+
sum by (name) (workqueue_depth{service="cortex-nova-metrics"}) > 0
683+
for: 60m
684+
labels:
685+
context: controller-workqueue
686+
dashboard: cortex/cortex
687+
service: cortex
688+
severity: warning
689+
support_group: workload-management
690+
annotations:
691+
summary: "Controller {{ $labels.name }}'s backlog is not being drained."
692+
description: >
693+
The workqueue for controller {{ $labels.name }} has a backlog that is
694+
not being drained. This may indicate that the controller is overwhelmed
695+
with work or is stuck on certain resources. Check the controller logs
696+
and the state of the resources it manages for more details.
697+
698+
- alert: CortexNovaWebhookLatencyHigh
699+
expr: |
700+
histogram_quantile(0.9, avg(rate(controller_runtime_webhook_latency_seconds_bucket{service="cortex-nova-metrics"}[5m])) by (webhook, le)) > 0.2
701+
for: 15m
702+
labels:
703+
context: controller-webhook
704+
dashboard: cortex/cortex
705+
service: cortex
706+
severity: warning
707+
support_group: workload-management
708+
annotations:
709+
summary: "Controller webhook {{ $labels.webhook }} latency is high"
710+
description: >
711+
The latency for webhook {{ $labels.webhook }} is higher than expected (p90 > 200ms).
712+
This may indicate performance issues with the webhook server or the logic it executes.
713+
Check the webhook server logs and monitor its resource usage for more insights.
714+
715+
- alert: CortexNovaWebhookErrorsHigh
716+
expr: |
717+
(sum by (webhook) (rate(controller_runtime_webhook_requests_total{code!="200", service="cortex-nova-metrics"}[5m])))
718+
/ (sum by (webhook) (rate(controller_runtime_webhook_requests_total{service="cortex-nova-metrics"}[5m]))) > 0.1
719+
for: 15m
720+
labels:
721+
context: controller-webhook
722+
dashboard: cortex/cortex
723+
service: cortex
724+
severity: warning
725+
support_group: workload-management
726+
annotations:
727+
summary: "Controller webhook {{ $labels.webhook }} is experiencing errors"
728+
description: >
729+
The webhook {{ $labels.webhook }} has experienced errors in the last 5 minutes.
730+
This may indicate issues with the webhook logic, connectivity problems, or
731+
external factors causing failures. Check the webhook server logs for error
732+
details and investigate the affected resources.

helm/bundles/cortex-nova/templates/kpis.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,23 @@ spec:
110110
---
111111
apiVersion: cortex.cloud/v1alpha1
112112
kind: KPI
113+
metadata:
114+
name: vm-faults
115+
spec:
116+
schedulingDomain: nova
117+
impl: vm_faults_kpi
118+
dependencies:
119+
datasources:
120+
- name: nova-servers
121+
- name: nova-flavors
122+
description: |
123+
This kpi tracks vm faults in the datacenter. It exposes helpful information
124+
about the faults, such as the availability zone, hypervisor type, vm state,
125+
and error info if available. This can be used to identify issues in the
126+
datacenter and to monitor the overall health of the vms.
127+
---
128+
apiVersion: cortex.cloud/v1alpha1
129+
kind: KPI
113130
metadata:
114131
name: cortex-nova-datasource-state
115132
spec:

helm/bundles/cortex-pods/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ apiVersion: v2
55
name: cortex-pods
66
description: A Helm chart deploying Cortex for Pods.
77
type: application
8-
version: 0.0.51
8+
version: 0.0.52
99
appVersion: 0.1.0
1010
dependencies:
1111
# from: file://../../library/cortex
1212
- name: cortex
1313
repository: oci://ghcr.io/cobaltcore-dev/cortex/charts
14-
version: 0.0.38
14+
version: 0.0.39
1515

1616
# Owner info adds a configmap to the kubernetes cluster with information on
1717
# the service owner. This makes it easier to find out who to contact in case

helm/library/cortex/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: v2
22
name: cortex
33
description: A Helm chart to distribute cortex.
44
type: application
5-
version: 0.0.38
6-
appVersion: "sha-b3cf6dc8"
5+
version: 0.0.39
6+
appVersion: "sha-f437366b"
77
icon: "https://example.com/icon.png"
88
dependencies: []

internal/knowledge/datasources/plugins/openstack/cinder/cinder_sync_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ func TestCinderSyncer_Sync(t *testing.T) {
5555
}
5656

5757
ctx := t.Context()
58+
if err := syncer.Init(ctx); err != nil {
59+
t.Fatalf("failed to init cinder syncer: %v", err)
60+
}
5861
_, err := syncer.Sync(ctx)
5962
if err != nil {
6063
t.Fatalf("expected no error, got %v", err)
@@ -76,6 +79,9 @@ func TestCinderSyncer_SyncAllStoragePools(t *testing.T) {
7679
}
7780

7881
ctx := t.Context()
82+
if err := syncer.Init(ctx); err != nil {
83+
t.Fatalf("failed to init cinder syncer: %v", err)
84+
}
7985
n, err := syncer.SyncAllStoragePools(ctx)
8086
if err != nil {
8187
t.Fatalf("expected no error, got %v", err)

0 commit comments

Comments
 (0)