Skip to content

Commit f13348e

Browse files
authored
feat: add kvm project usage metrics (#803)
## Changes - Add project utilization KPI for kvm - Minor adjustments on how the usage is calculated for vmware
1 parent e62e018 commit f13348e

7 files changed

Lines changed: 1176 additions & 14 deletions

File tree

helm/bundles/cortex-nova/templates/kpis_kvm.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,17 @@ spec:
1313
- name: host-utilization
1414
description: |
1515
This KPI tracks the total, utilized, reserved and failover capacity of KVM hosts.
16+
---
17+
apiVersion: cortex.cloud/v1alpha1
18+
kind: KPI
19+
metadata:
20+
name: kvm-project-utilization
21+
spec:
22+
schedulingDomain: nova
23+
impl: kvm_project_utilization_kpi
24+
dependencies:
25+
datasources:
26+
- name: nova-servers
27+
- name: nova-flavors
28+
- name: identity-projects
1629
{{- end }}
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
// Copyright SAP SE
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package infrastructure
5+
6+
import (
7+
"context"
8+
"log/slog"
9+
10+
"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/identity"
11+
"github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova"
12+
"github.com/cobaltcore-dev/cortex/internal/knowledge/db"
13+
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
14+
"github.com/cobaltcore-dev/cortex/pkg/conf"
15+
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
16+
"github.com/prometheus/client_golang/prometheus"
17+
"sigs.k8s.io/controller-runtime/pkg/client"
18+
)
19+
20+
type kvmProjectInstanceCount struct {
21+
ProjectID string `db:"project_id"`
22+
ProjectName string `db:"project_name"`
23+
DomainID string `db:"domain_id"`
24+
DomainName string `db:"domain_name"`
25+
ComputeHost string `db:"compute_host"`
26+
FlavorName string `db:"flavor_name"`
27+
AvailabilityZone string `db:"availability_zone"`
28+
InstanceCount float64 `db:"instance_count"`
29+
}
30+
31+
type kvmProjectCapacityUsage struct {
32+
ProjectID string `db:"project_id"`
33+
ProjectName string `db:"project_name"`
34+
DomainID string `db:"domain_id"`
35+
DomainName string `db:"domain_name"`
36+
ComputeHost string `db:"compute_host"`
37+
AvailabilityZone string `db:"availability_zone"`
38+
TotalVCPUs float64 `db:"total_vcpus"`
39+
TotalRAMMB float64 `db:"total_ram_mb"`
40+
TotalDiskGB float64 `db:"total_disk_gb"`
41+
}
42+
43+
type KVMProjectUtilizationKPI struct {
44+
// BaseKPI provides common fields and methods for all KPIs, such as database connection and Kubernetes client.
45+
plugins.BaseKPI[struct{}]
46+
47+
// instanceCountPerProjectAndHostAndFlavor is a Prometheus descriptor for the metric that counts the number of instances per project, host, and flavor.
48+
instanceCountPerProjectAndHostAndFlavor *prometheus.Desc
49+
50+
// capacityUsagePerProjectAndHost is a Prometheus descriptor for the metric that measures the capacity usage per project and host.
51+
capacityUsagePerProjectAndHost *prometheus.Desc
52+
}
53+
54+
func (k *KVMProjectUtilizationKPI) GetName() string {
55+
return "kvm_project_utilization_kpi"
56+
}
57+
58+
func (k *KVMProjectUtilizationKPI) Init(dbConn *db.DB, c client.Client, opts conf.RawOpts) error {
59+
if err := k.BaseKPI.Init(dbConn, c, opts); err != nil {
60+
return err
61+
}
62+
63+
k.instanceCountPerProjectAndHostAndFlavor = prometheus.NewDesc(
64+
"cortex_kvm_project_instances",
65+
"Number of running instances per project, hypervisor, and flavor on KVM.",
66+
append(kvmHostLabels, "project_id", "project_name", "domain_id", "domain_name", "flavor_name"), nil,
67+
)
68+
k.capacityUsagePerProjectAndHost = prometheus.NewDesc(
69+
"cortex_kvm_project_capacity_usage",
70+
"Resource capacity used by a project per KVM hypervisor and flavor. CPU in vCPUs, memory and disk in bytes.",
71+
append(kvmHostLabels, "project_id", "project_name", "domain_id", "domain_name", "resource"), nil,
72+
)
73+
return nil
74+
}
75+
76+
func (k *KVMProjectUtilizationKPI) Describe(ch chan<- *prometheus.Desc) {
77+
ch <- k.instanceCountPerProjectAndHostAndFlavor
78+
ch <- k.capacityUsagePerProjectAndHost
79+
}
80+
81+
func (k *KVMProjectUtilizationKPI) Collect(ch chan<- prometheus.Metric) {
82+
hosts, err := k.getKVMHosts()
83+
if err != nil {
84+
slog.Error("kvm_project_utilization: failed to get KVM hosts", "error", err)
85+
return
86+
}
87+
88+
// Export project x flavor x compute_host instance count metric
89+
projectInstanceCounts, err := k.queryProjectInstanceCount()
90+
if err != nil {
91+
slog.Error("kvm_project_utilization: Failed to query project instance count for project utilization KPI", "error", err)
92+
return
93+
}
94+
for _, projectInstanceCount := range projectInstanceCounts {
95+
host, ok := hosts[projectInstanceCount.ComputeHost]
96+
if !ok {
97+
slog.Warn("kvm_project_utilization: Compute host not found for project instance count", "compute_host", projectInstanceCount.ComputeHost)
98+
continue
99+
}
100+
hostLabels := host.getHostLabels()
101+
hostLabels = append(hostLabels, projectInstanceCount.ProjectID, projectInstanceCount.ProjectName, projectInstanceCount.DomainID, projectInstanceCount.DomainName, projectInstanceCount.FlavorName)
102+
ch <- prometheus.MustNewConstMetric(k.instanceCountPerProjectAndHostAndFlavor, prometheus.GaugeValue, projectInstanceCount.InstanceCount, hostLabels...)
103+
}
104+
105+
// Export project x compute_host x resource capacity usage metric
106+
projectCapacityUsages, err := k.queryProjectCapacityUsage()
107+
if err != nil {
108+
slog.Error("kvm_project_utilization: Failed to query project capacity usage for project utilization KPI", "error", err)
109+
return
110+
}
111+
for _, projectCapacityUsage := range projectCapacityUsages {
112+
host, ok := hosts[projectCapacityUsage.ComputeHost]
113+
if !ok {
114+
slog.Warn("kvm_project_utilization: Compute host not found for project capacity usage", "compute_host", projectCapacityUsage.ComputeHost)
115+
continue
116+
}
117+
hostLabels := host.getHostLabels()
118+
hostLabels = append(hostLabels, projectCapacityUsage.ProjectID, projectCapacityUsage.ProjectName, projectCapacityUsage.DomainID, projectCapacityUsage.DomainName)
119+
120+
ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalVCPUs, append(hostLabels, "vcpu")...)
121+
ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalRAMMB*1024*1024, append(hostLabels, "memory")...)
122+
ch <- prometheus.MustNewConstMetric(k.capacityUsagePerProjectAndHost, prometheus.GaugeValue, projectCapacityUsage.TotalDiskGB*1024*1024*1024, append(hostLabels, "disk")...)
123+
}
124+
}
125+
126+
// getKVMHosts retrieves the list of KVM hosts and their details from the database, returning a map keyed by compute host name.
127+
func (k *KVMProjectUtilizationKPI) getKVMHosts() (map[string]kvmHost, error) {
128+
hvs := &hv1.HypervisorList{}
129+
if err := k.Client.List(context.Background(), hvs); err != nil {
130+
return nil, err
131+
}
132+
133+
hosts := make(map[string]kvmHost, len(hvs.Items))
134+
for _, hv := range hvs.Items {
135+
host := kvmHost{Hypervisor: hv}
136+
hosts[host.Name] = host
137+
}
138+
return hosts, nil
139+
}
140+
141+
// queryProjectInstanceCount retrieves the number of running instances per project, hypervisor, and flavor on KVM from the database.
142+
func (k *KVMProjectUtilizationKPI) queryProjectCapacityUsage() ([]kvmProjectCapacityUsage, error) {
143+
// This query will fetch all active instances. It will perform a join with the openstack projects to get the project name.
144+
// It will also join with the flavors table to get the flavor information, which is needed for the capacity usage metrics.
145+
// The results will be grouped by project, compute host, and availability zone to get the total capacity usage per project and hypervisor.
146+
// We will filter the results to only include instances that are running on KVM hypervisors by checking the compute host name pattern.
147+
// This assumes that all KVM hypervisors have a compute host name that follows the pattern "nodeXXX-bbYYY",
148+
// which is a naming convention in SAP Cloud Infrastructure and may need to be adjusted based on the actual environment.
149+
query := `
150+
SELECT
151+
s.tenant_id AS project_id,
152+
COALESCE(p.name, '') AS project_name,
153+
COALESCE(p.domain_id, '') AS domain_id,
154+
COALESCE(d.name, '') AS domain_name,
155+
s.os_ext_srv_attr_host AS compute_host,
156+
s.os_ext_az_availability_zone AS availability_zone,
157+
COALESCE(SUM(f.vcpus), 0) AS total_vcpus,
158+
COALESCE(SUM(f.ram), 0) AS total_ram_mb,
159+
COALESCE(SUM(f.disk), 0) AS total_disk_gb
160+
FROM ` + nova.Server{}.TableName() + ` s
161+
LEFT JOIN ` + nova.Flavor{}.TableName() + ` f ON s.flavor_name = f.name
162+
LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id
163+
LEFT JOIN ` + identity.Domain{}.TableName() + ` d ON d.id = p.domain_id
164+
WHERE s.status NOT IN ('DELETED', 'ERROR')
165+
AND s.os_ext_srv_attr_host LIKE '` + kvmComputeHostPattern + `'
166+
GROUP BY s.tenant_id, p.name, p.domain_id, d.name, s.os_ext_srv_attr_host, s.os_ext_az_availability_zone
167+
`
168+
var usages []kvmProjectCapacityUsage
169+
if _, err := k.DB.Select(&usages, query); err != nil {
170+
return nil, err
171+
}
172+
return usages, nil
173+
}
174+
175+
// queryProjectInstanceCount retrieves the number of running instances per project, hypervisor, and flavor on KVM.
176+
func (k *KVMProjectUtilizationKPI) queryProjectInstanceCount() ([]kvmProjectInstanceCount, error) {
177+
// This query will fetch all active instances. It will perform a join with the openstack projects to get the project name.
178+
// The results will be grouped by project, hypervisor, flavor, and availability zone to get the instance count.
179+
// We will filter the results to only include instances that are running on KVM hypervisors by checking the compute host name pattern.
180+
// This assumes that all KVM hypervisors have a compute host name that follows the pattern "nodeXXX-bbYYY",
181+
// which is a naming convention in SAP Cloud Infrastructure and may need to be adjusted based on the actual environment.
182+
query := `
183+
SELECT
184+
s.tenant_id AS project_id,
185+
COALESCE(p.name, '') AS project_name,
186+
COALESCE(p.domain_id, '') AS domain_id,
187+
COALESCE(d.name, '') AS domain_name,
188+
s.os_ext_srv_attr_host AS compute_host,
189+
s.os_ext_az_availability_zone AS availability_zone,
190+
s.flavor_name,
191+
COUNT(*) AS instance_count
192+
FROM ` + nova.Server{}.TableName() + ` s
193+
LEFT JOIN ` + identity.Project{}.TableName() + ` p ON p.id = s.tenant_id
194+
LEFT JOIN ` + identity.Domain{}.TableName() + ` d ON d.id = p.domain_id
195+
WHERE s.status NOT IN ('DELETED', 'ERROR')
196+
AND s.os_ext_srv_attr_host LIKE '` + kvmComputeHostPattern + `'
197+
GROUP BY s.tenant_id, p.name, p.domain_id, d.name, s.os_ext_srv_attr_host, s.flavor_name, s.os_ext_az_availability_zone
198+
`
199+
var usages []kvmProjectInstanceCount
200+
if _, err := k.DB.Select(&usages, query); err != nil {
201+
return nil, err
202+
}
203+
return usages, nil
204+
}

0 commit comments

Comments
 (0)