fix: eliminate per-scrape Kubernetes API call causing 15s+ metrics latency

charford · charford · commit 4242c1558d1c · 2026-03-04T10:27:40.000-08:00
Collect() was calling containerLister.Update() on every Prometheus scrape,
which issues a synchronous cluster-wide Pods().List() against the API server.
The watchAndFeedback goroutine already refreshes container state every 5s,
so Collect() only needs to read cached data.

Also remove nvml.Init() from Collect() — NVML is already initialized in
watchAndFeedback and should only be called once at startup.

Fix a data race in ListContainers(): Update() holds the mutex while writing
the containers map, but the previous ListContainers() returned the raw map
with no lock. It now returns a snapshot under the mutex.

Co-Authored-By: charford &lt;casey@caseyharford.com&gt;
Signed-off-by: charford &lt;casey@caseyharford.com&gt;
diff --git a/cmd/vgpu-monitor/metrics.go b/cmd/vgpu-monitor/metrics.go
@@ -139,14 +139,7 @@ func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) {
 func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
 	klog.Info("Starting to collect metrics for vGPUMonitor")
 	containerLister := cc.ClusterManager.containerLister
-	if err := containerLister.Update(); err != nil {
-		klog.Error("Update container error: %s", err.Error())
-	}
 
-	nvret := config.Nvml().Init()
-	if nvret != nvml.SUCCESS {
-		klog.Errorf("nvml Init err= %v", nvret)
-	}
 	devnum, nvret := config.Nvml().DeviceGetCount()
 	if nvret != nvml.SUCCESS {
 		klog.Errorf("nvml GetDeviceCount err= %v", nvret)
diff --git a/pkg/monitor/nvidia/cudevshr.go b/pkg/monitor/nvidia/cudevshr.go
@@ -112,7 +112,13 @@ func (l *ContainerLister) UnLock() {
 }
 
 func (l *ContainerLister) ListContainers() map[string]*ContainerUsage {
-	return l.containers
+	l.mutex.Lock()
+	defer l.mutex.Unlock()
+	snapshot := make(map[string]*ContainerUsage, len(l.containers))
+	for k, v := range l.containers {
+		snapshot[k] = v
+	}
+	return snapshot
 }
 
 func (l *ContainerLister) Clientset() *kubernetes.Clientset {

Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,13 @@ func (l *ContainerLister) UnLock() {`
`112`	`112`	`}`
`113`	`113`
`114`	`114`	`func (l ContainerLister) ListContainers() map[string]ContainerUsage {`
`115`		`- return l.containers`
	`115`	`+ l.mutex.Lock()`
	`116`	`+ defer l.mutex.Unlock()`
	`117`	`+ snapshot := make(map[string]*ContainerUsage, len(l.containers))`
	`118`	`+ for k, v := range l.containers {`
	`119`	`+ snapshot[k] = v`
	`120`	`+ }`
	`121`	`+ return snapshot`
`116`	`122`	`}`
`117`	`123`
`118`	`124`	`func (l ContainerLister) Clientset() kubernetes.Clientset {`