Skip to content

Commit 4242c15

Browse files
committed
fix: eliminate per-scrape Kubernetes API call causing 15s+ metrics latency
Collect() was calling containerLister.Update() on every Prometheus scrape, which issues a synchronous cluster-wide Pods().List() against the API server. The watchAndFeedback goroutine already refreshes container state every 5s, so Collect() only needs to read cached data. Also remove nvml.Init() from Collect() — NVML is already initialized in watchAndFeedback and should only be called once at startup. Fix a data race in ListContainers(): Update() holds the mutex while writing the containers map, but the previous ListContainers() returned the raw map with no lock. It now returns a snapshot under the mutex. Co-Authored-By: charford <casey@caseyharford.com> Signed-off-by: charford <casey@caseyharford.com>
1 parent 51ddae6 commit 4242c15

2 files changed

Lines changed: 7 additions & 8 deletions

File tree

cmd/vgpu-monitor/metrics.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,7 @@ func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) {
139139
func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
140140
klog.Info("Starting to collect metrics for vGPUMonitor")
141141
containerLister := cc.ClusterManager.containerLister
142-
if err := containerLister.Update(); err != nil {
143-
klog.Error("Update container error: %s", err.Error())
144-
}
145142

146-
nvret := config.Nvml().Init()
147-
if nvret != nvml.SUCCESS {
148-
klog.Errorf("nvml Init err= %v", nvret)
149-
}
150143
devnum, nvret := config.Nvml().DeviceGetCount()
151144
if nvret != nvml.SUCCESS {
152145
klog.Errorf("nvml GetDeviceCount err= %v", nvret)

pkg/monitor/nvidia/cudevshr.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,13 @@ func (l *ContainerLister) UnLock() {
112112
}
113113

114114
func (l *ContainerLister) ListContainers() map[string]*ContainerUsage {
115-
return l.containers
115+
l.mutex.Lock()
116+
defer l.mutex.Unlock()
117+
snapshot := make(map[string]*ContainerUsage, len(l.containers))
118+
for k, v := range l.containers {
119+
snapshot[k] = v
120+
}
121+
return snapshot
116122
}
117123

118124
func (l *ContainerLister) Clientset() *kubernetes.Clientset {

0 commit comments

Comments
 (0)