Skip to content

Commit ab97420

Browse files
Merge pull request #107 from charford/fix/metrics-endpoint-latency
Remove unnecessary K8s API call and data race from `/metrics` scrape path
2 parents 51ddae6 + 4242c15 commit ab97420

2 files changed

Lines changed: 7 additions & 8 deletions

File tree

cmd/vgpu-monitor/metrics.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,7 @@ func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) {
139139
func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
140140
klog.Info("Starting to collect metrics for vGPUMonitor")
141141
containerLister := cc.ClusterManager.containerLister
142-
if err := containerLister.Update(); err != nil {
143-
klog.Error("Update container error: %s", err.Error())
144-
}
145142

146-
nvret := config.Nvml().Init()
147-
if nvret != nvml.SUCCESS {
148-
klog.Errorf("nvml Init err= %v", nvret)
149-
}
150143
devnum, nvret := config.Nvml().DeviceGetCount()
151144
if nvret != nvml.SUCCESS {
152145
klog.Errorf("nvml GetDeviceCount err= %v", nvret)

pkg/monitor/nvidia/cudevshr.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,13 @@ func (l *ContainerLister) UnLock() {
112112
}
113113

114114
func (l *ContainerLister) ListContainers() map[string]*ContainerUsage {
115-
return l.containers
115+
l.mutex.Lock()
116+
defer l.mutex.Unlock()
117+
snapshot := make(map[string]*ContainerUsage, len(l.containers))
118+
for k, v := range l.containers {
119+
snapshot[k] = v
120+
}
121+
return snapshot
116122
}
117123

118124
func (l *ContainerLister) Clientset() *kubernetes.Clientset {

0 commit comments

Comments
 (0)