Skip to content

Commit ee27b0a

Browse files
committed
fix: add Docker stats timeout and safe map iteration
- Add 5-second timeout for Docker ContainerStatsOneShot calls to prevent indefinite hanging if Docker daemon is unresponsive - Fix unsafe map iteration in cleanupStaleCPUStats by collecting keys to delete first before deletion (Go spec undefined behavior fix)
1 parent 29bfb41 commit ee27b0a

1 file changed

Lines changed: 13 additions & 2 deletions

File tree

engine/internal/srv/metrics/collector.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ import (
2828
const (
2929
// cpuPercentMultiplier converts CPU usage ratio to percentage.
3030
cpuPercentMultiplier = 100.0
31+
// dockerStatsTimeout is the timeout for fetching container stats.
32+
dockerStatsTimeout = 5 * time.Second
3133
)
3234

3335
// containerCPUState stores previous CPU stats for delta calculation.
@@ -249,7 +251,9 @@ func (c *Collector) getContainerStats(ctx context.Context, clones []*models.Clon
249251

250252
activeCloneIDs[clone.ID] = struct{}{}
251253

252-
stats, err := c.dockerClient.ContainerStatsOneShot(ctx, clone.ID)
254+
statsCtx, cancel := context.WithTimeout(ctx, dockerStatsTimeout)
255+
stats, err := c.dockerClient.ContainerStatsOneShot(statsCtx, clone.ID)
256+
cancel()
253257
if err != nil {
254258
log.Dbg(fmt.Sprintf("failed to get container stats for clone %s: %v", clone.ID, err))
255259
continue
@@ -328,11 +332,18 @@ func (c *Collector) calculateCPUPercent(cloneID string, stats *container.StatsRe
328332
}
329333

330334
func (c *Collector) cleanupStaleCPUStats(activeCloneIDs map[string]struct{}) {
335+
// collect keys to delete first to avoid modifying map during iteration
336+
keysToDelete := make([]string, 0)
337+
331338
for cloneID := range c.prevCPUStats {
332339
if _, ok := activeCloneIDs[cloneID]; !ok {
333-
delete(c.prevCPUStats, cloneID)
340+
keysToDelete = append(keysToDelete, cloneID)
334341
}
335342
}
343+
344+
for _, cloneID := range keysToDelete {
345+
delete(c.prevCPUStats, cloneID)
346+
}
336347
}
337348

338349
func (c *Collector) collectSnapshotMetrics() {

0 commit comments

Comments
 (0)