Skip to content

Commit dba8173

Browse files
fix(k8s): cap buildLogCache size to bound memory on failure bursts (bug-bash #2)
buildLogCache snapshots kaniko build logs of FAILED builds for the failure autopsy. It was TTL-bounded (30m) and swept stale entries on each new failure — but had NO size cap. A burst of failing builds inside one TTL window (a broken base image, a wedged registry, a fork-bomb of bad deploys) accumulates one ≤200-line snapshot per failure with no ceiling: unbounded memory growth on a long-lived api pod. Add buildLogCacheMaxEntries=256 and capBuildLogCacheSize(), called evict-after-store from snapshotBuildLogs. When a store pushes the live count past the cap, evict oldest-first (a recent failure's autopsy is far likelier to be read than one from hundreds of failures ago). sync.Map has no length, so we snapshot keys+times in one Range pass, sort by capturedAt, and delete the excess oldest. Tests: TestCapBuildLogCacheSize_EvictsOldestOverCap (over-cap → newest survive, oldest evicted), TestCapBuildLogCacheSize_NoOpUnderCap. Both new funcs 100% covered; verified locally. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 3f0f87c commit dba8173

2 files changed

Lines changed: 102 additions & 0 deletions

File tree

internal/providers/compute/k8s/build_log_cache_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ package k8s
1414

1515
import (
1616
"context"
17+
"fmt"
1718
"testing"
1819
"time"
1920

@@ -106,3 +107,61 @@ func TestEvictStaleBuildLogs_KeepsFreshDropsStale(t *testing.T) {
106107
t.Error("stale entry was not evicted")
107108
}
108109
}
110+
111+
// TestCapBuildLogCacheSize_EvictsOldestOverCap verifies the size cap: a burst
112+
// of fresh (non-stale) snapshots beyond buildLogCacheMaxEntries is bounded by
113+
// evicting the OLDEST entries first, while the most-recent cap-many survive.
114+
// This is the memory-leak guard the TTL sweep alone does not provide — many
115+
// failures inside one TTL window would otherwise grow the map without bound.
116+
func TestCapBuildLogCacheSize_EvictsOldestOverCap(t *testing.T) {
117+
p := &K8sProvider{clientset: fake.NewSimpleClientset()}
118+
119+
base := time.Now()
120+
overflow := 10
121+
total := buildLogCacheMaxEntries + overflow
122+
// All fresh (well within TTL) so eviction is driven purely by the size cap,
123+
// not by staleness. Older capturedAt for lower indexes → those evict first.
124+
for i := 0; i < total; i++ {
125+
key := fmt.Sprintf("app-%04d", i)
126+
p.buildLogCache.Store(key, &buildLogCacheEntry{
127+
lines: []string{key},
128+
capturedAt: base.Add(time.Duration(i) * time.Second),
129+
})
130+
}
131+
132+
p.capBuildLogCacheSize()
133+
134+
var count int
135+
p.buildLogCache.Range(func(_, _ any) bool { count++; return true })
136+
if count != buildLogCacheMaxEntries {
137+
t.Fatalf("cache not capped: got %d entries, want %d", count, buildLogCacheMaxEntries)
138+
}
139+
140+
// The oldest `overflow` entries (lowest indexes) must be gone; the newest
141+
// must remain.
142+
for i := 0; i < overflow; i++ {
143+
if _, ok := p.buildLogCache.Load(fmt.Sprintf("app-%04d", i)); ok {
144+
t.Errorf("oldest entry app-%04d should have been evicted", i)
145+
}
146+
}
147+
if _, ok := p.buildLogCache.Load(fmt.Sprintf("app-%04d", total-1)); !ok {
148+
t.Errorf("newest entry app-%04d was wrongly evicted", total-1)
149+
}
150+
}
151+
152+
// TestCapBuildLogCacheSize_NoOpUnderCap verifies the cap is a no-op when the
153+
// cache holds fewer than buildLogCacheMaxEntries entries (the common case).
154+
func TestCapBuildLogCacheSize_NoOpUnderCap(t *testing.T) {
155+
p := &K8sProvider{clientset: fake.NewSimpleClientset()}
156+
157+
p.buildLogCache.Store("only", &buildLogCacheEntry{
158+
lines: []string{"x"},
159+
capturedAt: time.Now(),
160+
})
161+
162+
p.capBuildLogCacheSize()
163+
164+
if _, ok := p.buildLogCache.Load("only"); !ok {
165+
t.Error("under-cap entry was wrongly evicted")
166+
}
167+
}

internal/providers/compute/k8s/client.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"log/slog"
1616
"os"
1717
"path/filepath"
18+
"sort"
1819
"strings"
1920
"sync"
2021
"time"
@@ -417,6 +418,15 @@ type buildLogCacheEntry struct {
417418
// failed build.
418419
const buildLogCacheTTL = 30 * time.Minute
419420

421+
// buildLogCacheMaxEntries caps the number of build-log snapshots held at once.
422+
// The TTL alone is not a memory bound: a burst of failing builds inside one
423+
// 30-minute window (a broken base image, a wedged registry) would otherwise
424+
// accumulate one ≤buildLogMaxLines-line snapshot per failure with no ceiling.
425+
// When a store pushes the cache past this cap we evict the oldest entries —
426+
// the autopsy for a recent failure is far likelier to be read than one from
427+
// hundreds of failures ago. 256 entries × 200 lines is a comfortable bound.
428+
const buildLogCacheMaxEntries = 256
429+
420430
// New creates a K8sProvider targeting the given namespace.
421431
// buildCtx is optional — when unset, builds fall back to the 1 MiB Secret path.
422432
// Returns an error if the k8s clientset cannot be initialized; the caller
@@ -1288,10 +1298,43 @@ func (p *K8sProvider) snapshotBuildLogs(ctx context.Context, ns, appID, jobName
12881298
lines: lines,
12891299
capturedAt: time.Now(),
12901300
})
1301+
// Evict-after-store: the TTL sweep above only drops *expired* entries, so a
1302+
// burst of failures inside one TTL window still grows the map unbounded.
1303+
// Cap the live entry count, evicting oldest-first.
1304+
p.capBuildLogCacheSize()
12911305
slog.Info("k8s.snapshotBuildLogs: captured failed build logs for autopsy",
12921306
"app_id", appID, "lines", len(lines))
12931307
}
12941308

1309+
// capBuildLogCacheSize bounds buildLogCache to buildLogCacheMaxEntries, evicting
1310+
// the oldest snapshots first. sync.Map has no length, so we snapshot keys +
1311+
// capture times in one Range pass; if over the cap we sort by capturedAt and
1312+
// delete the excess oldest. A concurrent Store between the Range and the
1313+
// Deletes can only leave us at most a few entries over the cap until the next
1314+
// failure re-runs this — an acceptable slop for a best-effort autopsy cache.
1315+
func (p *K8sProvider) capBuildLogCacheSize() {
1316+
type keyedEntry struct {
1317+
key any
1318+
capturedAt time.Time
1319+
}
1320+
var entries []keyedEntry
1321+
p.buildLogCache.Range(func(k, v any) bool {
1322+
if entry, ok := v.(*buildLogCacheEntry); ok {
1323+
entries = append(entries, keyedEntry{key: k, capturedAt: entry.capturedAt})
1324+
}
1325+
return true
1326+
})
1327+
if len(entries) <= buildLogCacheMaxEntries {
1328+
return
1329+
}
1330+
sort.Slice(entries, func(i, j int) bool {
1331+
return entries[i].capturedAt.Before(entries[j].capturedAt)
1332+
})
1333+
for _, e := range entries[:len(entries)-buildLogCacheMaxEntries] {
1334+
p.buildLogCache.Delete(e.key)
1335+
}
1336+
}
1337+
12951338
// evictStaleBuildLogs drops buildLogCache entries older than buildLogCacheTTL.
12961339
func (p *K8sProvider) evictStaleBuildLogs() {
12971340
now := time.Now()

0 commit comments

Comments
 (0)