Skip to content

Commit cd13e0c

Browse files
committed
build: review pass 4 — address HIGH + MED findings from PR #177
HIGH: #1 PruneResult.FreedBytes now reflects bytes actually reclaimed (sum over result.Removed after the loop), not the optimistic plan total. Honors the build-prune.schema.json contract under partial failure. #2 Prune now holds the FR-015 per-key flock around each entry's removal — same lock builder.Run() acquires — so a concurrent `trond build` cannot interleave with our manifest+artifact delete. New non-blocking TryAcquireCacheLock variant (posix: LOCK_EX|LOCK_NB; windows: TryLock) so a background prune never stalls an interactive build; locked entries are skipped with an info-level stderr line. MEDIUM: #3 Sha256IfExists's *ssh.ExitError classification is verified against the actual wrap shape Exec emits (fmt.Errorf with %w) via 3 new sub-tests in internal/target/ssh_sha256_classify_test.go. Documents the invariant: changing Exec's wrap verb breaks this and the test catches it. #4 resolveBuild omits JDKVersion from CacheKey when Builder=host — the BuilderImageDigest (sha256 of `java -version`) already captures the actual JVM, so including --jdk would fragment the cache across two invocations that produce identical artifacts. #5 findLargestFatJAR matches `find -path '*/build/libs/*.jar'` exactly (parent dir must be `build/libs/`), not just any `/build/libs/` substring. Pins cross-builder cache reuse: host and docker builders pick the same fat JAR even in unusual layouts (`build/libs/sub/x.jar`, `libs-archive/`, wrong-order segments). #6 TestRunner interface widened to pass outDir through so test fakes can populate image-snapshot files. Adapter updated; two existing test callers (apply, cmd) updated to the new 5-arg signature. #7 build_prune footgun guard: `--keep-last N --confirm` with no scoping filter is now rejected at both CLI and MCP layers. Operator must either add --all (explicit acknowledge of the near-wipe) or narrow with --orphan/--older-than. Dry-run is exempt — the plan output is the affordance for "what would keep_last remove". Tests added (all green): - TestPrune_FreedBytesOnlyCountsActuallyRemoved - TestPrune_FreedBytesOnDryRunMatchesPlan - TestPrune_AcquiresCacheLock - TestFindLargestFatJAR_NestedAndDecoyPaths - TestResolveBuild_HostBuilder_JDKVersionDoesNotFragmentCacheKey - TestBuildPrune_KeepLastFootgunGuard (cobra, 3 sub-cases, properly isolated via paths.SetBaseDir so the test never walks the dev's real ~/.trond cache) - TestBuildPrune_KeepLastAloneConfirmRejected (MCP, 3 sub-cases) - TestSha256_ExitErrorClassification (3 sub-cases)
1 parent 93071ad commit cd13e0c

15 files changed

Lines changed: 607 additions & 38 deletions

cmd/build_flags_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ type flagsCaptureRunner struct {
2424

2525
func (f *flagsCaptureRunner) RunDockerBuild(
2626
_ context.Context,
27-
sourcePath, _ string,
27+
sourcePath, _ /* outDir */, _ /* outTmpPath */ string,
2828
gradleTask string,
2929
gradleArgs []string,
3030
env map[string]string,

cmd/build_mgmt_test.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
package cmd
22

33
import (
4+
"strings"
45
"testing"
56
"time"
67

78
"github.com/tronprotocol/tron-deployment/internal/build"
9+
"github.com/tronprotocol/tron-deployment/internal/paths"
810
)
911

1012
// Cobra-layer tests for `trond build list / inspect / prune`. The
@@ -82,6 +84,71 @@ func TestSortEntries(t *testing.T) {
8284
})
8385
}
8486

87+
// TestBuildPrune_KeepLastFootgunGuard pins the review-pass-4 CLI
88+
// guard: --keep-last with --confirm but no scoping filter would
89+
// wipe everything-but-N — required either --all (explicit
90+
// acknowledge) or a scoping filter (--orphan / --older-than). Dry-
91+
// run is exempt; the plan output is the affordance.
92+
//
93+
// Isolates paths.BaseDir to a TempDir so the sub-tests that pass the
94+
// guard and reach the real Prune call don't walk the developer's
95+
// actual ~/.trond cache — a footgun that would have deleted real
96+
// build artifacts during test runs without this isolation.
97+
func TestBuildPrune_KeepLastFootgunGuard(t *testing.T) {
98+
dir := t.TempDir()
99+
paths.SetBaseDir(dir)
100+
t.Cleanup(func() { paths.SetBaseDir("") })
101+
102+
// Reset flag state between sub-tests. Cobra StringVars/BoolVars
103+
// persist between Execute() calls because they're package vars,
104+
// so explicitly clear before each scenario.
105+
reset := func() {
106+
buildPruneAll = false
107+
buildPruneOlderThan = 0
108+
buildPruneKeepLast = 0
109+
buildPruneOrphan = false
110+
buildPruneConfirm = false
111+
}
112+
113+
t.Run("--keep-last --confirm alone is rejected", func(t *testing.T) {
114+
reset()
115+
buildPruneKeepLast = 1
116+
buildPruneConfirm = true
117+
err := runBuildPrune(buildPruneCmd, nil)
118+
if err == nil {
119+
t.Fatal("expected validation error")
120+
}
121+
if !strings.Contains(err.Error(), "would wipe everything except") {
122+
t.Errorf("error %q should mention the footgun", err)
123+
}
124+
})
125+
126+
t.Run("--keep-last (dry-run) is allowed", func(t *testing.T) {
127+
reset()
128+
buildPruneKeepLast = 1
129+
// confirm=false → dry-run path; should NOT trigger the guard.
130+
err := runBuildPrune(buildPruneCmd, nil)
131+
// May succeed or fail at the actual Prune call (empty cache
132+
// dir in test) — what we're pinning is that the guard does
133+
// not fire. So the error, if any, must NOT mention the
134+
// footgun string.
135+
if err != nil && strings.Contains(err.Error(), "would wipe everything except") {
136+
t.Errorf("dry-run keep_last triggered the footgun guard; got %q", err)
137+
}
138+
})
139+
140+
t.Run("--keep-last --all --confirm is allowed (explicit)", func(t *testing.T) {
141+
reset()
142+
buildPruneKeepLast = 1
143+
buildPruneAll = true
144+
buildPruneConfirm = true
145+
err := runBuildPrune(buildPruneCmd, nil)
146+
if err != nil && strings.Contains(err.Error(), "would wipe everything except") {
147+
t.Errorf("explicit --all+keep-last+confirm triggered the footgun guard; got %q", err)
148+
}
149+
})
150+
}
151+
85152
// TestHumanBytes pins the table formatter. The numbers feed both
86153
// 'trond build list' (table mode) and 'trond build prune' (dry-run
87154
// output), so a regression here is loudly visible.

cmd/build_prune.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,27 @@ func runBuildPrune(cmd *cobra.Command, _ []string) error {
9696
"To remove entries older than a week: trond build prune --older-than 168h --confirm",
9797
)
9898
}
99+
// Footgun guard: `--keep-last N --confirm` with NO other filter
100+
// is equivalent to "delete every entry except the N newest" —
101+
// a near-wipe operation that looks small at a glance. Require
102+
// either an explicit second filter (--orphan / --older-than)
103+
// to scope what gets pruned, OR an explicit --all to acknowledge
104+
// the near-wipe intent. Dry-run is exempt: the plan output
105+
// shows exactly what would be deleted, which IS the obvious
106+
// affordance an interactive operator wants.
107+
if buildPruneConfirm && buildPruneKeepLast > 0 &&
108+
!buildPruneAll && !buildPruneOrphan && buildPruneOlderThan == 0 {
109+
return output.NewError("VALIDATION_ERROR", output.ExitValidationError,
110+
"--keep-last alone with --confirm would wipe everything except "+
111+
"the N newest entries; combine with --all to acknowledge, OR "+
112+
"narrow with --orphan / --older-than").
113+
WithSuggestions(
114+
"Preview first: trond build prune --keep-last "+
115+
fmt.Sprintf("%d", buildPruneKeepLast)+" (dry-run shows the plan)",
116+
"To genuinely wipe-all-but-N: trond build prune --all --keep-last "+
117+
fmt.Sprintf("%d", buildPruneKeepLast)+" --confirm",
118+
)
119+
}
99120

100121
opts := build.PruneOptions{
101122
All: buildPruneAll,

internal/apply/build_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ type fakeBuilderRunner struct {
2525

2626
func (f *fakeBuilderRunner) RunDockerBuild(
2727
_ context.Context,
28-
_ /* sourcePath */, outTmp string,
28+
_ /* sourcePath */, _ /* outDir */, outTmp string,
2929
_ /* gradleTask */ string,
3030
_ /* gradleArgs */ []string,
3131
_ /* env */ map[string]string,

internal/build/builder.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,11 +268,23 @@ func resolveBuild(ctx context.Context, req Request) (*resolved, error) {
268268
)
269269
}
270270

271+
// For host builds, BuilderImageDigest already captures the exact
272+
// JVM in use (sha256 of `java -version` output). Including
273+
// req.JDKVersion in the key on top of that would fragment the
274+
// cache pointlessly: two `--builder host` invocations with the
275+
// same actual JDK but different --jdk flags (e.g. --jdk 8 vs
276+
// --jdk 17, both falling back to whatever the host has)
277+
// rebuild identically. Drop JDKVersion from the host-builder
278+
// key so cache hits work as expected.
279+
keyJDK := req.JDKVersion
280+
if req.Builder == "host" {
281+
keyJDK = ""
282+
}
271283
key := CacheKey{
272284
GitRevision: src.ResolvedRevision,
273285
PatchHash: src.PatchHash,
274286
BuilderImageDigest: imageDigest,
275-
JDKVersion: req.JDKVersion,
287+
JDKVersion: keyJDK,
276288
ArtifactKind: req.ArtifactKind,
277289
GradleTask: req.GradleTask,
278290
GradleArgs: append([]string(nil), req.GradleArgs...),

internal/build/cache_ops.go

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,10 @@ type PruneOptions struct {
244244
// PruneResult is the structured output the CLI/MCP layer renders.
245245
// Plan is what WOULD be removed; Removed is what was actually
246246
// removed (nil when DryRun). FreedBytes is the sum of SizeBytes for
247-
// Removed entries (DryRun: sum over Plan, since we know what we
248-
// would have freed).
247+
// successfully-removed entries (or, on DryRun, the sum over Plan).
248+
// Schema contract: schemas/output/build-prune.schema.json describes
249+
// freed_bytes as "Total bytes reclaimed"; the post-loop recomputation
250+
// below honors that even when one entry's removal partially failed.
249251
type PruneResult struct {
250252
Plan []*Entry `json:"plan"`
251253
Removed []*Entry `json:"removed,omitempty"`
@@ -254,10 +256,25 @@ type PruneResult struct {
254256
}
255257

256258
// Prune evaluates the cache against opts, builds a deletion plan, and
257-
// (unless DryRun) executes it. Image artifacts get a best-effort
258-
// `docker rmi <tag>` so the docker storage actually reclaims layers;
259-
// failures there don't abort the prune (the trond cache files still
260-
// get cleaned).
259+
// (unless DryRun) executes it under per-entry flocks so concurrent
260+
// `trond build` runs against the same cache key cannot interleave
261+
// with our manifest/artifact deletion. Image artifacts also get a
262+
// best-effort `docker rmi <tag>` so the docker storage actually
263+
// reclaims layers; failures there don't abort the prune (the trond
264+
// cache files still get cleaned).
265+
//
266+
// Concurrency invariants:
267+
//
268+
// - The flock per cache key matches the one builder.go acquires in
269+
// Run() (AcquireCacheLock). A concurrent build of the same key
270+
// either finishes first (we then prune the produced artifact,
271+
// unsurprising) or blocks until we release (the build then sees
272+
// no manifest and rebuilds, also unsurprising). No race window
273+
// where Prune deletes a half-written manifest mid-build.
274+
// - Two concurrent Prune invocations on the same entry race only
275+
// on the lock acquisition; the second sees the manifest already
276+
// gone in removeEntry and treats it as best-effort (the
277+
// errors.Is(..., os.ErrNotExist) branch).
261278
func Prune(ctx context.Context, opts PruneOptions) (*PruneResult, error) {
262279
all, err := ListEntries(ctx, IncludeOrphans())
263280
if err != nil {
@@ -269,25 +286,48 @@ func Prune(ctx context.Context, opts PruneOptions) (*PruneResult, error) {
269286
Plan: plan,
270287
DryRun: opts.DryRun,
271288
}
272-
for _, e := range plan {
273-
result.FreedBytes += e.SizeBytes
274-
}
275289
if opts.DryRun {
290+
// Dry-run reports what WOULD be freed — the plan's full
291+
// size, since nothing is removed.
292+
for _, e := range plan {
293+
result.FreedBytes += e.SizeBytes
294+
}
276295
return result, nil
277296
}
278297

279298
for _, e := range plan {
280-
if err := removeEntry(ctx, e); err != nil {
299+
// FR-015 lock — same key the builder grabs in Run(). We use
300+
// the non-blocking try-variant: if a build holds this key
301+
// right now, prune skips the entry rather than waiting (a
302+
// background prune should NEVER stall an interactive build).
303+
release, ok, lockErr := TryAcquireCacheLock(CacheDir(), e.CacheKey)
304+
if lockErr != nil {
305+
fmt.Fprintf(os.Stderr,
306+
"warning: prune skip %s — lock open failed: %v\n",
307+
e.CacheKey, lockErr)
308+
continue
309+
}
310+
if !ok {
311+
fmt.Fprintf(os.Stderr,
312+
"info: prune skip %s — build in progress for this key\n",
313+
e.CacheKey)
314+
continue
315+
}
316+
err := removeEntry(ctx, e)
317+
release()
318+
if err != nil {
281319
// Don't abort the whole prune — one wedged docker rmi
282320
// shouldn't block the rest of the cleanup. Surface to
283-
// stderr; the result.Removed list reflects what
284-
// succeeded.
321+
// stderr; result.Removed + FreedBytes reflect only what
322+
// actually succeeded so the JSON contract holds even
323+
// after partial failures.
285324
fmt.Fprintf(os.Stderr,
286325
"warning: prune partial failure for %s: %v\n",
287326
e.CacheKey, err)
288327
continue
289328
}
290329
result.Removed = append(result.Removed, e)
330+
result.FreedBytes += e.SizeBytes
291331
}
292332
return result, nil
293333
}

internal/build/cache_ops_test.go

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,123 @@ func TestPrune_OrphanCleansManifestEvenWithoutArtifact(t *testing.T) {
318318
}
319319
}
320320

321+
// TestPrune_AcquiresCacheLock pins the review-pass-4 race fix: Prune
322+
// MUST hold the same per-key flock that builder.Run() acquires, so a
323+
// concurrent build of the entry being pruned cannot interleave with
324+
// our manifest+artifact deletion. Verified by holding the flock
325+
// externally and observing Prune's "lock unavailable" skip path.
326+
//
327+
// (We can't easily reproduce the data corruption directly in a unit
328+
// test — too much process orchestration. This test pins the
329+
// contract: "if the lock can't be acquired, the entry is skipped",
330+
// which is the post-condition that protects the cache.)
331+
func TestPrune_AcquiresCacheLock(t *testing.T) {
332+
withTempBaseDir(t)
333+
if err := EnsureCacheDirs(); err != nil {
334+
t.Fatalf("EnsureCacheDirs: %v", err)
335+
}
336+
locked := seedJARManifest(t, "locked-by-build", time.Now(), 100)
337+
338+
// Simulate a concurrent build: hold the flock for this entry's
339+
// key. Prune must observe the lock and skip without touching
340+
// anything.
341+
release, err := AcquireCacheLock(CacheDir(), locked.CacheKey)
342+
if err != nil {
343+
t.Fatalf("test AcquireCacheLock: %v", err)
344+
}
345+
t.Cleanup(release)
346+
347+
res, err := Prune(context.Background(), PruneOptions{All: true})
348+
if err != nil {
349+
t.Fatalf("Prune: %v", err)
350+
}
351+
// Plan still lists the entry (we never released the lock until
352+
// AFTER Prune returns), but Removed must NOT contain it because
353+
// Prune couldn't acquire the lock.
354+
if len(res.Removed) != 0 {
355+
t.Errorf("Removed should be empty when lock is held externally; got %v", keys(res.Removed))
356+
}
357+
// And the JAR + manifest are still on disk — no partial
358+
// deletion.
359+
if _, statErr := os.Stat(locked.ArtifactPath); statErr != nil {
360+
t.Errorf("locked JAR should be untouched; stat err = %v", statErr)
361+
}
362+
if _, statErr := os.Stat(manifestPath(locked.CacheKey)); statErr != nil {
363+
t.Errorf("locked manifest should be untouched; stat err = %v", statErr)
364+
}
365+
}
366+
367+
// TestPrune_FreedBytesOnlyCountsActuallyRemoved is the review-pass-4
368+
// regression guard: PruneResult.FreedBytes MUST reflect bytes
369+
// actually reclaimed, not the plan's optimistic total. Otherwise an
370+
// MCP agent surfacing "freed N bytes" would report a number that
371+
// doesn't match the bytes the OS actually got back.
372+
//
373+
// Constructed scenario: two entries in the plan, simulate partial
374+
// failure by pre-deleting one entry's manifest under our feet so
375+
// removeEntry's os.Remove(manifestPath) fails. The successful entry
376+
// contributes its bytes; the failed entry does not.
377+
func TestPrune_FreedBytesOnlyCountsActuallyRemoved(t *testing.T) {
378+
withTempBaseDir(t)
379+
good := seedJARManifest(t, "good", time.Now(), 500)
380+
bad := seedJARManifest(t, "bad", time.Now().Add(-time.Hour), 1000)
381+
382+
// Sabotage the "bad" entry's artifact: swap the JAR file for a
383+
// non-empty directory at the same path. removeEntry's
384+
// os.Remove(e.ArtifactPath) refuses to delete a non-empty dir
385+
// → entry removal fails. ListEntries still parses the (intact)
386+
// manifest, so the entry IS in the plan but won't end up in
387+
// Removed. Faithfully models real failures the reviewer flagged
388+
// (docker rmi wedged, fs permissions, etc.).
389+
if err := os.Remove(bad.ArtifactPath); err != nil {
390+
t.Fatalf("setup: remove jar for sabotage: %v", err)
391+
}
392+
if err := os.MkdirAll(bad.ArtifactPath, 0o755); err != nil {
393+
t.Fatalf("setup: mkdir jar path for sabotage: %v", err)
394+
}
395+
if err := os.WriteFile(filepath.Join(bad.ArtifactPath, "wedge"), []byte("x"), 0o600); err != nil {
396+
t.Fatalf("setup: wedge file: %v", err)
397+
}
398+
399+
res, err := Prune(context.Background(), PruneOptions{All: true})
400+
if err != nil {
401+
t.Fatalf("Prune: %v", err)
402+
}
403+
// Plan has both entries; Removed has only the good one.
404+
if len(res.Plan) != 2 {
405+
t.Errorf("Plan should list 2 entries; got %d", len(res.Plan))
406+
}
407+
if len(res.Removed) != 1 || res.Removed[0].CacheKey != "good" {
408+
t.Fatalf("Removed should contain only 'good'; got %v", keys(res.Removed))
409+
}
410+
// THE FIX: FreedBytes must equal Removed-only sum, NOT Plan sum.
411+
if res.FreedBytes != 500 {
412+
t.Errorf("FreedBytes = %d; want 500 (bytes actually reclaimed, not plan total of 1500)", res.FreedBytes)
413+
}
414+
// And good's artifact should be gone.
415+
if _, statErr := os.Stat(good.ArtifactPath); !os.IsNotExist(statErr) {
416+
t.Errorf("good JAR should be deleted; stat err = %v", statErr)
417+
}
418+
}
419+
420+
// TestPrune_FreedBytesOnDryRunMatchesPlan: on dry-run, FreedBytes
421+
// reflects what WOULD be freed (== plan sum), since Removed is
422+
// empty by design. Without this branch in Prune, the post-Removed-
423+
// loop accumulation would report 0 freed for dry-runs.
424+
func TestPrune_FreedBytesOnDryRunMatchesPlan(t *testing.T) {
425+
withTempBaseDir(t)
426+
seedJARManifest(t, "a", time.Now(), 100)
427+
seedJARManifest(t, "b", time.Now(), 200)
428+
429+
res, err := Prune(context.Background(), PruneOptions{All: true, DryRun: true})
430+
if err != nil {
431+
t.Fatalf("Prune: %v", err)
432+
}
433+
if res.FreedBytes != 300 {
434+
t.Errorf("DryRun FreedBytes = %d; want 300 (full plan total)", res.FreedBytes)
435+
}
436+
}
437+
321438
// --- helpers ---
322439

323440
func keys(entries []*Entry) []string {

0 commit comments

Comments
 (0)