fix(s3): pass appendPartBlobKeys' pending by pointer so flush observes appends

bootjp · bootjp · commit 97b4fc580b9c · 2026-04-25T23:58:56.000+09:00
Gemini high — the previous round split cleanup paths onto s3MetaBatchOps but missed that `appendPartBlobKeys` still took `pending []*kv.Elem[kv.OP]` by value. The flush closure inside `cleanupManifestBlobs` captures the outer `pending` slice header; appends performed inside the helper update the local header only, so flush sees length 0 every iteration and silently no-ops. After the helper returns, the caller reassigns `pending` from the helper's return value — but by then the entire part's chunks have accumulated into one slice that bypasses the s3MetaBatchOps cap entirely. For a manifest with thousands of chunks (e.g. a 5 GiB-object cleanup with ~5120 chunks), the broken helper builds one giant batch and hands it to coordinator.Dispatch in a single call, re-introducing exactly the OOM / oversized-MsgApp risk the s3MetaBatchOps cap was meant to prevent. This is the same shape as the pre-PR-#636 behaviour s3ChunkBatchOps was tightening, just on the cleanup side. Fix: take `pending` by pointer (`*[]*kv.Elem[kv.OP]`) and write through it. Now appends inside the helper update the variable the flush closure reads, so threshold-triggered flushes fire correctly and the slice length never exceeds s3MetaBatchOps. Test: - TestAppendPartBlobKeys_FlushFiresEveryS3MetaBatchOps builds a part with 2 × s3MetaBatchOps + 7 chunks, calls the helper directly with a recording flush closure, and asserts flush fired exactly twice with batch sizes [s3MetaBatchOps, s3MetaBatchOps] plus 7 trailing entries left for the caller's tail flush. The test catches the bug: the value-passing version would record 0 flushes (closure always saw length 0) — the assertion `Equal(2, flushCalls)` would fail loudly. Build / vet / lint clean. All S3 + cleanup tests pass.
diff --git a/adapter/s3.go b/adapter/s3.go
@@ -2237,29 +2237,39 @@ func (s *S3Server) cleanupManifestBlobs(ctx context.Context, bucket string, gene
 		pending = pending[:0]
 	}
 	for _, part := range manifest.Parts {
-		var ok bool
-		if pending, ok = s.appendPartBlobKeys(pending, bucket, generation, objectKey, manifest.UploadID, part, flush); !ok {
+		if !s.appendPartBlobKeys(&pending, bucket, generation, objectKey, manifest.UploadID, part, flush) {
 			return
 		}
 	}
 	flush()
 }
 
-func (s *S3Server) appendPartBlobKeys(pending []*kv.Elem[kv.OP], bucket string, generation uint64, objectKey string, uploadID string, part s3ObjectPart, flush func()) ([]*kv.Elem[kv.OP], bool) {
+// appendPartBlobKeys queues every blob-chunk Del for one manifest part
+// onto *pending and triggers flush whenever the batch reaches
+// s3MetaBatchOps. The slice is taken by pointer so that the caller's
+// `flush` closure (which captures pending from the enclosing
+// cleanupManifestBlobs scope) observes appends performed here. A
+// previous value-passing version silently no-op'd flush — flush saw
+// the outer `pending` whose header still pointed at length 0, and the
+// helper accumulated every chunk into one batch on return, defeating
+// the s3MetaBatchOps cap and re-opening the OOM / oversized-MsgApp
+// risk the cap was meant to bound. See TestS3CleanupManifestBlobs
+// _RespectsMetaBatchOps for the regression guard.
+func (s *S3Server) appendPartBlobKeys(pending *[]*kv.Elem[kv.OP], bucket string, generation uint64, objectKey string, uploadID string, part s3ObjectPart, flush func()) bool {
 	for chunkNo := range part.ChunkSizes {
 		chunkIndex, err := uint64FromInt(chunkNo)
 		if err != nil {
-			return pending, false
+			return false
 		}
-		pending = append(pending, &kv.Elem[kv.OP]{
+		*pending = append(*pending, &kv.Elem[kv.OP]{
 			Op:  kv.Del,
 			Key: s3keys.VersionedBlobKey(bucket, generation, objectKey, uploadID, part.PartNo, chunkIndex, part.PartVersion),
 		})
-		if len(pending) >= s3MetaBatchOps {
+		if len(*pending) >= s3MetaBatchOps {
 			flush()
 		}
 	}
-	return pending, true
+	return true
 }
 
 //nolint:cyclop // Proxying depends on root, bucket, and object-level leadership decisions.
diff --git a/adapter/s3_chunk_batch_test.go b/adapter/s3_chunk_batch_test.go
@@ -5,6 +5,7 @@ import (
 	"testing"
 
 	"github.com/bootjp/elastickv/internal/s3keys"
+	"github.com/bootjp/elastickv/kv"
 	pb "github.com/bootjp/elastickv/proto"
 	"github.com/stretchr/testify/require"
 	"google.golang.org/protobuf/proto"
@@ -137,3 +138,57 @@ func TestS3MetaBatchFitsInRaftMaxSize(t *testing.T) {
 		raftMaxSizePerMsgPostPR593, totalEntrySize, s3MetaBatchOps, len(objectKey),
 	)
 }
+
+// TestAppendPartBlobKeys_FlushFiresEveryS3MetaBatchOps is the
+// regression guard for the slice-by-value bug Gemini caught: a
+// previous version of appendPartBlobKeys took `pending` by value, so
+// the flush closure (captured from cleanupManifestBlobs's enclosing
+// scope) saw the outer slice header at length 0 and never fired,
+// silently accumulating every chunk into one giant batch. This test
+// pins the contract that the helper drains via flush exactly every
+// s3MetaBatchOps appends, never building a slice longer than the cap.
+func TestAppendPartBlobKeys_FlushFiresEveryS3MetaBatchOps(t *testing.T) {
+	t.Parallel()
+
+	// Build a manifest part with chunkCount > 2× s3MetaBatchOps so the
+	// flush closure must fire at least twice, plus a tail flush from
+	// the caller's final flush() in cleanupManifestBlobs.
+	const chunkCount = 2*s3MetaBatchOps + 7
+	chunkSizes := make([]uint64, chunkCount)
+	for i := range chunkSizes {
+		chunkSizes[i] = 1
+	}
+	part := s3ObjectPart{
+		PartNo:      1,
+		PartVersion: 1,
+		ChunkSizes:  chunkSizes,
+	}
+
+	pending := make([]*kv.Elem[kv.OP], 0, s3MetaBatchOps)
+	flushCalls := 0
+	flushBatchSizes := make([]int, 0, 4)
+	flush := func() {
+		// Mirror cleanupManifestBlobs's flush: record the batch size
+		// and then truncate. If the helper's pointer plumbing is
+		// broken, len(pending) here would always be 0 and the
+		// recorded batch sizes would never match s3MetaBatchOps.
+		flushCalls++
+		flushBatchSizes = append(flushBatchSizes, len(pending))
+		pending = pending[:0]
+	}
+
+	srv := (*S3Server)(nil) // method body does not touch s
+	ok := srv.appendPartBlobKeys(&pending, "bucket", 1, "key", "upload", part, flush)
+	require.True(t, ok)
+
+	// Exactly two threshold-triggered flushes inside the helper:
+	// at append #s3MetaBatchOps and #2×s3MetaBatchOps. The 7-entry
+	// remainder is left in pending for the caller's tail flush().
+	require.Equal(t, 2, flushCalls,
+		"expected flush to fire twice (at append %d and %d); slice-by-value bug regressed?",
+		s3MetaBatchOps, 2*s3MetaBatchOps)
+	require.Equal(t, []int{s3MetaBatchOps, s3MetaBatchOps}, flushBatchSizes,
+		"each flush must drain exactly s3MetaBatchOps entries; pending must not silently overflow the cap")
+	require.Len(t, pending, 7,
+		"trailing 7 entries should remain for the caller's final flush()")
+}