@@ -3,6 +3,7 @@ package adapter
33import (
44 "bytes"
55 "context"
6+ "log/slog"
67 "sort"
78 "strings"
89
@@ -359,29 +360,48 @@ func (s *S3Server) AdminPutBucketAcl(ctx context.Context, principal AdminPrincip
359360// bucket-must-be-empty rule mirrors the SigV4 deleteBucket path —
360361// the dashboard cannot force a recursive delete, by design.
361362//
362- // Known orphan-race limitation (coderabbitai 🔴 / 🟠 on PR #669):
363- // the empty-bucket probe (ScanAt with limit=1 on
364- // ObjectManifestPrefixForBucket) reads at readTS but the
365- // subsequent BucketMetaKey delete only carries that single point
366- // key in its ReadKeys set. A concurrent PutObject that inserts a
367- // manifest key in the scanned prefix between readTS and the
368- // delete's commitTS will not conflict — the OCC validator only
369- // inspects keys that appear in ReadKeys, and there is no
370- // ReadRanges mechanism today. The object's manifest key survives
371- // under a now-deleted bucket meta and becomes orphaned.
363+ // The dispatch happens in two phases because the production
364+ // coordinator (kv/sharded_coordinator.go: dispatchDelPrefixBroadcast)
365+ // rejects DEL_PREFIX inside a transaction and rejects DEL_PREFIX
366+ // mixed with Del or Put in the same OperationGroup:
372367//
373- // This race exists pre-existing in the SigV4 path
374- // (adapter/s3.go:deleteBucket — same shape, same limitation), so
375- // AdminDeleteBucket inherits the contract; closing the gap
376- // requires either (a) bumping BucketGenerationKey on every
377- // PutObject so it can serve as an OCC token in this read set, or
378- // (b) extending OperationGroup with ReadRanges and teaching the
379- // FSM to validate range emptiness atomically with commit. Both
380- // are larger changes outside this PR's scope; tracked in
381- // docs/design/2026_04_24_partial_admin_dashboard.md under the
382- // Outstanding open items section. Operators concerned about the
383- // orphan window today should pause writes against the target
384- // bucket before issuing the admin delete.
368+ // Phase 1: Del BucketMetaKey in a txn (OCC-protected against
369+ // a concurrent AdminCreateBucket landing between our
370+ // readTS and commitTS).
371+ // Phase 2: DEL_PREFIX over every per-bucket key family in a
372+ // non-txn broadcast — the safety net that sweeps
373+ // orphans left by any PutObject that committed
374+ // chunks/manifest between the empty-probe and the
375+ // Phase-1 commit. See design doc
376+ // 2026_04_28_proposed_admin_delete_bucket_safety_net.md
377+ // §6.2 for the original single-OperationGroup design
378+ // and the dispatch-shape rejection that forced the
379+ // two-phase split.
380+ //
381+ // Phase 2 is best-effort: a Phase-2 failure leaves the bucket meta
382+ // already deleted (Phase 1 succeeded) but per-bucket prefixes
383+ // possibly still containing orphans. That state is no worse than
384+ // the pre-fix behaviour on main and recovers on operator-driven
385+ // re-cleanup. We log a warning rather than propagate the error so
386+ // the operator-visible delete reports success — the bucket really
387+ // is gone from the API surface, and a retry would 404 because
388+ // loadBucketMetaAt no longer finds the meta.
389+ //
390+ // BucketGenerationKey is intentionally NOT deleted. Re-creating
391+ // the bucket bumps the generation; orphan blobs that escaped this
392+ // delete (e.g. on an older generation) stay isolated under the
393+ // old generation prefix and never surface in the new bucket.
394+ // Pinned by TestS3Server_AdminDeleteBucket_BucketGenerationKeySurvives.
395+ //
396+ // The contract change for clients: a PutObject that returned 200
397+ // OK during the race window can have its data swept by the
398+ // concurrent delete. Operators are advised to pause writes before
399+ // AdminDeleteBucket; the alternative (orphan objects that no API
400+ // can enumerate or remove) is strictly worse.
401+ //
402+ // The same shape is mirrored on the SigV4 path
403+ // (adapter/s3.go:deleteBucket) so both delete entrypoints share
404+ // the same race-window guarantees.
385405func (s * S3Server ) AdminDeleteBucket (ctx context.Context , principal AdminPrincipal , name string ) error {
386406 if ! principal .Role .canWrite () {
387407 return ErrAdminForbidden
@@ -390,6 +410,7 @@ func (s *S3Server) AdminDeleteBucket(ctx context.Context, principal AdminPrincip
390410 return ErrAdminNotLeader
391411 }
392412
413+ var deletedGeneration uint64
393414 err := s .retryS3Mutation (ctx , func () error {
394415 readTS := s .readTS ()
395416 startTS := s .txnStartTS (readTS )
@@ -411,21 +432,76 @@ func (s *S3Server) AdminDeleteBucket(ctx context.Context, principal AdminPrincip
411432 if len (kvs ) > 0 {
412433 return ErrAdminBucketNotEmpty
413434 }
435+ // Phase 1: Del BucketMetaKey in a txn so a concurrent
436+ // AdminCreateBucket racing the delete is rejected by OCC.
437+ // retryS3Mutation handles ErrWriteConflict / ErrTxnLocked
438+ // by re-running this whole closure.
414439 _ , err = s .coordinator .Dispatch (ctx , & kv.OperationGroup [kv.OP ]{
415440 IsTxn : true ,
416441 StartTS : startTS ,
417- Elems : []* kv.Elem [kv.OP ]{
418- {Op : kv .Del , Key : s3keys .BucketMetaKey (name )},
419- },
442+ Elems : []* kv.Elem [kv.OP ]{{Op : kv .Del , Key : s3keys .BucketMetaKey (name )}},
420443 })
421- return errors .WithStack (err )
444+ if err != nil {
445+ return errors .WithStack (err )
446+ }
447+ deletedGeneration = meta .Generation
448+ return nil
422449 })
423450 if err != nil {
424451 return err //nolint:wrapcheck // sentinel errors propagate as-is.
425452 }
453+ // Phase 2: best-effort safety-net DEL_PREFIX. Outside the
454+ // retryS3Mutation closure because retrying after Phase 1
455+ // committed would 404 at loadBucketMetaAt; we want the error
456+ // (if any) logged but not propagated to the operator.
457+ s .runBucketDeleteSafetyNet (ctx , name , deletedGeneration )
426458 return nil
427459}
428460
461+ // bucketDeleteSafetyNetElems returns the DEL_PREFIX elem list for
462+ // the Phase-2 safety-net dispatch shared between AdminDeleteBucket
463+ // and the SigV4 deleteBucket path. One helper so a future
464+ // per-bucket key family added to the data plane covers both delete
465+ // entrypoints in lockstep.
466+ //
467+ // BucketGenerationKey is intentionally not in the list — see the
468+ // AdminDeleteBucket doc comment for the orphan-isolation rationale.
469+ //
470+ // The 6 DEL_PREFIX ops broadcast across every shard
471+ // (kv/sharded_coordinator.go: DEL_PREFIX cannot be routed to a
472+ // single shard). Acceptable because (a) the empty-probe already
473+ // confirmed the manifest prefix is empty in the common case, so
474+ // per-shard scans return 0 keys, (b) bucket delete is operator-
475+ // frequency, not data-plane.
476+ func bucketDeleteSafetyNetElems (bucket string , generation uint64 ) []* kv.Elem [kv.OP ] {
477+ return []* kv.Elem [kv.OP ]{
478+ {Op : kv .DelPrefix , Key : s3keys .ObjectManifestPrefixForBucket (bucket , generation )},
479+ {Op : kv .DelPrefix , Key : s3keys .UploadMetaPrefixForBucket (bucket , generation )},
480+ {Op : kv .DelPrefix , Key : s3keys .UploadPartPrefixForBucket (bucket , generation )},
481+ {Op : kv .DelPrefix , Key : s3keys .BlobPrefixForBucket (bucket , generation )},
482+ {Op : kv .DelPrefix , Key : s3keys .GCUploadPrefixForBucket (bucket , generation )},
483+ {Op : kv .DelPrefix , Key : s3keys .RoutePrefixForBucket (bucket , generation )},
484+ }
485+ }
486+
487+ // runBucketDeleteSafetyNet runs the Phase-2 DEL_PREFIX dispatch
488+ // and swallows transport / cluster errors after logging — the
489+ // caller has already deleted the bucket meta and the operator-
490+ // visible state is consistent with that. Shared between admin and
491+ // SigV4 paths.
492+ func (s * S3Server ) runBucketDeleteSafetyNet (ctx context.Context , bucket string , generation uint64 ) {
493+ if _ , err := s .coordinator .Dispatch (ctx , & kv.OperationGroup [kv.OP ]{
494+ Elems : bucketDeleteSafetyNetElems (bucket , generation ),
495+ }); err != nil {
496+ slog .WarnContext (ctx ,
497+ "bucket delete safety-net DEL_PREFIX failed; bucket meta is gone but orphan sweep incomplete" ,
498+ slog .String ("bucket" , bucket ),
499+ slog .Uint64 ("generation" , generation ),
500+ slog .String ("error" , err .Error ()),
501+ )
502+ }
503+ }
504+
429505// adminCanonicalACL normalises an empty input to the canned
430506// "private" default. The SigV4 createBucket / putBucketAcl paths
431507// apply the same default after trimming the x-amz-acl header.
0 commit comments