Skip to content

Commit eb85c3d

Browse files
authored
s3: AdminDeleteBucket DEL_PREFIX safety net (closes TOCTOU) (#695)
2 parents 303ebe2 + 69c808c commit eb85c3d

9 files changed

Lines changed: 823 additions & 34 deletions

adapter/dynamodb_migration_test.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
"github.com/bootjp/elastickv/kv"
1010
"github.com/bootjp/elastickv/store"
11+
"github.com/cockroachdb/errors"
1112
"github.com/stretchr/testify/require"
1213
)
1314

@@ -27,6 +28,9 @@ func (c *localAdapterCoordinator) Dispatch(ctx context.Context, req *kv.Operatio
2728
if req == nil {
2829
return &kv.CoordinateResponse{}, nil
2930
}
31+
if err := c.validateDispatchShape(req); err != nil {
32+
return nil, err
33+
}
3034
commitTS, err := c.commitTSForRequest(req)
3135
if err != nil {
3236
return nil, err
@@ -37,6 +41,45 @@ func (c *localAdapterCoordinator) Dispatch(ctx context.Context, req *kv.Operatio
3741
return &kv.CoordinateResponse{}, nil
3842
}
3943

44+
// validateDispatchShape mirrors the production coordinator's
45+
// dispatch-time rejection rules so tests catch the same class of
46+
// bug that real clusters would reject. Specifically:
47+
//
48+
// - DEL_PREFIX cannot be in a transactional OperationGroup
49+
// (kv/sharded_coordinator.go: dispatchDelPrefixBroadcast
50+
// refuses IsTxn=true).
51+
// - DEL_PREFIX cannot be mixed with Put or Del in the same
52+
// OperationGroup (validateDelPrefixOnly enforces all-or-none).
53+
//
54+
// Without these checks, a regression that ships
55+
// `IsTxn:true with [Del, DelPrefix...]` (Codex P1 on PR #695)
56+
// would silently pass the local coordinator while production
57+
// rejected every bucket delete with ErrInvalidRequest.
58+
func (c *localAdapterCoordinator) validateDispatchShape(req *kv.OperationGroup[kv.OP]) error {
59+
hasDelPrefix := false
60+
hasOther := false
61+
for _, elem := range req.Elems {
62+
if elem == nil {
63+
continue
64+
}
65+
if elem.Op == kv.DelPrefix {
66+
hasDelPrefix = true
67+
} else {
68+
hasOther = true
69+
}
70+
}
71+
if !hasDelPrefix {
72+
return nil
73+
}
74+
if req.IsTxn {
75+
return errors.Wrap(kv.ErrInvalidRequest, "DEL_PREFIX not supported in transactions")
76+
}
77+
if hasOther {
78+
return errors.Wrap(kv.ErrInvalidRequest, "DEL_PREFIX cannot be mixed with other operations")
79+
}
80+
return nil
81+
}
82+
4083
func (c *localAdapterCoordinator) commitTSForRequest(req *kv.OperationGroup[kv.OP]) (uint64, error) {
4184
if req == nil {
4285
return 0, nil

adapter/s3.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,7 @@ func (s *S3Server) headBucket(w http.ResponseWriter, r *http.Request, bucket str
636636
}
637637

638638
func (s *S3Server) deleteBucket(w http.ResponseWriter, r *http.Request, bucket string) {
639+
var deletedGeneration uint64
639640
err := s.retryS3Mutation(r.Context(), func() error {
640641
readTS := s.readTS()
641642
startTS := s.txnStartTS(readTS)
@@ -669,19 +670,32 @@ func (s *S3Server) deleteBucket(w http.ResponseWriter, r *http.Request, bucket s
669670
}
670671
}
671672

673+
// Phase 1: Del BucketMetaKey in a txn (OCC-protected
674+
// against concurrent createBucket racing this delete).
675+
// Phase 2 (DEL_PREFIX safety net) runs outside the txn
676+
// because the production coordinator rejects DEL_PREFIX
677+
// inside transactions and rejects mixed Del+DelPrefix
678+
// groups (kv/sharded_coordinator.go: dispatchDelPrefixBroadcast).
679+
// See AdminDeleteBucket's doc comment for the full
680+
// rationale.
672681
_, err = s.coordinator.Dispatch(r.Context(), &kv.OperationGroup[kv.OP]{
673682
IsTxn: true,
674683
StartTS: startTS,
675-
Elems: []*kv.Elem[kv.OP]{
676-
{Op: kv.Del, Key: s3keys.BucketMetaKey(bucket)},
677-
},
684+
Elems: []*kv.Elem[kv.OP]{{Op: kv.Del, Key: s3keys.BucketMetaKey(bucket)}},
678685
})
679-
return errors.WithStack(err)
686+
if err != nil {
687+
return errors.WithStack(err)
688+
}
689+
deletedGeneration = meta.Generation
690+
return nil
680691
})
681692
if err != nil {
682693
writeS3MutationError(w, err, bucket, "")
683694
return
684695
}
696+
// Phase 2: best-effort DEL_PREFIX safety net. See
697+
// AdminDeleteBucket / runBucketDeleteSafetyNet for the contract.
698+
s.runBucketDeleteSafetyNet(r.Context(), bucket, deletedGeneration)
685699
w.WriteHeader(http.StatusNoContent)
686700
}
687701

adapter/s3_admin.go

Lines changed: 102 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package adapter
33
import (
44
"bytes"
55
"context"
6+
"log/slog"
67
"sort"
78
"strings"
89

@@ -359,29 +360,48 @@ func (s *S3Server) AdminPutBucketAcl(ctx context.Context, principal AdminPrincip
359360
// bucket-must-be-empty rule mirrors the SigV4 deleteBucket path —
360361
// the dashboard cannot force a recursive delete, by design.
361362
//
362-
// Known orphan-race limitation (coderabbitai 🔴 / 🟠 on PR #669):
363-
// the empty-bucket probe (ScanAt with limit=1 on
364-
// ObjectManifestPrefixForBucket) reads at readTS but the
365-
// subsequent BucketMetaKey delete only carries that single point
366-
// key in its ReadKeys set. A concurrent PutObject that inserts a
367-
// manifest key in the scanned prefix between readTS and the
368-
// delete's commitTS will not conflict — the OCC validator only
369-
// inspects keys that appear in ReadKeys, and there is no
370-
// ReadRanges mechanism today. The object's manifest key survives
371-
// under a now-deleted bucket meta and becomes orphaned.
363+
// The dispatch happens in two phases because the production
364+
// coordinator (kv/sharded_coordinator.go: dispatchDelPrefixBroadcast)
365+
// rejects DEL_PREFIX inside a transaction and rejects DEL_PREFIX
366+
// mixed with Del or Put in the same OperationGroup:
372367
//
373-
// This race exists pre-existing in the SigV4 path
374-
// (adapter/s3.go:deleteBucket — same shape, same limitation), so
375-
// AdminDeleteBucket inherits the contract; closing the gap
376-
// requires either (a) bumping BucketGenerationKey on every
377-
// PutObject so it can serve as an OCC token in this read set, or
378-
// (b) extending OperationGroup with ReadRanges and teaching the
379-
// FSM to validate range emptiness atomically with commit. Both
380-
// are larger changes outside this PR's scope; tracked in
381-
// docs/design/2026_04_24_partial_admin_dashboard.md under the
382-
// Outstanding open items section. Operators concerned about the
383-
// orphan window today should pause writes against the target
384-
// bucket before issuing the admin delete.
368+
// Phase 1: Del BucketMetaKey in a txn (OCC-protected against
369+
// a concurrent AdminCreateBucket landing between our
370+
// readTS and commitTS).
371+
// Phase 2: DEL_PREFIX over every per-bucket key family in a
372+
// non-txn broadcast — the safety net that sweeps
373+
// orphans left by any PutObject that committed
374+
// chunks/manifest between the empty-probe and the
375+
// Phase-1 commit. See design doc
376+
// 2026_04_28_proposed_admin_delete_bucket_safety_net.md
377+
// §6.2 for the original single-OperationGroup design
378+
// and the dispatch-shape rejection that forced the
379+
// two-phase split.
380+
//
381+
// Phase 2 is best-effort: a Phase-2 failure leaves the bucket meta
382+
// already deleted (Phase 1 succeeded) but per-bucket prefixes
383+
// possibly still containing orphans. That state is no worse than
384+
// the pre-fix behaviour on main and recovers on operator-driven
385+
// re-cleanup. We log a warning rather than propagate the error so
386+
// the operator-visible delete reports success — the bucket really
387+
// is gone from the API surface, and a retry would 404 because
388+
// loadBucketMetaAt no longer finds the meta.
389+
//
390+
// BucketGenerationKey is intentionally NOT deleted. Re-creating
391+
// the bucket bumps the generation; orphan blobs that escaped this
392+
// delete (e.g. on an older generation) stay isolated under the
393+
// old generation prefix and never surface in the new bucket.
394+
// Pinned by TestS3Server_AdminDeleteBucket_BucketGenerationKeySurvives.
395+
//
396+
// The contract change for clients: a PutObject that returned 200
397+
// OK during the race window can have its data swept by the
398+
// concurrent delete. Operators are advised to pause writes before
399+
// AdminDeleteBucket; the alternative (orphan objects that no API
400+
// can enumerate or remove) is strictly worse.
401+
//
402+
// The same shape is mirrored on the SigV4 path
403+
// (adapter/s3.go:deleteBucket) so both delete entrypoints share
404+
// the same race-window guarantees.
385405
func (s *S3Server) AdminDeleteBucket(ctx context.Context, principal AdminPrincipal, name string) error {
386406
if !principal.Role.canWrite() {
387407
return ErrAdminForbidden
@@ -390,6 +410,7 @@ func (s *S3Server) AdminDeleteBucket(ctx context.Context, principal AdminPrincip
390410
return ErrAdminNotLeader
391411
}
392412

413+
var deletedGeneration uint64
393414
err := s.retryS3Mutation(ctx, func() error {
394415
readTS := s.readTS()
395416
startTS := s.txnStartTS(readTS)
@@ -411,21 +432,76 @@ func (s *S3Server) AdminDeleteBucket(ctx context.Context, principal AdminPrincip
411432
if len(kvs) > 0 {
412433
return ErrAdminBucketNotEmpty
413434
}
435+
// Phase 1: Del BucketMetaKey in a txn so a concurrent
436+
// AdminCreateBucket racing the delete is rejected by OCC.
437+
// retryS3Mutation handles ErrWriteConflict / ErrTxnLocked
438+
// by re-running this whole closure.
414439
_, err = s.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{
415440
IsTxn: true,
416441
StartTS: startTS,
417-
Elems: []*kv.Elem[kv.OP]{
418-
{Op: kv.Del, Key: s3keys.BucketMetaKey(name)},
419-
},
442+
Elems: []*kv.Elem[kv.OP]{{Op: kv.Del, Key: s3keys.BucketMetaKey(name)}},
420443
})
421-
return errors.WithStack(err)
444+
if err != nil {
445+
return errors.WithStack(err)
446+
}
447+
deletedGeneration = meta.Generation
448+
return nil
422449
})
423450
if err != nil {
424451
return err //nolint:wrapcheck // sentinel errors propagate as-is.
425452
}
453+
// Phase 2: best-effort safety-net DEL_PREFIX. Outside the
454+
// retryS3Mutation closure because retrying after Phase 1
455+
// committed would 404 at loadBucketMetaAt; we want the error
456+
// (if any) logged but not propagated to the operator.
457+
s.runBucketDeleteSafetyNet(ctx, name, deletedGeneration)
426458
return nil
427459
}
428460

461+
// bucketDeleteSafetyNetElems returns the DEL_PREFIX elem list for
462+
// the Phase-2 safety-net dispatch shared between AdminDeleteBucket
463+
// and the SigV4 deleteBucket path. One helper so a future
464+
// per-bucket key family added to the data plane covers both delete
465+
// entrypoints in lockstep.
466+
//
467+
// BucketGenerationKey is intentionally not in the list — see the
468+
// AdminDeleteBucket doc comment for the orphan-isolation rationale.
469+
//
470+
// The 6 DEL_PREFIX ops broadcast across every shard
471+
// (kv/sharded_coordinator.go: DEL_PREFIX cannot be routed to a
472+
// single shard). Acceptable because (a) the empty-probe already
473+
// confirmed the manifest prefix is empty in the common case, so
474+
// per-shard scans return 0 keys, (b) bucket delete is operator-
475+
// frequency, not data-plane.
476+
func bucketDeleteSafetyNetElems(bucket string, generation uint64) []*kv.Elem[kv.OP] {
477+
return []*kv.Elem[kv.OP]{
478+
{Op: kv.DelPrefix, Key: s3keys.ObjectManifestPrefixForBucket(bucket, generation)},
479+
{Op: kv.DelPrefix, Key: s3keys.UploadMetaPrefixForBucket(bucket, generation)},
480+
{Op: kv.DelPrefix, Key: s3keys.UploadPartPrefixForBucket(bucket, generation)},
481+
{Op: kv.DelPrefix, Key: s3keys.BlobPrefixForBucket(bucket, generation)},
482+
{Op: kv.DelPrefix, Key: s3keys.GCUploadPrefixForBucket(bucket, generation)},
483+
{Op: kv.DelPrefix, Key: s3keys.RoutePrefixForBucket(bucket, generation)},
484+
}
485+
}
486+
487+
// runBucketDeleteSafetyNet runs the Phase-2 DEL_PREFIX dispatch
488+
// and swallows transport / cluster errors after logging — the
489+
// caller has already deleted the bucket meta and the operator-
490+
// visible state is consistent with that. Shared between admin and
491+
// SigV4 paths.
492+
func (s *S3Server) runBucketDeleteSafetyNet(ctx context.Context, bucket string, generation uint64) {
493+
if _, err := s.coordinator.Dispatch(ctx, &kv.OperationGroup[kv.OP]{
494+
Elems: bucketDeleteSafetyNetElems(bucket, generation),
495+
}); err != nil {
496+
slog.WarnContext(ctx,
497+
"bucket delete safety-net DEL_PREFIX failed; bucket meta is gone but orphan sweep incomplete",
498+
slog.String("bucket", bucket),
499+
slog.Uint64("generation", generation),
500+
slog.String("error", err.Error()),
501+
)
502+
}
503+
}
504+
429505
// adminCanonicalACL normalises an empty input to the canned
430506
// "private" default. The SigV4 createBucket / putBucketAcl paths
431507
// apply the same default after trimming the x-amz-acl header.

0 commit comments

Comments
 (0)