Skip to content

Commit 8edd54b

Browse files
fix(resource): atomic single-shot deprovision on DELETE — close double-deprovision TOCTOU
Design-review follow-up to #229. The idempotent-DELETE fix used a `resource.Status == "deleted"` PRE-READ before SoftDeleteResource + deprovision — a check-then-act TOCTOU: two concurrent DELETEs both read status='active', both pass the pre-read, and BOTH fire the destructive provisioner Deprovision / storage Deprovision RPC against the same backend. That is the truehomie-db DROP incident class (an active customer DB dropped via a path that re-ran teardown). Fix: make the soft-delete the authoritative atomic guard. New models.SoftDeleteResourceIfActive does `UPDATE ... SET status='deleted' WHERE id=$1 AND status != 'deleted'` and returns RowsAffected>0. The handler deprovisions ONLY when this call won the transition (deleted==true); any racer or sequential retry gets deleted==false → idempotent {"ok":true,"already_deleted":true} with NO second deprovision. Removes the redundant pre-read (the rows-affected check subsumes it and is race-safe). SoftDeleteResource is kept unchanged for the ~18 provision-rollback callers that always operate on a just-created row. Tests: models.TestSoftDeleteResourceIfActive_Branches (sqlmock: won / no-op / error — 100% of the new fn). The existing TestResourceDelete_Idempotent_ DoubleDelete now exercises the rows-affected !deleted branch via a sequential retry (200 already_deleted, no re-deprovision). All changed handler lines covered. Verified vs real Postgres+Redis. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 893bed9 commit 8edd54b

3 files changed

Lines changed: 74 additions & 14 deletions

File tree

internal/handlers/resource.go

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -216,22 +216,29 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error {
216216
return respondError(c, fiber.StatusNotFound, "not_found", "Resource not found")
217217
}
218218

219-
// Idempotent DELETE (bug-bash #4/#12): a repeated DELETE (retry, double-click,
220-
// concurrent call) must NOT re-soft-delete and, worse, re-deprovision the
221-
// backend a second time. The first DELETE already tore it down — report
222-
// success and do nothing.
223-
if resource.Status == "deleted" {
224-
return c.JSON(fiber.Map{"ok": true, "already_deleted": true, "id": resource.ID.String()})
225-
}
226-
227-
if err := models.SoftDeleteResource(c.Context(), h.db, resource.ID); err != nil {
219+
// Idempotent DELETE (bug-bash #4/#12, hardened post-review): a repeated
220+
// DELETE (retry, double-click, CONCURRENT call) must NOT re-fire the
221+
// destructive deprovision against already-gone backend infra. The original
222+
// fix used a `status == "deleted"` pre-read, but that is a check-then-act
223+
// TOCTOU: two concurrent DELETEs both read status='active' and both
224+
// deprovision. The authoritative guard is the atomic status-gated UPDATE —
225+
// only the call that actually transitions active→deleted (RowsAffected==1)
226+
// proceeds to deprovision; any racer/retry gets a no-op success.
227+
deleted, err := models.SoftDeleteResourceIfActive(c.Context(), h.db, resource.ID)
228+
if err != nil {
228229
slog.Error("resource.delete.failed",
229230
"error", err,
230231
"resource_id", resource.ID,
231232
"request_id", requestID,
232233
)
233234
return respondError(c, fiber.StatusServiceUnavailable, "delete_failed", "Failed to delete resource")
234235
}
236+
if !deleted {
237+
// Already deleted (sequential retry) or a concurrent DELETE won the
238+
// race. Either way the backend is being / was torn down by the winner —
239+
// report idempotent success WITHOUT re-firing deprovision.
240+
return c.JSON(fiber.Map{"ok": true, "already_deleted": true, "id": resource.ID.String()})
241+
}
235242

236243
// Deprovision the physical resource.
237244
// Fail open: a provisioner error is logged but does not affect the 200 response.

internal/models/coverage_resource_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,34 @@ func TestSoftDeleteResource_Branches(t *testing.T) {
234234
require.ErrorContains(t, SoftDeleteResource(ctx, db2, uuid.New()), "boom")
235235
}
236236

237+
func TestSoftDeleteResourceIfActive_Branches(t *testing.T) {
238+
ctx := context.Background()
239+
240+
// Winner: the status-gated UPDATE flips a row → deleted==true.
241+
db, mock := newMock(t)
242+
mock.ExpectExec(`UPDATE resources SET status = 'deleted' WHERE id = \$1 AND status != 'deleted'`).
243+
WillReturnResult(sqlmock.NewResult(0, 1))
244+
got, err := SoftDeleteResourceIfActive(ctx, db, uuid.New())
245+
require.NoError(t, err)
246+
require.True(t, got, "a row was transitioned → deleted must be true")
247+
248+
// Loser/retry: row already deleted, 0 rows affected → deleted==false
249+
// (this is the guard that stops the second concurrent DELETE from
250+
// re-firing the destructive deprovision).
251+
db2, mock2 := newMock(t)
252+
mock2.ExpectExec(`UPDATE resources SET status = 'deleted' WHERE id = \$1 AND status != 'deleted'`).
253+
WillReturnResult(sqlmock.NewResult(0, 0))
254+
got2, err := SoftDeleteResourceIfActive(ctx, db2, uuid.New())
255+
require.NoError(t, err)
256+
require.False(t, got2, "no row affected → deleted must be false (no deprovision)")
257+
258+
// Error path.
259+
db3, mock3 := newMock(t)
260+
mock3.ExpectExec(`UPDATE resources SET status = 'deleted'`).WillReturnError(errors.New("boom"))
261+
_, err = SoftDeleteResourceIfActive(ctx, db3, uuid.New())
262+
require.ErrorContains(t, err, "boom")
263+
}
264+
237265
func TestPauseResource_Branches(t *testing.T) {
238266
ctx := context.Background()
239267
db, mock := newMock(t)

internal/models/resource.go

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,9 @@ func SetWebhookHMACSecret(ctx context.Context, db *sql.DB, resourceID uuid.UUID,
454454
return nil
455455
}
456456

457-
// SoftDeleteResource marks a resource status as 'deleted'.
457+
// SoftDeleteResource marks a resource status as 'deleted'. Used by the
458+
// provision-rollback paths (db/cache/nosql/queue/storage/vector), which always
459+
// operate on a row they just created — they don't need the concurrency guard.
458460
func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error {
459461
_, err := db.ExecContext(ctx, `
460462
UPDATE resources SET status = 'deleted' WHERE id = $1
@@ -465,6 +467,28 @@ func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error {
465467
return nil
466468
}
467469

470+
// SoftDeleteResourceIfActive flips status to 'deleted' ONLY when the row is not
471+
// already deleted, returning whether THIS call performed the transition.
472+
//
473+
// The `status != 'deleted'` guard makes a concurrent DELETE single-shot: when
474+
// two requests race (both read the row before either commits), the UPDATE
475+
// serializes at the row and exactly one reports deleted==true. The DELETE
476+
// handler fires the DESTRUCTIVE deprovision RPC only on that winning call, so
477+
// the physical backend is torn down once — never twice against already-gone
478+
// infra (the truehomie-db DROP incident class). This is the authoritative
479+
// idempotency guard for DELETE /resources/:id, replacing a status pre-read
480+
// that had a check-then-act TOCTOU under concurrency.
481+
func SoftDeleteResourceIfActive(ctx context.Context, db *sql.DB, id uuid.UUID) (bool, error) {
482+
res, err := db.ExecContext(ctx, `
483+
UPDATE resources SET status = 'deleted' WHERE id = $1 AND status != 'deleted'
484+
`, id)
485+
if err != nil {
486+
return false, fmt.Errorf("models.SoftDeleteResourceIfActive: %w", err)
487+
}
488+
n, _ := res.RowsAffected()
489+
return n > 0, nil
490+
}
491+
468492
// ErrResourceNotActive is returned by PauseResource when the row exists but
469493
// status != 'active'. The handler maps this to 409 conflict (already paused
470494
// or terminal). Distinct error type so the handler doesn't have to second-guess
@@ -676,10 +700,11 @@ func UpdateProviderResourceID(ctx context.Context, db *sql.DB, resourceID uuid.U
676700
// resource to newTier and clears its TTL (expires_at = NULL).
677701
//
678702
// Called from the Razorpay subscription.charged webhook. Picks up two cases:
679-
// 1) Resources that are already permanent (expires_at IS NULL) — a hobby
680-
// user upgrading to pro: lift their existing resources to the new tier.
681-
// 2) Resources still on anonymous TTL (expires_at > now()) — a freshly
682-
// claimed user paying for the first time: clear the TTL + set tier.
703+
// 1. Resources that are already permanent (expires_at IS NULL) — a hobby
704+
// user upgrading to pro: lift their existing resources to the new tier.
705+
// 2. Resources still on anonymous TTL (expires_at > now()) — a freshly
706+
// claimed user paying for the first time: clear the TTL + set tier.
707+
//
683708
// This is the second half of "pay from day one": claim transfers team
684709
// ownership but does NOT clear the TTL or change tier. Only payment does.
685710
//

0 commit comments

Comments
 (0)