Skip to content

Commit 773c913

Browse files
fix(deploy): SetTTL terminal/permanent guard authoritative in SQL — close TOCTOU
Design-review follow-up to #228. SetTTL guarded permanent + terminal deploys via a PRE-READ of the looked-up row, then called SetDeploymentTTL whose UPDATE only guarded `ttl_policy != 'permanent'`. Two problems the panel surfaced: 1. TOCTOU: a deploy reaped to expired/deleted/stopped (or made permanent) between the handler's status read and the UPDATE would still get expires_at re-armed + reminders_sent reset to 0 — re-entering the 6-email reminder cycle for a deploy the user thinks is gone. A lost race also returned 200 "TTL set" for a write that never happened. 2. The terminal SQL guard was missing entirely (only permanent was guarded). Fix: make the status-guarded UPDATE the AUTHORITATIVE guard. SetDeploymentTTL now `WHERE id=$2 AND ttl_policy != 'permanent' AND status NOT IN ('deleted','expired','stopped')` and returns whether a row was actually updated. The handler drops the stale pre-read; on !applied it re-reads (now authoritative, not racy) ONLY to pick the precise code — already_permanent vs invalid_state — so a concurrent change yields an honest 409, never a phantom 200. Added lifecycleTerminalStatusesSQL (deleted/expired/stopped) as a NAMED const, deliberately distinct from terminalDeploymentStatusesSQL (deleted/expired): the former is "reject lifecycle mutations" (stopped is terminal), the latter is "hide from the user's list" (stopped is still visible). Documented why they differ — the panel's "unify them" call was wrong; they answer different questions. Tests: model SetDeploymentTTL now 100% (added the 0-rows→applied=false sqlmock case); the existing TestSetTTL_{Permanent,Terminal}Rejected now deterministically cover the handler's re-read !applied sub-branches (permanent→already_permanent, terminal→invalid_state). Signature change threaded through 3 callers. Verified vs real Postgres+Redis; all changed lines covered. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 53e312b commit 773c913

5 files changed

Lines changed: 68 additions & 27 deletions

File tree

internal/handlers/deploy_ttl.go

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -143,19 +143,6 @@ func (h *DeployHandler) SetTTL(c *fiber.Ctx) error {
143143
return respondError(c, fiber.StatusNotFound, "not_found", "Deployment not found")
144144
}
145145

146-
// Bug-bash #3/#5/#6: a permanent deploy must not be silently flipped back to
147-
// an expiring TTL, and a terminal (expired/deleted/stopped) deploy can't have
148-
// its lifecycle changed. Guard both before the write (the model adds a
149-
// defense-in-depth WHERE clause for the permanent case too).
150-
if d.TTLPolicy == models.DeployTTLPolicyPermanent {
151-
return respondError(c, fiber.StatusConflict, "already_permanent",
152-
"This deployment is permanent — it has no TTL to set. (Downgrading a permanent deploy is support-only.)")
153-
}
154-
if models.IsDeploymentTerminal(d.Status) {
155-
return respondError(c, fiber.StatusConflict, "invalid_state",
156-
fmt.Sprintf("Cannot set a TTL on a %s deployment", d.Status))
157-
}
158-
159146
if team.PlanTier == "anonymous" {
160147
// B7-P1-7 (see MakePermanent above): emit `claim_required` so an
161148
// agent branching on error code routes the user to the free claim
@@ -180,13 +167,33 @@ func (h *DeployHandler) SetTTL(c *fiber.Ctx) error {
180167
"")
181168
}
182169

183-
if err := models.SetDeploymentTTL(c.Context(), h.db, d.ID, body.Hours); err != nil {
170+
// The status-guarded UPDATE is the AUTHORITATIVE guard (bug-bash #3/#5/#6,
171+
// hardened post-review): it only touches a row whose ttl_policy != 'permanent'
172+
// AND whose status is non-terminal. Doing the check in SQL — rather than a
173+
// pre-read of the looked-up row — removes the check-then-act TOCTOU where a
174+
// deploy reaped/made-permanent between the read and the write would still get
175+
// a re-armed TTL + reminders_sent=0 reset.
176+
applied, err := models.SetDeploymentTTL(c.Context(), h.db, d.ID, body.Hours)
177+
if err != nil {
184178
slog.Error("deploy.set_ttl.failed",
185179
"deploy_id", d.ID, "team_id", team.ID, "error", err,
186180
"request_id", middleware.GetRequestID(c))
187181
return respondError(c, fiber.StatusServiceUnavailable, "update_failed",
188182
"Failed to set TTL")
189183
}
184+
if !applied {
185+
// The UPDATE matched no row → the deploy is permanent or terminal. Re-read
186+
// (authoritative, not a stale pre-read) only to choose the precise code:
187+
// `already_permanent` for a permanent deploy, else `invalid_state`. A
188+
// fetch error here still yields an honest 409 rather than a phantom 200.
189+
cur, ferr := models.GetDeploymentByID(c.Context(), h.db, d.ID)
190+
if ferr == nil && cur.TTLPolicy == models.DeployTTLPolicyPermanent {
191+
return respondError(c, fiber.StatusConflict, "already_permanent",
192+
"This deployment is permanent — it has no TTL to set. (Downgrading a permanent deploy is support-only.)")
193+
}
194+
return respondError(c, fiber.StatusConflict, "invalid_state",
195+
"Cannot set a TTL: the deployment is in a terminal state (expired/deleted/stopped) or changed concurrently.")
196+
}
190197

191198
updated, err := models.GetDeploymentByID(c.Context(), h.db, d.ID)
192199
if err != nil {

internal/handlers/deploy_ttl_guards_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,11 @@ func TestSetDeploymentTTL_PermanentGuard(t *testing.T) {
9292
_, err := db.Exec(`UPDATE deployments SET ttl_policy='permanent', expires_at=NULL WHERE id=$1`, deployID)
9393
require.NoError(t, err)
9494

95-
// Direct model call must NOT flip the permanent row.
96-
require.NoError(t, models.SetDeploymentTTL(context.Background(), db, deployID, 24))
95+
// Direct model call must NOT flip the permanent row — the WHERE guard
96+
// matches no row, so it reports applied=false (no error).
97+
appliedPerm, err := models.SetDeploymentTTL(context.Background(), db, deployID, 24)
98+
require.NoError(t, err)
99+
require.False(t, appliedPerm, "permanent row must not be updated → applied=false")
97100
var policy string
98101
var expiresAt sql.NullTime
99102
require.NoError(t, db.QueryRow(`SELECT ttl_policy, expires_at FROM deployments WHERE id=$1`, deployID).Scan(&policy, &expiresAt))

internal/models/coverage_deployment_test.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,19 @@ func TestDeploymentSimpleUpdaters_Branches(t *testing.T) {
201201
// SetDeploymentTTL
202202
db8, mock8 := newMock(t)
203203
mock8.ExpectExec(`SET expires_at = \$1,\s+ttl_policy = 'custom'`).WillReturnResult(sqlmock.NewResult(0, 1))
204-
require.NoError(t, SetDeploymentTTL(ctx, db8, id, 48))
204+
ok8, err8 := SetDeploymentTTL(ctx, db8, id, 48)
205+
require.NoError(t, err8)
206+
require.True(t, ok8, "1 row affected → applied")
205207
db8b, mock8b := newMock(t)
206208
mock8b.ExpectExec(`ttl_policy = 'custom'`).WillReturnError(errors.New("boom"))
207-
require.ErrorContains(t, SetDeploymentTTL(ctx, db8b, id, 48), "boom")
209+
_, err8b := SetDeploymentTTL(ctx, db8b, id, 48)
210+
require.ErrorContains(t, err8b, "boom")
211+
// 0 rows affected (permanent/terminal guard matched nothing) → applied=false.
212+
db8c, mock8c := newMock(t)
213+
mock8c.ExpectExec(`ttl_policy = 'custom'`).WillReturnResult(sqlmock.NewResult(0, 0))
214+
ok8c, err8c := SetDeploymentTTL(ctx, db8c, id, 48)
215+
require.NoError(t, err8c)
216+
require.False(t, ok8c, "0 rows affected → not applied")
208217

209218
// MarkDeploymentExpired
210219
db9, mock9 := newMock(t)

internal/models/deployment.go

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -738,25 +738,34 @@ func ElevateDeploymentTiersByTeam(ctx context.Context, db *sql.DB, teamID uuid.U
738738
// (1..8760) BEFORE invoking this — the model trusts its input. Resets
739739
// reminders_sent so a freshly-extended deploy gets the full 6-email
740740
// warning cycle again instead of skipping reminders that fired earlier.
741-
func SetDeploymentTTL(ctx context.Context, db *sql.DB, id uuid.UUID, hours int) error {
741+
func SetDeploymentTTL(ctx context.Context, db *sql.DB, id uuid.UUID, hours int) (bool, error) {
742742
expiresAt := time.Now().UTC().Add(time.Duration(hours) * time.Hour)
743-
// `ttl_policy != 'permanent'` is defense-in-depth (the handler already
744-
// rejects a permanent deploy with 409): if a concurrent MakePermanent races
745-
// this UPDATE, the WHERE clause prevents silently un-permanenting the deploy
746-
// — the row simply isn't touched (bug-bash #6).
747-
_, err := db.ExecContext(ctx, `
743+
// The WHERE clause is the AUTHORITATIVE guard, closing the handler's
744+
// check-then-act TOCTOU (bug-bash #6, hardened post-review):
745+
// - `ttl_policy != 'permanent'` — a concurrent MakePermanent must not be
746+
// silently un-permanented.
747+
// - `status NOT IN <lifecycleTerminalStatusesSQL>` — a deploy reaped to
748+
// expired/deleted/stopped between the handler's status read and this
749+
// UPDATE must NOT get a re-armed expires_at + reminders_sent=0 reset
750+
// (which would re-enter the reminder-email cycle for a gone deploy).
751+
// Returns whether a row was actually updated so the handler can surface a
752+
// 409 when the deploy changed underneath it rather than reporting a phantom
753+
// success for a write that never happened.
754+
res, err := db.ExecContext(ctx, `
748755
UPDATE deployments
749756
SET expires_at = $1,
750757
ttl_policy = 'custom',
751758
reminders_sent = 0,
752759
last_reminder_at = NULL,
753760
updated_at = now()
754761
WHERE id = $2 AND ttl_policy != 'permanent'
762+
AND status NOT IN `+lifecycleTerminalStatusesSQL+`
755763
`, expiresAt, id)
756764
if err != nil {
757-
return fmt.Errorf("models.SetDeploymentTTL: %w", err)
765+
return false, fmt.Errorf("models.SetDeploymentTTL: %w", err)
758766
}
759-
return nil
767+
n, _ := res.RowsAffected()
768+
return n > 0, nil
760769
}
761770

762771
// GetDeploymentsExpiringSoon returns deployments whose expires_at falls
@@ -955,6 +964,17 @@ const activeDeploymentStatusesSQL = `('building', 'deploying', 'healthy')`
955964
// "user-visible deployments" row set — see deploymentVisibleClause.
956965
const terminalDeploymentStatusesSQL = `('deleted', 'expired')`
957966

967+
// lifecycleTerminalStatusesSQL is the IN-list for "this deploy has reached an
968+
// end state — reject lifecycle MUTATIONS on it." It deliberately DIFFERS from
969+
// terminalDeploymentStatusesSQL: that one answers "hide from the user's list /
970+
// usage count" (a 'stopped' deploy is still VISIBLE — paused, not gone — so
971+
// it's excluded there). This one answers "may we re-arm a TTL on it", where
972+
// 'stopped' IS terminal — matching IsDeploymentTerminal. The two lists encode
973+
// different questions; keeping them separate is intentional, not drift. Used
974+
// by SetDeploymentTTL's UPDATE guard so a deploy reaped between the handler's
975+
// status read and the UPDATE cannot get a re-armed TTL + reminder-cycle reset.
976+
const lifecycleTerminalStatusesSQL = `('deleted', 'expired', 'stopped')`
977+
958978
// deploymentVisibleClause is the shared WHERE predicate for "deployments the
959979
// user sees" — i.e. every non-terminal row. GET /api/v1/deployments (the list)
960980
// and GET /api/v1/billing/usage's deployment count MUST use this same clause

internal/models/deployment_ttl_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,9 @@ func TestSetDeploymentTTL_SetsCustomExpiryAndResetsReminders(t *testing.T) {
162162
`, d.ID)
163163
require.NoError(t, err)
164164

165-
require.NoError(t, models.SetDeploymentTTL(ctx, db, d.ID, 72))
165+
appliedTTL, err := models.SetDeploymentTTL(ctx, db, d.ID, 72)
166+
require.NoError(t, err)
167+
require.True(t, appliedTTL, "healthy deploy → TTL applied")
166168

167169
refreshed, err := models.GetDeploymentByID(ctx, db, d.ID)
168170
require.NoError(t, err)

0 commit comments

Comments
 (0)