Skip to content

Commit c334806

Browse files
fix(stack): harden MergeStackEnvVars — in-tx deleting guard, lock_timeout, honest keys_set (panel #7) (#238)
Design-review follow-up to #235. Three hardenings to the atomic env merge: 1. In-tx teardown guard (closes a TOCTOU). The "stack is mid-teardown" check was a pre-read in the handler (stack.Status == deleting before MergeStackEnvVars) — racy: the teardown worker can flip status between GetStackBySlug and the merge, committing an env change onto a deleting stack. The SELECT now fetches status under the FOR UPDATE lock and returns models.ErrStackDeleting (→ 409 stack_deleting); the handler pre-read is removed so the in-tx check is authoritative. Same posture as the resource-DELETE / SetTTL hardenings this session. 2. SET LOCAL lock_timeout = '3s' as the first tx statement, so a PATCH that races a long-held row lock fails fast to a retryable 503 instead of hanging the request goroutine indefinitely. 3. keys_set audit count now counts only actual upserts (v != ""). The old `len(body.Env) - deletes` over-counted a no-op delete (empty value for an absent key: not a delete, not a set), making the rule-12 audit lie. Tests: TestMergeStackEnvVars_Branches rewritten for the new SQL (SET LOCAL exec + status,env_vars SELECT) and adds the lock_timeout-error and status='deleting'→ErrStackDeleting cases — MergeStackEnvVars stays 100%. The existing real-DB deleting handler tests now exercise the in-tx path deterministically. The two stack-env fault tests' failAfter bumped +1 for the added in-tx SET LOCAL exec (which the fault harness counts). Verified vs real Postgres+Redis; all changed lines covered. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent be32484 commit c334806

4 files changed

Lines changed: 108 additions & 38 deletions

File tree

internal/handlers/stack.go

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,13 +1187,10 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {
11871187
return respondError(c, fiber.StatusNotFound, "not_found", "Stack not found")
11881188
}
11891189

1190-
// A stack mid-teardown cannot accept an env change — the teardown
1191-
// worker will delete the row. 409 so the caller knows the request was
1192-
// valid but lost the race, not malformed.
1193-
if stack.Status == stackStatusDeleting {
1194-
return respondError(c, fiber.StatusConflict, "stack_deleting",
1195-
"This stack is being deleted and can no longer be modified.")
1196-
}
1190+
// Note: the "stack is mid-teardown" guard is NOT a pre-read here — it lives
1191+
// inside MergeStackEnvVars under the FOR UPDATE lock (returns ErrStackDeleting,
1192+
// mapped to 409 below). A pre-read would be a TOCTOU: the teardown worker can
1193+
// flip status between this handler's GetStackBySlug and the merge.
11971194

11981195
var body updateStackEnvBody
11991196
if err := c.BodyParser(&body); err != nil {
@@ -1228,6 +1225,12 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {
12281225
return respondError(c, fiber.StatusRequestEntityTooLarge, "env_too_large",
12291226
"Total env_vars payload exceeds 64KiB. Trim values or split across services.")
12301227
}
1228+
if errors.Is(err, models.ErrStackDeleting) {
1229+
// Authoritative teardown check (under the FOR UPDATE lock): the stack
1230+
// is being deleted and can no longer be modified.
1231+
return respondError(c, fiber.StatusConflict, "stack_deleting",
1232+
"This stack is being deleted and can no longer be modified.")
1233+
}
12311234
var notFound *models.ErrStackNotFound
12321235
if errors.As(err, &notFound) {
12331236
// Row vanished between GetStackBySlug and the merge tx. Treat as 404.
@@ -1239,9 +1242,20 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {
12391242
"Failed to persist env vars")
12401243
}
12411244

1245+
// keys_set counts only actual upserts (non-empty values). The old
1246+
// `len(body.Env) - deletes` over-counted when a PATCH sent an empty value
1247+
// for a key that wasn't present (a no-op delete: not counted in `deletes`,
1248+
// yet not a "set" either) — making the rule-12 audit surface lie.
1249+
keysSet := 0
1250+
for _, v := range body.Env {
1251+
if v != "" {
1252+
keysSet++
1253+
}
1254+
}
1255+
12421256
// Best-effort audit emit — never block the response on this.
12431257
auditMeta, _ := json.Marshal(map[string]any{
1244-
"keys_set": len(body.Env) - deletes,
1258+
"keys_set": keysSet,
12451259
"keys_deleted": deletes,
12461260
"total_after": len(merged),
12471261
})
@@ -1267,7 +1281,7 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {
12671281

12681282
slog.Info("stack.env.updated",
12691283
"slug", slug, "team_id", team.ID, "stack_id", stack.ID,
1270-
"keys_set", len(body.Env)-deletes, "keys_deleted", deletes, "total_after", len(merged))
1284+
"keys_set", keysSet, "keys_deleted", deletes, "total_after", len(merged))
12711285

12721286
return c.JSON(fiber.Map{
12731287
"ok": true,

internal/handlers/stack_final2_test.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,10 @@ func TestStackFinal2_UpdateEnv_MergeSelectFailed(t *testing.T) {
113113
_, slug := seedStack(t, seedDB, &teamID, "healthy")
114114
jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com")
115115

116-
// team(1)+GetStackBySlug(2) ok, merge tx SELECT ... FOR UPDATE(3) errors.
117-
app := stackFaultApp(t, openFaultDB(t, 2))
116+
// team(1)+GetStackBySlug(2) ok, merge tx SET LOCAL lock_timeout(3) ok, then the
117+
// SELECT ... FOR UPDATE(4) errors → persist_failed. (failAfter bumped 2→3 when
118+
// the in-tx SET LOCAL lock_timeout exec was added — panel #7 hardening.)
119+
app := stackFaultApp(t, openFaultDB(t, 3))
118120
status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`)
119121
assert.Equal(t, http.StatusServiceUnavailable, status)
120122
assert.Contains(t, body, "persist_failed")
@@ -132,8 +134,9 @@ func TestStackFinal2_UpdateEnv_PersistFailed(t *testing.T) {
132134
_, slug := seedStack(t, seedDB, &teamID, "healthy")
133135
jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com")
134136

135-
// team(1)+slug(2)+merge SELECT FOR UPDATE(3) ok, merge UPDATE(4) errors.
136-
app := stackFaultApp(t, openFaultDB(t, 3))
137+
// team(1)+slug(2)+SET LOCAL(3)+SELECT FOR UPDATE(4) ok, merge UPDATE(5) errors.
138+
// (failAfter bumped 3→4 for the in-tx SET LOCAL lock_timeout exec — panel #7.)
139+
app := stackFaultApp(t, openFaultDB(t, 4))
137140
status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`)
138141
assert.Equal(t, http.StatusServiceUnavailable, status)
139142
assert.Contains(t, body, "persist_failed")

internal/models/coverage_stack_test.go

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -337,75 +337,100 @@ func TestMergeStackEnvVars_Branches(t *testing.T) {
337337
ctx := context.Background()
338338
id := uuid.New()
339339
patch := map[string]string{"A": "1"}
340-
341-
// 1) BeginTx error.
340+
// selRe matches the status+env_vars FOR UPDATE select (post-hardening).
341+
selRe := `SELECT status, COALESCE\(env_vars.*FOR UPDATE`
342+
lockRe := `SET LOCAL lock_timeout`
343+
okRows := func(status, env string) *sqlmock.Rows {
344+
return sqlmock.NewRows([]string{"status", "env_vars"}).AddRow(status, []byte(env))
345+
}
346+
347+
// 1) BeginTx error (before SET LOCAL).
342348
dbB, mockB := newMock(t)
343349
mockB.ExpectBegin().WillReturnError(errors.New("begin-boom"))
344350
_, _, err := MergeStackEnvVars(ctx, dbB, id, patch)
345351
require.ErrorContains(t, err, "begin-boom")
346352

347-
// 2) SELECT ... FOR UPDATE → no rows → ErrStackNotFound (rolls back).
353+
// 2) SET LOCAL lock_timeout error (rolls back).
354+
dbLT, mockLT := newMock(t)
355+
mockLT.ExpectBegin()
356+
mockLT.ExpectExec(lockRe).WillReturnError(errors.New("lock-boom"))
357+
mockLT.ExpectRollback()
358+
_, _, err = MergeStackEnvVars(ctx, dbLT, id, patch)
359+
require.ErrorContains(t, err, "lock_timeout")
360+
361+
// 3) SELECT ... FOR UPDATE → no rows → ErrStackNotFound (rolls back).
348362
dbNF, mockNF := newMock(t)
349363
mockNF.ExpectBegin()
350-
mockNF.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errNoRows())
364+
mockNF.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
365+
mockNF.ExpectQuery(selRe).WillReturnError(errNoRows())
351366
mockNF.ExpectRollback()
352367
var nf *ErrStackNotFound
353368
_, _, err = MergeStackEnvVars(ctx, dbNF, id, patch)
354369
require.ErrorAs(t, err, &nf)
355370

356-
// 3) SELECT error (non-NoRows).
371+
// 4) SELECT error (non-NoRows).
357372
dbSE, mockSE := newMock(t)
358373
mockSE.ExpectBegin()
359-
mockSE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errors.New("sel-boom"))
374+
mockSE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
375+
mockSE.ExpectQuery(selRe).WillReturnError(errors.New("sel-boom"))
360376
mockSE.ExpectRollback()
361377
_, _, err = MergeStackEnvVars(ctx, dbSE, id, patch)
362378
require.ErrorContains(t, err, "sel-boom")
363379

364-
// 4) unmarshal error (malformed jsonb).
380+
// 5) status='deleting' → ErrStackDeleting (in-tx teardown guard, rolls back).
381+
dbDel, mockDel := newMock(t)
382+
mockDel.ExpectBegin()
383+
mockDel.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
384+
mockDel.ExpectQuery(selRe).WillReturnRows(okRows(StackStatusDeleting, `{}`))
385+
mockDel.ExpectRollback()
386+
_, _, err = MergeStackEnvVars(ctx, dbDel, id, patch)
387+
require.ErrorIs(t, err, ErrStackDeleting)
388+
389+
// 6) unmarshal error (malformed jsonb).
365390
dbUM, mockUM := newMock(t)
366391
mockUM.ExpectBegin()
367-
mockUM.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
368-
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`not json`)))
392+
mockUM.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
393+
mockUM.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `not json`))
369394
mockUM.ExpectRollback()
370395
_, _, err = MergeStackEnvVars(ctx, dbUM, id, patch)
371396
require.ErrorContains(t, err, "unmarshal")
372397

373-
// 5) over-cap → ErrStackEnvVarsTooLarge, checked BEFORE the UPDATE (rolls back).
398+
// 7) over-cap → ErrStackEnvVarsTooLarge, checked BEFORE the UPDATE (rolls back).
374399
dbTL, mockTL := newMock(t)
375400
mockTL.ExpectBegin()
376-
mockTL.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
377-
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
401+
mockTL.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
402+
mockTL.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`))
378403
mockTL.ExpectRollback()
379404
big := map[string]string{"K": strings.Repeat("x", maxStackEnvVarsBytes+1)}
380405
_, _, err = MergeStackEnvVars(ctx, dbTL, id, big)
381406
require.ErrorIs(t, err, ErrStackEnvVarsTooLarge)
382407

383-
// 6) UPDATE error.
408+
// 8) UPDATE error.
384409
dbUE, mockUE := newMock(t)
385410
mockUE.ExpectBegin()
386-
mockUE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
387-
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
411+
mockUE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
412+
mockUE.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`))
388413
mockUE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnError(errors.New("upd-boom"))
389414
mockUE.ExpectRollback()
390415
_, _, err = MergeStackEnvVars(ctx, dbUE, id, patch)
391416
require.ErrorContains(t, err, "upd-boom")
392417

393-
// 7) Commit error.
418+
// 9) Commit error.
394419
dbCE, mockCE := newMock(t)
395420
mockCE.ExpectBegin()
396-
mockCE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
397-
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
421+
mockCE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
422+
mockCE.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`))
398423
mockCE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1))
399424
mockCE.ExpectCommit().WillReturnError(errors.New("commit-boom"))
400425
_, _, err = MergeStackEnvVars(ctx, dbCE, id, patch)
401426
require.ErrorContains(t, err, "commit-boom")
402427

403-
// 8) Happy path: existing {A:old, B:keep}; patch upserts A, adds C, deletes B
404-
// (present → counted) and deletes MISSING (absent → NOT counted).
428+
// 10) Happy path: existing {A:old, B:keep}; patch upserts A, adds C, deletes B
429+
// (present → counted) and deletes MISSING (absent → NOT counted).
405430
dbOK, mockOK := newMock(t)
406431
mockOK.ExpectBegin()
407-
mockOK.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
408-
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{"A":"old","B":"keep"}`)))
432+
mockOK.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
433+
mockOK.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{"A":"old","B":"keep"}`))
409434
mockOK.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1))
410435
mockOK.ExpectCommit()
411436
merged, deletes, err := MergeStackEnvVars(ctx, dbOK, id,

internal/models/stack.go

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,19 @@ func GetStackEnvVars(ctx context.Context, db *sql.DB, stackID uuid.UUID) (map[st
636636
// pairs at ~64 bytes each).
637637
var ErrStackEnvVarsTooLarge = errors.New("stack env_vars payload exceeds 64KiB")
638638

639+
// StackStatusDeleting is the status of a stack mid-teardown. Lifecycle
640+
// mutations (env merge) must be rejected for it. Single source of truth shared
641+
// by the model guard and the handler.
642+
const StackStatusDeleting = "deleting"
643+
644+
// ErrStackDeleting is returned by MergeStackEnvVars when the row is mid-teardown
645+
// (status='deleting') at the moment the FOR UPDATE lock is taken. Checking this
646+
// INSIDE the locked transaction — rather than via a pre-read in the handler —
647+
// closes the TOCTOU where the teardown worker flips the row between the
648+
// handler's GetStackBySlug and the merge, which would otherwise commit an env
649+
// change onto a stack that's being deleted.
650+
var ErrStackDeleting = errors.New("stack is being deleted")
651+
639652
// maxStackEnvVarsBytes is the serialized-JSON byte cap on env_vars.
640653
// 64*1024 = 65536. See ErrStackEnvVarsTooLarge for rationale.
641654
const maxStackEnvVarsBytes = 64 * 1024
@@ -699,19 +712,34 @@ func MergeStackEnvVars(ctx context.Context, db *sql.DB, stackID uuid.UUID, patch
699712
return nil, 0, fmt.Errorf("models.MergeStackEnvVars: begin: %w", err)
700713
}
701714
// No-op after a successful Commit; rolls back on any early return (over-cap,
702-
// marshal error, scan error) so the row is never left half-updated.
715+
// marshal error, scan error, deleting) so the row is never left half-updated.
703716
defer func() { _ = tx.Rollback() }()
704717

718+
// Bound the time we'll wait for the row lock. Without this a PATCH that
719+
// races a long-held lock (another in-flight merge, or a teardown holding the
720+
// row) would block the request goroutine indefinitely; 3s fails fast to a
721+
// 503 the caller can retry. SET LOCAL is scoped to this transaction.
722+
if _, err := tx.ExecContext(ctx, `SET LOCAL lock_timeout = '3s'`); err != nil {
723+
return nil, 0, fmt.Errorf("models.MergeStackEnvVars: set lock_timeout: %w", err)
724+
}
725+
726+
// Fetch status under the same FOR UPDATE lock so the teardown check is
727+
// race-free: a stack flipped to 'deleting' between the handler's
728+
// GetStackBySlug and here is caught authoritatively, not via a stale pre-read.
705729
var raw []byte
730+
var status string
706731
err = tx.QueryRowContext(ctx, `
707-
SELECT COALESCE(env_vars, '{}'::jsonb) FROM stacks WHERE id = $1 FOR UPDATE
708-
`, stackID).Scan(&raw)
732+
SELECT status, COALESCE(env_vars, '{}'::jsonb) FROM stacks WHERE id = $1 FOR UPDATE
733+
`, stackID).Scan(&status, &raw)
709734
if err == sql.ErrNoRows {
710735
return nil, 0, &ErrStackNotFound{Slug: stackID.String()}
711736
}
712737
if err != nil {
713738
return nil, 0, fmt.Errorf("models.MergeStackEnvVars: select: %w", err)
714739
}
740+
if status == StackStatusDeleting {
741+
return nil, 0, ErrStackDeleting
742+
}
715743

716744
merged := map[string]string{}
717745
if len(raw) > 0 {

0 commit comments

Comments
 (0)