Skip to content

Commit 91a947b

Browse files
fix(stacks): mark promote approval executed AFTER preflight, not before (#11)
The Promote handler called MarkPromoteApprovalExecuted inside consumeApprovedPromote BEFORE the promote preflight (source-services fetch, image_ref check, target create/update, vault copy, env load) ran. Any preflight failure (412 missing_image_ref / no_services, 503 lookup/create/ env_load, 400 vault, 402 cap) therefore burned the single-use approval to 'executed' while the promote never launched — stranding the operator with a consumed, non-retryable approval and forcing a fresh email round-trip. Split consumeApprovedPromote into: - validateApprovedPromote — read-only checks (uuid, lookup, ownership, status, from/to/kind match, expiry), runs at the original position so an invalid approval still 4xxs early. Returns the row, no mutation. - markApprovedPromoteExecuted — the 'executed' CAS flip + audit, called ONLY after preflight fully succeeds, immediately before the runStackDeploy launch. A preflight failure now leaves the approval 'approved' and retryable. The single-use guarantee is preserved: MarkPromoteApprovalExecuted's CAS still returns 0 rows (409 approval_already_executed) on a concurrent double-consume. Twin promote (consumeApprovedTwin in twin.go) has its own consume path and is out of scope for this fix. Coverage: Symptom: preflight 412/503 strands a consumed (status='executed') approval Enumeration: rg MarkPromoteApprovalExecuted + consumeApprovedPromote call sites in stack.go Sites found: 1 (stack.go Promote; twin.go has a separate consumeApprovedTwin) Sites touched: 1 Coverage test: TestStackPromote_ApprovalID_PreflightFails_StaysApproved (missing_image_ref), TestStackPromote_ApprovalID_NoServices_StaysApproved, TestStackPromote_ApprovalID_HappyPath_ExecutesOnce (executed once + single-use), TestMarkApprovedPromoteExecuted_AlreadyExecuted_409 (CAS-miss arm), TestStackFinal_ConsumeApproved_ExecuteError_503 (updated for new call order) Live verified: awaiting post-merge auto-deploy (rule 14 build-SHA gate in CI) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 498f5cf commit 91a947b

4 files changed

Lines changed: 314 additions & 21 deletions

File tree

internal/handlers/export_final_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@ func (h *StackHandler) CheckStackDeployLimitForTest(ctx context.Context, fp stri
4343
return h.checkStackDeployLimit(ctx, fp)
4444
}
4545

46+
// MarkApprovedPromoteExecutedForTest re-exports the package-private
47+
// markApprovedPromoteExecuted so the approval_already_executed CAS-miss arm
48+
// (an approval flipped to 'executed' between validate and execute — only
49+
// reachable under a concurrent double-consume in prod) can be driven
50+
// deterministically by pre-seeding the row as already executed.
51+
func (h *StackHandler) MarkApprovedPromoteExecutedForTest(c *fiber.Ctx, row *models.PromoteApproval, from, to string) error {
52+
return h.markApprovedPromoteExecuted(c, row, from, to)
53+
}
54+
4655
// ── agent_action.go empty-arg default-branch coverage ────────────────────────
4756
// These re-exports drive the `if x == "" { x = "..." }` default branches that
4857
// the happy-path callers (always passing a non-empty value) leave open.

internal/handlers/stack.go

Lines changed: 67 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2010,10 +2010,19 @@ func (h *StackHandler) Promote(c *fiber.Ctx) error {
20102010
// row for THIS team, with matching from/to/kind. The worker (when it
20112011
// lands) will short-circuit this branch and run the promote on its
20122012
// own poll cadence; until then this path is the manual trigger.
2013+
//
2014+
// #11 (sweep 2026-06-04): we only VALIDATE here. The single-use
2015+
// 'executed' flip is deferred to markApprovedPromoteExecuted, called
2016+
// just before runStackDeploy below — so a preflight failure (412/503/
2017+
// 400/402 in Steps A–C) leaves the approval 'approved' and retryable
2018+
// instead of burning it on a promote that never ran.
2019+
var approvalRow *models.PromoteApproval
20132020
if body.ApprovalID != "" {
2014-
if err := h.consumeApprovedPromote(c, team, body, from, to, models.PromoteApprovalKindStack); err != nil {
2015-
return err
2021+
row, vErr := h.validateApprovedPromote(c, team, body, from, to, models.PromoteApprovalKindStack)
2022+
if vErr != nil {
2023+
return vErr
20162024
}
2025+
approvalRow = row
20172026
}
20182027

20192028
// Step A: Pull the source's services. If ANY service is missing
@@ -2307,6 +2316,20 @@ func (h *StackHandler) Promote(c *fiber.Ctx) error {
23072316
})
23082317
}
23092318

2319+
// #11 (sweep 2026-06-04): preflight (Steps A–C above) has fully
2320+
// succeeded — every failure path before this point returns early, so
2321+
// reaching here means the promote WILL launch. Burn the single-use
2322+
// approval to 'executed' now, immediately before the deploy launch.
2323+
// A failure here (503 execute_failed / 409 already_executed) returns
2324+
// before the launch; the target stack rows are already written but the
2325+
// approval state is the authoritative single-use gate, so re-calling
2326+
// with the same approval is the operator's retry path.
2327+
if approvalRow != nil {
2328+
if execErr := h.markApprovedPromoteExecuted(c, approvalRow, from, to); execErr != nil {
2329+
return execErr
2330+
}
2331+
}
2332+
23102333
// Step D: Hand off to the goroutine that calls the provider with
23112334
// SkipBuild=true. The dashboard's EnvironmentsGrid polls /family so it
23122335
// picks up the building → healthy transition automatically.
@@ -2408,65 +2431,91 @@ func (h *StackHandler) beginPromoteApproval(
24082431
return row, nil
24092432
}
24102433

2411-
// consumeApprovedPromote verifies that an explicit approval_id supplied
2412-
// by the caller matches an APPROVED but NOT-YET-EXECUTED row for the
2413-
// same team / from / to / kind, and atomically flips the row to
2414-
// 'executed'. Used by the manual-trigger fallback path until the
2415-
// worker-side polling lands.
2434+
// validateApprovedPromote verifies that an explicit approval_id supplied by
2435+
// the caller matches an APPROVED, NOT-YET-EXECUTED, non-expired row for the
2436+
// same team / from / to / kind. It returns the approval row on success but
2437+
// does NOT mutate it — the actual 'executed' flip is deferred to
2438+
// markApprovedPromoteExecuted, which the handler calls only AFTER the promote
2439+
// preflight (source-services, image_ref, target create/update, vault, env
2440+
// load) has succeeded.
2441+
//
2442+
// #11 (sweep 2026-06-04): the flip used to happen here, BEFORE preflight. A
2443+
// preflight failure (412 missing_image_ref / no_services, 503 lookup, 400
2444+
// vault, 402 cap) therefore burned the single-use approval to 'executed'
2445+
// while the promote never ran — leaving the operator with a non-retryable
2446+
// approval and forcing a fresh email round-trip. Splitting validate/execute
2447+
// keeps the approval 'approved' (retryable) on any preflight failure.
24162448
//
24172449
// Why we check from/to/kind in addition to the id: the approval row's
24182450
// payload is what the worker would replay. If a caller passes an
24192451
// approval_id for env=preprod but the request is to=production, we
24202452
// refuse — the row's authority covers the env pair it was issued for,
24212453
// not whatever the caller is asking for now.
2422-
func (h *StackHandler) consumeApprovedPromote(
2454+
func (h *StackHandler) validateApprovedPromote(
24232455
c *fiber.Ctx,
24242456
team *models.Team,
24252457
body promoteBody,
24262458
from, to, kind string,
2427-
) error {
2459+
) (*models.PromoteApproval, error) {
24282460
id, err := uuid.Parse(body.ApprovalID)
24292461
if err != nil {
2430-
return respondError(c, fiber.StatusBadRequest, "invalid_approval_id",
2462+
return nil, respondError(c, fiber.StatusBadRequest, "invalid_approval_id",
24312463
"approval_id must be a valid UUID")
24322464
}
24332465
row, err := models.GetPromoteApprovalByID(c.Context(), h.db, id)
24342466
if errors.Is(err, models.ErrPromoteApprovalNotFound) {
2435-
return respondError(c, fiber.StatusNotFound, "approval_not_found",
2467+
return nil, respondError(c, fiber.StatusNotFound, "approval_not_found",
24362468
"approval_id does not match any approval row")
24372469
}
24382470
if err != nil {
24392471
slog.Error("stack.promote.approval_lookup_failed",
24402472
"error", err, "approval_id", id,
24412473
"request_id", middleware.GetRequestID(c))
2442-
return respondError(c, fiber.StatusServiceUnavailable, "lookup_failed",
2474+
return nil, respondError(c, fiber.StatusServiceUnavailable, "lookup_failed",
24432475
"Failed to look up approval")
24442476
}
24452477
if row.TeamID != team.ID {
24462478
// Cross-team — same posture as stack ownership: 404 not 403.
2447-
return respondError(c, fiber.StatusNotFound, "approval_not_found",
2479+
return nil, respondError(c, fiber.StatusNotFound, "approval_not_found",
24482480
"approval_id does not match any approval row for this team")
24492481
}
24502482
if row.Status != models.PromoteApprovalStatusApproved {
2451-
return respondError(c, fiber.StatusConflict, "approval_not_approved",
2483+
return nil, respondError(c, fiber.StatusConflict, "approval_not_approved",
24522484
"approval row is in status="+row.Status+" — must be 'approved' to consume")
24532485
}
24542486
if row.PromoteKind != kind || row.FromEnv != from || row.ToEnv != to {
2455-
return respondError(c, fiber.StatusBadRequest, "approval_mismatch",
2487+
return nil, respondError(c, fiber.StatusBadRequest, "approval_mismatch",
24562488
"approval_id's recorded (kind,from,to) does not match this request")
24572489
}
24582490
if row.ExpiresAt.Before(time.Now().UTC()) {
24592491
// Even approved rows have an outer expiry — once the 24h window
24602492
// has fully passed since the original request we refuse to
24612493
// execute. This is belt-and-suspenders defence; the worker
24622494
// repo's polling job would refuse for the same reason.
2463-
return respondError(c, fiber.StatusGone, "approval_expired",
2495+
return nil, respondError(c, fiber.StatusGone, "approval_expired",
24642496
"approval window has fully expired")
24652497
}
2466-
ok, err := models.MarkPromoteApprovalExecuted(c.Context(), h.db, id)
2498+
return row, nil
2499+
}
2500+
2501+
// markApprovedPromoteExecuted atomically flips a validated approval row to
2502+
// 'executed' and audits the transition. It is called by Promote ONLY after
2503+
// the entire promote preflight has succeeded and immediately before the
2504+
// runStackDeploy launch, so a preflight failure leaves the row 'approved'
2505+
// (retryable). See validateApprovedPromote for the #11 rationale.
2506+
//
2507+
// The CAS inside MarkPromoteApprovalExecuted still guards against a concurrent
2508+
// double-consume: if a second request raced through validate + preflight and
2509+
// flipped the row first, this returns 0 rows and we 409 approval_already_executed.
2510+
func (h *StackHandler) markApprovedPromoteExecuted(
2511+
c *fiber.Ctx,
2512+
row *models.PromoteApproval,
2513+
from, to string,
2514+
) error {
2515+
ok, err := models.MarkPromoteApprovalExecuted(c.Context(), h.db, row.ID)
24672516
if err != nil {
24682517
slog.Error("stack.promote.approval_execute_failed",
2469-
"error", err, "approval_id", id,
2518+
"error", err, "approval_id", row.ID,
24702519
"request_id", middleware.GetRequestID(c))
24712520
return respondError(c, fiber.StatusServiceUnavailable, "execute_failed",
24722521
"Failed to mark approval executed")

internal/handlers/stack_final_test.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,17 @@ func TestStackFinal_ConsumeApproved_LookupError_503(t *testing.T) {
136136
}
137137

138138
// TestStackFinal_ConsumeApproved_ExecuteError_503 — MarkPromoteApprovalExecuted
139-
// errors after a fully-valid approved row (stack.go:2425). team(1) + stack(2) +
140-
// approval-read(3) succeed; the UPDATE(4) errors. failAfter=3.
139+
// errors on the deferred 'executed' flip, after a fully-valid approved row AND
140+
// after the entire promote preflight succeeds (markApprovedPromoteExecuted,
141+
// stack.go ~2520).
142+
//
143+
// #11 (sweep 2026-06-04): the flip moved from BEFORE preflight to AFTER it, so
144+
// the fault must now land on a LATER DB call. The fresh-target preflight runs
145+
// ~10 reads/writes (source services, family lookup, CreateStackWithCap, vault
146+
// copy, source+target env_vars, vault resolve) between the approval read and
147+
// the MarkPromoteApprovalExecuted UPDATE — failAfter=13 lands the injected
148+
// failure on that UPDATE (verified: 12 → env_load_failed, 13 → execute_failed,
149+
// 14 → success/202).
141150
func TestStackFinal_ConsumeApproved_ExecuteError_503(t *testing.T) {
142151
seedDB, clean := testhelpers.SetupTestDB(t)
143152
defer clean()
@@ -148,7 +157,7 @@ func TestStackFinal_ConsumeApproved_ExecuteError_503(t *testing.T) {
148157
slug, _ := seedPromoteSourceStack(t, seedDB, teamIDStr, "staging", "stkfinal-exec")
149158
id := mustSeedApprovedPromote(t, seedDB, teamID, "staging", "production")
150159

151-
faultDB := openFaultDB(t, 3)
160+
faultDB := openFaultDB(t, 13)
152161
app := stackFaultPromoteApp(t, faultDB)
153162
resp := postPromote(t, app, jwt, slug, map[string]any{
154163
"from": "staging", "to": "production", "approval_id": id,

0 commit comments

Comments
 (0)