Merge branch 'master' into test/integration-cov-brevo-thin

mastermanas805 · web-flow · commit ae94c80e9486 · 2026-06-04T22:11:05.000+05:30
diff --git a/internal/handlers/export_final_test.go b/internal/handlers/export_final_test.go
@@ -43,6 +43,15 @@ func (h *StackHandler) CheckStackDeployLimitForTest(ctx context.Context, fp stri
 	return h.checkStackDeployLimit(ctx, fp)
 }
 
+// MarkApprovedPromoteExecutedForTest re-exports the package-private
+// markApprovedPromoteExecuted so the approval_already_executed CAS-miss arm
+// (an approval flipped to 'executed' between validate and execute — only
+// reachable under a concurrent double-consume in prod) can be driven
+// deterministically by pre-seeding the row as already executed.
+func (h *StackHandler) MarkApprovedPromoteExecutedForTest(c *fiber.Ctx, row *models.PromoteApproval, from, to string) error {
+	return h.markApprovedPromoteExecuted(c, row, from, to)
+}
+
 // ── agent_action.go empty-arg default-branch coverage ────────────────────────
 // These re-exports drive the `if x == "" { x = "..." }` default branches that
 // the happy-path callers (always passing a non-empty value) leave open.
diff --git a/internal/handlers/openapi.go b/internal/handlers/openapi.go
@@ -183,10 +183,21 @@ func OpenAPISpecProduction() string {
 const openAPISpec = `{
   "openapi": "3.1.0",
   "info": {
-    "title": "InstaNode API",
+    "title": "instanode.dev — zero-friction dev infrastructure for AI agents",
     "version": "1.0.0",
-    "description": "Zero-friction developer infrastructure. Provision real databases, caches, and queues with a single HTTP call — no account, no Docker, no setup.\n\n## Idempotency\n\nEvery POST endpoint that creates a resource is idempotent. Two layered protections cover every retry pattern:\n\n1. Explicit Idempotency-Key header (Stripe-shape, 24h TTL). Pass the same opaque key on each retry of a logical operation and the server replays the first response verbatim. Reusing a key with a different body returns 409.\n2. Body-fingerprint fallback (120s TTL). When the header is absent, the server synthesises a key from sha256(scope, route, canonical-body) and dedups identical retries inside a 120s window. Absorbs double-clicks, mobile double-taps, agent retries on transient 5xx, and reverse-proxy retries on network blips. Use the explicit header for true exactly-once across longer windows.\n\nEvery response from a create endpoint carries:\n- X-Idempotency-Source: explicit | fingerprint | miss — which dedup path matched (explicit = caller passed an Idempotency-Key; fingerprint = the body-fingerprint cache replayed; miss = handler ran fresh).\n- X-Idempotent-Replay: true — present only when the response was served from the cache (either path).\n\n## Rate limit (applies to every route)\n\nA global per-IP rate limit (100 req/min) is applied to EVERY documented endpoint by the router middleware. Exceeding it returns 429 with the standard ErrorResponse envelope (error=rate_limited), a Retry-After HTTP header, and retry_after_seconds in the JSON body. The per-route response maps below may omit 429 for brevity; the canonical 429 shape is documented under components.responses.TooManyRequests and applies to every path. T19 P1-1 (BugHunt 2026-05-20).\n\n## Payload size (applies to every route)\n\nFiber's global BodyLimit is set to 50 MiB — only /deploy/new and /stacks/new (multipart tarballs) and /webhooks/github/* (push payloads) approach that cap; JSON endpoints are bounded to sub-KB bodies by the per-handler shape. Oversized requests return 413 payload_too_large with the standard JSON ErrorResponse envelope (NOT the upstream nginx HTML 502 the older shape returned — T19 P1-2). The canonical 413 shape is documented under components.responses.PayloadTooLarge.\n\n## Security headers (applies to every response)\n\nEvery response from EVERY route — including liveness/readiness probes, OpenAPI document fetch, 4xx error envelopes, 5xx error envelopes, and 404/405 Fiber-default responses — carries the following defense-in-depth response headers, set by the SecurityHeaders middleware ahead of RequestID in the router middleware chain (task #311 wave-3 chaos-verify redo):\n\n- Strict-Transport-Security: max-age=63072000; includeSubDomains — production only (omitted on ENVIRONMENT=development so local http://localhost:8080 doesn't poison the host's HSTS cache). 2-year max-age, includeSubDomains for *.api.instanode.dev.\n- X-Content-Type-Options: nosniff — disables MIME sniffing.\n- X-Frame-Options: SAMEORIGIN — clickjacking defense.\n- Referrer-Policy: strict-origin-when-cross-origin — prevents URL-token leakage across origin downgrades.\n- Permissions-Policy: geolocation=(), microphone=(), camera=(), payment=() — denies powerful browser APIs.\n- Cross-Origin-Resource-Policy: same-origin — blocks no-cors cross-origin fetches.\n\nThese headers are not enumerated in each per-route responses block to keep the spec readable; they apply globally. Coverage test: TestSecurityHeaders_AllEndpoints_AllHeaders_Prod (internal/handlers/security_headers_test.go) iterates 5 representative endpoints (healthz, readyz, openapi.json, db/new, claim) and asserts all 6 headers land on every response."
+    "description": "Zero-friction developer infrastructure built for AI coding agents (Claude Code, Cursor, MCP tool-use) and humans alike. Provision real Postgres + pgvector + Redis + MongoDB + NATS JetStream queues + S3-compatible object storage + webhook receivers — AND deploy your app on top of them — each with a single HTTP call. No account, no Docker, no setup. A free anonymous tier (24h TTL) lets an agent claim infrastructure the moment it hits a limit, with no signup. Also available as an MCP server, language SDKs, and a CLI for agent tool-use. The unit of value is the whole bundle: everything an agent needs to ship a working app, claimed and provisioned in one flow. Keywords: AI agent infrastructure, MCP, Postgres, pgvector / vector database, Redis cache, MongoDB, NATS queue, S3 object storage, webhooks, app deployment, free tier, single HTTP call, no signup.\n\n## Idempotency\n\nEvery POST endpoint that creates a resource is idempotent. Two layered protections cover every retry pattern:\n\n1. Explicit Idempotency-Key header (Stripe-shape, 24h TTL). Pass the same opaque key on each retry of a logical operation and the server replays the first response verbatim. Reusing a key with a different body returns 409.\n2. Body-fingerprint fallback (120s TTL). When the header is absent, the server synthesises a key from sha256(scope, route, canonical-body) and dedups identical retries inside a 120s window. Absorbs double-clicks, mobile double-taps, agent retries on transient 5xx, and reverse-proxy retries on network blips. Use the explicit header for true exactly-once across longer windows.\n\nEvery response from a create endpoint carries:\n- X-Idempotency-Source: explicit | fingerprint | miss — which dedup path matched (explicit = caller passed an Idempotency-Key; fingerprint = the body-fingerprint cache replayed; miss = handler ran fresh).\n- X-Idempotent-Replay: true — present only when the response was served from the cache (either path).\n\n## Rate limit (applies to every route)\n\nA global per-IP rate limit (100 req/min) is applied to EVERY documented endpoint by the router middleware. Exceeding it returns 429 with the standard ErrorResponse envelope (error=rate_limited), a Retry-After HTTP header, and retry_after_seconds in the JSON body. The per-route response maps below may omit 429 for brevity; the canonical 429 shape is documented under components.responses.TooManyRequests and applies to every path. T19 P1-1 (BugHunt 2026-05-20).\n\n## Payload size (applies to every route)\n\nFiber's global BodyLimit is set to 50 MiB — only /deploy/new and /stacks/new (multipart tarballs) and /webhooks/github/* (push payloads) approach that cap; JSON endpoints are bounded to sub-KB bodies by the per-handler shape. Oversized requests return 413 payload_too_large with the standard JSON ErrorResponse envelope (NOT the upstream nginx HTML 502 the older shape returned — T19 P1-2). The canonical 413 shape is documented under components.responses.PayloadTooLarge.\n\n## Security headers (applies to every response)\n\nEvery response from EVERY route — including liveness/readiness probes, OpenAPI document fetch, 4xx error envelopes, 5xx error envelopes, and 404/405 Fiber-default responses — carries the following defense-in-depth response headers, set by the SecurityHeaders middleware ahead of RequestID in the router middleware chain (task #311 wave-3 chaos-verify redo):\n\n- Strict-Transport-Security: max-age=63072000; includeSubDomains — production only (omitted on ENVIRONMENT=development so local http://localhost:8080 doesn't poison the host's HSTS cache). 2-year max-age, includeSubDomains for *.api.instanode.dev.\n- X-Content-Type-Options: nosniff — disables MIME sniffing.\n- X-Frame-Options: SAMEORIGIN — clickjacking defense.\n- Referrer-Policy: strict-origin-when-cross-origin — prevents URL-token leakage across origin downgrades.\n- Permissions-Policy: geolocation=(), microphone=(), camera=(), payment=() — denies powerful browser APIs.\n- Cross-Origin-Resource-Policy: same-origin — blocks no-cors cross-origin fetches.\n\nThese headers are not enumerated in each per-route responses block to keep the spec readable; they apply globally. Coverage test: TestSecurityHeaders_AllEndpoints_AllHeaders_Prod (internal/handlers/security_headers_test.go) iterates 5 representative endpoints (healthz, readyz, openapi.json, db/new, claim) and asserts all 6 headers land on every response."
   },
+  "tags": [
+    { "name": "database", "description": "Real Postgres connection strings via POST /db/new — encrypted at rest, per-token isolation, instant." },
+    { "name": "vector", "description": "pgvector-enabled Postgres via POST /vector/new — embedding stores with HNSW + IVFFlat for AI/RAG workloads." },
+    { "name": "cache", "description": "Real Redis connection strings via POST /cache/new — ACL namespace isolation." },
+    { "name": "nosql", "description": "Real MongoDB connection strings via POST /nosql/new — per-token database scoping." },
+    { "name": "queue", "description": "NATS JetStream URLs via POST /queue/new — per-account subject isolation." },
+    { "name": "storage", "description": "S3-compatible object storage via POST /storage/new plus broker-mode signed URLs via POST /storage/{token}/presign." },
+    { "name": "webhook", "description": "Public webhook receiver URLs via POST /webhook/new — captures any HTTP method payload." },
+    { "name": "deploy", "description": "The deployment wedge: ship an app via POST /deploy/new (single app) or POST /stacks/new (multi-service) — Docker build to public HTTPS URL with TLS, no Dockerfile-on-disk required." },
+    { "name": "auth", "description": "Magic-link, GitHub OAuth, and CLI device-flow login — used to claim a free anonymous resource bundle." }
+  ],
   "servers": [{ "url": "https://api.instanode.dev", "description": "Production" }],
   "paths": {
     "/livez": {
diff --git a/internal/handlers/stack.go b/internal/handlers/stack.go
@@ -2010,10 +2010,19 @@ func (h *StackHandler) Promote(c *fiber.Ctx) error {
 	// row for THIS team, with matching from/to/kind. The worker (when it
 	// lands) will short-circuit this branch and run the promote on its
 	// own poll cadence; until then this path is the manual trigger.
+	//
+	// #11 (sweep 2026-06-04): we only VALIDATE here. The single-use
+	// 'executed' flip is deferred to markApprovedPromoteExecuted, called
+	// just before runStackDeploy below — so a preflight failure (412/503/
+	// 400/402 in Steps A–C) leaves the approval 'approved' and retryable
+	// instead of burning it on a promote that never ran.
+	var approvalRow *models.PromoteApproval
 	if body.ApprovalID != "" {
-		if err := h.consumeApprovedPromote(c, team, body, from, to, models.PromoteApprovalKindStack); err != nil {
-			return err
+		row, vErr := h.validateApprovedPromote(c, team, body, from, to, models.PromoteApprovalKindStack)
+		if vErr != nil {
+			return vErr
 		}
+		approvalRow = row
 	}
 
 	// Step A: Pull the source's services. If ANY service is missing
@@ -2307,6 +2316,20 @@ func (h *StackHandler) Promote(c *fiber.Ctx) error {
 		})
 	}
 
+	// #11 (sweep 2026-06-04): preflight (Steps A–C above) has fully
+	// succeeded — every failure path before this point returns early, so
+	// reaching here means the promote WILL launch. Burn the single-use
+	// approval to 'executed' now, immediately before the deploy launch.
+	// A failure here (503 execute_failed / 409 already_executed) returns
+	// before the launch; the target stack rows are already written but the
+	// approval state is the authoritative single-use gate, so re-calling
+	// with the same approval is the operator's retry path.
+	if approvalRow != nil {
+		if execErr := h.markApprovedPromoteExecuted(c, approvalRow, from, to); execErr != nil {
+			return execErr
+		}
+	}
+
 	// Step D: Hand off to the goroutine that calls the provider with
 	// SkipBuild=true. The dashboard's EnvironmentsGrid polls /family so it
 	// picks up the building → healthy transition automatically.
@@ -2408,65 +2431,91 @@ func (h *StackHandler) beginPromoteApproval(
 	return row, nil
 }
 
-// consumeApprovedPromote verifies that an explicit approval_id supplied
-// by the caller matches an APPROVED but NOT-YET-EXECUTED row for the
-// same team / from / to / kind, and atomically flips the row to
-// 'executed'. Used by the manual-trigger fallback path until the
-// worker-side polling lands.
+// validateApprovedPromote verifies that an explicit approval_id supplied by
+// the caller matches an APPROVED, NOT-YET-EXECUTED, non-expired row for the
+// same team / from / to / kind. It returns the approval row on success but
+// does NOT mutate it — the actual 'executed' flip is deferred to
+// markApprovedPromoteExecuted, which the handler calls only AFTER the promote
+// preflight (source-services, image_ref, target create/update, vault, env
+// load) has succeeded.
+//
+// #11 (sweep 2026-06-04): the flip used to happen here, BEFORE preflight. A
+// preflight failure (412 missing_image_ref / no_services, 503 lookup, 400
+// vault, 402 cap) therefore burned the single-use approval to 'executed'
+// while the promote never ran — leaving the operator with a non-retryable
+// approval and forcing a fresh email round-trip. Splitting validate/execute
+// keeps the approval 'approved' (retryable) on any preflight failure.
 //
 // Why we check from/to/kind in addition to the id: the approval row's
 // payload is what the worker would replay. If a caller passes an
 // approval_id for env=preprod but the request is to=production, we
 // refuse — the row's authority covers the env pair it was issued for,
 // not whatever the caller is asking for now.
-func (h *StackHandler) consumeApprovedPromote(
+func (h *StackHandler) validateApprovedPromote(
 	c *fiber.Ctx,
 	team *models.Team,
 	body promoteBody,
 	from, to, kind string,
-) error {
+) (*models.PromoteApproval, error) {
 	id, err := uuid.Parse(body.ApprovalID)
 	if err != nil {
-		return respondError(c, fiber.StatusBadRequest, "invalid_approval_id",
+		return nil, respondError(c, fiber.StatusBadRequest, "invalid_approval_id",
 			"approval_id must be a valid UUID")
 	}
 	row, err := models.GetPromoteApprovalByID(c.Context(), h.db, id)
 	if errors.Is(err, models.ErrPromoteApprovalNotFound) {
-		return respondError(c, fiber.StatusNotFound, "approval_not_found",
+		return nil, respondError(c, fiber.StatusNotFound, "approval_not_found",
 			"approval_id does not match any approval row")
 	}
 	if err != nil {
 		slog.Error("stack.promote.approval_lookup_failed",
 			"error", err, "approval_id", id,
 			"request_id", middleware.GetRequestID(c))
-		return respondError(c, fiber.StatusServiceUnavailable, "lookup_failed",
+		return nil, respondError(c, fiber.StatusServiceUnavailable, "lookup_failed",
 			"Failed to look up approval")
 	}
 	if row.TeamID != team.ID {
 		// Cross-team — same posture as stack ownership: 404 not 403.
-		return respondError(c, fiber.StatusNotFound, "approval_not_found",
+		return nil, respondError(c, fiber.StatusNotFound, "approval_not_found",
 			"approval_id does not match any approval row for this team")
 	}
 	if row.Status != models.PromoteApprovalStatusApproved {
-		return respondError(c, fiber.StatusConflict, "approval_not_approved",
+		return nil, respondError(c, fiber.StatusConflict, "approval_not_approved",
 			"approval row is in status="+row.Status+" — must be 'approved' to consume")
 	}
 	if row.PromoteKind != kind || row.FromEnv != from || row.ToEnv != to {
-		return respondError(c, fiber.StatusBadRequest, "approval_mismatch",
+		return nil, respondError(c, fiber.StatusBadRequest, "approval_mismatch",
 			"approval_id's recorded (kind,from,to) does not match this request")
 	}
 	if row.ExpiresAt.Before(time.Now().UTC()) {
 		// Even approved rows have an outer expiry — once the 24h window
 		// has fully passed since the original request we refuse to
 		// execute. This is belt-and-suspenders defence; the worker
 		// repo's polling job would refuse for the same reason.
-		return respondError(c, fiber.StatusGone, "approval_expired",
+		return nil, respondError(c, fiber.StatusGone, "approval_expired",
 			"approval window has fully expired")
 	}
-	ok, err := models.MarkPromoteApprovalExecuted(c.Context(), h.db, id)
+	return row, nil
+}
+
+// markApprovedPromoteExecuted atomically flips a validated approval row to
+// 'executed' and audits the transition. It is called by Promote ONLY after
+// the entire promote preflight has succeeded and immediately before the
+// runStackDeploy launch, so a preflight failure leaves the row 'approved'
+// (retryable). See validateApprovedPromote for the #11 rationale.
+//
+// The CAS inside MarkPromoteApprovalExecuted still guards against a concurrent
+// double-consume: if a second request raced through validate + preflight and
+// flipped the row first, this returns 0 rows and we 409 approval_already_executed.
+func (h *StackHandler) markApprovedPromoteExecuted(
+	c *fiber.Ctx,
+	row *models.PromoteApproval,
+	from, to string,
+) error {
+	ok, err := models.MarkPromoteApprovalExecuted(c.Context(), h.db, row.ID)
 	if err != nil {
 		slog.Error("stack.promote.approval_execute_failed",
-			"error", err, "approval_id", id,
+			"error", err, "approval_id", row.ID,
 			"request_id", middleware.GetRequestID(c))
 		return respondError(c, fiber.StatusServiceUnavailable, "execute_failed",
 			"Failed to mark approval executed")
diff --git a/internal/handlers/stack_final_test.go b/internal/handlers/stack_final_test.go
@@ -136,8 +136,17 @@ func TestStackFinal_ConsumeApproved_LookupError_503(t *testing.T) {
 }
 
 // TestStackFinal_ConsumeApproved_ExecuteError_503 — MarkPromoteApprovalExecuted
-// errors after a fully-valid approved row (stack.go:2425). team(1) + stack(2) +
-// approval-read(3) succeed; the UPDATE(4) errors. failAfter=3.
+// errors on the deferred 'executed' flip, after a fully-valid approved row AND
+// after the entire promote preflight succeeds (markApprovedPromoteExecuted,
+// stack.go ~2520).
+//
+// #11 (sweep 2026-06-04): the flip moved from BEFORE preflight to AFTER it, so
+// the fault must now land on a LATER DB call. The fresh-target preflight runs
+// ~10 reads/writes (source services, family lookup, CreateStackWithCap, vault
+// copy, source+target env_vars, vault resolve) between the approval read and
+// the MarkPromoteApprovalExecuted UPDATE — failAfter=13 lands the injected
+// failure on that UPDATE (verified: 12 → env_load_failed, 13 → execute_failed,
+// 14 → success/202).
 func TestStackFinal_ConsumeApproved_ExecuteError_503(t *testing.T) {
 	seedDB, clean := testhelpers.SetupTestDB(t)
 	defer clean()
@@ -148,7 +157,7 @@ func TestStackFinal_ConsumeApproved_ExecuteError_503(t *testing.T) {
 	slug, _ := seedPromoteSourceStack(t, seedDB, teamIDStr, "staging", "stkfinal-exec")
 	id := mustSeedApprovedPromote(t, seedDB, teamID, "staging", "production")
 
-	faultDB := openFaultDB(t, 3)
+	faultDB := openFaultDB(t, 13)
 	app := stackFaultPromoteApp(t, faultDB)
 	resp := postPromote(t, app, jwt, slug, map[string]any{
 		"from": "staging", "to": "production", "approval_id": id,
diff --git a/internal/handlers/stack_promote_approval_preflight_test.go b/internal/handlers/stack_promote_approval_preflight_test.go
diff --git a/openapi.snapshot.json b/openapi.snapshot.json