Skip to content

Commit ae94c80

Browse files
Merge branch 'master' into test/integration-cov-brevo-thin
2 parents 60ca62d + 8039d1d commit ae94c80

6 files changed

Lines changed: 367 additions & 25 deletions

File tree

internal/handlers/export_final_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@ func (h *StackHandler) CheckStackDeployLimitForTest(ctx context.Context, fp stri
4343
return h.checkStackDeployLimit(ctx, fp)
4444
}
4545

46+
// MarkApprovedPromoteExecutedForTest re-exports the package-private
47+
// markApprovedPromoteExecuted so the approval_already_executed CAS-miss arm
48+
// (an approval flipped to 'executed' between validate and execute — only
49+
// reachable under a concurrent double-consume in prod) can be driven
50+
// deterministically by pre-seeding the row as already executed.
51+
func (h *StackHandler) MarkApprovedPromoteExecutedForTest(c *fiber.Ctx, row *models.PromoteApproval, from, to string) error {
52+
return h.markApprovedPromoteExecuted(c, row, from, to)
53+
}
54+
4655
// ── agent_action.go empty-arg default-branch coverage ────────────────────────
4756
// These re-exports drive the `if x == "" { x = "..." }` default branches that
4857
// the happy-path callers (always passing a non-empty value) leave open.

internal/handlers/openapi.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,21 @@ func OpenAPISpecProduction() string {
183183
const openAPISpec = `{
184184
"openapi": "3.1.0",
185185
"info": {
186-
"title": "InstaNode API",
186+
"title": "instanode.dev — zero-friction dev infrastructure for AI agents",
187187
"version": "1.0.0",
188-
"description": "Zero-friction developer infrastructure. Provision real databases, caches, and queues with a single HTTP call — no account, no Docker, no setup.\n\n## Idempotency\n\nEvery POST endpoint that creates a resource is idempotent. Two layered protections cover every retry pattern:\n\n1. Explicit Idempotency-Key header (Stripe-shape, 24h TTL). Pass the same opaque key on each retry of a logical operation and the server replays the first response verbatim. Reusing a key with a different body returns 409.\n2. Body-fingerprint fallback (120s TTL). When the header is absent, the server synthesises a key from sha256(scope, route, canonical-body) and dedups identical retries inside a 120s window. Absorbs double-clicks, mobile double-taps, agent retries on transient 5xx, and reverse-proxy retries on network blips. Use the explicit header for true exactly-once across longer windows.\n\nEvery response from a create endpoint carries:\n- X-Idempotency-Source: explicit | fingerprint | miss — which dedup path matched (explicit = caller passed an Idempotency-Key; fingerprint = the body-fingerprint cache replayed; miss = handler ran fresh).\n- X-Idempotent-Replay: true — present only when the response was served from the cache (either path).\n\n## Rate limit (applies to every route)\n\nA global per-IP rate limit (100 req/min) is applied to EVERY documented endpoint by the router middleware. Exceeding it returns 429 with the standard ErrorResponse envelope (error=rate_limited), a Retry-After HTTP header, and retry_after_seconds in the JSON body. The per-route response maps below may omit 429 for brevity; the canonical 429 shape is documented under components.responses.TooManyRequests and applies to every path. T19 P1-1 (BugHunt 2026-05-20).\n\n## Payload size (applies to every route)\n\nFiber's global BodyLimit is set to 50 MiB — only /deploy/new and /stacks/new (multipart tarballs) and /webhooks/github/* (push payloads) approach that cap; JSON endpoints are bounded to sub-KB bodies by the per-handler shape. Oversized requests return 413 payload_too_large with the standard JSON ErrorResponse envelope (NOT the upstream nginx HTML 502 the older shape returned — T19 P1-2). The canonical 413 shape is documented under components.responses.PayloadTooLarge.\n\n## Security headers (applies to every response)\n\nEvery response from EVERY route — including liveness/readiness probes, OpenAPI document fetch, 4xx error envelopes, 5xx error envelopes, and 404/405 Fiber-default responses — carries the following defense-in-depth response headers, set by the SecurityHeaders middleware ahead of RequestID in the router middleware chain (task #311 wave-3 chaos-verify redo):\n\n- Strict-Transport-Security: max-age=63072000; includeSubDomains — production only (omitted on ENVIRONMENT=development so local http://localhost:8080 doesn't poison the host's HSTS cache). 2-year max-age, includeSubDomains for *.api.instanode.dev.\n- X-Content-Type-Options: nosniff — disables MIME sniffing.\n- X-Frame-Options: SAMEORIGIN — clickjacking defense.\n- Referrer-Policy: strict-origin-when-cross-origin — prevents URL-token leakage across origin downgrades.\n- Permissions-Policy: geolocation=(), microphone=(), camera=(), payment=() — denies powerful browser APIs.\n- Cross-Origin-Resource-Policy: same-origin — blocks no-cors cross-origin fetches.\n\nThese headers are not enumerated in each per-route responses block to keep the spec readable; they apply globally. Coverage test: TestSecurityHeaders_AllEndpoints_AllHeaders_Prod (internal/handlers/security_headers_test.go) iterates 5 representative endpoints (healthz, readyz, openapi.json, db/new, claim) and asserts all 6 headers land on every response."
188+
"description": "Zero-friction developer infrastructure built for AI coding agents (Claude Code, Cursor, MCP tool-use) and humans alike. Provision real Postgres + pgvector + Redis + MongoDB + NATS JetStream queues + S3-compatible object storage + webhook receivers — AND deploy your app on top of them — each with a single HTTP call. No account, no Docker, no setup. A free anonymous tier (24h TTL) lets an agent claim infrastructure the moment it hits a limit, with no signup. Also available as an MCP server, language SDKs, and a CLI for agent tool-use. The unit of value is the whole bundle: everything an agent needs to ship a working app, claimed and provisioned in one flow. Keywords: AI agent infrastructure, MCP, Postgres, pgvector / vector database, Redis cache, MongoDB, NATS queue, S3 object storage, webhooks, app deployment, free tier, single HTTP call, no signup.\n\n## Idempotency\n\nEvery POST endpoint that creates a resource is idempotent. Two layered protections cover every retry pattern:\n\n1. Explicit Idempotency-Key header (Stripe-shape, 24h TTL). Pass the same opaque key on each retry of a logical operation and the server replays the first response verbatim. Reusing a key with a different body returns 409.\n2. Body-fingerprint fallback (120s TTL). When the header is absent, the server synthesises a key from sha256(scope, route, canonical-body) and dedups identical retries inside a 120s window. Absorbs double-clicks, mobile double-taps, agent retries on transient 5xx, and reverse-proxy retries on network blips. Use the explicit header for true exactly-once across longer windows.\n\nEvery response from a create endpoint carries:\n- X-Idempotency-Source: explicit | fingerprint | miss — which dedup path matched (explicit = caller passed an Idempotency-Key; fingerprint = the body-fingerprint cache replayed; miss = handler ran fresh).\n- X-Idempotent-Replay: true — present only when the response was served from the cache (either path).\n\n## Rate limit (applies to every route)\n\nA global per-IP rate limit (100 req/min) is applied to EVERY documented endpoint by the router middleware. Exceeding it returns 429 with the standard ErrorResponse envelope (error=rate_limited), a Retry-After HTTP header, and retry_after_seconds in the JSON body. The per-route response maps below may omit 429 for brevity; the canonical 429 shape is documented under components.responses.TooManyRequests and applies to every path. T19 P1-1 (BugHunt 2026-05-20).\n\n## Payload size (applies to every route)\n\nFiber's global BodyLimit is set to 50 MiB — only /deploy/new and /stacks/new (multipart tarballs) and /webhooks/github/* (push payloads) approach that cap; JSON endpoints are bounded to sub-KB bodies by the per-handler shape. Oversized requests return 413 payload_too_large with the standard JSON ErrorResponse envelope (NOT the upstream nginx HTML 502 the older shape returned — T19 P1-2). The canonical 413 shape is documented under components.responses.PayloadTooLarge.\n\n## Security headers (applies to every response)\n\nEvery response from EVERY route — including liveness/readiness probes, OpenAPI document fetch, 4xx error envelopes, 5xx error envelopes, and 404/405 Fiber-default responses — carries the following defense-in-depth response headers, set by the SecurityHeaders middleware ahead of RequestID in the router middleware chain (task #311 wave-3 chaos-verify redo):\n\n- Strict-Transport-Security: max-age=63072000; includeSubDomains — production only (omitted on ENVIRONMENT=development so local http://localhost:8080 doesn't poison the host's HSTS cache). 2-year max-age, includeSubDomains for *.api.instanode.dev.\n- X-Content-Type-Options: nosniff — disables MIME sniffing.\n- X-Frame-Options: SAMEORIGIN — clickjacking defense.\n- Referrer-Policy: strict-origin-when-cross-origin — prevents URL-token leakage across origin downgrades.\n- Permissions-Policy: geolocation=(), microphone=(), camera=(), payment=() — denies powerful browser APIs.\n- Cross-Origin-Resource-Policy: same-origin — blocks no-cors cross-origin fetches.\n\nThese headers are not enumerated in each per-route responses block to keep the spec readable; they apply globally. Coverage test: TestSecurityHeaders_AllEndpoints_AllHeaders_Prod (internal/handlers/security_headers_test.go) iterates 5 representative endpoints (healthz, readyz, openapi.json, db/new, claim) and asserts all 6 headers land on every response."
189189
},
190+
"tags": [
191+
{ "name": "database", "description": "Real Postgres connection strings via POST /db/new — encrypted at rest, per-token isolation, instant." },
192+
{ "name": "vector", "description": "pgvector-enabled Postgres via POST /vector/new — embedding stores with HNSW + IVFFlat for AI/RAG workloads." },
193+
{ "name": "cache", "description": "Real Redis connection strings via POST /cache/new — ACL namespace isolation." },
194+
{ "name": "nosql", "description": "Real MongoDB connection strings via POST /nosql/new — per-token database scoping." },
195+
{ "name": "queue", "description": "NATS JetStream URLs via POST /queue/new — per-account subject isolation." },
196+
{ "name": "storage", "description": "S3-compatible object storage via POST /storage/new plus broker-mode signed URLs via POST /storage/{token}/presign." },
197+
{ "name": "webhook", "description": "Public webhook receiver URLs via POST /webhook/new — captures any HTTP method payload." },
198+
{ "name": "deploy", "description": "The deployment wedge: ship an app via POST /deploy/new (single app) or POST /stacks/new (multi-service) — Docker build to public HTTPS URL with TLS, no Dockerfile-on-disk required." },
199+
{ "name": "auth", "description": "Magic-link, GitHub OAuth, and CLI device-flow login — used to claim a free anonymous resource bundle." }
200+
],
190201
"servers": [{ "url": "https://api.instanode.dev", "description": "Production" }],
191202
"paths": {
192203
"/livez": {

internal/handlers/stack.go

Lines changed: 67 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2010,10 +2010,19 @@ func (h *StackHandler) Promote(c *fiber.Ctx) error {
20102010
// row for THIS team, with matching from/to/kind. The worker (when it
20112011
// lands) will short-circuit this branch and run the promote on its
20122012
// own poll cadence; until then this path is the manual trigger.
2013+
//
2014+
// #11 (sweep 2026-06-04): we only VALIDATE here. The single-use
2015+
// 'executed' flip is deferred to markApprovedPromoteExecuted, called
2016+
// just before runStackDeploy below — so a preflight failure (412/503/
2017+
// 400/402 in Steps A–C) leaves the approval 'approved' and retryable
2018+
// instead of burning it on a promote that never ran.
2019+
var approvalRow *models.PromoteApproval
20132020
if body.ApprovalID != "" {
2014-
if err := h.consumeApprovedPromote(c, team, body, from, to, models.PromoteApprovalKindStack); err != nil {
2015-
return err
2021+
row, vErr := h.validateApprovedPromote(c, team, body, from, to, models.PromoteApprovalKindStack)
2022+
if vErr != nil {
2023+
return vErr
20162024
}
2025+
approvalRow = row
20172026
}
20182027

20192028
// Step A: Pull the source's services. If ANY service is missing
@@ -2307,6 +2316,20 @@ func (h *StackHandler) Promote(c *fiber.Ctx) error {
23072316
})
23082317
}
23092318

2319+
// #11 (sweep 2026-06-04): preflight (Steps A–C above) has fully
2320+
// succeeded — every failure path before this point returns early, so
2321+
// reaching here means the promote WILL launch. Burn the single-use
2322+
// approval to 'executed' now, immediately before the deploy launch.
2323+
// A failure here (503 execute_failed / 409 already_executed) returns
2324+
// before the launch; the target stack rows are already written but the
2325+
// approval state is the authoritative single-use gate, so re-calling
2326+
// with the same approval is the operator's retry path.
2327+
if approvalRow != nil {
2328+
if execErr := h.markApprovedPromoteExecuted(c, approvalRow, from, to); execErr != nil {
2329+
return execErr
2330+
}
2331+
}
2332+
23102333
// Step D: Hand off to the goroutine that calls the provider with
23112334
// SkipBuild=true. The dashboard's EnvironmentsGrid polls /family so it
23122335
// picks up the building → healthy transition automatically.
@@ -2408,65 +2431,91 @@ func (h *StackHandler) beginPromoteApproval(
24082431
return row, nil
24092432
}
24102433

2411-
// consumeApprovedPromote verifies that an explicit approval_id supplied
2412-
// by the caller matches an APPROVED but NOT-YET-EXECUTED row for the
2413-
// same team / from / to / kind, and atomically flips the row to
2414-
// 'executed'. Used by the manual-trigger fallback path until the
2415-
// worker-side polling lands.
2434+
// validateApprovedPromote verifies that an explicit approval_id supplied by
2435+
// the caller matches an APPROVED, NOT-YET-EXECUTED, non-expired row for the
2436+
// same team / from / to / kind. It returns the approval row on success but
2437+
// does NOT mutate it — the actual 'executed' flip is deferred to
2438+
// markApprovedPromoteExecuted, which the handler calls only AFTER the promote
2439+
// preflight (source-services, image_ref, target create/update, vault, env
2440+
// load) has succeeded.
2441+
//
2442+
// #11 (sweep 2026-06-04): the flip used to happen here, BEFORE preflight. A
2443+
// preflight failure (412 missing_image_ref / no_services, 503 lookup, 400
2444+
// vault, 402 cap) therefore burned the single-use approval to 'executed'
2445+
// while the promote never ran — leaving the operator with a non-retryable
2446+
// approval and forcing a fresh email round-trip. Splitting validate/execute
2447+
// keeps the approval 'approved' (retryable) on any preflight failure.
24162448
//
24172449
// Why we check from/to/kind in addition to the id: the approval row's
24182450
// payload is what the worker would replay. If a caller passes an
24192451
// approval_id for env=preprod but the request is to=production, we
24202452
// refuse — the row's authority covers the env pair it was issued for,
24212453
// not whatever the caller is asking for now.
2422-
func (h *StackHandler) consumeApprovedPromote(
2454+
func (h *StackHandler) validateApprovedPromote(
24232455
c *fiber.Ctx,
24242456
team *models.Team,
24252457
body promoteBody,
24262458
from, to, kind string,
2427-
) error {
2459+
) (*models.PromoteApproval, error) {
24282460
id, err := uuid.Parse(body.ApprovalID)
24292461
if err != nil {
2430-
return respondError(c, fiber.StatusBadRequest, "invalid_approval_id",
2462+
return nil, respondError(c, fiber.StatusBadRequest, "invalid_approval_id",
24312463
"approval_id must be a valid UUID")
24322464
}
24332465
row, err := models.GetPromoteApprovalByID(c.Context(), h.db, id)
24342466
if errors.Is(err, models.ErrPromoteApprovalNotFound) {
2435-
return respondError(c, fiber.StatusNotFound, "approval_not_found",
2467+
return nil, respondError(c, fiber.StatusNotFound, "approval_not_found",
24362468
"approval_id does not match any approval row")
24372469
}
24382470
if err != nil {
24392471
slog.Error("stack.promote.approval_lookup_failed",
24402472
"error", err, "approval_id", id,
24412473
"request_id", middleware.GetRequestID(c))
2442-
return respondError(c, fiber.StatusServiceUnavailable, "lookup_failed",
2474+
return nil, respondError(c, fiber.StatusServiceUnavailable, "lookup_failed",
24432475
"Failed to look up approval")
24442476
}
24452477
if row.TeamID != team.ID {
24462478
// Cross-team — same posture as stack ownership: 404 not 403.
2447-
return respondError(c, fiber.StatusNotFound, "approval_not_found",
2479+
return nil, respondError(c, fiber.StatusNotFound, "approval_not_found",
24482480
"approval_id does not match any approval row for this team")
24492481
}
24502482
if row.Status != models.PromoteApprovalStatusApproved {
2451-
return respondError(c, fiber.StatusConflict, "approval_not_approved",
2483+
return nil, respondError(c, fiber.StatusConflict, "approval_not_approved",
24522484
"approval row is in status="+row.Status+" — must be 'approved' to consume")
24532485
}
24542486
if row.PromoteKind != kind || row.FromEnv != from || row.ToEnv != to {
2455-
return respondError(c, fiber.StatusBadRequest, "approval_mismatch",
2487+
return nil, respondError(c, fiber.StatusBadRequest, "approval_mismatch",
24562488
"approval_id's recorded (kind,from,to) does not match this request")
24572489
}
24582490
if row.ExpiresAt.Before(time.Now().UTC()) {
24592491
// Even approved rows have an outer expiry — once the 24h window
24602492
// has fully passed since the original request we refuse to
24612493
// execute. This is belt-and-suspenders defence; the worker
24622494
// repo's polling job would refuse for the same reason.
2463-
return respondError(c, fiber.StatusGone, "approval_expired",
2495+
return nil, respondError(c, fiber.StatusGone, "approval_expired",
24642496
"approval window has fully expired")
24652497
}
2466-
ok, err := models.MarkPromoteApprovalExecuted(c.Context(), h.db, id)
2498+
return row, nil
2499+
}
2500+
2501+
// markApprovedPromoteExecuted atomically flips a validated approval row to
2502+
// 'executed' and audits the transition. It is called by Promote ONLY after
2503+
// the entire promote preflight has succeeded and immediately before the
2504+
// runStackDeploy launch, so a preflight failure leaves the row 'approved'
2505+
// (retryable). See validateApprovedPromote for the #11 rationale.
2506+
//
2507+
// The CAS inside MarkPromoteApprovalExecuted still guards against a concurrent
2508+
// double-consume: if a second request raced through validate + preflight and
2509+
// flipped the row first, this returns 0 rows and we 409 approval_already_executed.
2510+
func (h *StackHandler) markApprovedPromoteExecuted(
2511+
c *fiber.Ctx,
2512+
row *models.PromoteApproval,
2513+
from, to string,
2514+
) error {
2515+
ok, err := models.MarkPromoteApprovalExecuted(c.Context(), h.db, row.ID)
24672516
if err != nil {
24682517
slog.Error("stack.promote.approval_execute_failed",
2469-
"error", err, "approval_id", id,
2518+
"error", err, "approval_id", row.ID,
24702519
"request_id", middleware.GetRequestID(c))
24712520
return respondError(c, fiber.StatusServiceUnavailable, "execute_failed",
24722521
"Failed to mark approval executed")

internal/handlers/stack_final_test.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,17 @@ func TestStackFinal_ConsumeApproved_LookupError_503(t *testing.T) {
136136
}
137137

138138
// TestStackFinal_ConsumeApproved_ExecuteError_503 — MarkPromoteApprovalExecuted
139-
// errors after a fully-valid approved row (stack.go:2425). team(1) + stack(2) +
140-
// approval-read(3) succeed; the UPDATE(4) errors. failAfter=3.
139+
// errors on the deferred 'executed' flip, after a fully-valid approved row AND
140+
// after the entire promote preflight succeeds (markApprovedPromoteExecuted,
141+
// stack.go ~2520).
142+
//
143+
// #11 (sweep 2026-06-04): the flip moved from BEFORE preflight to AFTER it, so
144+
// the fault must now land on a LATER DB call. The fresh-target preflight runs
145+
// ~10 reads/writes (source services, family lookup, CreateStackWithCap, vault
146+
// copy, source+target env_vars, vault resolve) between the approval read and
147+
// the MarkPromoteApprovalExecuted UPDATE — failAfter=13 lands the injected
148+
// failure on that UPDATE (verified: 12 → env_load_failed, 13 → execute_failed,
149+
// 14 → success/202).
141150
func TestStackFinal_ConsumeApproved_ExecuteError_503(t *testing.T) {
142151
seedDB, clean := testhelpers.SetupTestDB(t)
143152
defer clean()
@@ -148,7 +157,7 @@ func TestStackFinal_ConsumeApproved_ExecuteError_503(t *testing.T) {
148157
slug, _ := seedPromoteSourceStack(t, seedDB, teamIDStr, "staging", "stkfinal-exec")
149158
id := mustSeedApprovedPromote(t, seedDB, teamID, "staging", "production")
150159

151-
faultDB := openFaultDB(t, 3)
160+
faultDB := openFaultDB(t, 13)
152161
app := stackFaultPromoteApp(t, faultDB)
153162
resp := postPromote(t, app, jwt, slug, map[string]any{
154163
"from": "staging", "to": "production", "approval_id": id,

0 commit comments

Comments
 (0)