@@ -142,6 +142,14 @@ const (
142142 idempotencyFingerprintTTL = 120 * time .Second
143143 // idempotencyKeyMaxLen caps the client-supplied key. Stripe uses 255.
144144 idempotencyKeyMaxLen = 255
145+ // idempotencyInflightTTL bounds an in-flight reservation marker (BugBash
146+ // 2026-06-02 #21). It must exceed the slowest synchronous handler — a
147+ // real provision (provisionDB/Cache/NoSQL runs in-request) can take
148+ // ~10-20s — so a concurrent retry is correctly told "in progress" rather
149+ // than racing past a too-early-expired marker. It is also the worst-case
150+ // self-heal window if the process dies mid-handler before the marker is
151+ // overwritten or deleted.
152+ idempotencyInflightTTL = 60 * time .Second
145153
146154 // X-Idempotency-Source values.
147155 idempotencySourceExplicit = "explicit"
@@ -157,6 +165,16 @@ type idemEntry struct {
157165 Body []byte `json:"b"`
158166 ContentType string `json:"c"`
159167 BodyHash string `json:"h"` // sha256 hex of the original request body
168+ // InFlight marks a reservation placeholder written (via SETNX) the
169+ // moment a cache-miss request begins running the handler, BEFORE the
170+ // real response exists. A concurrent same-key request that reads this
171+ // marker must NOT re-run the handler — it returns 409
172+ // idempotency_key_in_progress instead. The marker is overwritten by the
173+ // real entry when the handler completes (or deleted if the response is
174+ // non-cacheable), and self-expires after idempotencyInflightTTL if the
175+ // process dies mid-handler. Closes the check-then-act double-create race
176+ // (BugBash 2026-06-02 #21).
177+ InFlight bool `json:"f,omitempty"`
160178}
161179
162180// mutableErrorCodes lists machine-readable `error` strings whose 4xx
@@ -324,6 +342,39 @@ func Idempotency(rdb *redis.Client, endpoint string) fiber.Handler {
324342 }
325343}
326344
345+ // respondIdempotencyInProgress returns the 409 a request gets when it finds an
346+ // in-flight reservation marker for its key (BugBash 2026-06-02 #21). The
347+ // envelope mirrors the idempotency_key_conflict 409 (ok/error/message/
348+ // request_id/retry_after_seconds/agent_action) but is RETRYABLE: unlike a
349+ // body-mismatch conflict, the right move is to wait and retry the SAME key —
350+ // the original request's response will be replayed once it completes.
351+ func respondIdempotencyInProgress (c * fiber.Ctx ) error {
352+ return c .Status (fiber .StatusConflict ).JSON (fiber.Map {
353+ "ok" : false ,
354+ "error" : "idempotency_key_in_progress" ,
355+ "message" : "A request with this Idempotency-Key is still being processed." ,
356+ "request_id" : GetRequestID (c ),
357+ "retry_after_seconds" : 2 ,
358+ "agent_action" : "Another request with the same Idempotency-Key is still in flight. Do NOT mint a new key — wait ~2s and retry the SAME key; the original response will be replayed once it completes. See https://instanode.dev/docs/idempotency." ,
359+ })
360+ }
361+
362+ // reserveInflight best-effort places an in-flight marker at cacheKey via SETNX,
363+ // so a concurrent same-key request that arrives after this point reads the
364+ // marker and returns 409 in_progress instead of double-running the handler
365+ // (BugBash 2026-06-02 #21). Fire-and-forget: a SETNX error or a lost sub-
366+ // millisecond race just means the marker isn't placed, and the request falls
367+ // back to the documented best-effort dedup posture (run the handler; the GET
368+ // hit-block still catches the common case where the other request reserved
369+ // first). The marker is overwritten by the real entry on completion, or
370+ // cleared on a non-cacheable response.
371+ func reserveInflight (ctx context.Context , rdb * redis.Client , cacheKey , reqBodyHash string ) {
372+ // Marshal of this fixed-shape struct cannot fail; the marker is best-effort
373+ // regardless, so we ignore the error and let SETNX no-op on a nil payload.
374+ marker , _ := json .Marshal (idemEntry {InFlight : true , BodyHash : reqBodyHash })
375+ _ = rdb .SetNX (ctx , cacheKey , marker , idempotencyInflightTTL ).Err ()
376+ }
377+
327378// idempotencyExplicit handles the Stripe-shape Idempotency-Key path.
328379// Extracted from the main Idempotency wrapper so the fingerprint-fallback
329380// branch can live alongside it without nesting another layer of if-blocks.
@@ -371,6 +422,13 @@ func idempotencyExplicit(c *fiber.Ctx, rdb *redis.Client, endpoint, scope, rawKe
371422 slog .Warn ("idempotency.cache_unmarshal_failed" ,
372423 "error" , jerr , "endpoint" , endpoint )
373424 } else {
425+ if entry .InFlight {
426+ // A concurrent request with this key is still running (it
427+ // placed this reservation marker). Don't run the handler a
428+ // second time (BugBash #21) — tell the caller to retry the
429+ // same key shortly. No rate-limit refund: nothing was served.
430+ return respondIdempotencyInProgress (c )
431+ }
374432 if entry .BodyHash != reqBodyHash {
375433 // 409 is a genuine error response, not a replay — DO NOT
376434 // refund the rate-limit counter here. The agent did the
@@ -407,13 +465,24 @@ func idempotencyExplicit(c *fiber.Ctx, rdb *redis.Client, endpoint, scope, rawKe
407465 }
408466 }
409467
468+ // MISS — reserve the key (SETNX in-flight marker) before running the
469+ // handler so a concurrent same-key request reads the marker and 409s
470+ // instead of double-creating (BugBash #21). The real entry overwrites
471+ // the marker below; non-cacheable paths delete it.
472+ reserveInflight (ctx , rdb , cacheKey , reqBodyHash )
473+
410474 nextErr := c .Next ()
411475 if nextErr != nil && ! IsResponseWrittenErr (nextErr ) {
476+ // Handler failed before writing a response — clear the reservation so
477+ // an immediate retry isn't told "in progress" for the marker's TTL.
478+ rdb .Del (context .Background (), cacheKey )
412479 return nextErr
413480 }
414481
415482 status := c .Response ().StatusCode ()
416483 if status >= 500 {
484+ // 5xx is retryable — never leave the marker stranding the retry.
485+ rdb .Del (context .Background (), cacheKey )
417486 return nextErr
418487 }
419488
@@ -426,6 +495,9 @@ func idempotencyExplicit(c *fiber.Ctx, rdb *redis.Client, endpoint, scope, rawKe
426495 // the agent's 24h Idempotency-Key. Success + stable failures still
427496 // cache as before.
428497 if ! shouldCacheResponse (status , body , ct ) {
498+ // Clear the reservation: a mutable 4xx is meant to be retried once
499+ // the user resolves it, so the marker must not block that retry.
500+ rdb .Del (context .Background (), cacheKey )
429501 slog .Info ("idempotency.skip_cache_mutable_error" ,
430502 "endpoint" , endpoint ,
431503 "status" , status ,
@@ -516,6 +588,12 @@ func idempotencyFingerprint(c *fiber.Ctx, rdb *redis.Client, endpoint, scope str
516588 slog .Warn ("idempotency.fingerprint_cache_unmarshal_failed" ,
517589 "error" , jerr , "endpoint" , endpoint )
518590 // Corrupt — fall through to handler and overwrite below.
591+ } else if entry .InFlight {
592+ // A concurrent same-fingerprint request is still running (BugBash
593+ // #21). Return 409 in_progress rather than double-running the
594+ // handler. The source header still reports the fingerprint path.
595+ c .Set (idempotencySourceHeader , idempotencySourceFingerprint )
596+ return respondIdempotencyInProgress (c )
519597 } else {
520598 // Cache HIT on the body-fingerprint fallback path. Same
521599 // refund semantics as the explicit-key branch: the
@@ -532,15 +610,19 @@ func idempotencyFingerprint(c *fiber.Ctx, rdb *redis.Client, endpoint, scope str
532610 }
533611 }
534612
535- // Miss — run the handler, then cache the response (non-5xx only).
613+ // Miss — reserve the key (BugBash #21), run the handler, then cache the
614+ // response (non-5xx only).
536615 c .Set (idempotencySourceHeader , idempotencySourceMiss )
616+ reserveInflight (ctx , rdb , cacheKey , sha256Hex (canonBody ))
537617 nextErr := c .Next ()
538618 if nextErr != nil && ! IsResponseWrittenErr (nextErr ) {
619+ rdb .Del (context .Background (), cacheKey )
539620 return nextErr
540621 }
541622
542623 status := c .Response ().StatusCode ()
543624 if status >= 500 {
625+ rdb .Del (context .Background (), cacheKey )
544626 return nextErr
545627 }
546628
@@ -553,6 +635,7 @@ func idempotencyFingerprint(c *fiber.Ctx, rdb *redis.Client, endpoint, scope str
553635 // path is the no-header default, so the silent strand is even more
554636 // likely here.
555637 if ! shouldCacheResponse (status , body , ct ) {
638+ rdb .Del (context .Background (), cacheKey )
556639 slog .Info ("idempotency.fingerprint_skip_cache_mutable_error" ,
557640 "endpoint" , endpoint ,
558641 "status" , status ,
0 commit comments