diff --git a/.changeset/chat-agent-tools.md b/.changeset/chat-agent-tools.md
new file mode 100644
index 00000000000..1d44ea2a659
--- /dev/null
+++ b/.changeset/chat-agent-tools.md
@@ -0,0 +1,15 @@
+---
+"@trigger.dev/sdk": patch
+---
+
+Add a `tools` option to `chat.agent`. Declaring your tools here threads them into the SDK's internal `convertToModelMessages`, so each tool's `toModelOutput` is re-applied when prior-turn history is re-converted.
+
+```ts
+chat.agent({
+  tools: { readFile, search },
+  run: async ({ messages, tools, signal }) =>
+    streamText({ model, messages, tools, abortSignal: signal }),
+});
+```
+
+Also exports `InferChatUIMessageFromTools<typeof tools>` to derive the chat `UIMessage` type (typed tool parts) directly from a tool set.
diff --git a/.changeset/coerce-concurrency-key-to-string.md b/.changeset/coerce-concurrency-key-to-string.md
new file mode 100644
index 00000000000..faccf7a48bf
--- /dev/null
+++ b/.changeset/coerce-concurrency-key-to-string.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/core": patch
+---
+
+Coerce numeric `concurrencyKey` values to string at the API boundary across `tasks.trigger`, `tasks.batchTrigger`, and the Phase-2 streaming batch endpoint.
diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md
new file mode 100644
index 00000000000..c2a3b1a0e8e
--- /dev/null
+++ b/.changeset/mollifier-buffer-extensions.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/redis-worker": minor
+---
+
+Mollifier buffer extensions: idempotency dedup, an atomic `mutateSnapshot` API, metadata CAS, claim primitives, and a `MollifierSnapshot` type. The buffer's Redis client now reconnects with jittered backoff so a fleet of clients doesn't stampede Redis in lockstep after a blip.
diff --git a/.changeset/mollifier-buffer-pipeline-list-entries.md b/.changeset/mollifier-buffer-pipeline-list-entries.md
new file mode 100644
index 00000000000..2c55d9b18a8
--- /dev/null
+++ b/.changeset/mollifier-buffer-pipeline-list-entries.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/redis-worker": patch
+---
+
+Pipeline the per-entry `HGETALL` fetches in `MollifierBuffer.listEntriesForEnv`. The previous serial implementation issued one Redis round-trip per runId returned by `LRANGE`, which dominated stale-sweep wall-time at any meaningful backlog (at the sweep's default maxCount=1000, this is ~1000 RTTs per env per pass). Behaviour is unchanged — entries are still skipped when the entry hash has been torn down by a concurrent drainer ack/fail between the LRANGE and the HGETALL.
diff --git a/.changeset/mollifier-drainer-terminal-failure-callback.md b/.changeset/mollifier-drainer-terminal-failure-callback.md
new file mode 100644
index 00000000000..e0ac3400ff3
--- /dev/null
+++ b/.changeset/mollifier-drainer-terminal-failure-callback.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/redis-worker": minor
+---
+
+Add `onTerminalFailure` callback to `MollifierDrainerOptions` so the customer's run lands a SYSTEM_FAILURE PG row even when the drainer exhausts `maxAttempts` on a retryable PG error. Previously, retryable-error exhaustion called `buffer.fail()` directly, which atomically marks FAILED + DELs the entry hash with no PG write — silent data loss when PG was unreachable across the full retry budget. The callback fires before `buffer.fail()` on any terminal path (`cause: "non-retryable"` or `"max-attempts-exhausted"`); throwing a retryable error from the callback causes the drainer to requeue rather than fail.
diff --git a/.changeset/mollifier-tag-cap.md b/.changeset/mollifier-tag-cap.md
new file mode 100644
index 00000000000..b9057664fa7
--- /dev/null
+++ b/.changeset/mollifier-tag-cap.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/redis-worker": patch
+---
+
+Mollifier `mutateSnapshot` now enforces a tag cap: an `append_tags` patch carrying `maxTags` returns `"limit_exceeded"` (writing nothing) when the deduped tag count would exceed the limit, so a buffered run can't accumulate more tags via the tags API than the trigger validator allows at creation.
diff --git a/.server-changes/mollifier-dashboard.md b/.server-changes/mollifier-dashboard.md
new file mode 100644
index 00000000000..1aad107063e
--- /dev/null
+++ b/.server-changes/mollifier-dashboard.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: feature
+---
+
+Mollifier dashboard surface: run-detail page renders buffered runs via synthetic trace, header, and span shapes; admin-only "Buffered" indicator and drainer LOG event in the trace tree.
diff --git a/.server-changes/mollifier-drainer-replay.md b/.server-changes/mollifier-drainer-replay.md
new file mode 100644
index 00000000000..fb2c9dd37bc
--- /dev/null
+++ b/.server-changes/mollifier-drainer-replay.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: feature
+---
+
+Mollifier drainer replay: replay buffered entries into `engine.trigger`, stale-entry sweep, a drainer-health gauge, and run-engine cancelled/failed run APIs. Known limitation: stale-sweep runs per-webapp instance, so stale-entry counter metrics multiply by N webapps in HA until a distributed lease lands as follow-up.
diff --git a/.server-changes/mollifier-mutations.md b/.server-changes/mollifier-mutations.md
new file mode 100644
index 00000000000..d0d5a969cbc
--- /dev/null
+++ b/.server-changes/mollifier-mutations.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: feature
+---
+
+Mollifier API mutations on buffered runs: tag, metadata, replay, reschedule, cancel, and idempotency-key reset via a buffer-snapshot fallback. When a mutation races a mid-drain run, the wait-and-bounce loop watches the buffer entry in Redis (cheap) and reads the primary exactly once for the actual mutation, instead of polling the writer on a fixed cadence; polls use jittered exponential backoff.
diff --git a/.server-changes/mollifier-reads.md b/.server-changes/mollifier-reads.md
new file mode 100644
index 00000000000..320310be1ee
--- /dev/null
+++ b/.server-changes/mollifier-reads.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: feature
+---
+
+Mollifier API read-fallback: serve buffered runs from synthetic run/trace/span data on the retrieve, trace, spans, and events endpoints.
diff --git a/.server-changes/mollifier-trigger.md b/.server-changes/mollifier-trigger.md
new file mode 100644
index 00000000000..a289972ef87
--- /dev/null
+++ b/.server-changes/mollifier-trigger.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: feature
+---
+
+Mollifier trigger-time decisions: gate `engine.trigger`, mollify bursts into the buffer, claim idempotency keys, and read-fallback for buffered runs.
diff --git a/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx b/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx
index facff746c5e..566bc787daa 100644
--- a/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx
+++ b/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx
@@ -10,9 +10,18 @@ import { SpinnerWhite } from "~/components/primitives/Spinner";
 type CancelRunDialogProps = {
   runFriendlyId: string;
   redirectPath: string;
+  // Fired on submit so the parent can close the Radix Dialog without
+  // wrapping the submit button in `DialogClose` — that wrapper races
+  // submit (close fires first, unmounts the form, and the cancel POST
+  // never lands). Optional so existing call sites still type-check.
+  onCancelSubmitted?: () => void;
 };
 
-export function CancelRunDialog({ runFriendlyId, redirectPath }: CancelRunDialogProps) {
+export function CancelRunDialog({
+  runFriendlyId,
+  redirectPath,
+  onCancelSubmitted,
+}: CancelRunDialogProps) {
   const navigation = useNavigation();
 
   const formAction = `/resources/taskruns/${runFriendlyId}/cancel`;
@@ -27,7 +36,11 @@ export function CancelRunDialog({ runFriendlyId, redirectPath }: CancelRunDialog
         </Paragraph>
         <FormButtons
           confirmButton={
-            <Form action={`/resources/taskruns/${runFriendlyId}/cancel`} method="post">
+            <Form
+              action={`/resources/taskruns/${runFriendlyId}/cancel`}
+              method="post"
+              onSubmit={() => onCancelSubmitted?.()}
+            >
               <Button
                 type="submit"
                 name="redirectUrl"
diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx
index 60c234402d5..9996eb7b30a 100644
--- a/apps/webapp/app/entry.server.tsx
+++ b/apps/webapp/app/entry.server.tsx
@@ -9,6 +9,7 @@ import { renderToPipeableStream } from "react-dom/server";
 import { PassThrough } from "stream";
 import * as Worker from "~/services/worker.server";
 import { initMollifierDrainerWorker } from "~/v3/mollifierDrainerWorker.server";
+import { initMollifierStaleSweepWorker } from "~/v3/mollifierStaleSweepWorker.server";
 import { bootstrap } from "./bootstrap";
 import { LocaleContextProvider } from "./components/primitives/LocaleProvider";
 import {
@@ -228,6 +229,7 @@ Worker.init().catch((error) => {
 });
 
 initMollifierDrainerWorker();
+initMollifierStaleSweepWorker();
 
 bootstrap().catch((error) => {
   logError(error);
diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts
index 97b8333e279..dafd67124b7 100644
--- a/apps/webapp/app/env.server.ts
+++ b/apps/webapp/app/env.server.ts
@@ -1063,13 +1063,16 @@ const EnvironmentSchema = z
     // Separate switch for the drainer (consumer side) so it can be split
     // off onto a dedicated worker service. Unset → inherits
     // TRIGGER_MOLLIFIER_ENABLED, so single-container self-hosters don't have to
-    // flip two switches. In multi-replica deployments, set this to "0"
-    // explicitly on every replica except the one dedicated drainer
-    // service — otherwise every replica's polling loop races for the
-    // same buffer entries. `TRIGGER_MOLLIFIER_ENABLED` is still the master kill
-    // switch; setting this to "1" while `TRIGGER_MOLLIFIER_ENABLED` is "0" is a
-    // no-op because the gate-side singleton refuses to construct a
-    // buffer when the system is off.
+    // flip two switches. Multi-replica drainers are correct — `popAndMarkDraining`
+    // is an atomic ZPOPMIN + status flip in one Lua call, so only one replica
+    // can win any given entry — but inefficient: polling load (SMEMBERS +
+    // per-env scans) multiplies by N, and `TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY`
+    // is per-process so engine load also multiplies. Splitting the drainer
+    // onto a dedicated worker keeps that traffic off the request-serving
+    // replicas. `TRIGGER_MOLLIFIER_ENABLED` is still the master kill switch;
+    // setting this to "1" while `TRIGGER_MOLLIFIER_ENABLED` is "0" is a
+    // no-op because the gate-side singleton refuses to construct a buffer
+    // when the system is off.
     TRIGGER_MOLLIFIER_DRAINER_ENABLED: z.string().default(process.env.TRIGGER_MOLLIFIER_ENABLED ?? "0"),
     TRIGGER_MOLLIFIER_SHADOW_MODE: z.string().default("0"),
     TRIGGER_MOLLIFIER_REDIS_HOST: z
@@ -1095,10 +1098,35 @@ const EnvironmentSchema = z
     TRIGGER_MOLLIFIER_TRIP_THRESHOLD: z.coerce.number().int().positive().default(100),
     TRIGGER_MOLLIFIER_HOLD_MS: z.coerce.number().int().positive().default(500),
     TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY: z.coerce.number().int().positive().default(50),
-    TRIGGER_MOLLIFIER_ENTRY_TTL_S: z.coerce.number().int().positive().default(600),
     TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS: z.coerce.number().int().positive().default(3),
     TRIGGER_MOLLIFIER_DRAIN_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().positive().default(30_000),
     TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK: z.coerce.number().int().positive().default(500),
+    // Periodic sweep that scans buffer queue LISTs for entries whose
+    // dwell exceeds the stale threshold. Independent of the drainer —
+    // its job is exactly to make a stuck/offline drainer visible to
+    // ops. Defaults: explicitly opt-in (a separate kill switch from
+    // the mollifier itself), run every 5 minutes, alert on anything
+    // that's been dwelling for 5+ minutes (matches the sweep interval
+    // — "anything still here when we check" is the simplest threshold
+    // that converges).
+    //
+    // The sweep was previously defaulting to inherit
+    // `TRIGGER_MOLLIFIER_ENABLED`, which meant any deployment already
+    // running with the mollifier on would auto-start the sweep worker
+    // on upgrade — turning on new background load with no explicit
+    // rollout step. Hard-defaulting to "0" preserves the intent of
+    // exposing the sweep as a separate switch.
+    TRIGGER_MOLLIFIER_STALE_SWEEP_ENABLED: z.string().default("0"),
+    TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS: z.coerce
+      .number()
+      .int()
+      .positive()
+      .default(5 * 60_000),
+    TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS: z.coerce
+      .number()
+      .int()
+      .positive()
+      .default(5 * 60_000),
 
     BATCH_TRIGGER_PROCESS_JOB_VISIBILITY_TIMEOUT_MS: z.coerce
       .number()
diff --git a/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts
index a392866afc9..3f102b4f41e 100644
--- a/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts
+++ b/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts
@@ -9,12 +9,17 @@ import {
   logger,
 } from "@trigger.dev/core/v3";
 import { parsePacketAsJson } from "@trigger.dev/core/v3/utils/ioSerialization";
+import { BatchId } from "@trigger.dev/core/v3/isomorphic";
 import { getUserProvidedIdempotencyKey } from "@trigger.dev/core/v3/serverOnly";
 import { Prisma, TaskRunAttemptStatus, TaskRunStatus } from "@trigger.dev/database";
 import assertNever from "assert-never";
 import { API_VERSIONS, CURRENT_API_VERSION, RunStatusUnspecifiedApiVersion } from "~/api/versions";
 import { $replica, prisma } from "~/db.server";
 import { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import {
+  findRunByIdWithMollifierFallback,
+  type SyntheticRun,
+} from "~/v3/mollifier/readFallback.server";
 import { generatePresignedUrl } from "~/v3/objectStore.server";
 import { tracer } from "~/v3/tracer.server";
 import { startSpanWithEnv } from "~/v3/tracing.server";
@@ -64,13 +69,46 @@ type CommonRelatedRun = Prisma.Result<
   "findFirstOrThrow"
 >;
 
-type FoundRun = NonNullable<Awaited<ReturnType<typeof ApiRetrieveRunPresenter.findRun>>>;
+// Full shape returned by findRun() — the commonRunSelect fields plus the
+// extras the route handler reads. Declared explicitly (not inferred via
+// ReturnType<typeof findRun>) so findRun can return a synthesised buffered
+// run without the type becoming self-referential.
+// Exported so the buffer-synthesis helper below can be unit-tested
+// against a stable shape without re-deriving it (FoundRun's exact field
+// list is what the buffered run must match for `call()` not to surprise).
+export type FoundRun = CommonRelatedRun & {
+  traceId: string;
+  payload: string;
+  payloadType: string;
+  output: string | null;
+  outputType: string;
+  error: Prisma.JsonValue;
+  attempts: { id: string }[];
+  attemptNumber: number | null;
+  engine: "V1" | "V2";
+  taskEventStore: string;
+  parentTaskRun: CommonRelatedRun | null;
+  rootTaskRun: CommonRelatedRun | null;
+  childRuns: CommonRelatedRun[];
+  // True when this run was synthesised from the mollifier buffer rather
+  // than read from Postgres. Callers that would otherwise query backing
+  // stores keyed on PG identifiers (e.g. ClickHouse event lookups by
+  // traceId) can short-circuit to an empty response — buffered runs
+  // haven't executed and have no events to fetch. Devin's analysis on
+  // PR #3755 (events endpoint) flagged the pre-fix code as making a
+  // wasted ClickHouse round-trip when this is set; gate on this flag
+  // instead.
+  isBuffered: boolean;
+};
 
 export class ApiRetrieveRunPresenter {
   constructor(private readonly apiVersion: API_VERSIONS) {}
 
-  public static async findRun(friendlyId: string, env: AuthenticatedEnvironment) {
-    return $replica.taskRun.findFirst({
+  public static async findRun(
+    friendlyId: string,
+    env: AuthenticatedEnvironment,
+  ): Promise<FoundRun | null> {
+    const pgRow = await $replica.taskRun.findFirst({
       where: {
         friendlyId,
         runtimeEnvironmentId: env.id,
@@ -102,6 +140,23 @@ export class ApiRetrieveRunPresenter {
         },
       },
     });
+
+    if (pgRow) return { ...pgRow, isBuffered: false };
+
+    // Postgres miss → fall back to the mollifier buffer. When the gate
+    // diverted a trigger, the run lives in Redis until the drainer replays
+    // it through engine.trigger. Synthesise the FoundRun shape so call()
+    // returns a `QUEUED` (or `FAILED`) response with empty output, no
+    // attempts, no relations.
+    const buffered = await findRunByIdWithMollifierFallback({
+      runId: friendlyId,
+      environmentId: env.id,
+      organizationId: env.organizationId,
+    });
+
+    if (!buffered) return null;
+
+    return synthesiseFoundRunFromBuffer(buffered);
   }
 
   public async call(taskRun: FoundRun, env: AuthenticatedEnvironment) {
@@ -475,3 +530,162 @@ function resolveTriggerFunction(run: CommonRelatedRun): TriggerFunction {
     return run.resumeParentOnCompletion ? "triggerAndWait" : "trigger";
   }
 }
+
+// Build a FoundRun-shaped object from a buffered (mollified) run. The run
+// is in the Redis buffer; engine.trigger hasn't created the Postgres row
+// yet, so every field that comes from execution state (output, attempts,
+// completedAt, cost, relations) takes a default. The presenter's call()
+// handles QUEUED-state runs without surprise.
+function bufferedStatusToTaskRunStatus(status: SyntheticRun["status"]): TaskRunStatus {
+  switch (status) {
+    case "FAILED":
+      return "SYSTEM_FAILURE";
+    case "CANCELED":
+      return "CANCELED";
+    default:
+      return "PENDING";
+  }
+}
+
+// The PG path stores `TaskRun.payload` as `String?`, so in production
+// the buffered snapshot's `payload` is always a string. We defensively
+// coerce other types instead of silently dropping them: an object gets
+// JSON-stringified (matches how the trigger path would serialise it),
+// anything truly unrenderable falls back to an empty string. The log
+// line surfaces format drift to ops without crashing the read path.
+function synthesisePayload(buffered: SyntheticRun): string {
+  const payload = buffered.payload;
+  if (typeof payload === "string") return payload;
+  if (payload === undefined || payload === null) return "";
+  try {
+    const serialised = JSON.stringify(payload);
+    logger.warn("ApiRetrieveRunPresenter: buffered snapshot.payload non-string coerced", {
+      runFriendlyId: buffered.friendlyId,
+      payloadType: typeof payload,
+    });
+    return typeof serialised === "string" ? serialised : "";
+  } catch {
+    logger.error("ApiRetrieveRunPresenter: buffered snapshot.payload unserialisable", {
+      runFriendlyId: buffered.friendlyId,
+      payloadType: typeof payload,
+    });
+    return "";
+  }
+}
+
+// Mirror synthesisePayload for metadata. The PG path stores
+// `TaskRun.metadata` as `String?`, and the snapshot writes it from
+// `metadataPacket.data` (also a string), so in production it is always a
+// string or absent. We coerce defensively — an object gets JSON-stringified
+// (matching how the trigger path serialises it) rather than silently
+// dropped to null, and the log line surfaces format drift to ops.
+function synthesiseMetadata(buffered: SyntheticRun): string | null {
+  const metadata = buffered.metadata;
+  if (typeof metadata === "string") return metadata;
+  if (metadata === undefined || metadata === null) return null;
+  try {
+    const serialised = JSON.stringify(metadata);
+    logger.warn("ApiRetrieveRunPresenter: buffered snapshot.metadata non-string coerced", {
+      runFriendlyId: buffered.friendlyId,
+      metadataType: typeof metadata,
+    });
+    return typeof serialised === "string" ? serialised : null;
+  } catch {
+    logger.error("ApiRetrieveRunPresenter: buffered snapshot.metadata unserialisable", {
+      runFriendlyId: buffered.friendlyId,
+      metadataType: typeof metadata,
+    });
+    return null;
+  }
+}
+
+// Exported for unit testing. Used by `findRun()` above when the
+// Postgres lookup misses and the buffer carries the run — keep the shape
+// in lockstep with `FoundRun`'s field list so `call()` treats a synthesised
+// buffered run identically to a freshly-triggered PG row.
+export function synthesiseFoundRunFromBuffer(buffered: SyntheticRun): FoundRun {
+  const status: TaskRunStatus = bufferedStatusToTaskRunStatus(buffered.status);
+
+  const errorJson: Prisma.JsonValue = buffered.error
+    ? {
+        type: "STRING_ERROR",
+        raw: `${buffered.error.code}: ${buffered.error.message}`,
+      }
+    : null;
+
+  const metadata: string | null = synthesiseMetadata(buffered);
+
+  return {
+    // `id` is the internal cuid (Prisma TaskRun.id column), `friendlyId`
+    // is the user-facing `run_xxx` token. Downstream logging keyed off
+    // `taskRun.id` correlates with other systems via the cuid — using
+    // the friendlyId here breaks log correlation. `SyntheticRun` carries
+    // the cuid alongside the friendlyId for exactly this reason
+    // (RunId.fromFriendlyId in readFallback.server.ts).
+    id: buffered.id,
+    friendlyId: buffered.friendlyId,
+    status,
+    taskIdentifier: buffered.taskIdentifier ?? "",
+    createdAt: buffered.createdAt,
+    startedAt: null,
+    updatedAt: buffered.cancelledAt ?? buffered.createdAt,
+    // PG-resident SYSTEM_FAILURE rows always have `completedAt` set by
+    // the engine; the buffer-synth path must match so SDK consumers
+    // that poll on `isCompleted` and then read `finishedAt` see a real
+    // timestamp instead of `undefined`. CANCELED already had this via
+    // `buffered.cancelledAt`; fall back to `buffered.createdAt` for
+    // FAILED (the buffer entry has no separate "failedAt" — the
+    // best-available approximation of when the terminal state landed
+    // is the entry's creation time).
+    completedAt:
+      buffered.cancelledAt ?? (status === "SYSTEM_FAILURE" ? buffered.createdAt : null),
+    expiredAt: null,
+    delayUntil: buffered.delayUntil ?? null,
+    metadata,
+    metadataType: buffered.metadataType ?? "application/json",
+    ttl: buffered.ttl ?? null,
+    costInCents: 0,
+    baseCostInCents: 0,
+    usageDurationMs: 0,
+    idempotencyKey: buffered.idempotencyKey ?? null,
+    idempotencyKeyOptions: buffered.idempotencyKeyOptions ?? null,
+    isTest: buffered.isTest,
+    depth: buffered.depth,
+    // Scheduled triggers go through the same TriggerTaskService path as
+    // API triggers and aren't bypassed by the mollifier gate, so a
+    // scheduled run can land in the buffer with its scheduleId set on the
+    // snapshot. Forward it so resolveSchedule() can hydrate the `schedule`
+    // field in the API response instead of silently dropping it until the
+    // drainer materialises.
+    scheduleId: buffered.scheduleId ?? null,
+    lockedToVersion: buffered.lockedToVersion ? { version: buffered.lockedToVersion } : null,
+    resumeParentOnCompletion: buffered.resumeParentOnCompletion,
+    // Reconstruct the batch from the snapshot's internal id so a buffered
+    // run reports the same `batchId` / triggerFunction as it will once
+    // materialised, and so batch-scoped JWTs authorise against it (the
+    // route authorization callbacks read `run.batch?.friendlyId`).
+    batch: buffered.batchId
+      ? { id: buffered.batchId, friendlyId: BatchId.toFriendlyId(buffered.batchId) }
+      : null,
+    runTags: buffered.tags,
+    traceId: buffered.traceId ?? "",
+    payload: synthesisePayload(buffered),
+    payloadType: buffered.payloadType ?? "application/json",
+    output: null,
+    outputType: "application/json",
+    error: errorJson,
+    attempts: [],
+    attemptNumber: null,
+    engine: "V2",
+    taskEventStore: "taskEvent",
+    // Empty string when absent (matches syntheticSpanRun.server.ts and lets
+    // `createCommonRunStructure`'s `run.workerQueue || undefined` coerce the
+    // API response's `region` to undefined instead of advertising a
+    // misleading "main" region for a not-yet-assigned buffered run).
+    workerQueue: buffered.workerQueue ?? "",
+    parentTaskRun: null,
+    rootTaskRun: null,
+    childRuns: [],
+    isBuffered: true,
+  };
+}
diff --git a/apps/webapp/app/presenters/v3/RunPresenter.server.ts b/apps/webapp/app/presenters/v3/RunPresenter.server.ts
index b4733144907..d965f74a77d 100644
--- a/apps/webapp/app/presenters/v3/RunPresenter.server.ts
+++ b/apps/webapp/app/presenters/v3/RunPresenter.server.ts
@@ -20,6 +20,20 @@ export class RunEnvironmentMismatchError extends Error {
   }
 }
 
+// Thrown by `call()` when the run isn't in PG. The route loader catches
+// this and falls back to the mollifier buffer via `tryMollifiedRunFallback`.
+// Using a typed error (rather than Prisma's `findFirstOrThrow` exception)
+// keeps the buffered case off the PrismaClient error path — that path
+// emits a `PrismaClient error` log every time it fires, which on the
+// run-detail page polls becomes per-tick log spam and Sentry noise for
+// any run that legitimately lives in the buffer.
+export class RunNotInPgError extends Error {
+  constructor(public readonly runFriendlyId: string) {
+    super(`Run ${runFriendlyId} not in PG`);
+    this.name = "RunNotInPgError";
+  }
+}
+
 export class RunPresenter {
   #prismaClient: PrismaClient;
 
@@ -42,7 +56,13 @@ export class RunPresenter {
     showDeletedLogs: boolean;
     showDebug: boolean;
   }) {
-    const run = await this.#prismaClient.taskRun.findFirstOrThrow({
+    // `findFirst` + explicit null check (not `findFirstOrThrow`) because
+    // a missing PG row is the *expected* path for buffered runs — the
+    // route catches `RunNotInPgError` and falls back to the synthesised
+    // buffer view. `findFirstOrThrow` would log a `PrismaClient error`
+    // every tick of the page poll, masking real DB issues with synthetic
+    // not-found noise.
+    const run = await this.#prismaClient.taskRun.findFirst({
       select: {
         id: true,
         createdAt: true,
@@ -106,6 +126,10 @@ export class RunPresenter {
       },
     });
 
+    if (!run) {
+      throw new RunNotInPgError(runFriendlyId);
+    }
+
     if (environmentSlug !== run.runtimeEnvironment.slug) {
       throw new RunEnvironmentMismatchError(
         `Run ${runFriendlyId} is not in environment ${environmentSlug}`
diff --git a/apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts b/apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts
index 69560c49e88..3a01f8f4397 100644
--- a/apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts
+++ b/apps/webapp/app/presenters/v3/RunStreamPresenter.server.ts
@@ -3,6 +3,8 @@ import { logger } from "~/services/logger.server";
 import { singleton } from "~/utils/singleton";
 import { ABORT_REASON_SEND_ERROR, createSSELoader, SendFunction } from "~/utils/sse";
 import { throttle } from "~/utils/throttle";
+import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server";
+import { deserialiseMollifierSnapshot } from "~/v3/mollifier/mollifierSnapshot.server";
 import { tracePubSub } from "~/v3/services/tracePubSub.server";
 
 const PING_INTERVAL = 5_000;
@@ -37,17 +39,48 @@ export class RunStreamPresenter {
           },
         });
 
-        if (!run) {
+        // Fall back to the mollifier buffer when the run isn't in PG yet.
+        // The buffered run has no execution events to stream, but we still
+        // attach a trace-pubsub subscription using the snapshot's traceId
+        // so that the moment the drainer materialises the row and execution
+        // begins, those events flow to this open SSE connection. Closing
+        // with 404 would force the dashboard to keep retrying.
+        let traceId: string | null = run?.traceId ?? null;
+        if (!traceId) {
+          const buffer = getMollifierBuffer();
+          if (buffer) {
+            try {
+              const entry = await buffer.getEntry(runFriendlyId);
+              if (entry) {
+                // Go through the webapp wrapper so this read-side module
+                // shares a single deserialisation path with readFallback —
+                // see the contract comment in syntheticRedirectInfo.server.ts.
+                const snapshot = deserialiseMollifierSnapshot(entry.payload);
+                if (typeof snapshot.traceId === "string") {
+                  traceId = snapshot.traceId;
+                }
+              }
+            } catch (err) {
+              logger.warn("RunStreamPresenter buffer fallback failed", {
+                runFriendlyId,
+                err: err instanceof Error ? err.message : String(err),
+              });
+            }
+          }
+        }
+
+        if (!traceId) {
           throw new Response("Not found", { status: 404 });
         }
+        const resolvedRun = { traceId };
 
         logger.info("RunStreamPresenter.start", {
           runFriendlyId,
-          traceId: run.traceId,
+          traceId: resolvedRun.traceId,
         });
 
         // Subscribe to trace updates
-        const { unsubscribe, eventEmitter } = await tracePubSub.subscribeToTrace(run.traceId);
+        const { unsubscribe, eventEmitter } = await tracePubSub.subscribeToTrace(resolvedRun.traceId);
 
         // Only send max every 1 second
         const throttledSend = throttle(
@@ -105,7 +138,7 @@ export class RunStreamPresenter {
           cleanup: () => {
             logger.info("RunStreamPresenter.cleanup", {
               runFriendlyId,
-              traceId: run.traceId,
+              traceId: resolvedRun.traceId,
             });
 
             // Remove message listener
@@ -119,13 +152,13 @@ export class RunStreamPresenter {
               .then(() => {
                 logger.info("RunStreamPresenter.cleanup.unsubscribe succeeded", {
                   runFriendlyId,
-                  traceId: run.traceId,
+                  traceId: resolvedRun.traceId,
                 });
               })
               .catch((error) => {
                 logger.error("RunStreamPresenter.cleanup.unsubscribe failed", {
                   runFriendlyId,
-                  traceId: run.traceId,
+                  traceId: resolvedRun.traceId,
                   error: {
                     name: error.name,
                     message: error.message,
diff --git a/apps/webapp/app/presenters/v3/SpanPresenter.server.ts b/apps/webapp/app/presenters/v3/SpanPresenter.server.ts
index 61334ba96e3..47ae27bd17c 100644
--- a/apps/webapp/app/presenters/v3/SpanPresenter.server.ts
+++ b/apps/webapp/app/presenters/v3/SpanPresenter.server.ts
@@ -32,6 +32,8 @@ import {
   extractAIEmbedData,
 } from "~/components/runs/v3/ai";
 import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server";
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
+import { buildSyntheticSpanRun } from "~/v3/mollifier/syntheticSpanRun.server";
 
 export type PromptSpanData = {
   slug: string;
@@ -72,9 +74,21 @@ function extractPromptSpanData(properties: Record<string, unknown>): PromptSpanD
   };
 }
 
+// SpanRun is grounded in the PG-path `getRun` method rather than
+// inferred from `call`'s return type. The buffered branch of `call`
+// routes through `buildSyntheticSpanRun`, and that helper is annotated
+// `Promise<SpanRun>` — if SpanRun were derived from `call` it would
+// close a loop TS no longer tolerates ("Type alias 'Result' circularly
+// references itself"). `getRun` is the canonical source for the shape
+// (the synthetic helper just rebuilds the same shape from a buffer
+// snapshot), and it doesn't recurse, so grounding here breaks the
+// cycle while keeping Span available off `call` (Span's path through
+// `#getSpan` has no synthetic indirection).
+export type SpanRun = NonNullable<
+  Awaited<ReturnType<InstanceType<typeof SpanPresenter>["getRun"]>>
+>;
 type Result = Awaited<ReturnType<SpanPresenter["call"]>>;
 export type Span = NonNullable<NonNullable<Result>["span"]>;
-export type SpanRun = NonNullable<NonNullable<Result>["run"]>;
 type FindRunResult = NonNullable<
   Awaited<ReturnType<InstanceType<typeof SpanPresenter>["findRun"]>>
 >;
@@ -84,12 +98,18 @@ export class SpanPresenter extends BasePresenter {
   public async call({
     userId,
     projectSlug,
+    envSlug,
     spanId,
     runFriendlyId,
     linkedRunId,
   }: {
     userId: string;
     projectSlug: string;
+    // Optional for backwards compatibility, required for the mollifier
+    // buffer fallback when the parent run isn't yet in PG — we need to
+    // resolve the env id to satisfy `findRunByIdWithMollifierFallback`'s
+    // auth check.
+    envSlug?: string;
     spanId: string;
     runFriendlyId: string;
     linkedRunId?: string;
@@ -127,7 +147,32 @@ export class SpanPresenter extends BasePresenter {
     });
 
     if (!parentRun) {
-      return;
+      // PG miss → fall back to the mollifier buffer. Without this the
+      // right-side span detail panel on the run-detail page never
+      // resolves for buffered runs: `call()` returns undefined, the
+      // resource route redirects with an "Event not found" toast, the
+      // run-detail page reloads, the toast fires again — a perpetual
+      // spin until the drainer materialises the row. Synthesise a
+      // SpanRun straight from the buffer snapshot, reusing
+      // `buildSyntheticSpanRun` (the same helper the run-detail
+      // loader's header fallback already uses).
+      if (!envSlug) return;
+      const envRow = await this._replica.runtimeEnvironment.findFirst({
+        where: { project: { id: project.id }, slug: envSlug },
+        select: { id: true, slug: true, type: true, organizationId: true },
+      });
+      if (!envRow) return;
+      const buffered = await findRunByIdWithMollifierFallback({
+        runId: runFriendlyId,
+        environmentId: envRow.id,
+        organizationId: envRow.organizationId,
+      });
+      if (!buffered) return;
+      const synth = await buildSyntheticSpanRun({
+        run: buffered,
+        environment: { id: envRow.id, slug: envRow.slug, type: envRow.type },
+      });
+      return { type: "run" as const, run: synth };
     }
 
     const { traceId } = parentRun;
@@ -373,6 +418,7 @@ export class SpanPresenter extends BasePresenter {
       traceId: run.traceId,
       spanId: run.spanId,
       isCached: !!linkedRunId,
+      isBuffered: false,
       machinePreset: machine?.name,
       taskEventStore: run.taskEventStore,
       externalTraceId,
diff --git a/apps/webapp/app/routes/@.runs.$runParam.ts b/apps/webapp/app/routes/@.runs.$runParam.ts
index a52600628d8..a709191271e 100644
--- a/apps/webapp/app/routes/@.runs.$runParam.ts
+++ b/apps/webapp/app/routes/@.runs.$runParam.ts
@@ -3,7 +3,8 @@ import { z } from "zod";
 import { prisma } from "~/db.server";
 import { redirectWithErrorMessage } from "~/models/message.server";
 import { requireUser } from "~/services/session.server";
-import { impersonate, rootPath, v3RunPath } from "~/utils/pathBuilder";
+import { impersonate, rootPath, v3RunPath, v3RunSpanPath } from "~/utils/pathBuilder";
+import { findBufferedRunRedirectInfo } from "~/v3/mollifier/syntheticRedirectInfo.server";
 
 const ParamsSchema = z.object({
   runParam: z.string(),
@@ -32,6 +33,7 @@ export async function loader({ params, request }: LoaderFunctionArgs) {
       friendlyId: runParam,
     },
     select: {
+      spanId: true,
       runtimeEnvironment: {
         select: {
           slug: true,
@@ -51,16 +53,45 @@ export async function loader({ params, request }: LoaderFunctionArgs) {
   });
 
   if (!run) {
+    // Admin impersonation route — bypass org membership so admins can
+    // open any buffered run by friendlyId, mirroring the existing PG
+    // behaviour above (no membership filter on the find).
+    const buffered = await findBufferedRunRedirectInfo({
+      runFriendlyId: runParam,
+      userId: user.id,
+      skipOrgMembershipCheck: true,
+    });
+    if (buffered) {
+      // Preselect the root span so the run-detail trace tree opens with
+      // the buffered run's span highlighted, matching the sibling
+      // redirect routes (runs.$runParam.ts, projects.v3.$projectRef…).
+      const path = buffered.spanId
+        ? v3RunSpanPath(
+            { slug: buffered.organizationSlug },
+            { slug: buffered.projectSlug },
+            { slug: buffered.environmentSlug },
+            { friendlyId: runParam },
+            { spanId: buffered.spanId }
+          )
+        : v3RunPath(
+            { slug: buffered.organizationSlug },
+            { slug: buffered.projectSlug },
+            { slug: buffered.environmentSlug },
+            { friendlyId: runParam }
+          );
+      return redirect(impersonate(path));
+    }
     return redirectWithErrorMessage(rootPath(), request, "Run doesn't exist", {
       ephemeral: false,
     });
   }
 
-  const path = v3RunPath(
+  const path = v3RunSpanPath(
     { slug: run.project.organization.slug },
     { slug: run.project.slug },
     { slug: run.runtimeEnvironment.slug },
-    { friendlyId: runParam }
+    { friendlyId: runParam },
+    { spanId: run.spanId }
   );
 
   return redirect(impersonate(path));
diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam/route.tsx
index d55511e7ff5..fbe4b9046c6 100644
--- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam/route.tsx
+++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam/route.tsx
@@ -88,10 +88,18 @@ import { useReplaceSearchParams } from "~/hooks/useReplaceSearchParams";
 import { useSearchParams } from "~/hooks/useSearchParam";
 import { type Shortcut, useShortcutKeys } from "~/hooks/useShortcutKeys";
 import { useHasAdminAccess } from "~/hooks/useUser";
+import { env } from "~/env.server";
 import { findProjectBySlug } from "~/models/project.server";
 import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server";
 import { NextRunListPresenter } from "~/presenters/v3/NextRunListPresenter.server";
-import { RunEnvironmentMismatchError, RunPresenter } from "~/presenters/v3/RunPresenter.server";
+import {
+  RunEnvironmentMismatchError,
+  RunNotInPgError,
+  RunPresenter,
+} from "~/presenters/v3/RunPresenter.server";
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
+import { buildSyntheticRunHeader } from "~/v3/mollifier/syntheticRunHeader.server";
+import { buildSyntheticTraceForBufferedRun } from "~/v3/mollifier/syntheticTrace.server";
 import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server";
 import { getImpersonationId } from "~/services/impersonation.server";
 import { logger } from "~/services/logger.server";
@@ -277,9 +285,78 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => {
       );
     }
 
+    // Only fall back to the mollifier buffer on a genuine PG miss. Any
+    // other error (DB timeout during trace queries, event-repository
+    // failure, etc.) means the run WAS in PG but a downstream lookup
+    // failed — falling back to the buffer here would either return a
+    // stale synth entry if one happens to exist in the brief drainer-
+    // materialisation race window, or quietly mask the real failure.
+    // `RunNotInPgError` is the typed signal RunPresenter throws for the
+    // route loader's specific case (`RunPresenter.server.ts:130`).
+    if (!(error instanceof RunNotInPgError)) {
+      throw error;
+    }
+
+    // PG miss → try the mollifier buffer. When the gate diverts a trigger
+    // the run sits in Redis until the drainer materialises it; without
+    // this fallback the run-detail page 404s for the brief buffered window
+    // even though the API has accepted the trigger and returned an id.
+    const buffered = await tryMollifiedRunFallback({
+      runFriendlyId: runParam,
+      organizationSlug,
+      projectSlug: projectParam,
+      envSlug: envParam,
+      userId,
+    });
+
+    if (buffered) {
+      // Preselect the root span on the initial page load when the URL
+      // doesn't already carry `?span=`. The sibling redirect routes
+      // (runs.$runParam.ts, @.runs.$runParam.ts,
+      // projects.v3.$projectRef.runs.$runParam.ts) all do this, but
+      // direct navigation to the canonical project-scoped URL never
+      // hit those redirects — leaving the right detail panel collapsed.
+      // Skip on `_data` requests (Remix data fetches): they're
+      // client-driven follow-ups and the client URL is what matters,
+      // not the loader's view of it.
+      if (
+        !url.searchParams.has("span") &&
+        !url.searchParams.has("_data") &&
+        buffered.run.spanId
+      ) {
+        url.searchParams.set("span", buffered.run.spanId);
+        throw redirect(url.pathname + "?" + url.searchParams.toString());
+      }
+
+      const parent = await getResizableSnapshot(request, resizableSettings.parent.autosaveId);
+      const tree = await getResizableSnapshot(request, resizableSettings.tree.autosaveId);
+
+      return json({
+        run: buffered.run,
+        trace: buffered.trace,
+        maximumLiveReloadingSetting: env.MAXIMUM_LIVE_RELOADING_EVENTS,
+        resizable: { parent, tree },
+        runsList: null,
+      });
+    }
+
     throw error;
   }
 
+  // Preselect the root span on the initial page load when the URL
+  // doesn't already carry `?span=`. See the comment on the equivalent
+  // block in the buffered fallback above — the sibling redirect routes
+  // do this, but direct navigation to the canonical project-scoped URL
+  // never hits them, leaving the right detail panel collapsed.
+  if (
+    !url.searchParams.has("span") &&
+    !url.searchParams.has("_data") &&
+    result.run.spanId
+  ) {
+    url.searchParams.set("span", result.run.spanId);
+    throw redirect(url.pathname + "?" + url.searchParams.toString());
+  }
+
   //resizable settings
   const parent = await getResizableSnapshot(request, resizableSettings.parent.autosaveId);
   const tree = await getResizableSnapshot(request, resizableSettings.tree.autosaveId);
@@ -305,6 +382,39 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => {
   });
 };
 
+async function tryMollifiedRunFallback(args: {
+  runFriendlyId: string;
+  organizationSlug: string;
+  projectSlug: string;
+  envSlug: string;
+  userId: string;
+}) {
+  const project = await findProjectBySlug(args.organizationSlug, args.projectSlug, args.userId);
+  if (!project) return null;
+  const environment = await findEnvironmentBySlug(project.id, args.envSlug, args.userId);
+  if (!environment) return null;
+
+  const buffered = await findRunByIdWithMollifierFallback({
+    runId: args.runFriendlyId,
+    environmentId: environment.id,
+    organizationId: project.organizationId,
+  });
+  if (!buffered) return null;
+
+  return {
+    run: buildSyntheticRunHeader({
+      run: buffered,
+      environment: {
+        id: environment.id,
+        organizationId: project.organizationId,
+        type: environment.type,
+        slug: environment.slug,
+      },
+    }),
+    trace: buildSyntheticTraceForBufferedRun(buffered),
+  };
+}
+
 type LoaderData = SerializeFrom<typeof loader>;
 
 export default function Page() {
@@ -407,23 +517,17 @@ export default function Page() {
             />
           </Dialog>
           {run.isFinished ? null : (
-            <Dialog key={`cancel-${run.friendlyId}`}>
-              <DialogTrigger asChild>
-                <Button variant="danger/small" LeadingIcon={StopCircleIcon} shortcut={{ key: "C" }}>
-                  Cancel run…
-                </Button>
-              </DialogTrigger>
-              <CancelRunDialog
-                runFriendlyId={run.friendlyId}
-                redirectPath={v3RunSpanPath(
-                  organization,
-                  project,
-                  environment,
-                  { friendlyId: run.friendlyId },
-                  { spanId: run.spanId }
-                )}
-              />
-            </Dialog>
+            <ControlledCancelRunDialog
+              key={`cancel-${run.friendlyId}`}
+              runFriendlyId={run.friendlyId}
+              redirectPath={v3RunSpanPath(
+                organization,
+                project,
+                environment,
+                { friendlyId: run.friendlyId },
+                { spanId: run.spanId }
+              )}
+            />
           )}
         </PageAccessories>
       </NavBar>
@@ -587,6 +691,35 @@ function TraceView({
   );
 }
 
+// Controlled wrapper around the cancel dialog. Owns the Radix open state
+// so the dialog closes itself once the cancel action transitions through
+// submission. We can't `<DialogClose asChild>`-wrap the submit button
+// because Radix's onClick handler swallows the button's name=value pair
+// that the form action depends on for `redirectUrl`.
+function ControlledCancelRunDialog({
+  runFriendlyId,
+  redirectPath,
+}: {
+  runFriendlyId: string;
+  redirectPath: string;
+}) {
+  const [open, setOpen] = useState(false);
+  return (
+    <Dialog open={open} onOpenChange={setOpen}>
+      <DialogTrigger asChild>
+        <Button variant="danger/small" LeadingIcon={StopCircleIcon} shortcut={{ key: "C" }}>
+          Cancel run…
+        </Button>
+      </DialogTrigger>
+      <CancelRunDialog
+        runFriendlyId={runFriendlyId}
+        redirectPath={redirectPath}
+        onCancelSubmitted={() => setOpen(false)}
+      />
+    </Dialog>
+  );
+}
+
 function NoLogsView({ run, resizable }: Pick<LoaderData, "run" | "resizable">) {
   const plan = useCurrentPlan();
   const organization = useOrganization();
@@ -616,6 +749,11 @@ function NoLogsView({ run, resizable }: Pick<LoaderData, "run" | "resizable">) {
         >
           <div className="grid h-full place-items-center">
             {daysSinceCompleted === undefined ? (
+              // NoLogsView only renders when the loader returns no trace.
+              // Buffered runs always carry a synthetic trace (see
+              // buildSyntheticTraceForBufferedRun) so they never reach
+              // this branch — the message here is the pre-mollifier
+              // copy for runs with no completedAt and no logs.
               <InfoPanel variant="info" icon={InformationCircleIcon} title="We delete old logs">
                 <Paragraph variant="small">
                   We tidy up older logs to keep things running smoothly.
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.events.ts b/apps/webapp/app/routes/api.v1.runs.$runId.events.ts
index bfa3cab971b..42468f67604 100644
--- a/apps/webapp/app/routes/api.v1.runs.$runId.events.ts
+++ b/apps/webapp/app/routes/api.v1.runs.$runId.events.ts
@@ -38,6 +38,16 @@ export const loader = createLoaderApiRoute(
     },
   },
   async ({ resource: run, authentication }) => {
+    // Short-circuit for mollifier-buffered runs. The drainer hasn't
+    // materialised execution events yet (the gate intercepts before
+    // any trace event is written), so a ClickHouse round-trip is
+    // guaranteed to come back empty. `findRun` now sets `isBuffered`
+    // explicitly on its return value — gate on that rather than
+    // probing surrogate fields like `traceId === ""`.
+    if (run.isBuffered) {
+      return json({ events: [] }, { status: 200 });
+    }
+
     const eventRepository = await getEventRepositoryForStore(
       run.taskEventStore,
       authentication.environment.organization.id
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts
index f27a9c13f98..f9c815f6ef1 100644
--- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts
+++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts
@@ -1,15 +1,161 @@
+import type { LoaderFunctionArgs } from "@remix-run/server-runtime";
 import { json } from "@remix-run/server-runtime";
 import { tryCatch } from "@trigger.dev/core/utils";
+import type { RunMetadataChangeOperation } from "@trigger.dev/core/v3/schemas";
 import { UpdateMetadataRequestBody } from "@trigger.dev/core/v3";
 import { z } from "zod";
+import { $replica } from "~/db.server";
+// Aliased to avoid shadowing the local `env: AuthenticatedEnvironment`
+// parameter the route handler and `routeOperationsToRun` use.
+import { env as appEnv } from "~/env.server";
+import type { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+import { authenticateApiRequest } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
 import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server";
 import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
 import { ServiceValidationError } from "~/v3/services/common.server";
+import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server";
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
 
 const ParamsSchema = z.object({
   runId: z.string(),
 });
 
+// GET handler added to fix the pre-existing route bug where this URL
+// returned a Remix "no loader" 400 — only PUT (update) was exported, so
+// GET had no handler. Returns `{ metadata, metadataType }` from either
+// the Postgres row or the mollifier buffer snapshot.
+export async function loader({ request, params }: LoaderFunctionArgs) {
+  const authenticationResult = await authenticateApiRequest(request);
+  if (!authenticationResult) {
+    return json({ error: "Invalid or Missing API Key" }, { status: 401 });
+  }
+
+  const parsed = ParamsSchema.safeParse(params);
+  if (!parsed.success) {
+    return json({ error: "Invalid or missing run ID" }, { status: 400 });
+  }
+
+  const env = authenticationResult.environment;
+
+  const pgRun = await $replica.taskRun.findFirst({
+    where: { friendlyId: parsed.data.runId, runtimeEnvironmentId: env.id },
+    select: { metadata: true, metadataType: true },
+  });
+  if (pgRun) {
+    return json({ metadata: pgRun.metadata, metadataType: pgRun.metadataType }, { status: 200 });
+  }
+
+  const buffered = await findRunByIdWithMollifierFallback({
+    runId: parsed.data.runId,
+    environmentId: env.id,
+    organizationId: env.organizationId,
+  });
+  if (buffered) {
+    return json(
+      {
+        metadata: buffered.metadata ?? null,
+        metadataType: buffered.metadataType ?? "application/json",
+      },
+      { status: 200 }
+    );
+  }
+
+  return json({ error: "Run not found" }, { status: 404 });
+}
+
+// Route parent/root operations to the existing PG service by directly
+// invoking it against the parent/root runId. The service ingests via
+// its batching worker, which targets PG by id. If the parent/root is
+// itself buffered we recurse through our buffered-mutation helper.
+// `_ingestion_only` flag: a synthetic body that has the operations
+// promoted to top-level `operations` so the service applies them to
+// `targetRunId` directly.
+// Exported so the silent-failure logging behaviour can be unit-tested.
+// The route handler itself isn't an attractive test target (createActionApiRoute
+// wraps it in auth + body parsing + error-handler middleware), but the
+// fan-out helper carries the load-bearing logic — including the ops-
+// visibility branch this change adds.
+export async function routeOperationsToRun(
+  targetRunId: string | undefined,
+  operations: RunMetadataChangeOperation[] | undefined,
+  env: AuthenticatedEnvironment
+): Promise<void> {
+  if (!targetRunId || !operations || operations.length === 0) return;
+
+  // Try PG first via the existing service (this is how parent/root
+  // operations have always landed; preserve that). Accepts the full
+  // AuthenticatedEnvironment so we don't have to recover the unsafe
+  // `as unknown` cast that the previous narrowed `{ id, organizationId }`
+  // signature forced on us.
+  //
+  // Two non-success outcomes from `call`:
+  //   * throws — PG threw (e.g. "Cannot update metadata for a completed
+  //     run", or a transient PG outage).
+  //   * resolves with undefined — PG row didn't exist (the target may be
+  //     buffered, not yet materialised).
+  // Either way we want to try the buffer fallback below; treating the
+  // undefined-return as success would make the fallback unreachable.
+  const [error, result] = await tryCatch(
+    updateMetadataService.call(targetRunId, { operations }, env)
+  );
+  if (!error && result !== undefined) return;
+
+  if (error) {
+    // PG threw — auxiliary op, stay best-effort and don't surface this
+    // to the caller (the caller's primary mutation already landed). But
+    // warn so a genuine PG outage on these ops isn't invisible.
+    logger.warn("metadata route: parent/root PG op failed", {
+      targetRunId,
+      error: error instanceof Error ? error.message : String(error),
+    });
+  }
+
+  // Buffer fallback only makes sense for friendlyId-keyed entries. The
+  // PG-side parent/root IDs are internal cuids; the buffer keys entries
+  // by friendlyId, so passing the internal id would silently no-op.
+  // Skip explicitly — a buffered child's parent is always materialised
+  // in PG already (a buffered run hasn't executed, so it can't have
+  // triggered the child), so the buffered-parent branch isn't actually
+  // reachable. Treating the no-op as intentional rather than incidental.
+  if (!targetRunId.startsWith("run_")) return;
+
+  // Best-effort buffer fallback. Wrap so a transient Redis throw on
+  // this auxiliary op can't 500 the request after the primary mutation
+  // already succeeded.
+  const [bufferError, bufferOutcome] = await tryCatch(
+    applyMetadataMutationToBufferedRun({
+      runId: targetRunId,
+      environmentId: env.id,
+      organizationId: env.organizationId,
+      maximumSize: appEnv.TASK_RUN_METADATA_MAXIMUM_SIZE,
+      body: { operations },
+    })
+  );
+  if (bufferError) {
+    logger.warn("metadata route: buffer fallback for parent/root op failed", {
+      targetRunId,
+      error: bufferError instanceof Error ? bufferError.message : String(bufferError),
+    });
+    return;
+  }
+  // `applyMetadataMutationToBufferedRun` reports non-throw failures via
+  // its returned outcome kind: `not_found`, `busy`, `version_exhausted`,
+  // `metadata_too_large`. Without inspecting `.kind`, the parent/root
+  // operation can silently disappear — no PG row landed it (handled
+  // above) and the buffer rejected it for one of these reasons but the
+  // helper returned cleanly. Surface a warn log per non-success branch
+  // so ops can trace why a parent/root op went missing. The customer's
+  // primary mutation has already succeeded by this point; this remains
+  // best-effort, so we still don't bubble these to the response.
+  if (bufferOutcome && bufferOutcome.kind !== "applied") {
+    logger.warn("metadata route: parent/root buffer op did not apply", {
+      targetRunId,
+      kind: bufferOutcome.kind,
+    });
+  }
+}
+
 const { action } = createActionApiRoute(
   {
     params: ParamsSchema,
@@ -18,23 +164,104 @@ const { action } = createActionApiRoute(
     method: "PUT",
   },
   async ({ authentication, body, params }) => {
-    const [error, result] = await tryCatch(
-      updateMetadataService.call(params.runId, body, authentication.environment)
-    );
+    const env = authentication.environment;
+    const runId = params.runId;
 
-    if (error) {
-      if (error instanceof ServiceValidationError) {
-        return json({ error: error.message }, { status: error.status ?? 422 });
+    // PG-canonical path. If the run is in PG, the existing service
+    // owns the full request shape including parent/root operations,
+    // metadataVersion CAS, batching, validation — none of which the
+    // buffer side needs to reimplement.
+    const [pgError, pgResult] = await tryCatch(
+      updateMetadataService.call(runId, body, env)
+    );
+    if (pgError) {
+      if (pgError instanceof ServiceValidationError) {
+        return json({ error: pgError.message }, { status: pgError.status ?? 422 });
       }
-
       return json({ error: "Internal Server Error" }, { status: 500 });
     }
+    if (pgResult) {
+      return json(pgResult, { status: 200 });
+    }
 
-    if (!result) {
+    // PG miss. Target run is either buffered or genuinely absent.
+    const bufferOutcome = await applyMetadataMutationToBufferedRun({
+      runId,
+      environmentId: env.id,
+      organizationId: env.organizationId,
+      maximumSize: appEnv.TASK_RUN_METADATA_MAXIMUM_SIZE,
+      body: { metadata: body.metadata, operations: body.operations },
+    });
+
+    if (bufferOutcome.kind === "not_found") {
       return json({ error: "Task Run not found" }, { status: 404 });
     }
+    if (bufferOutcome.kind === "metadata_too_large") {
+      // Mirror PG's `MetadataTooLargeError` (413).
+      return json(
+        {
+          error: `Metadata exceeds maximum size of ${bufferOutcome.maximumSize} bytes`,
+        },
+        { status: 413 }
+      );
+    }
+    if (bufferOutcome.kind === "busy") {
+      // Entry is materialising. Best path is to retry the PG call —
+      // the row may be visible now. We don't waste a roundtrip in
+      // the happy path, but a 503 here would be customer-visible
+      // breakage for legitimately-burst workloads. Hand back 503 with
+      // a retry hint; SDK retry policy converges.
+      return json({ error: "Run materialising, retry shortly" }, { status: 503 });
+    }
+    if (bufferOutcome.kind === "version_exhausted") {
+      // Pathological contention — many concurrent metadata writers on
+      // the same buffered runId. Surface as 503 rather than silently
+      // dropping the request.
+      return json({ error: "Metadata write contention; retry shortly" }, { status: 503 });
+    }
+
+    // Buffered metadata mutation succeeded. Fan parent/root operations
+    // out to their respective runs (parent/root are typically PG-
+    // materialised by the time the child is buffered, so the existing
+    // service handles them; if they're also buffered, the helper
+    // recurses through the buffered mutation path).
+    //
+    // Use the parent/root friendlyIds the buffered mutation captured
+    // during its internal read — NOT a second `findRunByIdWithMollifierFallback`
+    // call here. The drainer's terminal-failure path DELetes the entry
+    // hash atomically, so if it fires between the primary mutation
+    // landing and our route's second read, `bufferedEntry` would come
+    // back null and the route would silently drop `parentOperations` /
+    // `rootOperations` after the customer's primary mutation already
+    // landed on the snapshot. Capturing the ids in the helper's first
+    // CAS read closes that race.
+    //
+    // Self-fallback to `runId` matches PG semantics: the PG service
+    // routes to `taskRun.parentTaskRun?.id ?? taskRun.id` and
+    // `taskRun.rootTaskRun?.id ?? taskRun.id`, so a top-level run's
+    // parent/root ops land on itself rather than being silently
+    // dropped.
+    await Promise.all([
+      routeOperationsToRun(
+        bufferOutcome.parentTaskRunFriendlyId ?? runId,
+        body.parentOperations,
+        env,
+      ),
+      routeOperationsToRun(
+        bufferOutcome.rootTaskRunFriendlyId ?? runId,
+        body.rootOperations,
+        env,
+      ),
+    ]);
 
-    return json(result, { status: 200 });
+    // Wire-shape parity with the PG branch. `UpdateMetadataService.call`
+    // returns `{ metadata: <object> }` (see `updateMetadata.server.ts:356-358`),
+    // sourced from `applyResults.newMetadata` / `parsePacket(metadataPacket)`
+    // — both parsed `Record<string, unknown>`. `bufferOutcome.newMetadata`
+    // is typed identically (`applyMetadataMutation.server.ts:27`). SDK
+    // consumers see the same response shape regardless of which branch
+    // serves the request.
+    return json({ metadata: bufferOutcome.newMetadata }, { status: 200 });
   }
 );
 
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts
index be0d12087b6..a5250e5b850 100644
--- a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts
+++ b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts
@@ -9,33 +9,69 @@ import {
 } from "~/services/routeBuilders/apiBuilder.server";
 import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server";
 import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server";
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
+import { buildSyntheticSpanDetailBody } from "~/v3/mollifier/syntheticApiResponses.server";
 
 const ParamsSchema = z.object({
   runId: z.string(),
   spanId: z.string(),
 });
 
+// Resolve the run from either Postgres or the mollifier buffer.
+// Buffered runs only have one valid spanId (the queued span recorded at
+// gate time and reused as the run's root spanId when the drainer
+// materialises). Any other spanId returns a deterministic 404; the queued
+// span returns a minimal synthesised shape so the customer's SDK sees the
+// same 200 contract they'd get for a freshly-triggered run.
+type ResolvedRun =
+  | { source: "pg"; run: Awaited<ReturnType<typeof findPgRun>> & {} }
+  | { source: "buffer"; run: NonNullable<Awaited<ReturnType<typeof findRunByIdWithMollifierFallback>>> };
+
+async function findPgRun(runId: string, environmentId: string) {
+  return $replica.taskRun.findFirst({
+    where: { friendlyId: runId, runtimeEnvironmentId: environmentId },
+  });
+}
+
 export const loader = createLoaderApiRoute(
   {
     params: ParamsSchema,
     allowJWT: true,
     corsStrategy: "all",
-    findResource: (params, auth) => {
-      return $replica.taskRun.findFirst({
-        where: {
-          friendlyId: params.runId,
-          runtimeEnvironmentId: auth.environment.id,
-        },
+    findResource: async (params, auth): Promise<ResolvedRun | null> => {
+      const pgRun = await findPgRun(params.runId, auth.environment.id);
+      if (pgRun) return { source: "pg", run: pgRun };
+
+      const buffered = await findRunByIdWithMollifierFallback({
+        runId: params.runId,
+        environmentId: auth.environment.id,
+        organizationId: auth.environment.organizationId,
       });
+      if (buffered) return { source: "buffer", run: buffered };
+
+      return null;
     },
     shouldRetryNotFound: true,
     authorization: {
       action: "read",
-      resource: (run) => {
+      resource: (resolved) => {
+        if (resolved.source === "pg") {
+          const run = resolved.run;
+          const resources = [
+            { type: "runs", id: run.friendlyId },
+            { type: "tasks", id: run.taskIdentifier },
+            ...run.runTags.map((tag) => ({ type: "tags", id: tag })),
+          ];
+          if (run.batchId) {
+            resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) });
+          }
+          return anyResource(resources);
+        }
+        const run = resolved.run;
         const resources = [
           { type: "runs", id: run.friendlyId },
-          { type: "tasks", id: run.taskIdentifier },
-          ...run.runTags.map((tag) => ({ type: "tags", id: tag })),
+          ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []),
+          ...run.tags.map((tag) => ({ type: "tags", id: tag })),
         ];
         if (run.batchId) {
           resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) });
@@ -44,7 +80,20 @@ export const loader = createLoaderApiRoute(
       },
     },
   },
-  async ({ params, resource: run, authentication }) => {
+  async ({ params, resource: resolved, authentication }) => {
+    if (resolved.source === "buffer") {
+      // Buffered runs have exactly one valid spanId — the queued span the
+      // mollifier gate recorded at trigger time, which becomes the run's
+      // root spanId once the drainer materialises. Any other spanId is a
+      // deterministic 404. The matching spanId returns a minimal shape
+      // representing "span exists, no execution data yet."
+      if (resolved.run.spanId !== params.spanId) {
+        return json({ error: "Span not found" }, { status: 404 });
+      }
+      return json(buildSyntheticSpanDetailBody(resolved.run), { status: 200 });
+    }
+
+    const run = resolved.run;
     const eventRepository = await getEventRepositoryForStore(
       run.taskEventStore,
       authentication.environment.organization.id
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts
index eae94375b9f..ef7f3180bf3 100644
--- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts
+++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts
@@ -1,22 +1,39 @@
 import { type ActionFunctionArgs, json } from "@remix-run/server-runtime";
 import { AddTagsRequestBody } from "@trigger.dev/core/v3";
+import type { BufferEntry } from "@trigger.dev/redis-worker";
 import { z } from "zod";
 import { prisma } from "~/db.server";
 import { MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server";
 import { authenticateApiRequest } from "~/services/apiAuth.server";
+import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server";
 import { logger } from "~/services/logger.server";
+import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server";
+
+// Pull the existing tags out of a buffer entry's serialised payload so
+// the buffer-path response can dedup against them, matching the
+// PG-path's `newTags.length` count rather than the pre-dedup input
+// count. Returns null on any parse failure / shape mismatch so the
+// caller can fall back gracefully.
+function parseSnapshotTags(entry: BufferEntry | null): string[] | null {
+  if (!entry) return null;
+  try {
+    const snapshot = JSON.parse(entry.payload) as { tags?: unknown };
+    if (!Array.isArray(snapshot.tags)) return null;
+    return snapshot.tags.filter((t): t is string => typeof t === "string");
+  } catch {
+    return null;
+  }
+}
 
 const ParamsSchema = z.object({
   runId: z.string(),
 });
 
 export async function action({ request, params }: ActionFunctionArgs) {
-  // Ensure this is a POST request
   if (request.method.toUpperCase() !== "POST") {
     return { status: 405, body: "Method Not Allowed" };
   }
 
-  // Authenticate the request
   const authenticationResult = await authenticateApiRequest(request);
   if (!authenticationResult) {
     return json({ error: "Invalid or Missing API Key" }, { status: 401 });
@@ -32,59 +49,89 @@ export async function action({ request, params }: ActionFunctionArgs) {
 
   try {
     const anyBody = await request.json();
-
     const body = AddTagsRequestBody.safeParse(anyBody);
     if (!body.success) {
       return json({ error: "Invalid request body", issues: body.error.issues }, { status: 400 });
     }
-
-    const run = await prisma.taskRun.findFirst({
-      where: {
-        friendlyId: parsedParams.data.runId,
-        runtimeEnvironmentId: authenticationResult.environment.id,
-      },
-      select: {
-        runTags: true,
-      },
-    });
-
-    const existingTags = run?.runTags ?? [];
-
-    //remove duplicate tags from the new tags
     const bodyTags = typeof body.data.tags === "string" ? [body.data.tags] : body.data.tags;
-    const newTags = bodyTags.filter((tag) => {
-      if (tag.trim().length === 0) return false;
-      return !existingTags.includes(tag);
-    });
-
-    if (existingTags.length + newTags.length > MAX_TAGS_PER_RUN) {
-      return json(
-        {
-          error: `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${
-            existingTags.length + newTags.length
-          }. These tags have not been set: ${newTags.map((t) => `'${t}'`).join(", ")}.`,
-        },
-        { status: 422 }
-      );
-    }
+    const nonEmptyTags = bodyTags.filter((t) => t.trim().length > 0);
 
-    if (newTags.length === 0) {
+    if (nonEmptyTags.length === 0) {
       return json({ message: "No new tags to add" }, { status: 200 });
     }
 
-    await prisma.taskRun.update({
-      where: {
-        friendlyId: parsedParams.data.runId,
-        runtimeEnvironmentId: authenticationResult.environment.id,
+    const env = authenticationResult.environment;
+    const outcome = await mutateWithFallback<Response>({
+      runId: parsedParams.data.runId,
+      environmentId: env.id,
+      organizationId: env.organizationId,
+      bufferPatch: { type: "append_tags", tags: nonEmptyTags, maxTags: MAX_TAGS_PER_RUN },
+      pgMutation: async (taskRun) => {
+        const existing = taskRun.runTags ?? [];
+        const newTags = nonEmptyTags.filter((t) => !existing.includes(t));
+
+        if (existing.length + newTags.length > MAX_TAGS_PER_RUN) {
+          return json(
+            {
+              error: `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${
+                existing.length + newTags.length
+              }. These tags have not been set: ${newTags.map((t) => `'${t}'`).join(", ")}.`,
+            },
+            { status: 422 }
+          );
+        }
+        if (newTags.length === 0) {
+          return json({ message: "No new tags to add" }, { status: 200 });
+        }
+        await prisma.taskRun.update({
+          where: {
+            id: taskRun.id,
+            runtimeEnvironmentId: env.id,
+          },
+          data: { runTags: { push: newTags } },
+        });
+        return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 });
       },
-      data: {
-        runTags: {
-          push: newTags,
-        },
+      // Buffer-applied patch path. The mutateSnapshot Lua deduplicates
+      // against existing snapshot tags atomically and enforces
+      // MAX_TAGS_PER_RUN via the `maxTags` we pass in `bufferPatch` —
+      // matching the PG-path cap above so a buffered run can't exceed the
+      // limit the trigger validator applies at creation.
+      //
+      // Dedup the success-count off the pre-mutation entry (already
+      // fetched by mutateWithFallback's env-auth pre-check, so no extra
+      // Redis read) so the message reports the same `newTags.length` the
+      // PG path reports — not the pre-dedup request count, which would
+      // give an inconsistent number across the buffered/materialised
+      // boundary for the same input.
+      synthesisedResponse: ({ bufferEntry }) => {
+        const existing = parseSnapshotTags(bufferEntry);
+        const newTagsCount = existing
+          ? nonEmptyTags.filter((t) => !existing.includes(t)).length
+          : nonEmptyTags.length;
+        return json(
+          { message: `Successfully set ${newTagsCount} new tags.` },
+          { status: 200 }
+        );
       },
+      // Buffer rejected the append because it would exceed the cap. We
+      // don't know the exact deduped overflow count here (the Lua does),
+      // so report the limit rather than a precise "trying to set N".
+      rejectedResponse: () =>
+        json(
+          { error: `Runs can only have ${MAX_TAGS_PER_RUN} tags.` },
+          { status: 422 }
+        ),
+      abortSignal: getRequestAbortSignal(),
     });
 
-    return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 });
+    if (outcome.kind === "not_found") {
+      return json({ error: "Run not found" }, { status: 404 });
+    }
+    if (outcome.kind === "timed_out") {
+      return json({ error: "Run materialisation timed out" }, { status: 503 });
+    }
+    return outcome.response;
   } catch (error) {
     logger.error("Failed to add run tags", { error });
     return json({ error: "Something went wrong, please try again." }, { status: 500 });
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts b/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts
index 77e6a4df043..04ae398194f 100644
--- a/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts
+++ b/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts
@@ -8,32 +8,68 @@ import {
 } from "~/services/routeBuilders/apiBuilder.server";
 import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server";
 import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server";
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
+import { buildSyntheticTraceBody } from "~/v3/mollifier/syntheticApiResponses.server";
 
 const ParamsSchema = z.object({
   runId: z.string(), // This is the run friendly ID
 });
 
+// Discriminator on the resolved resource — `pg` is the real Prisma TaskRun
+// row, `buffer` is a synthesised shape from the mollifier buffer for runs
+// whose drainer hasn't yet materialised them. The handler renders an empty
+// trace for buffered runs so the customer sees the same 200 shape they'd
+// get for a freshly-triggered PG run with no spans yet (matches the
+// pass-through control case in scripts/mollifier-api-parity.sh).
+type ResolvedRun =
+  | { source: "pg"; run: Awaited<ReturnType<typeof findPgRun>> & {} }
+  | { source: "buffer"; run: NonNullable<Awaited<ReturnType<typeof findRunByIdWithMollifierFallback>>> };
+
+async function findPgRun(runId: string, environmentId: string) {
+  return $replica.taskRun.findFirst({
+    where: { friendlyId: runId, runtimeEnvironmentId: environmentId },
+  });
+}
+
 export const loader = createLoaderApiRoute(
   {
     params: ParamsSchema,
     allowJWT: true,
     corsStrategy: "all",
-    findResource: (params, auth) => {
-      return $replica.taskRun.findFirst({
-        where: {
-          friendlyId: params.runId,
-          runtimeEnvironmentId: auth.environment.id,
-        },
+    findResource: async (params, auth): Promise<ResolvedRun | null> => {
+      const pgRun = await findPgRun(params.runId, auth.environment.id);
+      if (pgRun) return { source: "pg", run: pgRun };
+
+      const buffered = await findRunByIdWithMollifierFallback({
+        runId: params.runId,
+        environmentId: auth.environment.id,
+        organizationId: auth.environment.organizationId,
       });
+      if (buffered) return { source: "buffer", run: buffered };
+
+      return null;
     },
     shouldRetryNotFound: true,
     authorization: {
       action: "read",
-      resource: (run) => {
+      resource: (resolved) => {
+        if (resolved.source === "pg") {
+          const run = resolved.run;
+          const resources = [
+            { type: "runs", id: run.friendlyId },
+            { type: "tasks", id: run.taskIdentifier },
+            ...run.runTags.map((tag) => ({ type: "tags", id: tag })),
+          ];
+          if (run.batchId) {
+            resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) });
+          }
+          return anyResource(resources);
+        }
+        const run = resolved.run;
         const resources = [
           { type: "runs", id: run.friendlyId },
-          { type: "tasks", id: run.taskIdentifier },
-          ...run.runTags.map((tag) => ({ type: "tags", id: tag })),
+          ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []),
+          ...run.tags.map((tag) => ({ type: "tags", id: tag })),
         ];
         if (run.batchId) {
           resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) });
@@ -42,7 +78,17 @@ export const loader = createLoaderApiRoute(
       },
     },
   },
-  async ({ resource: run, authentication }) => {
+  async ({ resource: resolved, authentication }) => {
+    if (resolved.source === "buffer") {
+      // Buffered runs have no events ingested yet — the drainer hasn't
+      // materialised the PG row and the worker hasn't started executing.
+      // The helper synthesises a single root span that satisfies the SDK's
+      // RetrieveRunTraceResponseBody schema (rootSpan is non-nullable) and
+      // reflects the buffered terminal state.
+      return json(buildSyntheticTraceBody(resolved.run), { status: 200 });
+    }
+
+    const run = resolved.run;
     const eventRepository = await getEventRepositoryForStore(
       run.taskEventStore,
       authentication.environment.organization.id
diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts
index 72ad202467d..4bb5922997f 100644
--- a/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts
+++ b/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts
@@ -1,10 +1,12 @@
 import type { ActionFunctionArgs } from "@remix-run/server-runtime";
 import { json } from "@remix-run/server-runtime";
+import type { TaskRun } from "@trigger.dev/database";
 import { z } from "zod";
 import { prisma } from "~/db.server";
 import { authenticateApiRequest } from "~/services/apiAuth.server";
 import { logger } from "~/services/logger.server";
 import { ReplayTaskRunService } from "~/v3/services/replayTaskRun.server";
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
 import { sanitizeTriggerSource } from "~/utils/triggerSource";
 
 const ParamsSchema = z.object({
@@ -12,6 +14,39 @@ const ParamsSchema = z.object({
   runParam: z.string(),
 });
 
+// Subset of TaskRun fields that ReplayTaskRunService.call actually
+// reads from `existingTaskRun`. Validate the buffered fallback against
+// this before casting to TaskRun so a buffer-format drift surfaces as a
+// 404/422 here rather than as a silent NaN/undefined deep inside
+// replay. The full TaskRun type has many more fields the service never
+// touches; we only assert the ones it reads.
+const BufferedReplayInputSchema = z.object({
+  id: z.string(),
+  friendlyId: z.string(),
+  runtimeEnvironmentId: z.string(),
+  taskIdentifier: z.string(),
+  payload: z.string(),
+  payloadType: z.string(),
+  queue: z.string(),
+  isTest: z.boolean(),
+  traceId: z.string(),
+  spanId: z.string(),
+  engine: z.string(),
+  runTags: z.array(z.string()),
+  // Nullable / optional fields the service tolerates via `??` fallbacks.
+  concurrencyKey: z.string().nullable().optional(),
+  workerQueue: z.string().nullable().optional(),
+  machinePreset: z.string().nullable().optional(),
+  realtimeStreamsVersion: z.string().nullable().optional(),
+  // ReplayTaskRunService.getExistingMetadata reads these to preserve
+  // the original run's metadata on replay. Without them in the schema
+  // they'd be stripped by Zod's default key-passthrough behaviour, and
+  // a buffered-source replay would silently lose metadata that a
+  // PG-source replay carries over.
+  seedMetadata: z.string().nullable().optional(),
+  seedMetadataType: z.string().nullable().optional(),
+});
+
 export async function action({ request, params }: ActionFunctionArgs) {
   // Ensure this is a POST request
   if (request.method.toUpperCase() !== "POST") {
@@ -32,12 +67,57 @@ export async function action({ request, params }: ActionFunctionArgs) {
   const { runParam } = parsed.data;
 
   try {
-    const taskRun = await prisma.taskRun.findUnique({
+    const env = authenticationResult.environment;
+    // PG-first. Replay works on any status per audit — no
+    // filter beyond friendlyId is the existing semantic; findFirst with
+    // env scoping tightens it minimally without changing behaviour for
+    // a correctly-authed caller.
+    let taskRun: TaskRun | null = await prisma.taskRun.findFirst({
       where: {
         friendlyId: runParam,
+        runtimeEnvironmentId: env.id,
       },
     });
 
+    if (!taskRun) {
+      // Buffered fallback. SyntheticRun carries every field
+      // ReplayTaskRunService reads from a TaskRun. Validate the subset of
+      // fields the service consumes (BufferedReplayInputSchema above)
+      // before casting; a schema mismatch surfaces as a 404 here rather
+      // than as a silent undefined deep inside the service.
+      const buffered = await findRunByIdWithMollifierFallback({
+        runId: runParam,
+        environmentId: env.id,
+        organizationId: env.organizationId,
+      });
+      if (buffered) {
+        const parsed = BufferedReplayInputSchema.safeParse(buffered);
+        if (parsed.success) {
+          // Manual sync point: `BufferedReplayInputSchema` covers only
+          // the subset of `TaskRun` fields `ReplayTaskRunService.call`
+          // currently reads from `existingTaskRun`. The cast is `as
+          // unknown as TaskRun` because the full `TaskRun` type carries
+          // ~40 fields the service never touches; mirroring all of them
+          // on a synthetic snapshot would be misleading. If a future
+          // change to `ReplayTaskRunService` reads an additional
+          // `existingTaskRun` field, **add it to the schema above** —
+          // otherwise the buffered path will silently feed the service
+          // `undefined` for that field while the PG-source replay
+          // works. The `safeParse` + warn-log + 404 below is the
+          // run-time fail-safe; this comment is the design fail-safe.
+          taskRun = parsed.data as unknown as TaskRun;
+        } else {
+          logger.warn("replay: buffered fallback failed schema validation", {
+            runParam,
+            issues: parsed.error.issues.map((issue) => ({
+              path: issue.path.join("."),
+              code: issue.code,
+            })),
+          });
+        }
+      }
+    }
+
     if (!taskRun) {
       return json({ error: "Run not found" }, { status: 404 });
     }
diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts
index 0ac8aec8351..cbdd9807d8b 100644
--- a/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts
+++ b/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts
@@ -3,90 +3,162 @@ import { json } from "@remix-run/server-runtime";
 import { RescheduleRunRequestBody } from "@trigger.dev/core/v3/schemas";
 import { z } from "zod";
 import { getApiVersion } from "~/api/versions";
-import { prisma } from "~/db.server";
 import { ApiRetrieveRunPresenter } from "~/presenters/v3/ApiRetrieveRunPresenter.server";
 import { authenticateApiRequest } from "~/services/apiAuth.server";
+import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server";
 import { logger } from "~/services/logger.server";
 import { ServiceValidationError } from "~/v3/services/baseService.server";
 import { RescheduleTaskRunService } from "~/v3/services/rescheduleTaskRun.server";
+import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server";
+import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server";
+import { parseDelay } from "~/utils/delays";
 
 const ParamsSchema = z.object({
   runParam: z.string(),
 });
 
 export async function action({ request, params }: ActionFunctionArgs) {
-  // Ensure this is a POST request
   if (request.method.toUpperCase() !== "POST") {
     return { status: 405, body: "Method Not Allowed" };
   }
 
-  // Authenticate the request
   const authenticationResult = await authenticateApiRequest(request);
-
   if (!authenticationResult) {
     return json({ error: "Invalid or missing API Key" }, { status: 401 });
   }
 
   const parsed = ParamsSchema.safeParse(params);
-
   if (!parsed.success) {
     return json({ error: "Invalid or missing run ID" }, { status: 400 });
   }
 
-  const { runParam } = parsed.data;
-
-  const taskRun = await prisma.taskRun.findUnique({
-    where: {
-      friendlyId: runParam,
-      runtimeEnvironmentId: authenticationResult.environment.id,
-    },
-  });
-
-  if (!taskRun) {
-    return json({ error: "Run not found" }, { status: 404 });
-  }
-
   const anyBody = await request.json();
-
   const body = RescheduleRunRequestBody.safeParse(anyBody);
-
   if (!body.success) {
     return json({ error: "Invalid request body" }, { status: 400 });
   }
 
-  const service = new RescheduleTaskRunService();
+  const env = authenticationResult.environment;
+  // Pre-resolve the absolute Date the buffer snapshot should encode.
+  // RescheduleTaskRunService expects this to be present on the body for
+  // its PG-side flow; for the buffer-side patch we encode the same
+  // wall-clock value so the drainer's engine.trigger sees the intended
+  // delayUntil after materialisation.
+  //
+  // Wire-compat: pre-PR the validation lived inside
+  // `RescheduleTaskRunService.call` (rescheduleTaskRun.server.ts:14-18),
+  // which throws `ServiceValidationError("Invalid delay: …")`. The
+  // route's catch block below converts that to status **400** (not
+  // 422 — `ServiceValidationError` defaults to 422 but this route's
+  // catch block has always returned 400). Mirror that 400 + message
+  // shape here so SDK consumers keying retry/classification logic on
+  // 400 see no behavioural drift now that the parse is hoisted to the
+  // route layer.
+  const delayUntil = await parseDelay(body.data.delay);
+  if (!delayUntil) {
+    return json({ error: `Invalid delay: ${body.data.delay}` }, { status: 400 });
+  }
 
   try {
-    const updatedRun = await service.call(taskRun, body.data);
-
-    if (!updatedRun) {
-      return json({ error: "An unknown error occurred" }, { status: 500 });
+    // PG-side `RescheduleTaskRunService.call` enforces
+    // `taskRun.status !== "DELAYED"` and 422s otherwise — without an
+    // equivalent guard the buffer path would happily inject a
+    // `delayUntil` into the snapshot of a non-delayed buffered run, and
+    // the drainer would materialise it with an unintended delay. The
+    // SyntheticRun type doesn't carry a "DELAYED" enum value because
+    // it's not a terminal status the trace API needs to express; the
+    // buffered analogue is `delayUntil` set in the snapshot. Gate on
+    // that.
+    //
+    // Only apply the guard when the buffer entry is NOT yet
+    // materialised. Post-materialise the entry sticks around for a
+    // 30s grace TTL with `materialised: true`, but the PG row is now
+    // canonical — its DELAYED state may differ from what the snapshot
+    // recorded at trigger time (e.g. a prior reschedule via the PG
+    // path, or a delay set by the engine through another mechanism).
+    // Reading from the stale snapshot would 422 a legitimately-DELAYED
+    // PG row. When `materialised` we let `mutateWithFallback` route to
+    // PG, which runs its own canonical DELAYED check.
+    const buffer = getMollifierBuffer();
+    const entry = buffer ? await buffer.getEntry(parsed.data.runParam) : null;
+    const isLiveBuffered =
+      entry !== null &&
+      entry.materialised !== true &&
+      entry.envId === env.id &&
+      entry.orgId === env.organizationId;
+    if (isLiveBuffered) {
+      const snapshot = JSON.parse(entry.payload) as Record<string, unknown>;
+      const snapshotDelayUntil =
+        typeof snapshot.delayUntil === "string" ? snapshot.delayUntil : undefined;
+      if (!snapshotDelayUntil) {
+        return json(
+          { error: "Cannot reschedule a run that is not delayed" },
+          { status: 422 },
+        );
+      }
     }
 
-    const run = await ApiRetrieveRunPresenter.findRun(
-      updatedRun.friendlyId,
-      authenticationResult.environment
-    );
-
-    if (!run) {
+    const outcome = await mutateWithFallback<Response>({
+      runId: parsed.data.runParam,
+      environmentId: env.id,
+      organizationId: env.organizationId,
+      bufferPatch: {
+        type: "set_delay",
+        delayUntil: delayUntil.toISOString(),
+      },
+      pgMutation: async (taskRun) => {
+        const service = new RescheduleTaskRunService();
+        const updatedRun = await service.call(taskRun, body.data);
+        if (!updatedRun) {
+          return json({ error: "An unknown error occurred" }, { status: 500 });
+        }
+
+        const run = await ApiRetrieveRunPresenter.findRun(updatedRun.friendlyId, env);
+        if (!run) {
+          return json({ error: "Run not found" }, { status: 404 });
+        }
+        const apiVersion = getApiVersion(request);
+        const presenter = new ApiRetrieveRunPresenter(apiVersion);
+        const result = await presenter.call(run, env);
+        if (!result) {
+          return json({ error: "Run not found" }, { status: 404 });
+        }
+        return json(result);
+      },
+      // Buffered snapshot has been patched. Run it through the same
+      // ApiRetrieveRunPresenter the PG branch uses (it falls back to
+      // the buffer for the SyntheticRun lookup) so the response shape
+      // matches `RetrieveRunResponse` — that's what the SDK's
+      // `rescheduleRun` zod-validates against. Returning a stripped
+      // `{ id, delayUntil }` object fails the SDK schema on every
+      // existing SDK version.
+      synthesisedResponse: async () => {
+        const run = await ApiRetrieveRunPresenter.findRun(parsed.data.runParam, env);
+        if (!run) {
+          return json({ error: "Run not found" }, { status: 404 });
+        }
+        const apiVersion = getApiVersion(request);
+        const presenter = new ApiRetrieveRunPresenter(apiVersion);
+        const result = await presenter.call(run, env);
+        if (!result) {
+          return json({ error: "Run not found" }, { status: 404 });
+        }
+        return json(result);
+      },
+      abortSignal: getRequestAbortSignal(),
+    });
+
+    if (outcome.kind === "not_found") {
       return json({ error: "Run not found" }, { status: 404 });
     }
-
-    const apiVersion = getApiVersion(request);
-
-    const presenter = new ApiRetrieveRunPresenter(apiVersion);
-    const result = await presenter.call(run, authenticationResult.environment);
-
-    if (!result) {
-      return json({ error: "Run not found" }, { status: 404 });
+    if (outcome.kind === "timed_out") {
+      return json({ error: "Run materialisation timed out" }, { status: 503 });
     }
-
-    return json(result);
+    return outcome.response;
   } catch (error) {
     if (error instanceof ServiceValidationError) {
       return json({ error: error.message }, { status: 400 });
     }
-
     logger.error("Failed to reschedule run", { error });
     return json({ error: "Something went wrong, please try again." }, { status: 500 });
   }
diff --git a/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts b/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts
index 8206a90f320..1f8a42af08c 100644
--- a/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts
+++ b/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts
@@ -134,7 +134,20 @@ const { action, loader } = createActionApiRoute(
         return json({ error: "Task not found" }, { status: 404 });
       }
 
-      await saveRequestIdempotency(requestIdempotencyKey, "trigger", result.run.id);
+      // Skip request-idempotency caching when the gate diverted to the
+      // mollifier buffer. `result.run.id` is a synthesised cuid with no
+      // corresponding PG row, so a lost-response SDK retry that reaches
+      // `handleRequestIdempotency` would lookup that id, miss in PG, and
+      // fall through to a fresh trigger — producing a duplicate buffer
+      // entry for triggers without a task-level idempotency key (the
+      // task-level path still dedupes via the buffer's SETNX in
+      // `findBufferedRunWithIdempotency`). Accepting the retry-as-fresh-
+      // trigger semantics here is bounded by the drainer's eventual
+      // materialisation: once the run lands in PG, normal request-
+      // idempotency from that point forward works as usual.
+      if (!result.isMollified) {
+        await saveRequestIdempotency(requestIdempotencyKey, "trigger", result.run.id);
+      }
 
       const $responseHeaders = await responseHeaders(result.run, authentication);
 
diff --git a/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts b/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts
index a636ca0cc1d..f02b058b272 100644
--- a/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts
+++ b/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts
@@ -1,8 +1,13 @@
 import { json } from "@remix-run/server-runtime";
 import { z } from "zod";
-import { $replica } from "~/db.server";
 import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
+import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server";
 import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server";
+import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server";
+import {
+  resolveRunForMutation,
+  type ResolvedRunForMutation,
+} from "~/v3/mollifier/resolveRunForMutation.server";
 
 const ParamsSchema = z.object({
   runParam: z.string(),
@@ -17,29 +22,55 @@ const { action } = createActionApiRoute(
       action: "write",
       resource: (params) => ({ type: "runs", id: params.runParam }),
     },
-    findResource: async (params, auth) => {
-      return $replica.taskRun.findFirst({
-        where: {
-          friendlyId: params.runParam,
-          runtimeEnvironmentId: auth.environment.id,
-        },
-      });
-    },
+    // PG-or-buffer resolver. Returning null here would 404 BEFORE the
+    // action runs (`apiBuilder.server.ts:321`), so buffered cancels need
+    // a buffer check at this layer too. Logic lives in a helper so the
+    // three paths (PG hit, buffer hit, both miss) are unit-tested
+    // independently of the route builder. The action's mutateWithFallback
+    // call repeats the lookup atomically — slightly redundant but keeps
+    // wait-and-bounce semantics intact.
+    findResource: async (params, auth): Promise<ResolvedRunForMutation | null> =>
+      resolveRunForMutation({
+        runParam: params.runParam,
+        environmentId: auth.environment.id,
+        organizationId: auth.environment.organizationId,
+      }),
   },
-  async ({ resource }) => {
-    if (!resource) {
-      return json({ error: "Run not found" }, { status: 404 });
-    }
+  async ({ params, authentication }) => {
+    const runId = params.runParam;
+    const env = authentication.environment;
+    const cancelledAt = new Date();
+    const cancelReason = "Canceled by user";
 
-    const service = new CancelTaskRunService();
+    const outcome = await mutateWithFallback({
+      runId,
+      environmentId: env.id,
+      organizationId: env.organizationId,
+      bufferPatch: {
+        type: "mark_cancelled",
+        cancelledAt: cancelledAt.toISOString(),
+        cancelReason,
+      },
+      pgMutation: async (taskRun) => {
+        const service = new CancelTaskRunService();
+        try {
+          await service.call(taskRun);
+        } catch {
+          return json({ error: "Internal Server Error" }, { status: 500 });
+        }
+        return json({ id: taskRun.friendlyId }, { status: 200 });
+      },
+      synthesisedResponse: () => json({ id: runId }, { status: 200 }),
+      abortSignal: getRequestAbortSignal(),
+    });
 
-    try {
-      await service.call(resource);
-    } catch (error) {
-      return json({ error: "Internal Server Error" }, { status: 500 });
+    if (outcome.kind === "not_found") {
+      return json({ error: "Run not found" }, { status: 404 });
     }
-
-    return json({ id: resource.friendlyId }, { status: 200 });
+    if (outcome.kind === "timed_out") {
+      return json({ error: "Run materialisation timed out" }, { status: 503 });
+    }
+    return outcome.response;
   }
 );
 
diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx
index 09f3f33fcb3..7e825fe303d 100644
--- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx
+++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx
@@ -120,6 +120,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => {
   try {
     const result = await presenter.call({
       projectSlug: projectParam,
+      envSlug: envParam,
       spanId: spanParam,
       runFriendlyId: runParam,
       userId,
@@ -1021,6 +1022,10 @@ function RunBody({
                     <Paragraph spacing variant="small" className="text-yellow-500">
                       Admin only
                     </Paragraph>
+                    <Property.Item>
+                      <Property.Label>Buffered</Property.Label>
+                      <Property.Value>{run.isBuffered ? "Yes" : "No"}</Property.Value>
+                    </Property.Item>
                     <Property.Item>
                       <Property.Label>Worker queue</Property.Label>
                       <Property.Value>{run.workerQueue}</Property.Value>
@@ -1096,7 +1101,7 @@ function RunBody({
               {run.isCached ? "Jump to original run" : "Focus on run"}
             </LinkButton>
           )}
-          <AdminDebugRun friendlyId={run.friendlyId} />
+          {!run.isBuffered && <AdminDebugRun friendlyId={run.friendlyId} />}
         </div>
         <div className="flex items-center">
           {run.logsDeletedAt === null ? (
diff --git a/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts b/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts
index 5c7725c510b..c2a6fa9590c 100644
--- a/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts
+++ b/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts
@@ -9,6 +9,8 @@ import { formatDurationMilliseconds } from "@trigger.dev/core/v3/utils/durations
 import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server";
 import { TaskEventKind } from "@trigger.dev/database";
 import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server";
+import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server";
+import { deserialiseMollifierSnapshot } from "~/v3/mollifier/mollifierSnapshot.server";
 
 export async function loader({ params, request }: LoaderFunctionArgs) {
   const user = await requireUser(request);
@@ -30,6 +32,67 @@ export async function loader({ params, request }: LoaderFunctionArgs) {
   });
 
   if (!run || !run.organizationId) {
+    // Buffered run? It hasn't executed, so there are no events to
+    // stream — but a 404 is wrong: the run does exist, the customer's
+    // "Download logs" button on the run-detail page generates this
+    // exact URL, and a 404 reads as "your run vanished" rather than
+    // "no logs yet". Verify the entry exists in the buffer (with the
+    // user as a member of the entry's org), and if so stream a single
+    // informational line in the same `<timestamp> <task> <level>
+    // <message>` shape `formatRunEvent` uses below — so a downstream
+    // log viewer / grep over the downloaded file produces a
+    // meaningful explanation, not a 0-byte mystery.
+    const buffer = getMollifierBuffer();
+    if (buffer) {
+      const entry = await buffer.getEntry(parsedParams.runParam);
+      if (entry) {
+        const member = await prisma.orgMember.findFirst({
+          where: { userId: user.id, organizationId: entry.orgId },
+          select: { id: true },
+        });
+        if (member) {
+          let taskIdentifier: string | undefined;
+          try {
+            // Use the shared webapp wrapper rather than raw JSON.parse so
+            // every read-side module shares a single deserialisation path
+            // (see contract comment in `mollifierSnapshot.server.ts` and
+            // `syntheticRedirectInfo.server.ts`). Keeps behaviour
+            // consistent if the snapshot encoding ever changes.
+            const snapshot = deserialiseMollifierSnapshot(entry.payload) as {
+              taskIdentifier?: unknown;
+            };
+            if (typeof snapshot.taskIdentifier === "string") {
+              taskIdentifier = snapshot.taskIdentifier;
+            }
+          } catch {
+            // Fall through — taskIdentifier stays undefined.
+          }
+          const placeholderParts = [
+            entry.createdAt.toISOString(),
+            ...(taskIdentifier ? [taskIdentifier] : []),
+            "INFO",
+            "Run is queued, has not started executing yet — no logs to download.",
+          ];
+          const placeholder = placeholderParts.join(" ") + "\n";
+          const placeholderReadable = new Readable({
+            read() {
+              this.push(placeholder);
+              this.push(null);
+            },
+          });
+          const gzipStream = createGzip();
+          const compressed = placeholderReadable.pipe(gzipStream);
+          return new Response(compressed as any, {
+            status: 200,
+            headers: {
+              "Content-Type": "application/octet-stream",
+              "Content-Disposition": `attachment; filename="${parsedParams.runParam}.log"`,
+              "Content-Encoding": "gzip",
+            },
+          });
+        }
+      }
+    }
     return new Response("Not found", { status: 404 });
   }
 
diff --git a/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts b/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts
index 240d7d3d8ed..fa6ee29f3db 100644
--- a/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts
+++ b/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts
@@ -6,6 +6,7 @@ import { redirectWithErrorMessage, redirectWithSuccessMessage } from "~/models/m
 import { logger } from "~/services/logger.server";
 import { requireUserId } from "~/services/session.server";
 import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server";
+import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server";
 
 export const cancelSchema = z.object({
   redirectUrl: z.string(),
@@ -42,15 +43,56 @@ export const action: ActionFunction = async ({ request, params }) => {
       },
     });
 
-    if (!taskRun) {
+    if (taskRun) {
+      const cancelRunService = new CancelTaskRunService();
+      await cancelRunService.call(taskRun);
+      return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`);
+    }
+
+    // PG miss — try the mollifier buffer. The customer can hit cancel
+    // on a buffered run from the dashboard during the burst window.
+    // Snapshot a `mark_cancelled` patch; the drainer's
+    // bifurcation routes the run to `engine.createCancelledRun` on
+    // next pop.
+    const buffer = getMollifierBuffer();
+    const entry = buffer ? await buffer.getEntry(runParam) : null;
+    if (!entry) {
       submission.error = { runParam: ["Run not found"] };
       return json(submission);
     }
 
-    const cancelRunService = new CancelTaskRunService();
-    await cancelRunService.call(taskRun);
+    // Dashboard auth: verify the requesting user is a member of the
+    // buffered run's org. The API path scopes by env id from the
+    // authenticated request; the dashboard route uses org-membership
+    // because the URL doesn't carry an envId.
+    const member = await prisma.orgMember.findFirst({
+      where: { userId, organizationId: entry.orgId },
+      select: { id: true },
+    });
+    if (!member) {
+      submission.error = { runParam: ["Run not found"] };
+      return json(submission);
+    }
 
-    return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`);
+    const result = await buffer!.mutateSnapshot(runParam, {
+      type: "mark_cancelled",
+      cancelledAt: new Date().toISOString(),
+      cancelReason: "Canceled by user",
+    });
+    if (result === "applied_to_snapshot") {
+      return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`);
+    }
+    // "not_found" or "busy" — both indicate the drainer raced us between
+    // the getEntry check above and mutateSnapshot. On "not_found" the
+    // entry was just popped and the PG row is in flight; on "busy" the
+    // drainer is mid-materialisation. Either way the customer should
+    // retry — by then the PG row exists and the regular cancel path at
+    // the top of this action takes over.
+    return redirectWithErrorMessage(
+      submission.value.redirectUrl,
+      request,
+      "Run is materialising — retry in a moment"
+    );
   } catch (error) {
     if (error instanceof Error) {
       logger.error("Failed to cancel run", {
diff --git a/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts b/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts
index 8a22822d06b..507d3cc706f 100644
--- a/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts
+++ b/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts
@@ -11,6 +11,12 @@ import { requireUser } from "~/services/session.server";
 import { sortEnvironments } from "~/utils/environmentSort";
 import { v3RunSpanPath } from "~/utils/pathBuilder";
 import { ReplayTaskRunService } from "~/v3/services/replayTaskRun.server";
+import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server";
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
+import {
+  buildSyntheticReplayTaskRun,
+  type SyntheticReplayTaskRun,
+} from "~/v3/mollifier/syntheticReplayTaskRun.server";
 import parseDuration from "parse-duration";
 import { findCurrentWorkerDeployment } from "~/v3/models/workerDeployment.server";
 import { queueTypeFromType } from "~/presenters/v3/QueueRetrievePresenter.server";
@@ -33,7 +39,7 @@ export async function loader({ request, params }: LoaderFunctionArgs) {
     Object.fromEntries(new URL(request.url).searchParams)
   );
 
-  const run = await $replica.taskRun.findFirst({
+  let run = await $replica.taskRun.findFirst({
     select: {
       payload: true,
       payloadType: true,
@@ -88,6 +94,83 @@ export async function loader({ request, params }: LoaderFunctionArgs) {
     where: { friendlyId: runParam, project: { organization: { members: { some: { userId } } } } },
   });
 
+  let synthetic:
+    | (Awaited<ReturnType<typeof findRunByIdWithMollifierFallback>> & { __synth: true })
+    | undefined;
+  if (!run) {
+    // Buffered fallback: read the snapshot and look up the env list via
+    // the snapshot's organizationId. Without this the Replay dialog
+    // 404s for runs queued in the mollifier buffer, which dumps the
+    // user back to the task list.
+    const buffer = getMollifierBuffer();
+    const entry = buffer ? await buffer.getEntry(runParam) : null;
+    if (!entry) throw new Response("Not Found", { status: 404 });
+    const member = await prisma.orgMember.findFirst({
+      where: { userId, organizationId: entry.orgId },
+      select: { id: true },
+    });
+    if (!member) throw new Response("Not Found", { status: 404 });
+    const buffered = await findRunByIdWithMollifierFallback({
+      runId: runParam,
+      environmentId: entry.envId,
+      organizationId: entry.orgId,
+    });
+    if (!buffered) throw new Response("Not Found", { status: 404 });
+    synthetic = Object.assign(buffered, { __synth: true as const });
+    // Scope the project lookup to the buffer entry's org as well as the
+    // env id. The prior `orgMember.findFirst` above confirms the user
+    // belongs to `entry.orgId`; pinning `organizationId` here means a
+    // malformed entry whose envId resolves to a different org can't leak
+    // that project's data through this loader. Mirrors the PG path's
+    // `project.organization.members.some.userId` scoping (lines 42-95)
+    // — the env filter and select shape are kept identical so the Replay
+    // dialog renders the same dropdown either way.
+    const orgProject = await $replica.project.findFirst({
+      where: {
+        organizationId: entry.orgId,
+        environments: { some: { id: entry.envId } },
+      },
+      select: {
+        slug: true,
+        environments: {
+          select: {
+            id: true,
+            type: true,
+            slug: true,
+            branchName: true,
+            orgMember: { select: { user: true } },
+          },
+          where: {
+            archivedAt: null,
+            OR: [
+              { type: { in: ["PREVIEW", "STAGING", "PRODUCTION"] } },
+              { type: "DEVELOPMENT", orgMember: { userId } },
+            ],
+          },
+        },
+      },
+    });
+    if (!orgProject) throw new Response("Not Found", { status: 404 });
+    run = {
+      payload: buffered.payload,
+      payloadType: buffered.payloadType ?? "application/json",
+      seedMetadata: buffered.seedMetadata ?? null,
+      seedMetadataType: buffered.seedMetadataType ?? null,
+      runtimeEnvironmentId: entry.envId,
+      concurrencyKey: buffered.concurrencyKey ?? null,
+      maxAttempts: buffered.maxAttempts ?? null,
+      maxDurationInSeconds: buffered.maxDurationInSeconds ?? null,
+      machinePreset: buffered.machinePreset ?? null,
+      workerQueue: buffered.workerQueue ?? null,
+      ttl: buffered.ttl ?? null,
+      idempotencyKey: buffered.idempotencyKey ?? null,
+      runTags: buffered.runTags,
+      queue: buffered.queue ?? "task/",
+      taskIdentifier: buffered.taskIdentifier ?? "",
+      project: orgProject,
+    } as unknown as typeof run;
+  }
+
   if (!run) {
     throw new Response("Not Found", { status: 404 });
   }
@@ -164,6 +247,15 @@ export async function loader({ request, params }: LoaderFunctionArgs) {
 }
 
 export const action: ActionFunction = async ({ request, params }) => {
+  // Dashboard auth: identical pattern to resources.taskruns.$runParam.cancel.ts.
+  // The loader above this action already gates with `requireUser`, but
+  // Remix's action runs independently — without this call any request
+  // with a valid runParam could submit a replay. The PG findFirst below
+  // also adds the org-membership filter so a PAT can't replay another
+  // org's run, and the buffered fallback verifies org membership via
+  // orgMember.findFirst against the snapshot's orgId.
+  const user = await requireUser(request);
+  const userId = user.id;
   const { runParam } = ParamSchema.parse(params);
 
   const formData = await request.formData();
@@ -174,9 +266,18 @@ export const action: ActionFunction = async ({ request, params }) => {
   }
 
   try {
-    const taskRun = await prisma.taskRun.findFirst({
+    const pgRun = await prisma.taskRun.findFirst({
       where: {
         friendlyId: runParam,
+        project: {
+          organization: {
+            members: {
+              some: {
+                userId,
+              },
+            },
+          },
+        },
       },
       include: {
         runtimeEnvironment: {
@@ -192,6 +293,50 @@ export const action: ActionFunction = async ({ request, params }) => {
       },
     });
 
+    // Mollifier read-fallback: if the original isn't in PG yet,
+    // synthesise a TaskRun from the buffered snapshot. The B4-extended
+    // SyntheticRun carries every field ReplayTaskRunService reads. We
+    // also need projectSlug + orgSlug + envSlug for the redirect path,
+    // so look those up via the snapshot's runtimeEnvironmentId.
+    let taskRun: SyntheticReplayTaskRun | null = pgRun ?? null;
+    if (!taskRun) {
+      const buffer = getMollifierBuffer();
+      const entry = buffer ? await buffer.getEntry(runParam) : null;
+      if (entry) {
+        // Same org-membership gate as the PG path above. Without this
+        // any authenticated user who knows a runId could replay the
+        // buffered run across orgs.
+        const member = await prisma.orgMember.findFirst({
+          where: { userId, organizationId: entry.orgId },
+          select: { id: true },
+        });
+        if (!member) {
+          return redirectWithErrorMessage(
+            submission.value.failedRedirect,
+            request,
+            "Run not found"
+          );
+        }
+        const synthetic = await findRunByIdWithMollifierFallback({
+          runId: runParam,
+          environmentId: entry.envId,
+          organizationId: entry.orgId,
+        });
+        if (synthetic) {
+          const envRow = await prisma.runtimeEnvironment.findFirst({
+            where: { id: entry.envId },
+            select: {
+              slug: true,
+              project: { select: { slug: true, organization: { select: { slug: true } } } },
+            },
+          });
+          if (envRow) {
+            taskRun = buildSyntheticReplayTaskRun({ synthetic, envRow });
+          }
+        }
+      }
+    }
+
     if (!taskRun) {
       return redirectWithErrorMessage(submission.value.failedRedirect, request, "Run not found");
     }
diff --git a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts
index a6fe5babe2c..55cb3311441 100644
--- a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts
+++ b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts
@@ -2,13 +2,50 @@ import { RunId } from "@trigger.dev/core/v3/isomorphic";
 import type { PrismaClientOrTransaction, TaskRun } from "@trigger.dev/database";
 import { logger } from "~/services/logger.server";
 import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server";
+import { ServiceValidationError } from "~/v3/services/common.server";
 import type { RunEngine } from "~/v3/runEngine.server";
 import { shouldIdempotencyKeyBeCleared } from "~/v3/taskStatus";
+import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server";
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
+import { claimOrAwait } from "~/v3/mollifier/idempotencyClaim.server";
+import { makeResolveMollifierFlag } from "~/v3/mollifier/mollifierGate.server";
 import type { TraceEventConcern, TriggerTaskRequest } from "../types";
 
+// In-memory per-org mollifier-enabled check, shared with `evaluateGate`
+// (same `Organization.featureFlags` JSON, no DB read). Used to gate the
+// pre-gate claim's Redis round-trip so non-mollifier orgs don't pay it
+// during staged rollout — see the comment above the claim block in
+// handleTriggerRequest.
+const resolveOrgMollifierFlag = makeResolveMollifierFlag();
+
+// Claim ownership context returned to the caller when the
+// IdempotencyKeyConcern won a pre-gate claim. Caller MUST publish the
+// winning runId on pipeline success (`publishClaim`) or release the
+// claim on failure (`releaseClaim`).
+export type ClaimedIdempotency = {
+  envId: string;
+  taskIdentifier: string;
+  idempotencyKey: string;
+  // Ownership token from `claimOrAwait`. The caller's trigger pipeline
+  // MUST thread this into publishClaim/releaseClaim so the buffer's
+  // compare-and-act protects the slot against a stale predecessor.
+  token: string;
+};
+
 export type IdempotencyKeyConcernResult =
   | { isCached: true; run: TaskRun }
-  | { isCached: false; idempotencyKey?: string; idempotencyKeyExpiresAt?: Date };
+  | {
+      isCached: false;
+      idempotencyKey?: string;
+      idempotencyKeyExpiresAt?: Date;
+      // Set when this trigger holds a pre-gate claim. The caller's
+      // trigger pipeline MUST resolve the claim by either publishing
+      // the runId on success or releasing on failure. Undefined when
+      // the request has no idempotency key, when the buffer is
+      // unavailable, or when the request is a triggerAndWait (claim
+      // path skipped per plan doc).
+      claim?: ClaimedIdempotency;
+    };
 
 export class IdempotencyKeyConcern {
   constructor(
@@ -17,6 +54,86 @@ export class IdempotencyKeyConcern {
     private readonly traceEventConcern: TraceEventConcern
   ) {}
 
+  // Buffer-side idempotency dedup. Resolves an idempotency key against the
+  // mollifier buffer when PG missed. Returns a SyntheticRun cast to
+  // TaskRun so the route handler (which only reads run.id / run.friendlyId)
+  // can echo the buffered run's friendlyId as a cached hit. Returns null
+  // for any failure or miss — buffer outages must not 500 the trigger
+  // hot path; we fail open to "no cache hit" and let the request through.
+  private async findBufferedRunWithIdempotency(
+    environmentId: string,
+    organizationId: string,
+    taskIdentifier: string,
+    idempotencyKey: string,
+  ): Promise<TaskRun | null> {
+    const buffer = getMollifierBuffer();
+    if (!buffer) return null;
+
+    let bufferedRunId: string | null;
+    try {
+      bufferedRunId = await buffer.lookupIdempotency({
+        envId: environmentId,
+        taskIdentifier,
+        idempotencyKey,
+      });
+    } catch (err) {
+      logger.error("IdempotencyKeyConcern: buffer lookupIdempotency failed", {
+        environmentId,
+        taskIdentifier,
+        err: err instanceof Error ? err.message : String(err),
+      });
+      return null;
+    }
+    if (!bufferedRunId) return null;
+
+    const synthetic = await findRunByIdWithMollifierFallback({
+      runId: bufferedRunId,
+      environmentId,
+      organizationId,
+    });
+    if (!synthetic) return null;
+    // PG-resident path enforces idempotency-key expiry below
+    // (`existingRun.idempotencyKeyExpiresAt < new Date()` clears the key
+    // and lets a new run go through). The buffer path needs the same
+    // check — without it a customer who passes `idempotencyKeyTTL: "2s"`
+    // gets the cached buffered runId returned indefinitely, because the
+    // buffer entry persists for its own (hours-long) TTL independent of
+    // the customer's key TTL.
+    //
+    // Returning null isn't enough on its own: the trigger pipeline then
+    // proceeds to `mollifyTrigger`, whose `buffer.accept` Lua dedupes by
+    // `(envId, taskIdentifier, idempotencyKey)` via SETNX on the same
+    // `mollifier:idempotency:*` key and would echo the stale runId as
+    // `duplicate_idempotency`. Clear the buffer-side idempotency
+    // binding (both the lookup and any in-flight claim) so the next
+    // accept goes through as a fresh trigger. Mirrors what
+    // `ResetIdempotencyKeyService` does for the explicit
+    // reset-via-API path.
+    if (
+      synthetic.idempotencyKeyExpiresAt &&
+      synthetic.idempotencyKeyExpiresAt < new Date()
+    ) {
+      const buffer = getMollifierBuffer();
+      if (buffer) {
+        try {
+          await buffer.resetIdempotency({
+            envId: environmentId,
+            taskIdentifier,
+            idempotencyKey,
+          });
+        } catch (err) {
+          logger.warn("IdempotencyKeyConcern: failed to reset expired buffer idempotency", {
+            envId: environmentId,
+            taskIdentifier,
+            err: err instanceof Error ? err.message : String(err),
+          });
+        }
+      }
+      return null;
+    }
+    return synthetic as unknown as TaskRun;
+  }
+
   async handleTriggerRequest(
     request: TriggerTaskRequest,
     parentStore: string | undefined
@@ -44,6 +161,25 @@ export class IdempotencyKeyConcern {
         })
       : undefined;
 
+    // Buffer fallback per the mollifier-idempotency design. PG missed —
+    // the same key may belong to a buffered run that hasn't materialised
+    // yet. Skipped when `resumeParentOnCompletion` is set: blocking a
+    // parent on a buffered child via waitpoint requires a PG row that
+    // doesn't exist yet. The follow-up accept's SETNX in mollifyTrigger
+    // still dedupes the trigger itself; the waitpoint just doesn't fire
+    // for this rare race window.
+    if (!existingRun && idempotencyKey && !request.body.options?.resumeParentOnCompletion) {
+      const buffered = await this.findBufferedRunWithIdempotency(
+        request.environment.id,
+        request.environment.organizationId,
+        request.taskId,
+        idempotencyKey,
+      );
+      if (buffered) {
+        return { isCached: true, run: buffered };
+      }
+    }
+
     if (existingRun) {
       // The idempotency key has expired
       if (existingRun.idempotencyKeyExpiresAt && existingRun.idempotencyKeyExpiresAt < new Date()) {
@@ -133,6 +269,133 @@ export class IdempotencyKeyConcern {
       return { isCached: true, run: existingRun };
     }
 
+    // Pre-gate claim — closes the PG+buffer race during gate transition.
+    // All same-key triggers serialise here before evaluateGate decides
+    // PG-pass-through vs mollify. Skipped for triggerAndWait
+    // (resumeParentOnCompletion) — that path bypasses the gate entirely
+    // and its existing PG-side dedup is sufficient.
+    //
+    // Also gated on the same per-org mollifier flag the gate uses: when
+    // `TRIGGER_MOLLIFIER_ENABLED=1` globally for staged rollout, the buffer
+    // singleton is constructed and `claimOrAwait` would otherwise issue a
+    // Redis SETNX for EVERY idempotency-keyed trigger — including orgs
+    // that haven't opted in. Those orgs never enter the mollify branch
+    // (the gate always returns pass_through for them), so there's no
+    // buffer activity to serialise against; PG's unique constraint
+    // already deduplicates concurrent same-key races. Resolving the org
+    // flag is a pure in-memory read of `Organization.featureFlags` — no
+    // DB query, same predicate the gate uses — keeping the claim's Redis
+    // RTT off the hot path for non-opted-in orgs during incremental
+    // rollout.
+    // Match the gate's bypass list (`mollifierGate.server.ts:158-175`).
+    // debounce + oneTimeUseToken triggers always return pass_through from
+    // the gate, so claiming a Redis SETNX here is wasted RTT on the
+    // trigger hot path. Excluding them keeps the claim aligned with the
+    // gate — if the gate would never mollify the request, there's no
+    // buffer to serialise against.
+    const claimEligible =
+      !request.body.options?.resumeParentOnCompletion &&
+      !request.body.options?.debounce &&
+      !request.options?.oneTimeUseToken &&
+      (await resolveOrgMollifierFlag({
+        envId: request.environment.id,
+        orgId: request.environment.organizationId,
+        taskId: request.taskId,
+        orgFeatureFlags:
+          ((request.environment.organization?.featureFlags as
+            | Record<string, unknown>
+            | null
+            | undefined) ?? null),
+      }));
+    if (claimEligible) {
+      const ttlSeconds = Math.max(
+        1,
+        Math.min(
+          30,
+          Math.ceil((idempotencyKeyExpiresAt.getTime() - Date.now()) / 1000),
+        ),
+      );
+      const outcome = await claimOrAwait({
+        envId: request.environment.id,
+        taskIdentifier: request.taskId,
+        idempotencyKey,
+        ttlSeconds,
+      });
+      if (outcome.kind === "resolved") {
+        // Another concurrent trigger committed first. Re-resolve via the
+        // existing checks: writer-side PG findFirst first (defeats
+        // replica lag), then buffer fallback for the buffered case.
+        const writerRun = await this.prisma.taskRun.findFirst({
+          where: {
+            runtimeEnvironmentId: request.environment.id,
+            idempotencyKey,
+            taskIdentifier: request.taskId,
+          },
+          include: { associatedWaitpoint: true },
+        });
+        if (writerRun) {
+          return { isCached: true, run: writerRun };
+        }
+        const buffered = await this.findBufferedRunWithIdempotency(
+          request.environment.id,
+          request.environment.organizationId,
+          request.taskId,
+          idempotencyKey,
+        );
+        if (buffered) {
+          return { isCached: true, run: buffered };
+        }
+        // Claim resolved to a runId nothing can find — the run was
+        // genuinely lost (claimant errored after publish, drain failed,
+        // or both the PG row and buffer entry TTL'd out). This is
+        // terminal, not transient: `lookupIdempotency` self-heals a
+        // dangling pointer, and `ack` keeps the entry hash as a
+        // read-fallback past the PG write, so re-polling cannot conjure
+        // a run that is gone. Falling through to a fresh trigger is the
+        // correct recovery.
+        //
+        // Why falling through claimless is safe (no duplicate runs):
+        // concurrent triggers that also fall through here converge on a
+        // single run via the same dedup backstops the claim layer relies
+        // on — the PG unique constraint on the idempotency key
+        // (RunDuplicateIdempotencyKeyError → retry resolves to the
+        // winner) for the pass-through path, and `accept`'s idempotency
+        // SETNX (`duplicate_idempotency`) for the mollify path. Once the
+        // first fall-through commits a run, later callers find it via the
+        // writer-PG / buffer lookups above despite the stale `resolved:`
+        // slot, which the slot's TTL clears within ~30s. The residual
+        // cost is a few redundant (deduped) trigger attempts in that
+        // window, not duplicate runs.
+        logger.warn("idempotency claim resolved but runId not findable", {
+          envId: request.environment.id,
+          taskIdentifier: request.taskId,
+          claimedRunId: outcome.runId,
+        });
+      }
+      if (outcome.kind === "timed_out") {
+        throw new ServiceValidationError(
+          "Idempotency claim resolution timed out",
+          503,
+        );
+      }
+      if (outcome.kind === "claimed") {
+        // Caller MUST publish/release. Signalled via the result's
+        // `claim` field, including the ownership token so the buffer
+        // can compare-and-act on the slot we now own.
+        return {
+          isCached: false,
+          idempotencyKey,
+          idempotencyKeyExpiresAt,
+          claim: {
+            envId: request.environment.id,
+            taskIdentifier: request.taskId,
+            idempotencyKey,
+            token: outcome.token,
+          },
+        };
+      }
+    }
+
     return { isCached: false, idempotencyKey, idempotencyKeyExpiresAt };
   }
 }
diff --git a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts
index 5f985b684c1..a8a7cbf0f3b 100644
--- a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts
+++ b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts
@@ -6,6 +6,7 @@ import type { PrismaClientOrTransaction } from "@trigger.dev/database";
 import type { AuthenticatedEnvironment } from "~/services/apiAuth.server";
 import { logger } from "~/services/logger.server";
 import { getEventRepository } from "~/v3/eventRepository/index.server";
+import { PerformTaskRunAlertsService } from "~/v3/services/alerts/performTaskRunAlerts.server";
 import { DefaultQueueManager } from "../concerns/queues.server";
 import type { TriggerTaskRequest } from "../types";
 
@@ -176,6 +177,14 @@ export class TriggerFailedTaskService {
           event.setAttribute("runId", failedRunFriendlyId);
           event.failWithError(taskRunError);
 
+          // `emitRunFailedEvent: false` because this call site owns the
+          // trace-event lifecycle via the outer `traceEvent({
+          // incomplete: false, isError: true })`. Letting the engine
+          // emit `runFailed` here would race the
+          // `completeFailedRunEvent` listener against the outer trace
+          // event's own completion write for the same (traceId, spanId).
+          // We re-trigger the alerts side directly after the trace
+          // event closes, below.
           return await this.engine.createFailedTaskRun({
             friendlyId: failedRunFriendlyId,
             environment: {
@@ -200,12 +209,30 @@ export class TriggerFailedTaskService {
             spanId: event.spanId,
             traceContext: traceContext as Record<string, unknown>,
             taskEventStore: store,
+            emitRunFailedEvent: false,
             ...(queueName !== undefined && { queue: queueName }),
             ...(lockedQueueId !== undefined && { lockedQueueId }),
           });
         }
       );
 
+      // Alerts side of `runFailed` — the engine emit was suppressed
+      // above so the trace-event completion isn't double-written; we
+      // still need the alert pipeline to fire so customers' ERROR
+      // channels see the failure. Best-effort: a failed enqueue logs
+      // but doesn't block returning the friendlyId, mirroring the
+      // engine handler's behaviour at runEngineHandlers.server.ts:81.
+      try {
+        await PerformTaskRunAlertsService.enqueue(failedRun.id);
+      } catch (alertsError) {
+        logger.warn("TriggerFailedTaskService: alert enqueue failed", {
+          taskId: request.taskId,
+          friendlyId: failedRun.friendlyId,
+          error:
+            alertsError instanceof Error ? alertsError.message : String(alertsError),
+        });
+      }
+
       return failedRun.friendlyId;
     } catch (createError) {
       const createErrorMsg =
@@ -264,7 +291,7 @@ export class TriggerFailedTaskService {
         }
       }
 
-      await this.engine.createFailedTaskRun({
+      const failedRun = await this.engine.createFailedTaskRun({
         friendlyId: failedRunFriendlyId,
         environment: {
           id: opts.environmentId,
@@ -286,8 +313,32 @@ export class TriggerFailedTaskService {
         depth,
         resumeParentOnCompletion: opts.resumeParentOnCompletion,
         batch: opts.batch,
+        // Suppress the engine's `runFailed` bus emit — the listener
+        // (`runEngineHandlers.server.ts` `runFailed`) calls
+        // `completeFailedRunEvent`, which writes a ClickHouse trace event
+        // row keyed on (traceId, spanId). This caller has no trace
+        // context (the method name is literally `callWithoutTraceEvents`)
+        // so the emit would write a row with empty traceId/spanId —
+        // orphan event in the store. We still want alert coverage,
+        // though, so enqueue directly below.
+        emitRunFailedEvent: false,
       });
 
+      // Alerts side of `runFailed` — the engine emit was suppressed
+      // above so we don't create an orphan trace event; enqueue the
+      // alert directly so customers' ERROR channels still see the
+      // failure. Best-effort, mirroring the `call()` path.
+      try {
+        await PerformTaskRunAlertsService.enqueue(failedRun.id);
+      } catch (alertsError) {
+        logger.warn("TriggerFailedTaskService.callWithoutTraceEvents: alert enqueue failed", {
+          taskId: opts.taskId,
+          friendlyId: failedRun.friendlyId,
+          error:
+            alertsError instanceof Error ? alertsError.message : String(alertsError),
+        });
+      }
+
       return failedRunFriendlyId;
     } catch (createError) {
       logger.error("TriggerFailedTaskService: failed to create pre-failed TaskRun (no trace)", {
diff --git a/apps/webapp/app/runEngine/services/triggerTask.server.ts b/apps/webapp/app/runEngine/services/triggerTask.server.ts
index 2d9eeec0943..0049968e06a 100644
--- a/apps/webapp/app/runEngine/services/triggerTask.server.ts
+++ b/apps/webapp/app/runEngine/services/triggerTask.server.ts
@@ -30,7 +30,14 @@ import type {
   TriggerTaskServiceResult,
 } from "../../v3/services/triggerTask.server";
 import { clampMaxDuration } from "../../v3/utils/maxDuration";
-import { IdempotencyKeyConcern } from "../concerns/idempotencyKeys.server";
+import {
+  IdempotencyKeyConcern,
+  type ClaimedIdempotency,
+} from "../concerns/idempotencyKeys.server";
+import {
+  publishClaim as publishMollifierClaim,
+  releaseClaim as releaseMollifierClaim,
+} from "~/v3/mollifier/idempotencyClaim.server";
 import type {
   PayloadProcessor,
   QueueManager,
@@ -50,8 +57,8 @@ import {
   getMollifierBuffer as defaultGetMollifierBuffer,
   type MollifierGetBuffer,
 } from "~/v3/mollifier/mollifierBuffer.server";
-import { buildBufferedTriggerPayload } from "~/v3/mollifier/bufferedTriggerPayload.server";
-import { serialiseSnapshot } from "@trigger.dev/redis-worker";
+import { mollifyTrigger } from "~/v3/mollifier/mollifierMollify.server";
+import { type MollifierBuffer } from "@trigger.dev/redis-worker";
 import { QueueSizeLimitExceededError, ServiceValidationError } from "~/v3/services/common.server";
 
 class NoopTriggerRacepointSystem implements TriggerRacepointSystem {
@@ -124,474 +131,657 @@ export class RunEngineTriggerTaskService {
     options?: TriggerTaskServiceOptions;
     attempt?: number;
   }): Promise<TriggerTaskServiceResult | undefined> {
-    return await startSpan(this.tracer, "RunEngineTriggerTaskService.call()", async (span) => {
-      span.setAttribute("taskId", taskId);
-      span.setAttribute("attempt", attempt);
-
-      const runFriendlyId = options?.runFriendlyId ?? RunId.generate().friendlyId;
-      const triggerRequest = {
-        taskId,
-        friendlyId: runFriendlyId,
-        environment,
-        body,
-        options,
-      } satisfies TriggerTaskRequest;
-
-      // Validate max attempts
-      const maxAttemptsValidation = this.validator.validateMaxAttempts({
-        taskId,
-        attempt,
-      });
-
-      if (!maxAttemptsValidation.ok) {
-        throw maxAttemptsValidation.error;
-      }
+    // Pre-gate idempotency-claim ownership. Set inside the span when
+    // `IdempotencyKeyConcern.handleTriggerRequest` returns `claim:
+    // {...}`. The try/catch below resolves it once the span finishes.
+    let idempotencyClaim: ClaimedIdempotency | undefined;
+    try {
+      const result = await startSpan(
+        this.tracer,
+        "RunEngineTriggerTaskService.call()",
+        async (span) => {
+          span.setAttribute("taskId", taskId);
+          span.setAttribute("attempt", attempt);
+
+          const runFriendlyId = options?.runFriendlyId ?? RunId.generate().friendlyId;
+          const triggerRequest = {
+            taskId,
+            friendlyId: runFriendlyId,
+            environment,
+            body,
+            options,
+          } satisfies TriggerTaskRequest;
 
-      // Validate tags
-      const tagValidation = this.validator.validateTags({
-        tags: body.options?.tags,
-      });
+          // Validate max attempts
+          const maxAttemptsValidation = this.validator.validateMaxAttempts({
+            taskId,
+            attempt,
+          });
 
-      if (!tagValidation.ok) {
-        throw tagValidation.error;
-      }
+          if (!maxAttemptsValidation.ok) {
+            throw maxAttemptsValidation.error;
+          }
 
-      // Validate entitlement (unless skipChecks is enabled)
-      let planType: string | undefined;
+          // Validate tags
+          const tagValidation = this.validator.validateTags({
+            tags: body.options?.tags,
+          });
 
-      if (!options.skipChecks) {
-        const entitlementValidation = await this.validator.validateEntitlement({
-          environment,
-        });
+          if (!tagValidation.ok) {
+            throw tagValidation.error;
+          }
 
-        if (!entitlementValidation.ok) {
-          throw entitlementValidation.error;
-        }
+          // Validate entitlement (unless skipChecks is enabled)
+          let planType: string | undefined;
 
-        // Extract plan type from entitlement response
-        planType = entitlementValidation.plan?.type;
-      } else {
-        // When skipChecks is enabled, planType should be passed via options
-        planType = options.planType;
+          if (!options.skipChecks) {
+            const entitlementValidation = await this.validator.validateEntitlement({
+              environment,
+            });
 
-        if (!planType) {
-          logger.warn("Plan type not set but skipChecks is enabled", {
+            if (!entitlementValidation.ok) {
+              throw entitlementValidation.error;
+            }
+
+            // Extract plan type from entitlement response
+            planType = entitlementValidation.plan?.type;
+          } else {
+            // When skipChecks is enabled, planType should be passed via options
+            planType = options.planType;
+
+            if (!planType) {
+              logger.warn("Plan type not set but skipChecks is enabled", {
+                taskId,
+                environment: {
+                  id: environment.id,
+                  type: environment.type,
+                  projectId: environment.projectId,
+                  organizationId: environment.organizationId,
+                },
+              });
+            }
+          }
+
+          // Parse delay from either explicit delay option or debounce.delay
+          const delaySource = body.options?.delay ?? body.options?.debounce?.delay;
+          const [parseDelayError, delayUntil] = await tryCatch(parseDelay(delaySource));
+
+          if (parseDelayError) {
+            throw new ServiceValidationError(`Invalid delay ${delaySource}`);
+          }
+
+          // Validate debounce options
+          if (body.options?.debounce) {
+            if (!delayUntil) {
+              throw new ServiceValidationError(
+                `Debounce requires a valid delay duration. Provided: ${body.options.debounce.delay}`
+              );
+            }
+
+            // Always validate debounce.delay separately since it's used for rescheduling
+            // This catches the case where options.delay is valid but debounce.delay is invalid
+            const [debounceDelayError, debounceDelayUntil] = await tryCatch(
+              parseDelay(body.options.debounce.delay)
+            );
+
+            if (debounceDelayError || !debounceDelayUntil) {
+              throw new ServiceValidationError(
+                `Invalid debounce delay: ${body.options.debounce.delay}. ` +
+                `Supported formats: {number}s, {number}m, {number}h, {number}d, {number}w`
+              );
+            }
+          }
+
+          // Get parent run if specified
+          const parentRun = body.options?.parentRunId
+            ? await this.prisma.taskRun.findFirst({
+              where: {
+                id: RunId.fromFriendlyId(body.options.parentRunId),
+                runtimeEnvironmentId: environment.id,
+              },
+            })
+            : undefined;
+
+          // Validate parent run
+          const parentRunValidation = this.validator.validateParentRun({
             taskId,
-            environment: {
-              id: environment.id,
-              type: environment.type,
-              projectId: environment.projectId,
-              organizationId: environment.organizationId,
-            },
+            parentRun: parentRun ?? undefined,
+            resumeParentOnCompletion: body.options?.resumeParentOnCompletion,
           });
-        }
-      }
-
-      // Parse delay from either explicit delay option or debounce.delay
-      const delaySource = body.options?.delay ?? body.options?.debounce?.delay;
-      const [parseDelayError, delayUntil] = await tryCatch(parseDelay(delaySource));
 
-      if (parseDelayError) {
-        throw new ServiceValidationError(`Invalid delay ${delaySource}`);
-      }
+          if (!parentRunValidation.ok) {
+            throw parentRunValidation.error;
+          }
 
-      // Validate debounce options
-      if (body.options?.debounce) {
-        if (!delayUntil) {
-          throw new ServiceValidationError(
-            `Debounce requires a valid delay duration. Provided: ${body.options.debounce.delay}`
-          );
-        }
-
-        // Always validate debounce.delay separately since it's used for rescheduling
-        // This catches the case where options.delay is valid but debounce.delay is invalid
-        const [debounceDelayError, debounceDelayUntil] = await tryCatch(
-          parseDelay(body.options.debounce.delay)
-        );
-
-        if (debounceDelayError || !debounceDelayUntil) {
-          throw new ServiceValidationError(
-            `Invalid debounce delay: ${body.options.debounce.delay}. ` +
-            `Supported formats: {number}s, {number}m, {number}h, {number}d, {number}w`
+          const idempotencyKeyConcernResult = await this.idempotencyKeyConcern.handleTriggerRequest(
+            triggerRequest,
+            parentRun?.taskEventStore
           );
-        }
-      }
 
-      // Get parent run if specified
-      const parentRun = body.options?.parentRunId
-        ? await this.prisma.taskRun.findFirst({
-          where: {
-            id: RunId.fromFriendlyId(body.options.parentRunId),
-            runtimeEnvironmentId: environment.id,
-          },
-        })
-        : undefined;
-
-      // Validate parent run
-      const parentRunValidation = this.validator.validateParentRun({
-        taskId,
-        parentRun: parentRun ?? undefined,
-        resumeParentOnCompletion: body.options?.resumeParentOnCompletion,
-      });
-
-      if (!parentRunValidation.ok) {
-        throw parentRunValidation.error;
-      }
+          if (idempotencyKeyConcernResult.isCached) {
+            return idempotencyKeyConcernResult;
+          }
 
-      const idempotencyKeyConcernResult = await this.idempotencyKeyConcern.handleTriggerRequest(
-        triggerRequest,
-        parentRun?.taskEventStore
-      );
+          const { idempotencyKey, idempotencyKeyExpiresAt, claim: claimResult } =
+            idempotencyKeyConcernResult;
+
+          // If we own an idempotency claim, the trigger pipeline below MUST
+          // resolve it — publish on success so waiters see our runId,
+          // release on error so the next claimant can retry. Stored in an
+          // outer scope so the try/catch at the bottom of `callV2` can act
+          // on whichever return path or throw the pipeline takes.
+          idempotencyClaim = claimResult;
+
+          if (idempotencyKey) {
+            await this.triggerRacepointSystem.waitForRacepoint({
+              racepoint: "idempotencyKey",
+              id: idempotencyKey,
+            });
+          }
 
-      if (idempotencyKeyConcernResult.isCached) {
-        return idempotencyKeyConcernResult;
-      }
+          const lockedToBackgroundWorker = body.options?.lockToVersion
+            ? await this.prisma.backgroundWorker.findFirst({
+              where: {
+                projectId: environment.projectId,
+                runtimeEnvironmentId: environment.id,
+                version: body.options?.lockToVersion,
+              },
+              select: {
+                id: true,
+                version: true,
+                sdkVersion: true,
+                cliVersion: true,
+              },
+            })
+            : undefined;
 
-      const { idempotencyKey, idempotencyKeyExpiresAt } = idempotencyKeyConcernResult;
+          const { queueName, lockedQueueId, taskTtl, taskKind } =
+            await this.queueConcern.resolveQueueProperties(
+              triggerRequest,
+              lockedToBackgroundWorker ?? undefined
+            );
 
-      if (idempotencyKey) {
-        await this.triggerRacepointSystem.waitForRacepoint({
-          racepoint: "idempotencyKey",
-          id: idempotencyKey,
-        });
-      }
+          // Resolve TTL with precedence: per-trigger > task-level > dev default
+          let ttl: string | undefined;
 
-      const lockedToBackgroundWorker = body.options?.lockToVersion
-        ? await this.prisma.backgroundWorker.findFirst({
-          where: {
-            projectId: environment.projectId,
-            runtimeEnvironmentId: environment.id,
-            version: body.options?.lockToVersion,
-          },
-          select: {
-            id: true,
-            version: true,
-            sdkVersion: true,
-            cliVersion: true,
-          },
-        })
-        : undefined;
-
-      const { queueName, lockedQueueId, taskTtl, taskKind } =
-        await this.queueConcern.resolveQueueProperties(
-          triggerRequest,
-          lockedToBackgroundWorker ?? undefined
-        );
-
-      // Resolve TTL with precedence: per-trigger > task-level > dev default
-      let ttl: string | undefined;
-
-      if (body.options?.ttl !== undefined) {
-        ttl =
-          typeof body.options.ttl === "number"
-            ? stringifyDuration(body.options.ttl)
-            : body.options.ttl;
-      } else {
-        ttl = taskTtl ?? (environment.type === "DEVELOPMENT" ? "10m" : undefined);
-      }
+          if (body.options?.ttl !== undefined) {
+            ttl =
+              typeof body.options.ttl === "number"
+                ? stringifyDuration(body.options.ttl)
+                : body.options.ttl;
+          } else {
+            ttl = taskTtl ?? (environment.type === "DEVELOPMENT" ? "10m" : undefined);
+          }
 
-      if (!options.skipChecks) {
-        const queueSizeGuard = await this.queueConcern.validateQueueLimits(
-          environment,
-          queueName
-        );
-
-        if (!queueSizeGuard.ok) {
-          throw new QueueSizeLimitExceededError(
-            `Cannot trigger ${taskId} as the queue size limit for this environment has been reached. The maximum size is ${queueSizeGuard.maximumSize}`,
-            queueSizeGuard.maximumSize ?? 0,
-            undefined,
-            "warn"
+          if (!options.skipChecks) {
+            const queueSizeGuard = await this.queueConcern.validateQueueLimits(
+              environment,
+              queueName
+            );
+
+            if (!queueSizeGuard.ok) {
+              throw new QueueSizeLimitExceededError(
+                `Cannot trigger ${taskId} as the queue size limit for this environment has been reached. The maximum size is ${queueSizeGuard.maximumSize}`,
+                queueSizeGuard.maximumSize ?? 0,
+                undefined,
+                "warn"
+              );
+            }
+          }
+
+          const metadataPacket = body.options?.metadata
+            ? handleMetadataPacket(
+              body.options?.metadata,
+              body.options?.metadataType ?? "application/json",
+              this.metadataMaximumSize
+            )
+            : undefined;
+
+          const tags = (
+            body.options?.tags
+              ? typeof body.options.tags === "string"
+                ? [body.options.tags]
+                : body.options.tags
+              : []
+          ).filter((tag) => tag.trim().length > 0);
+
+          const depth = parentRun ? parentRun.depth + 1 : 0;
+
+          const workerQueueResult = await this.queueConcern.getWorkerQueue(
+            environment,
+            body.options?.region
           );
-        }
-      }
+          const workerQueue = workerQueueResult?.masterQueue;
+          const enableFastPath = workerQueueResult?.enableFastPath ?? false;
+
+          // Build annotations for this run
+          const triggerSource = options.triggerSource ?? "api";
+          const triggerAction = options.triggerAction ?? "trigger";
+          const parentAnnotations = RunAnnotations.safeParse(parentRun?.annotations).data;
+          const annotations = {
+            triggerSource,
+            triggerAction,
+            rootTriggerSource: parentAnnotations?.rootTriggerSource ?? triggerSource,
+            rootScheduleId: parentAnnotations?.rootScheduleId || options.scheduleId || undefined,
+            taskKind: taskKind ?? "STANDARD",
+          };
+
+          try {
+            return await this.traceEventConcern.traceRun(
+              triggerRequest,
+              parentRun?.taskEventStore,
+              async (event, store) => {
+                event.setAttribute("queueName", queueName);
+                span.setAttribute("queueName", queueName);
+                event.setAttribute("runId", runFriendlyId);
+                span.setAttribute("runId", runFriendlyId);
+
+                // Short-circuit when mollifier is globally off (the default
+                // for every deployment that hasn't opted in). Avoids the
+                // GateInputs allocation, the deps spread inside `evaluateGate`,
+                // and the `mollifier.decisions{outcome=pass_through}` OTel
+                // increment on every trigger — `triggerTask` is the
+                // highest-throughput code path in the system. The check goes
+                // through a DI'd predicate so unit tests that inject a custom
+                // `evaluateGate` can also override the gate-on check (the
+                // default reads `env.TRIGGER_MOLLIFIER_ENABLED`, which is "0"
+                // in CI where no .env file is present).
+                //
+                // Batch items bypass the mollifier gate entirely.
+                //
+                // The mollify path returns a stripped run-shape `{ id,
+                // friendlyId, spanId }` with no PG row written. Batch
+                // tracking relies on `BatchTaskRunItem`, a join row whose
+                // `taskRunId` column has a NOT NULL FK to `TaskRun.id` —
+                // creating that join at trigger-time (in
+                // `batchTriggerV3.server.ts:871`) fails with FK violation
+                // for any mollified item, and skipping it at trigger-time
+                // would silently drop the batch↔run link forever because
+                // the drainer's materialise path doesn't (yet) create
+                // `BatchTaskRunItem`. Either side alone is wrong:
+                //   - skip at trigger-time only → batch progress
+                //     under-reports forever, `batchTriggerAndWait` parent
+                //     stays parked
+                //   - mollify at trigger-time only → FK violation, 500
+                //
+                // The proper end state is a drainer-side
+                // `BatchTaskRunItem` create-on-materialise (the snapshot
+                // already carries `batch: { id, index }` so the drainer
+                // has the info). That belongs in the drainer / replay PR,
+                // not here. Until that lands, batch triggers pass-through
+                // — they lose the burst-protection benefit, but the path
+                // works end-to-end.
+                const skipMollifierForBatch = !!options.batchId;
+                const mollifierOutcome: GateOutcome | null =
+                  this.isMollifierGloballyEnabled() && !skipMollifierForBatch
+                    ? await this.evaluateGate({
+                        envId: environment.id,
+                        orgId: environment.organizationId,
+                        taskId,
+                        orgFeatureFlags:
+                          (environment.organization.featureFlags as Record<string, unknown> | null) ??
+                          null,
+                        options: {
+                          debounce: body.options?.debounce,
+                          oneTimeUseToken: options.oneTimeUseToken,
+                          parentTaskRunId: body.options?.parentRunId,
+                          resumeParentOnCompletion: body.options?.resumeParentOnCompletion,
+                        },
+                      })
+                    : null;
+
+                // When the gate says mollify, write the engine.trigger input
+                // snapshot into the Redis buffer and return a synthesised
+                // TriggerTaskServiceResult. The customer never waits on
+                // Postgres; the drainer materialises the run later by replaying
+                // engine.trigger against the snapshot. The run span has already
+                // been opened by traceRun above (PARTIAL event in ClickHouse),
+                // so its traceId/spanId live in the snapshot and the drainer's
+                // `mollifier.drained` span parents on the same trace — buffered
+                // runs become visible in the dashboard's trace view immediately,
+                // not only after the drainer fires.
+                if (mollifierOutcome?.action === "mollify") {
+                  const mollifierBuffer = this.getMollifierBuffer();
+                  if (mollifierBuffer && !body.options?.debounce) {
+                    event.setAttribute("mollifier.reason", mollifierOutcome.decision.reason);
+                    event.setAttribute("mollifier.count", String(mollifierOutcome.decision.count));
+                    event.setAttribute(
+                      "mollifier.threshold",
+                      String(mollifierOutcome.decision.threshold)
+                    );
+                    event.setAttribute("taskRunId", runFriendlyId);
+
+                    const payloadPacket = await this.payloadProcessor.process(triggerRequest);
+
+                    const engineTriggerInput = this.#buildEngineTriggerInput({
+                      runFriendlyId,
+                      environment,
+                      idempotencyKey,
+                      idempotencyKeyExpiresAt,
+                      body,
+                      options,
+                      queueName,
+                      lockedQueueId,
+                      workerQueue,
+                      enableFastPath,
+                      lockedToBackgroundWorker: lockedToBackgroundWorker ?? undefined,
+                      delayUntil,
+                      ttl,
+                      metadataPacket,
+                      tags,
+                      depth,
+                      parentRun: parentRun ?? undefined,
+                      annotations,
+                      planType,
+                      taskId,
+                      payloadPacket,
+                      traceContext: this.#propagateExternalTraceContext(
+                        event.traceContext,
+                        parentRun?.traceContext,
+                        event.traceparent?.spanId
+                      ),
+                      traceId: event.traceId,
+                      spanId: event.spanId,
+                      parentSpanId:
+                        options.parentAsLinkType === "replay"
+                          ? undefined
+                          : event.traceparent?.spanId,
+                      taskEventStore: store,
+                    });
+
+                    const result = await mollifyTrigger({
+                      runFriendlyId,
+                      environmentId: environment.id,
+                      organizationId: environment.organizationId,
+                      engineTriggerInput,
+                      decision: mollifierOutcome.decision,
+                      buffer: mollifierBuffer,
+                      // Idempotency-key triple wires the buffer's SETNX into
+                      // the trigger-time dedup symmetric with PG.
+                      idempotencyKey,
+                      taskIdentifier: taskId,
+                    });
+
+                    logger.debug("mollifier.buffered", {
+                      runId: runFriendlyId,
+                      envId: environment.id,
+                      orgId: environment.organizationId,
+                      taskId,
+                      reason: mollifierOutcome.decision.reason,
+                    });
+
+                    // Synthetic result is structurally narrower than the full
+                    // TaskRun; the route handler only reads
+                    // `result.run.friendlyId`. traceRun flushes the PARTIAL
+                    // run-span event to ClickHouse on callback return.
+                    // `isMollified` flags the route to skip the request-
+                    // idempotency cache write — see the field's contract on
+                    // `TriggerTaskServiceResult`.
+                    return {
+                      ...(result as unknown as TriggerTaskServiceResult),
+                      isMollified: true,
+                    };
+                  }
+                  if (!mollifierBuffer) {
+                    logger.warn(
+                      "mollifier gate said mollify but buffer is null — falling through to pass-through"
+                    );
+                  }
+                }
 
-      const metadataPacket = body.options?.metadata
-        ? handleMetadataPacket(
-          body.options?.metadata,
-          body.options?.metadataType ?? "application/json",
-          this.metadataMaximumSize
-        )
-        : undefined;
-
-      const tags = (
-        body.options?.tags
-          ? typeof body.options.tags === "string"
-            ? [body.options.tags]
-            : body.options.tags
-          : []
-      ).filter((tag) => tag.trim().length > 0);
-
-      const depth = parentRun ? parentRun.depth + 1 : 0;
-
-      const workerQueueResult = await this.queueConcern.getWorkerQueue(
-        environment,
-        body.options?.region
-      );
-      const workerQueue = workerQueueResult?.masterQueue;
-      const enableFastPath = workerQueueResult?.enableFastPath ?? false;
-
-      // Build annotations for this run
-      const triggerSource = options.triggerSource ?? "api";
-      const triggerAction = options.triggerAction ?? "trigger";
-      const parentAnnotations = RunAnnotations.safeParse(parentRun?.annotations).data;
-      const annotations = {
-        triggerSource,
-        triggerAction,
-        rootTriggerSource: parentAnnotations?.rootTriggerSource ?? triggerSource,
-        rootScheduleId: parentAnnotations?.rootScheduleId || options.scheduleId || undefined,
-        taskKind: taskKind ?? "STANDARD",
-      };
-
-      // Short-circuit before the gate when mollifier is globally off (the
-      // default for every deployment that hasn't opted in). Avoids the
-      // GateInputs allocation, the deps spread inside `evaluateGate`, and
-      // the `mollifier.decisions{outcome=pass_through}` OTel increment on
-      // every trigger — `triggerTask` is the highest-throughput code path
-      // in the system. The check goes through a DI'd predicate so unit
-      // tests that inject a custom `evaluateGate` can also override the
-      // gate-on check (the default reads `env.TRIGGER_MOLLIFIER_ENABLED`,
-      // which is "0" in CI where no .env file is present).
-      const mollifierOutcome: GateOutcome | null = this.isMollifierGloballyEnabled()
-        ? await this.evaluateGate({
-            envId: environment.id,
-            orgId: environment.organizationId,
-            taskId,
-            orgFeatureFlags:
-              (environment.organization.featureFlags as Record<string, unknown> | null) ?? null,
-          })
-        : null;
-
-      try {
-        return await this.traceEventConcern.traceRun(
-          triggerRequest,
-          parentRun?.taskEventStore,
-          async (event, store) => {
-            event.setAttribute("queueName", queueName);
-            span.setAttribute("queueName", queueName);
-            event.setAttribute("runId", runFriendlyId);
-            span.setAttribute("runId", runFriendlyId);
-
-            const payloadPacket = await this.payloadProcessor.process(triggerRequest);
-
-            // Phase 1 dual-write: if the org has the mollifier feature flag
-            // enabled and the per-env trip evaluator says divert, write the
-            // canonical replay payload to the buffer AND continue through
-            // engine.trigger as normal. The buffer entry is an audit/preview
-            // copy; the drainer's no-op handler consumes it to prove the
-            // dequeue mechanism works. Phase 2 will replace engine.trigger
-            // (below) with a synthesised 200 response and rely on the
-            // drainer to perform the Postgres write via replay.
-            if (mollifierOutcome?.action === "mollify") {
-              const buffer = this.getMollifierBuffer();
-              if (buffer) {
-                const canonicalPayload = buildBufferedTriggerPayload({
+                const payloadPacket = await this.payloadProcessor.process(triggerRequest);
+
+                const baseEngineInput = this.#buildEngineTriggerInput({
                   runFriendlyId,
-                  taskId,
-                  envId: environment.id,
-                  envType: environment.type,
-                  envSlug: environment.slug,
-                  orgId: environment.organizationId,
-                  orgSlug: environment.organization.slug,
-                  projectId: environment.projectId,
-                  projectRef: environment.project.externalRef,
+                  environment,
+                  idempotencyKey,
+                  idempotencyKeyExpiresAt,
                   body,
-                  idempotencyKey: idempotencyKey ?? null,
-                  idempotencyKeyExpiresAt: idempotencyKey
-                    ? idempotencyKeyExpiresAt ?? null
-                    : null,
+                  options,
+                  queueName,
+                  lockedQueueId,
+                  workerQueue,
+                  enableFastPath,
+                  lockedToBackgroundWorker: lockedToBackgroundWorker ?? undefined,
+                  delayUntil,
+                  ttl,
+                  metadataPacket,
                   tags,
-                  parentRunFriendlyId: parentRun?.friendlyId ?? null,
-                  traceContext: event.traceContext,
-                  triggerSource,
-                  triggerAction,
-                  serviceOptions: options,
-                  createdAt: new Date(),
+                  depth,
+                  parentRun: parentRun ?? undefined,
+                  annotations,
+                  planType,
+                  taskId,
+                  payloadPacket,
+                  traceContext: this.#propagateExternalTraceContext(
+                    event.traceContext,
+                    parentRun?.traceContext,
+                    event.traceparent?.spanId
+                  ),
+                  traceId: event.traceId,
+                  spanId: event.spanId,
+                  parentSpanId:
+                    options.parentAsLinkType === "replay" ? undefined : event.traceparent?.spanId,
+                  taskEventStore: store,
                 });
 
-                try {
-                  const serialisedPayload = serialiseSnapshot(canonicalPayload);
-                  await buffer.accept({
-                    runId: runFriendlyId,
-                    envId: environment.id,
-                    orgId: environment.organizationId,
-                    payload: serialisedPayload,
-                  });
-                  // Light log on the hot path — keep this synchronous work
-                  // O(1) per trigger. The drainer computes the payload hash
-                  // off-path; operators correlate `mollifier.buffered` →
-                  // `mollifier.drained` by runId.
-                  logger.debug("mollifier.buffered", {
-                    runId: runFriendlyId,
-                    envId: environment.id,
-                    orgId: environment.organizationId,
-                    taskId,
-                    payloadBytes: serialisedPayload.length,
-                  });
-                } catch (err) {
-                  // Fail-open: buffer write must never block the customer's
-                  // trigger. engine.trigger below is the primary write path
-                  // in Phase 1 — the customer still gets a valid run.
-                  logger.error("mollifier.buffer_accept_failed", {
-                    runId: runFriendlyId,
-                    envId: environment.id,
-                    taskId,
-                    err: err instanceof Error ? err.message : String(err),
-                  });
+                const taskRun = await this.engine.trigger(
+                  {
+                    ...baseEngineInput,
+                    // onDebounced is a closure over webapp state (triggerRequest +
+                    // traceEventConcern) and can't be serialised into the mollifier
+                    // snapshot. The pass-through path attaches it here; the drainer
+                    // path replays without it. The debounce and triggerAndWait gate
+                    // bypasses ensure neither reaches the mollify branch.
+                    onDebounced:
+                      body.options?.debounce && body.options?.resumeParentOnCompletion
+                        ? async ({ existingRun, waitpoint, debounceKey }) => {
+                          return await this.traceEventConcern.traceDebouncedRun(
+                            triggerRequest,
+                            parentRun?.taskEventStore,
+                            {
+                              existingRun,
+                              debounceKey,
+                              incomplete: waitpoint.status === "PENDING",
+                              isError: waitpoint.outputIsError,
+                            },
+                            async (spanEvent) => {
+                              const spanId =
+                                options?.parentAsLinkType === "replay"
+                                  ? spanEvent.spanId
+                                  : spanEvent.traceparent?.spanId
+                                    ? `${spanEvent.traceparent.spanId}:${spanEvent.spanId}`
+                                    : spanEvent.spanId;
+                              return spanId;
+                            }
+                          );
+                        }
+                        : undefined,
+                  },
+                  this.prisma
+                );
+
+                // If the returned run has a different friendlyId, it was debounced.
+                // For triggerAndWait: stop the outer span since a replacement debounced span was created via onDebounced.
+                // For regular trigger: let the span complete normally - no replacement span needed since the
+                // original run already has its span from when it was first created.
+                if (
+                  taskRun.friendlyId !== runFriendlyId &&
+                  body.options?.debounce &&
+                  body.options?.resumeParentOnCompletion
+                ) {
+                  event.stop();
                 }
-              }
-            }
 
-            const taskRun = await this.engine.trigger(
-              {
-                friendlyId: runFriendlyId,
-                environment: environment,
-                idempotencyKey,
-                idempotencyKeyExpiresAt: idempotencyKey ? idempotencyKeyExpiresAt : undefined,
-                idempotencyKeyOptions: body.options?.idempotencyKeyOptions,
-                taskIdentifier: taskId,
-                payload: payloadPacket.data ?? "",
-                payloadType: payloadPacket.dataType,
-                context: body.context,
-                traceContext: this.#propagateExternalTraceContext(
-                  event.traceContext,
-                  parentRun?.traceContext,
-                  event.traceparent?.spanId
-                ),
-                traceId: event.traceId,
-                spanId: event.spanId,
-                parentSpanId:
-                  options.parentAsLinkType === "replay" ? undefined : event.traceparent?.spanId,
-                replayedFromTaskRunFriendlyId: options.replayedFromTaskRunFriendlyId,
-                lockedToVersionId: lockedToBackgroundWorker?.id,
-                taskVersion: lockedToBackgroundWorker?.version,
-                sdkVersion: lockedToBackgroundWorker?.sdkVersion,
-                cliVersion: lockedToBackgroundWorker?.cliVersion,
-                concurrencyKey: body.options?.concurrencyKey,
-                queue: queueName,
-                lockedQueueId,
-                workerQueue,
-                enableFastPath,
-                isTest: body.options?.test ?? false,
-                delayUntil,
-                queuedAt: delayUntil ? undefined : new Date(),
-                maxAttempts: body.options?.maxAttempts,
-                taskEventStore: store,
-                ttl,
-                tags,
-                oneTimeUseToken: options.oneTimeUseToken,
-                parentTaskRunId: parentRun?.id,
-                rootTaskRunId: parentRun?.rootTaskRunId ?? parentRun?.id,
-                batch: options?.batchId
-                  ? {
-                    id: options.batchId,
-                    index: options.batchIndex ?? 0,
-                  }
-                  : undefined,
-                resumeParentOnCompletion: body.options?.resumeParentOnCompletion,
-                depth,
-                metadata: metadataPacket?.data,
-                metadataType: metadataPacket?.dataType,
-                seedMetadata: metadataPacket?.data,
-                seedMetadataType: metadataPacket?.dataType,
-                maxDurationInSeconds: body.options?.maxDuration
-                  ? clampMaxDuration(body.options.maxDuration)
-                  : undefined,
-                machine: body.options?.machine,
-                priorityMs: body.options?.priority ? body.options.priority * 1_000 : undefined,
-                queueTimestamp:
-                  options.queueTimestamp ??
-                  (parentRun && body.options?.resumeParentOnCompletion
-                    ? parentRun.queueTimestamp ?? undefined
-                    : undefined),
-                scheduleId: options.scheduleId,
-                scheduleInstanceId: options.scheduleInstanceId,
-                createdAt: options.overrideCreatedAt,
-                bulkActionId: body.options?.bulkActionId,
-                planType,
-                realtimeStreamsVersion: options.realtimeStreamsVersion,
-                streamBasinName: environment.organization.streamBasinName,
-                debounce: body.options?.debounce,
-                annotations,
-                // When debouncing with triggerAndWait, create a span for the debounced trigger
-                onDebounced:
-                  body.options?.debounce && body.options?.resumeParentOnCompletion
-                    ? async ({ existingRun, waitpoint, debounceKey }) => {
-                      return await this.traceEventConcern.traceDebouncedRun(
-                        triggerRequest,
-                        parentRun?.taskEventStore,
-                        {
-                          existingRun,
-                          debounceKey,
-                          incomplete: waitpoint.status === "PENDING",
-                          isError: waitpoint.outputIsError,
-                        },
-                        async (spanEvent) => {
-                          const spanId =
-                            options?.parentAsLinkType === "replay"
-                              ? spanEvent.spanId
-                              : spanEvent.traceparent?.spanId
-                                ? `${spanEvent.traceparent.spanId}:${spanEvent.spanId}`
-                                : spanEvent.spanId;
-                          return spanId;
-                        }
-                      );
-                    }
-                    : undefined,
-              },
-              this.prisma
-            );
+                const error = taskRun.error ? TaskRunError.parse(taskRun.error) : undefined;
 
-            // If the returned run has a different friendlyId, it was debounced.
-            // For triggerAndWait: stop the outer span since a replacement debounced span was created via onDebounced.
-            // For regular trigger: let the span complete normally - no replacement span needed since the
-            // original run already has its span from when it was first created.
-            if (
-              taskRun.friendlyId !== runFriendlyId &&
-              body.options?.debounce &&
-              body.options?.resumeParentOnCompletion
-            ) {
-              event.stop();
-            }
+                if (error) {
+                  event.failWithError(error);
+                }
 
-            const error = taskRun.error ? TaskRunError.parse(taskRun.error) : undefined;
+                const result = { run: taskRun, error, isCached: false };
 
-            if (error) {
-              event.failWithError(error);
-            }
+                if (result?.error) {
+                  throw new ServiceValidationError(
+                    taskRunErrorToString(taskRunErrorEnhancer(result.error))
+                  );
+                }
 
-            const result = { run: taskRun, error, isCached: false };
+                return result;
+              }
+            );
+          } catch (error) {
+            if (error instanceof RunDuplicateIdempotencyKeyError) {
+              //retry calling this function, because this time it will return the idempotent run
+              return await this.call({
+                taskId,
+                environment,
+                body,
+                options: { ...options, runFriendlyId },
+                attempt: attempt + 1,
+              });
+            }
 
-            if (result?.error) {
+            if (error instanceof RunOneTimeUseTokenError) {
               throw new ServiceValidationError(
-                taskRunErrorToString(taskRunErrorEnhancer(result.error))
+                `Cannot trigger ${taskId} with a one-time use token as it has already been used.`
               );
             }
 
-            return result;
+            throw error;
           }
-        );
-      } catch (error) {
-        if (error instanceof RunDuplicateIdempotencyKeyError) {
-          //retry calling this function, because this time it will return the idempotent run
-          return await this.call({
-            taskId,
-            environment,
-            body,
-            options: { ...options, runFriendlyId },
-            attempt: attempt + 1,
-          });
-        }
-
-        if (error instanceof RunOneTimeUseTokenError) {
-          throw new ServiceValidationError(
-            `Cannot trigger ${taskId} with a one-time use token as it has already been used.`
-          );
-        }
-
-        throw error;
+        },
+      );
+      // Pipeline returned successfully — publish the claim if we held
+      // one. Waiters polling for our key resolve to this runId.
+      if (idempotencyClaim && result?.run?.friendlyId) {
+        await publishMollifierClaim({
+          envId: idempotencyClaim.envId,
+          taskIdentifier: idempotencyClaim.taskIdentifier,
+          idempotencyKey: idempotencyClaim.idempotencyKey,
+          token: idempotencyClaim.token,
+          runId: result.run.friendlyId,
+        });
+      }
+      return result;
+    } catch (err) {
+      // Pipeline threw — release the claim so the next claimant can
+      // retry. Re-throw so the caller sees the original error.
+      if (idempotencyClaim) {
+        await releaseMollifierClaim(idempotencyClaim);
       }
-    });
+      throw err;
+    }
+  }
+
+  // Build the engine.trigger() input object from the values gathered during
+  // this.call(). Extracted so the mollify path can construct the
+  // same input shape without re-entering the trace-run span. The pass-through
+  // path spreads this result and attaches `onDebounced` inline; the mollify
+  // path serialises it into the buffer for drainer replay.
+  #buildEngineTriggerInput(args: {
+    runFriendlyId: string;
+    environment: AuthenticatedEnvironment;
+    idempotencyKey?: string;
+    idempotencyKeyExpiresAt?: Date;
+    body: TriggerTaskRequest["body"];
+    options: TriggerTaskServiceOptions;
+    queueName: string;
+    lockedQueueId?: string;
+    workerQueue?: string;
+    enableFastPath: boolean;
+    lockedToBackgroundWorker?: { id: string; version: string; sdkVersion: string; cliVersion: string };
+    delayUntil?: Date;
+    ttl?: string;
+    metadataPacket?: { data?: string; dataType: string };
+    tags: string[];
+    depth: number;
+    parentRun?: { id: string; rootTaskRunId?: string | null; queueTimestamp?: Date | null; taskEventStore?: string };
+    annotations: {
+      triggerSource: string;
+      triggerAction: string;
+      rootTriggerSource: string;
+      rootScheduleId?: string | undefined;
+    };
+    planType?: string;
+    taskId: string;
+    payloadPacket: { data?: string; dataType: string };
+    traceContext: TriggerTraceContext;
+    traceId: string;
+    spanId: string;
+    parentSpanId: string | undefined;
+    taskEventStore: string;
+  }) {
+    return {
+      friendlyId: args.runFriendlyId,
+      environment: args.environment,
+      idempotencyKey: args.idempotencyKey,
+      idempotencyKeyExpiresAt: args.idempotencyKey ? args.idempotencyKeyExpiresAt : undefined,
+      idempotencyKeyOptions: args.body.options?.idempotencyKeyOptions,
+      taskIdentifier: args.taskId,
+      payload: args.payloadPacket.data ?? "",
+      payloadType: args.payloadPacket.dataType,
+      context: args.body.context,
+      traceContext: args.traceContext,
+      traceId: args.traceId,
+      spanId: args.spanId,
+      parentSpanId: args.parentSpanId,
+      replayedFromTaskRunFriendlyId: args.options.replayedFromTaskRunFriendlyId,
+      lockedToVersionId: args.lockedToBackgroundWorker?.id,
+      taskVersion: args.lockedToBackgroundWorker?.version,
+      sdkVersion: args.lockedToBackgroundWorker?.sdkVersion,
+      cliVersion: args.lockedToBackgroundWorker?.cliVersion,
+      // Schema-level coercion now lands `body.options.concurrencyKey` as
+      // `string` on the API path, but the BatchQueue worker rebuilds
+      // body.options from Redis-stored items (Record<string, unknown>),
+      // which can still carry the pre-fix shape from in-flight batches.
+      concurrencyKey:
+        typeof args.body.options?.concurrencyKey === "number"
+          ? String(args.body.options.concurrencyKey)
+          : args.body.options?.concurrencyKey,
+      queue: args.queueName,
+      lockedQueueId: args.lockedQueueId,
+      workerQueue: args.workerQueue,
+      enableFastPath: args.enableFastPath,
+      isTest: args.body.options?.test ?? false,
+      delayUntil: args.delayUntil,
+      queuedAt: args.delayUntil ? undefined : new Date(),
+      maxAttempts: args.body.options?.maxAttempts,
+      taskEventStore: args.taskEventStore,
+      ttl: args.ttl,
+      tags: args.tags,
+      oneTimeUseToken: args.options.oneTimeUseToken,
+      parentTaskRunId: args.parentRun?.id,
+      rootTaskRunId: args.parentRun?.rootTaskRunId ?? args.parentRun?.id,
+      batch: args.options?.batchId
+        ? { id: args.options.batchId, index: args.options.batchIndex ?? 0 }
+        : undefined,
+      resumeParentOnCompletion: args.body.options?.resumeParentOnCompletion,
+      depth: args.depth,
+      metadata: args.metadataPacket?.data,
+      metadataType: args.metadataPacket?.dataType,
+      seedMetadata: args.metadataPacket?.data,
+      seedMetadataType: args.metadataPacket?.dataType,
+      maxDurationInSeconds: args.body.options?.maxDuration
+        ? clampMaxDuration(args.body.options.maxDuration)
+        : undefined,
+      machine: args.body.options?.machine,
+      priorityMs: args.body.options?.priority ? args.body.options.priority * 1_000 : undefined,
+      queueTimestamp:
+        args.options.queueTimestamp ??
+        (args.parentRun && args.body.options?.resumeParentOnCompletion
+          ? args.parentRun.queueTimestamp ?? undefined
+          : undefined),
+      scheduleId: args.options.scheduleId,
+      scheduleInstanceId: args.options.scheduleInstanceId,
+      createdAt: args.options.overrideCreatedAt,
+      bulkActionId: args.body.options?.bulkActionId,
+      planType: args.planType,
+      realtimeStreamsVersion: args.options.realtimeStreamsVersion,
+      streamBasinName: args.environment.organization.streamBasinName,
+      debounce: args.body.options?.debounce,
+      annotations: args.annotations,
+    };
   }
 
   #propagateExternalTraceContext(
diff --git a/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts b/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts
new file mode 100644
index 00000000000..ee6419cc381
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts
@@ -0,0 +1,204 @@
+import { applyMetadataOperations } from "@trigger.dev/core/v3";
+import type { FlushedRunMetadata } from "@trigger.dev/core/v3/schemas";
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import { logger } from "~/services/logger.server";
+import { getMollifierBuffer } from "./mollifierBuffer.server";
+
+// On `applied` we surface the parent/root friendlyIds captured during
+// the snapshot read. Callers that fan parent/root metadata operations
+// out to their respective runs can use these without a second
+// `findRunByIdWithMollifierFallback` round trip — and, more importantly,
+// without racing the drainer's terminal-failure path (which atomically
+// DELetes the entry hash). Without these on the outcome the second
+// read can come back null mid-route, silently dropping the caller's
+// parentOperations / rootOperations after the primary mutation already
+// landed on the snapshot.
+//
+// FriendlyIds (not internal cuids) because the consuming
+// `routeOperationsToRun` helper gates on the `run_…` prefix to decide
+// whether to attempt the buffer fallback; cuids would skip that path.
+// The snapshot's `parentTaskRunId` / `rootTaskRunId` are engine-side
+// cuids, so we convert via `RunId.toFriendlyId` here — identical to
+// what `readFallback.server.ts` does when assembling its SyntheticRun.
+export type ApplyMetadataMutationOutcome =
+  | {
+      kind: "applied";
+      newMetadata: Record<string, unknown>;
+      parentTaskRunFriendlyId: string | undefined;
+      rootTaskRunFriendlyId: string | undefined;
+    }
+  | { kind: "not_found" }
+  | { kind: "busy" }
+  | { kind: "version_exhausted" }
+  // Mirrors the PG-side `MetadataTooLargeError` (status 413). Carries
+  // the limit + observed size so the route can produce a useful body.
+  | { kind: "metadata_too_large"; maximumSize: number; observedSize: number };
+
+// Apply a metadata PUT (body.metadata replace AND/OR body.operations
+// deltas) to a buffered run's snapshot. Mirrors the PG-side
+// `UpdateMetadataService.#updateRunMetadataWithOperations` retry loop:
+// read snapshot → apply operations in JS → CAS-write back with the
+// observed `metadataVersion`. Retries on conflict; bounded by
+// `maxRetries`. The Lua CAS is the atomicity primitive — concurrent
+// callers never lose an increment / append / set.
+export async function applyMetadataMutationToBufferedRun(input: {
+  runId: string;
+  // Env+org scoping closes a cross-environment write gap on the buffer
+  // path: the route's PG path is already env-scoped via Prisma filters,
+  // and this helper now enforces the same isolation before any buffer
+  // write so a caller authed in env A can't mutate a buffered run that
+  // belongs to env B.
+  environmentId: string;
+  organizationId: string;
+  // Byte-size cap on the resulting metadata payload, mirroring the
+  // PG-side `UpdateMetadataService.maximumSize` (sourced from
+  // `env.TASK_RUN_METADATA_MAXIMUM_SIZE`). Required so the buffer path
+  // doesn't silently allow writes the PG path would have rejected.
+  maximumSize: number;
+  body: Pick<FlushedRunMetadata, "metadata" | "operations">;
+  buffer?: MollifierBuffer | null;
+  maxRetries?: number;
+}): Promise<ApplyMetadataMutationOutcome> {
+  const buffer = input.buffer ?? getMollifierBuffer();
+  if (!buffer) return { kind: "not_found" };
+
+  // Default retry budget tuned for buffered-window concurrency. The
+  // PG-side `UpdateMetadataService` uses 3, which is fine when the only
+  // writer is the executing task itself. For a buffered run the writers
+  // are external API callers, and N parallel writers exhaust 3 retries
+  // quickly under contention. Bumping to 12 covers ~50-way concurrency
+  // with sub-percent failure probability; the cost is bounded (each
+  // retry is one Redis Lua call ~1ms).
+  const maxRetries = input.maxRetries ?? 12;
+  for (let attempt = 0; attempt <= maxRetries; attempt++) {
+    const entry = await buffer.getEntry(input.runId);
+    if (!entry) return { kind: "not_found" };
+    // Env+org check: an entry from a different env is treated as a
+    // miss (not 403) so existence in other envs doesn't leak.
+    if (
+      entry.envId !== input.environmentId ||
+      entry.orgId !== input.organizationId
+    ) {
+      return { kind: "not_found" };
+    }
+    if (entry.status !== "QUEUED" || entry.materialised) {
+      return { kind: "busy" };
+    }
+
+    const snapshot = JSON.parse(entry.payload) as Record<string, unknown>;
+    const currentMetadataType =
+      typeof snapshot.metadataType === "string" ? snapshot.metadataType : "application/json";
+
+    // Capture parent/root ids during this read so the caller can fan
+    // parent/root operations out without a second buffer.getEntry. If
+    // the drainer's terminal-failure path runs between our CAS-write
+    // below and the route's follow-up, the entry hash would be DELd
+    // and a second read would return null — silently dropping the
+    // caller's `body.parentOperations` / `body.rootOperations`. The ids
+    // themselves are immutable for a run, so capturing them on any
+    // loop iteration is fine.
+    const snapshotParentTaskRunInternalId =
+      typeof snapshot.parentTaskRunId === "string" ? snapshot.parentTaskRunId : undefined;
+    const snapshotParentTaskRunFriendlyId = snapshotParentTaskRunInternalId
+      ? RunId.toFriendlyId(snapshotParentTaskRunInternalId)
+      : undefined;
+    const snapshotRootTaskRunInternalId =
+      typeof snapshot.rootTaskRunId === "string" ? snapshot.rootTaskRunId : undefined;
+    const snapshotRootTaskRunFriendlyId = snapshotRootTaskRunInternalId
+      ? RunId.toFriendlyId(snapshotRootTaskRunInternalId)
+      : undefined;
+
+    // Match PG semantics: `body.operations` and `body.metadata` are
+    // mutually exclusive on a single request. The PG service
+    // (`UpdateMetadataService.#updateRunMetadata`) branches on
+    // `Array.isArray(body.operations)` — if operations are present it
+    // applies them on top of the EXISTING metadata and ignores
+    // `body.metadata` entirely; otherwise `body.metadata` is the new
+    // full value. Doing both here would make a request like
+    // `{ metadata: {b:2}, operations: [set c=3] }` produce
+    // `{b:2,c:3}` on the buffer vs `{a:1,c:3}` on PG, which silently
+    // changes semantics across the buffered/materialised boundary.
+    const parseSnapshotMetadata = (): Record<string, unknown> => {
+      if (typeof snapshot.metadata !== "string") return {};
+      try {
+        return JSON.parse(snapshot.metadata) as Record<string, unknown>;
+      } catch {
+        return {};
+      }
+    };
+
+    let metadataObject: Record<string, unknown>;
+    // Use `Array.isArray` (the PG service's predicate) instead of a
+    // truthy length check. For `{ metadata, operations: [] }` PG sees
+    // Array.isArray([])=true and no-ops on existing metadata; a
+    // `.length` check would treat the empty array as falsy and fall
+    // through to the `body.metadata` branch, replacing metadata —
+    // exactly the cross-boundary drift the comment above warns
+    // against.
+    if (Array.isArray(input.body.operations)) {
+      // Operations take precedence: apply on top of existing snapshot
+      // metadata; ignore `body.metadata` to match PG behaviour.
+      metadataObject = applyMetadataOperations(
+        parseSnapshotMetadata(),
+        input.body.operations,
+      ).newMetadata;
+    } else if (input.body.metadata !== undefined) {
+      // No operations — full replace.
+      metadataObject = input.body.metadata as Record<string, unknown>;
+    } else {
+      // Neither — write back existing snapshot metadata (no-op shape).
+      metadataObject = parseSnapshotMetadata();
+    }
+
+    const newMetadataStr = JSON.stringify(metadataObject);
+
+    // Size cap — match PG (`handleMetadataPacket` throws
+    // `MetadataTooLargeError` (413) when the JSON-encoded packet
+    // exceeds the configured cap). Reject in-loop, before CAS, so a
+    // single oversize write doesn't churn the retry budget.
+    const observedSize = Buffer.byteLength(newMetadataStr, "utf8");
+    if (observedSize > input.maximumSize) {
+      return {
+        kind: "metadata_too_large",
+        maximumSize: input.maximumSize,
+        observedSize,
+      };
+    }
+
+    const cas = await buffer.casSetMetadata({
+      runId: input.runId,
+      expectedVersion: entry.metadataVersion,
+      newMetadata: newMetadataStr,
+      newMetadataType: currentMetadataType,
+    });
+
+    if (cas.kind === "applied") {
+      return {
+        kind: "applied",
+        newMetadata: metadataObject,
+        parentTaskRunFriendlyId: snapshotParentTaskRunFriendlyId,
+        rootTaskRunFriendlyId: snapshotRootTaskRunFriendlyId,
+      };
+    }
+    if (cas.kind === "not_found") return { kind: "not_found" };
+    if (cas.kind === "busy") return { kind: "busy" };
+    // version_conflict — another caller wrote between our read + CAS.
+    // Small jittered backoff so a thundering herd of N retriers doesn't
+    // all re-read + re-CAS at exactly the same moment.
+    logger.debug("applyMetadataMutationToBufferedRun: version_conflict, retrying", {
+      runId: input.runId,
+      attempt,
+      observedVersion: entry.metadataVersion,
+      currentVersion: cas.currentVersion,
+    });
+    const backoffMs = Math.floor(Math.random() * (5 + attempt * 5));
+    await new Promise((resolve) => setTimeout(resolve, backoffMs));
+  }
+
+  logger.warn("applyMetadataMutationToBufferedRun: retries exhausted", {
+    runId: input.runId,
+    maxRetries,
+  });
+  return { kind: "version_exhausted" };
+}
diff --git a/apps/webapp/app/v3/mollifier/bufferedTriggerPayload.server.ts b/apps/webapp/app/v3/mollifier/bufferedTriggerPayload.server.ts
index d251e9f98e8..287b5bf9bcb 100644
--- a/apps/webapp/app/v3/mollifier/bufferedTriggerPayload.server.ts
+++ b/apps/webapp/app/v3/mollifier/bufferedTriggerPayload.server.ts
@@ -2,17 +2,17 @@ import type { TriggerTaskRequestBody } from "@trigger.dev/core/v3";
 import type { TriggerTaskServiceOptions } from "~/v3/services/triggerTask.server";
 
 // Canonical payload shape written to the mollifier buffer when the gate
-// decides to mollify a trigger. Phase 1 ALSO calls engine.trigger directly
-// (dual-write) so this is currently an audit/preview record. Phase 2 will
-// make the buffer the primary write path: the drainer's handler will read
-// this payload and replay it through engine.trigger to create the run in
-// Postgres, and read-fallback endpoints will synthesise a Run view from it
-// while it is still QUEUED.
+// decides to mollify a trigger. At this stage the call site ALSO calls
+// engine.trigger directly (dual-write), so this is currently an
+// audit/preview record. A later change makes the buffer the primary write
+// path: the drainer's handler reads this payload and replays it through
+// engine.trigger to create the run in Postgres, and read-fallback
+// endpoints synthesise a Run view from it while it is still QUEUED.
 //
-// CONTRACT: this shape must contain everything needed for Phase 2's
-// drainer-replay to reconstruct an equivalent engine.trigger call. Phase 1
-// emits it to logs; Phase 2 will serialise it into Redis and rebuild it on
-// the drain side. Keep it serialisable — no functions, no class instances.
+// CONTRACT: this shape must contain everything the drainer-replay needs to
+// reconstruct an equivalent engine.trigger call. Today it is emitted to
+// logs; later it is serialised into Redis and rebuilt on the drain side.
+// Keep it serialisable — no functions, no class instances.
 export type BufferedTriggerPayload = {
   runFriendlyId: string;
 
diff --git a/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts b/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts
new file mode 100644
index 00000000000..47c9733c927
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts
@@ -0,0 +1,218 @@
+import { randomUUID } from "node:crypto";
+import type {
+  IdempotencyClaimResult,
+  IdempotencyLookupInput,
+  MollifierBuffer,
+} from "@trigger.dev/redis-worker";
+import { logger } from "~/services/logger.server";
+import { getMollifierBuffer } from "./mollifierBuffer.server";
+
+// Tunables. The TTL on the claim key is bounded by typical trigger-pipeline
+// dwell; long enough that a slow PG insert doesn't expire mid-flight,
+// short enough that a crashed claimant unblocks waiters quickly.
+export const DEFAULT_CLAIM_TTL_SECONDS = 30;
+// safetyNetMs caps how long a waiter blocks before returning timed_out.
+// Matches the mutateWithFallback safety net so SDK retry policies don't
+// have to special-case this path.
+export const DEFAULT_CLAIM_WAIT_MS = 5_000;
+export const DEFAULT_CLAIM_POLL_MS = 25;
+
+export type ClaimOrAwaitOutcome =
+  // We own the claim. `token` MUST be passed to publishClaim/releaseClaim
+  // so the buffer can compare-and-act against our ownership marker — a
+  // late release from a previous claimant whose TTL expired cannot
+  // erase our slot.
+  | { kind: "claimed"; token: string }
+  | { kind: "resolved"; runId: string } // someone else's runId; caller returns isCached:true
+  | { kind: "timed_out" };
+
+export type ClaimOrAwaitInput = IdempotencyLookupInput & {
+  ttlSeconds?: number;
+  safetyNetMs?: number;
+  pollStepMs?: number;
+  abortSignal?: AbortSignal;
+  // Test injection.
+  buffer?: MollifierBuffer | null;
+  now?: () => number;
+  sleep?: (ms: number) => Promise<void>;
+  // Test override for the ownership-token generator. Defaults to
+  // `crypto.randomUUID()`. Tests pass a deterministic value so they
+  // can assert publish/release pass-through.
+  generateToken?: () => string;
+};
+
+// Pre-gate Redis claim. All same-key triggers serialise through here
+// before the trigger pipeline runs. Returning `resolved` short-circuits
+// the trigger entirely — the caller responds with the cached runId.
+// Returning `claimed` means we own the claim and MUST publish the
+// winning runId on success (`publishClaim`) or release the claim on
+// failure (`releaseClaim`).
+//
+// Failure modes:
+// - Redis down at claim time: returns `claimed` (fail open, no
+//   coordination). Customer is no worse than today's race; the
+//   PG unique constraint is the eventual arbiter.
+// - Claimant crashes mid-pipeline: claim TTL expires, waiters
+//   eventually time out, SDK retries.
+// - PG/buffer publish failure: waiters time out and SDK retries; next
+//   attempt sees the eventual PG/buffer state via existing
+//   IdempotencyKeyConcern PG-first lookup.
+export async function claimOrAwait(input: ClaimOrAwaitInput): Promise<ClaimOrAwaitOutcome> {
+  const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer;
+  if (!buffer) {
+    // Mollifier disabled / buffer construction failed. Fall open —
+    // caller proceeds with the trigger pipeline (PG unique constraint
+    // backstop). The token is never read in this case (publish/release
+    // are buffer-null no-ops downstream), so we skip the default
+    // `randomUUID()` to keep the mollifier-OFF hot path allocation-free
+    // for idempotency-keyed triggers — `triggerTask` is the
+    // highest-throughput code path in the system. A test-injected
+    // generator is still honoured for deterministic assertions.
+    return { kind: "claimed", token: input.generateToken ? input.generateToken() : "" };
+  }
+  const generateToken = input.generateToken ?? randomUUID;
+  // Generate the ownership token up front so the retry loop reuses it
+  // — we're the same logical claimant across attempts; only the slot
+  // owner changes between releases.
+  const token = generateToken();
+  const ttlSeconds = input.ttlSeconds ?? DEFAULT_CLAIM_TTL_SECONDS;
+  const safetyNetMs = input.safetyNetMs ?? DEFAULT_CLAIM_WAIT_MS;
+  const pollStepMs = input.pollStepMs ?? DEFAULT_CLAIM_POLL_MS;
+  const now = input.now ?? Date.now;
+  const sleep = input.sleep ?? defaultSleep;
+
+  const lookupInput: IdempotencyLookupInput = {
+    envId: input.envId,
+    taskIdentifier: input.taskIdentifier,
+    idempotencyKey: input.idempotencyKey,
+  };
+
+  // Initial claim attempt. Most production-path calls resolve here on
+  // the first call (either we win, or the key is already resolved from
+  // a prior burst).
+  let result: IdempotencyClaimResult;
+  try {
+    result = await buffer.claimIdempotency({ ...lookupInput, token, ttlSeconds });
+  } catch (err) {
+    logger.warn("idempotency claim failed (fail-open)", {
+      envId: input.envId,
+      taskIdentifier: input.taskIdentifier,
+      err: err instanceof Error ? err.message : String(err),
+    });
+    return { kind: "claimed", token };
+  }
+
+  if (result.kind === "claimed") return { kind: "claimed", token };
+  if (result.kind === "resolved") return result;
+
+  // result.kind === "pending" — wait/poll loop. May see the value flip
+  // to "resolved" (winner published), the key vanish (winner released
+  // on error → retry claim), or stay "pending" until the safety net.
+  const deadline = now() + safetyNetMs;
+  while (now() < deadline) {
+    if (input.abortSignal?.aborted) return { kind: "timed_out" };
+    await sleep(pollStepMs);
+
+    let current: IdempotencyClaimResult | null;
+    try {
+      current = await buffer.readClaim(lookupInput);
+    } catch (err) {
+      // Transient read failure — keep polling until deadline.
+      logger.warn("idempotency claim read failed mid-poll", {
+        err: err instanceof Error ? err.message : String(err),
+      });
+      continue;
+    }
+
+    if (current === null) {
+      // Claimant released on error. Re-attempt the claim — one of the
+      // waiters will win, the rest see "pending" again. Reuse our token:
+      // we're still the same logical claimant, just contending for a
+      // freshly empty slot.
+      try {
+        const retry = await buffer.claimIdempotency({ ...lookupInput, token, ttlSeconds });
+        if (retry.kind === "claimed") return { kind: "claimed", token };
+        if (retry.kind === "resolved") return retry;
+        // "pending" again → keep polling.
+      } catch (err) {
+        logger.warn("idempotency claim retry failed", {
+          err: err instanceof Error ? err.message : String(err),
+        });
+        return { kind: "claimed", token };
+      }
+      continue;
+    }
+    if (current.kind === "resolved") return current;
+    // current.kind === "pending" → keep polling.
+  }
+  return { kind: "timed_out" };
+}
+
+// Publish the winning runId so waiters resolve. Best-effort: failure
+// here means waiters will time out and the SDK will retry, which will
+// then find the row via the existing IdempotencyKeyConcern PG-first
+// check.
+export async function publishClaim(input: {
+  envId: string;
+  taskIdentifier: string;
+  idempotencyKey: string;
+  // Ownership token from the `claimed` outcome. Buffer compare-and-sets
+  // on this so a publish from a stale claimant (TTL expired, another
+  // claimant moved in) is a no-op rather than overwriting their claim.
+  token: string;
+  runId: string;
+  ttlSeconds?: number;
+  buffer?: MollifierBuffer | null;
+}): Promise<void> {
+  const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer;
+  if (!buffer) return;
+  const ttlSeconds = input.ttlSeconds ?? DEFAULT_CLAIM_TTL_SECONDS;
+  try {
+    await buffer.publishClaim({
+      envId: input.envId,
+      taskIdentifier: input.taskIdentifier,
+      idempotencyKey: input.idempotencyKey,
+      token: input.token,
+      runId: input.runId,
+      ttlSeconds,
+    });
+  } catch (err) {
+    logger.warn("idempotency claim publish failed", {
+      envId: input.envId,
+      taskIdentifier: input.taskIdentifier,
+      err: err instanceof Error ? err.message : String(err),
+    });
+  }
+}
+
+// Release on pipeline failure. Best-effort. If the DEL fails, the claim
+// TTL is the safety net — waiters time out, SDK retries.
+export async function releaseClaim(input: {
+  envId: string;
+  taskIdentifier: string;
+  idempotencyKey: string;
+  // Ownership token from the `claimed` outcome. Buffer compare-and-
+  // deletes on this so a release from a stale claimant whose TTL
+  // expired can't wipe a new owner's claim.
+  token: string;
+  buffer?: MollifierBuffer | null;
+}): Promise<void> {
+  const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer;
+  if (!buffer) return;
+  try {
+    await buffer.releaseClaim({
+      envId: input.envId,
+      taskIdentifier: input.taskIdentifier,
+      idempotencyKey: input.idempotencyKey,
+      token: input.token,
+    });
+  } catch (err) {
+    logger.warn("idempotency claim release failed", {
+      err: err instanceof Error ? err.message : String(err),
+    });
+  }
+}
+
+function defaultSleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
diff --git a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts
index 9c8917623e4..09b52aa9da3 100644
--- a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts
+++ b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts
@@ -22,7 +22,6 @@ function initializeMollifierBuffer(): MollifierBuffer {
       enableAutoPipelining: true,
       ...(env.TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
     },
-    entryTtlSeconds: env.TRIGGER_MOLLIFIER_ENTRY_TTL_S,
   });
 }
 
diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts
index 139aeaf9a6e..26ac60f180f 100644
--- a/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts
+++ b/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts
@@ -1,10 +1,16 @@
-import { createHash } from "node:crypto";
-import { MollifierDrainer, serialiseSnapshot } from "@trigger.dev/redis-worker";
+import { MollifierDrainer } from "@trigger.dev/redis-worker";
+import { prisma } from "~/db.server";
 import { env } from "~/env.server";
+import { engine as runEngine } from "~/v3/runEngine.server";
 import { logger } from "~/services/logger.server";
 import { singleton } from "~/utils/singleton";
 import { getMollifierBuffer } from "./mollifierBuffer.server";
-import type { BufferedTriggerPayload } from "./bufferedTriggerPayload.server";
+import {
+  createDrainerHandler,
+  createDrainerTerminalFailureHandler,
+  isRetryablePgError,
+} from "./mollifierDrainerHandler.server";
+import type { MollifierSnapshot } from "./mollifierSnapshot.server";
 
 // Distinct error class for the deterministic "fail loud at boot" throws
 // below. The bootstrap in `mollifierDrainerWorker.server.ts` catches
@@ -25,7 +31,7 @@ export class MollifierConfigurationError extends Error {
   }
 }
 
-function initializeMollifierDrainer(): MollifierDrainer<BufferedTriggerPayload> {
+function initializeMollifierDrainer(): MollifierDrainer<MollifierSnapshot> {
   const buffer = getMollifierBuffer();
   if (!buffer) {
     // Unreachable in normal config: getMollifierDrainer() gates on the
@@ -68,40 +74,14 @@ function initializeMollifierDrainer(): MollifierDrainer<BufferedTriggerPayload>
     maxAttempts: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS,
   });
 
-  // Phase 1 handler: no-op ack. The trigger has ALREADY been written to
-  // Postgres via engine.trigger (dual-write at the call site). Popping +
-  // acking here proves the dequeue mechanism works end-to-end without
-  // duplicating the work. Phase 2 will replace this with an engine.trigger
-  // replay that performs the actual Postgres write.
-  const drainer = new MollifierDrainer<BufferedTriggerPayload>({
+  const drainer = new MollifierDrainer<MollifierSnapshot>({
     buffer,
-    handler: async (input) => {
-      // Hash the (re-serialised, canonical) payload on the drain side rather
-      // than on the trigger hot path. Burst-time CPU stays with engine.trigger;
-      // the drainer is the natural place for the audit-equivalence checksum.
-      // Re-serialisation is identity for the BufferedTriggerPayload shape
-      // (only strings/numbers/plain objects), so this hash matches what the
-      // call site wrote into Redis.
-      const reserialised = serialiseSnapshot(input.payload);
-      const payloadHash = createHash("sha256").update(reserialised).digest("hex");
-      logger.info("mollifier.drained", {
-        runId: input.runId,
-        envId: input.envId,
-        orgId: input.orgId,
-        taskId: input.payload.taskId,
-        attempts: input.attempts,
-        ageMs: Date.now() - input.createdAt.getTime(),
-        payloadBytes: reserialised.length,
-        payloadHash,
-      });
-    },
+    handler: createDrainerHandler({ engine: runEngine, prisma }),
+    onTerminalFailure: createDrainerTerminalFailureHandler({ engine: runEngine, prisma }),
     concurrency: env.TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY,
     maxAttempts: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS,
     maxOrgsPerTick: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK,
-    // A no-op handler shouldn't throw, but if something does (e.g. an
-    // unexpected deserialise failure), don't loop — let it FAIL terminally
-    // so the entry is observable in metrics.
-    isRetryable: () => false,
+    isRetryable: isRetryablePgError,
   });
 
   return drainer;
@@ -114,7 +94,7 @@ function initializeMollifierDrainer(): MollifierDrainer<BufferedTriggerPayload>
 // handler registration, leaving a narrow window where a SIGTERM landing
 // between `start()` and `process.once("SIGTERM", ...)` would skip the
 // graceful stop. The split is intentional.
-export function getMollifierDrainer(): MollifierDrainer<BufferedTriggerPayload> | null {
+export function getMollifierDrainer(): MollifierDrainer<MollifierSnapshot> | null {
   if (env.TRIGGER_MOLLIFIER_ENABLED !== "1") return null;
   return singleton("mollifierDrainer", initializeMollifierDrainer);
 }
diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts
new file mode 100644
index 00000000000..6e829baa575
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts
@@ -0,0 +1,403 @@
+import { context, trace, TraceFlags } from "@opentelemetry/api";
+import type { RunEngine } from "@internal/run-engine";
+import type { PrismaClientOrTransaction } from "@trigger.dev/database";
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+import type {
+  MollifierDrainerHandler,
+  MollifierDrainerTerminalFailureHandler,
+} from "@trigger.dev/redis-worker";
+import { logger } from "~/services/logger.server";
+import { recordRunDebugLog } from "~/v3/eventRepository/index.server";
+import { PerformTaskRunAlertsService } from "~/v3/services/alerts/performTaskRunAlerts.server";
+import { startSpan } from "~/v3/tracing.server";
+import type { MollifierSnapshot } from "./mollifierSnapshot.server";
+
+const tracer = trace.getTracer("mollifier-drainer");
+
+export function isRetryablePgError(err: unknown): boolean {
+  if (!(err instanceof Error)) return false;
+  const msg = err.message ?? "";
+  // Prisma surfaces P1001 ("Can't reach database server") via two
+  // different error classes — `PrismaClientKnownRequestError` exposes
+  // it as `err.code`, `PrismaClientInitializationError` exposes it as
+  // `err.errorCode`. Check both so reconnection-time errors retry
+  // regardless of which class fires.
+  const code = (err as { code?: string }).code;
+  const errorCode = (err as { errorCode?: string }).errorCode;
+  if (code === "P2024") return true;
+  if (code === "P1001" || errorCode === "P1001") return true;
+  if (msg.includes("Can't reach database server")) return true;
+  if (msg.includes("Connection lost")) return true;
+  if (msg.includes("ECONNRESET")) return true;
+  return false;
+}
+
+export function createDrainerHandler(deps: {
+  engine: RunEngine;
+  prisma: PrismaClientOrTransaction;
+}): MollifierDrainerHandler<MollifierSnapshot> {
+  return async (input) => {
+    const dwellMs = Date.now() - input.createdAt.getTime();
+
+    // Re-attach to the trace started by the caller's mollifier.queued span
+    // (its traceId + spanId were captured into the snapshot at buffer time).
+    // Without this the drainer would emit mollifier.drained in a brand-new
+    // trace and the engine.trigger instrumentation would inherit an empty
+    // active context — leaving the run-detail page with only the root span.
+    const snapshotTraceId =
+      typeof input.payload.traceId === "string" ? input.payload.traceId : undefined;
+    const snapshotSpanId =
+      typeof input.payload.spanId === "string" ? input.payload.spanId : undefined;
+
+    const parentContext =
+      snapshotTraceId && snapshotSpanId
+        ? trace.setSpanContext(context.active(), {
+            traceId: snapshotTraceId,
+            spanId: snapshotSpanId,
+            traceFlags: TraceFlags.SAMPLED,
+            isRemote: true,
+          })
+        : context.active();
+
+    // Cancel-wins-over-trigger. If a cancel API call landed on this
+    // entry while it was QUEUED, the snapshot carries `cancelledAt` +
+    // `cancelReason`. Skip the normal materialise path and write a
+    // CANCELED PG row directly. The `runCancelled` bus emit is
+    // suppressed here because a buffered-only run never had a primary
+    // trace event written for it — the runCancelled handler's
+    // `cancelRunEvent` lookup would fail and log noise per cancel.
+    const cancelledAtStr =
+      typeof input.payload.cancelledAt === "string" ? input.payload.cancelledAt : undefined;
+    if (cancelledAtStr) {
+      const cancelReason =
+        typeof input.payload.cancelReason === "string"
+          ? input.payload.cancelReason
+          : "Canceled by user";
+      await context.with(parentContext, async () => {
+        await startSpan(tracer, "mollifier.drained.cancelled", async (span) => {
+          span.setAttribute("mollifier.drained", true);
+          span.setAttribute("mollifier.dwell_ms", dwellMs);
+          span.setAttribute("mollifier.attempts", input.attempts);
+          span.setAttribute("mollifier.run_friendly_id", input.runId);
+          span.setAttribute("mollifier.cancel_bifurcation", true);
+          span.setAttribute("taskRunId", input.runId);
+          try {
+            await deps.engine.createCancelledRun(
+              {
+                snapshot: input.payload as any,
+                cancelledAt: new Date(cancelledAtStr),
+                cancelReason,
+                emitRunCancelledEvent: false,
+              },
+              deps.prisma,
+            );
+          } catch (err) {
+            // createCancelledRun throws a conflict when the normal trigger
+            // replay path won the race and already materialised a live
+            // (non-CANCELED) row for this friendlyId. Its contract leaves
+            // the resolution to us: honour the cancel by actually
+            // cancelling the now-live run. Letting the conflict propagate
+            // would instead reach the drainer's terminal-failure path
+            // (isRetryablePgError() is false for it), buffer.fail() the
+            // entry, and silently lose the cancellation while the run
+            // keeps executing.
+            const isConflict =
+              err instanceof Error && err.message.startsWith("createCancelledRun conflict");
+            if (!isConflict) {
+              // Mirror the SYSTEM_FAILURE fallback the non-cancelled
+              // trigger path uses below. Without this branch, a
+              // non-retryable createCancelledRun failure rethrows, the
+              // drainer's onTerminalFailure handler skips because it
+              // gates on `cause === "max-attempts-exhausted"` (and the
+              // outer drainer classifies non-retryable failures with
+              // `cause: "non-retryable"`), and buffer.fail() deletes
+              // the entry — leaving NO PG row. The cancellation
+              // disappears silently from the customer's dashboard.
+              // Writing a SYSTEM_FAILURE row gives the run a terminal,
+              // visible state.
+              if (isRetryablePgError(err)) {
+                throw err;
+              }
+              span.setAttribute("mollifier.cancel_terminal_failure_reason",
+                err instanceof Error ? err.message : String(err));
+              try {
+                const wrote = await writeMollifierTerminalFailureRow(deps, {
+                  friendlyId: input.runId,
+                  snapshot: input.payload as Record<string, unknown>,
+                  reason: err instanceof Error ? err.message : String(err),
+                });
+                if (wrote) return;
+              } catch (writeErr) {
+                if (isRetryablePgError(writeErr)) {
+                  span.setAttribute("mollifier.cancel_terminal_write_retryable", true);
+                  throw writeErr;
+                }
+                span.setAttribute(
+                  "mollifier.cancel_terminal_write_error",
+                  writeErr instanceof Error ? writeErr.message : String(writeErr)
+                );
+              }
+              throw err;
+            }
+            span.setAttribute("mollifier.cancel_conflict", true);
+            const friendlyId =
+              typeof input.payload.friendlyId === "string"
+                ? input.payload.friendlyId
+                : input.runId;
+            await deps.engine.cancelRun({
+              runId: RunId.fromFriendlyId(friendlyId),
+              completedAt: new Date(cancelledAtStr),
+              reason: cancelReason,
+            });
+          }
+        });
+      });
+      return;
+    }
+
+    await context.with(parentContext, async () => {
+      await startSpan(tracer, "mollifier.drained", async (span) => {
+        span.setAttribute("mollifier.drained", true);
+        span.setAttribute("mollifier.dwell_ms", dwellMs);
+        span.setAttribute("mollifier.attempts", input.attempts);
+        span.setAttribute("mollifier.run_friendly_id", input.runId);
+        span.setAttribute("taskRunId", input.runId);
+
+        let triggerSucceeded = false;
+        try {
+          await deps.engine.trigger(input.payload as any, deps.prisma);
+          triggerSucceeded = true;
+        } catch (err) {
+          // The retryable-PG class re-throws so the drainer's outer
+          // worker loop can `buffer.requeue` (handled in
+          // `MollifierDrainer.drainOne`). For non-retryable failures we
+          // write a terminal SYSTEM_FAILURE row to PG via the engine's
+          // existing `createFailedTaskRun` (used by batch-trigger for
+          // the same purpose) so the customer sees the run in their
+          // dashboard / SDK instead of silently losing it when the
+          // buffer entry TTLs out. If THAT insert also fails (PG truly
+          // unreachable), rethrow so the drainer's outer catch falls
+          // through to its existing `buffer.fail` terminal-marker path.
+          if (isRetryablePgError(err)) {
+            throw err;
+          }
+          const reason = err instanceof Error ? err.message : String(err);
+          span.setAttribute("mollifier.terminal_failure_reason", reason);
+          try {
+            const wrote = await writeMollifierTerminalFailureRow(deps, {
+              friendlyId: input.runId,
+              snapshot: input.payload as Record<string, unknown>,
+              reason,
+            });
+            if (!wrote) {
+              // Snapshot too malformed to even construct a TaskRun row.
+              // Drainer's outer catch will buffer.fail this entry.
+              throw err;
+            }
+          } catch (writeErr) {
+            // The terminal SYSTEM_FAILURE write itself failed. If it
+            // failed because PG is transiently unreachable, rethrow the
+            // *write* error so the drainer requeues — buffer.fail()ing on
+            // the original non-retryable error would lose the run with no
+            // PG row ever landing. Once PG recovers the requeued entry
+            // writes its failure row and the customer sees it.
+            if (isRetryablePgError(writeErr)) {
+              span.setAttribute("mollifier.terminal_write_retryable", true);
+              throw writeErr;
+            }
+            // PG reachable but the write was rejected for another reason
+            // (genuinely bad snapshot). Rethrow the original trigger error
+            // so the drainer falls back to buffer.fail.
+            span.setAttribute(
+              "mollifier.terminal_write_error",
+              writeErr instanceof Error ? writeErr.message : String(writeErr)
+            );
+            throw err;
+          }
+        }
+
+        // Admin-only audit trail emitted once engine.trigger has
+        // landed a PG row. `recordRunDebugLog` flips this to the
+        // admin-gated debug kind (TaskEventKind.LOG in the PG store /
+        // DEBUG_EVENT in the ClickHouse store) which the trace view +
+        // logs download already strip for non-admins
+        // (`eventRepository.server.ts:108`,
+        // `resources.runs.$runParam.logs.download.ts:118`).
+        //
+        // Placement: emit as a zero-duration marker AT materialisation
+        // time, not as a back-dated bar spanning the buffered window.
+        // `engine.trigger` rewrites the run's root span at
+        // materialisation (it adopts the synth root via traceId/spanId
+        // carryover but updates start_time to "now"), so the trace
+        // renderer treats materialisation time as t=0. A back-dated
+        // event with startTime = bufferedAt would land before that t=0
+        // and get clipped from the tree. Same pattern as the
+        // `[engine] QUEUED` markers. The window itself is preserved
+        // in metadata so admins can read it off the span detail pane.
+        //
+        // Best-effort: `recordRunDebugLog` has its own try/catch and
+        // returns a result, so it never throws into the materialisation
+        // path. Failures are logged but not surfaced because the
+        // customer-visible run has already landed.
+        if (triggerSucceeded) {
+          const debugResult = await recordRunDebugLog(
+            RunId.fromFriendlyId(input.runId),
+            `Mollifier buffered ${dwellMs}ms before materialising`,
+            {
+              attributes: {
+                runId: input.runId,
+                metadata: {
+                  "mollifier.bufferedAt": input.createdAt.toISOString(),
+                  "mollifier.materialisedAt": new Date().toISOString(),
+                  "mollifier.dwellMs": dwellMs,
+                  "mollifier.attempts": input.attempts,
+                },
+              },
+              parentId: snapshotSpanId,
+            }
+          );
+          if (!debugResult.success && debugResult.code !== "RUN_NOT_FOUND") {
+            logger.warn("mollifier drainer: failed to record admin debug log", {
+              runId: input.runId,
+              code: debugResult.code,
+            });
+          }
+        }
+      });
+    });
+  };
+}
+
+// Shared SYSTEM_FAILURE construction used by both terminal paths:
+//   - non-retryable failure inside the handler (above)
+//   - retryable failure after maxAttempts inside the drainer's
+//     `processEntry` (via `createDrainerTerminalFailureHandler`)
+//
+// Suppresses `runFailed` and enqueues the alert manually — the engine's
+// `runFailed` handler calls `completeFailedRunEvent`, which looks up
+// the run's primary span. Buffered-only runs never had a primary trace
+// event written (the mollifier gate intercepts BEFORE
+// `repository.traceEvent` runs), so the lookup always fails and the
+// handler logs a systematic `[runFailed] Failed to complete failed
+// run event` error per terminal failure. `TriggerFailedTaskService`
+// handles the identical situation the same way (see triggerFailedTask
+// .server.ts:212 and 324) — pass `emitRunFailedEvent: false` to the
+// engine and call `PerformTaskRunAlertsService.enqueue(...)` directly
+// so customers' ERROR channels still fire. Alert enqueue is
+// best-effort; an alert-side failure is logged but does not bubble up
+// (the SYSTEM_FAILURE row landing is the load-bearing customer-visible
+// outcome).
+//
+// Returns the new `TaskRun` on success or `null` when the snapshot was
+// so malformed it couldn't even produce an environment — caller decides
+// whether to escalate that to `buffer.fail` directly. Throws on any
+// other failure so the drainer's retryable/non-retryable disposition
+// logic can own the decision.
+async function writeMollifierTerminalFailureRow(
+  deps: { engine: RunEngine; prisma: PrismaClientOrTransaction },
+  args: { friendlyId: string; snapshot: Record<string, unknown>; reason: string },
+) {
+  const { snapshot } = args;
+  const env = snapshot.environment as
+    | {
+        id: string;
+        type: any;
+        project: { id: string };
+        organization: { id: string };
+      }
+    | undefined;
+  if (!env) return null;
+  // Extract batch association from the snapshot if present. Without this
+  // a SYSTEM_FAILURE row for a buffered batch child won't be linked to
+  // its batch, and the batch parent's completion tracking can hang
+  // indefinitely waiting on a child that landed but isn't visible to
+  // the batch.
+  const rawBatch = snapshot.batch;
+  const batch =
+    rawBatch &&
+    typeof rawBatch === "object" &&
+    "id" in rawBatch &&
+    typeof (rawBatch as { id: unknown }).id === "string" &&
+    "index" in rawBatch &&
+    typeof (rawBatch as { index: unknown }).index === "number"
+      ? (rawBatch as { id: string; index: number })
+      : undefined;
+  const failedRun = await deps.engine.createFailedTaskRun({
+    friendlyId: args.friendlyId,
+    environment: env,
+    taskIdentifier: String(snapshot.taskIdentifier ?? ""),
+    payload: typeof snapshot.payload === "string" ? snapshot.payload : undefined,
+    payloadType: typeof snapshot.payloadType === "string" ? snapshot.payloadType : undefined,
+    error: {
+      type: "STRING_ERROR",
+      raw: `Mollifier drainer terminal failure: ${args.reason}`,
+    },
+    parentTaskRunId:
+      typeof snapshot.parentTaskRunId === "string" ? snapshot.parentTaskRunId : undefined,
+    rootTaskRunId:
+      typeof snapshot.rootTaskRunId === "string" ? snapshot.rootTaskRunId : undefined,
+    depth: typeof snapshot.depth === "number" ? snapshot.depth : 0,
+    resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true,
+    batch,
+    traceId: typeof snapshot.traceId === "string" ? snapshot.traceId : undefined,
+    spanId: typeof snapshot.spanId === "string" ? snapshot.spanId : undefined,
+    taskEventStore:
+      typeof snapshot.taskEventStore === "string" ? snapshot.taskEventStore : undefined,
+    queue: typeof snapshot.queue === "string" ? snapshot.queue : undefined,
+    lockedQueueId:
+      typeof snapshot.lockedQueueId === "string" ? snapshot.lockedQueueId : undefined,
+    emitRunFailedEvent: false,
+  });
+  // Alerts side of `runFailed` — the engine emit was suppressed above
+  // so we don't create an orphan trace event; enqueue the alert
+  // directly so customers' ERROR channels still see the failure.
+  // Best-effort, mirroring TriggerFailedTaskService.
+  try {
+    await PerformTaskRunAlertsService.enqueue(failedRun.id);
+  } catch (alertsError) {
+    logger.warn("writeMollifierTerminalFailureRow: alert enqueue failed", {
+      friendlyId: args.friendlyId,
+      error: alertsError instanceof Error ? alertsError.message : String(alertsError),
+    });
+  }
+  return failedRun;
+}
+
+// Drainer-side terminal-failure callback. Fires from
+// `MollifierDrainer.processEntry` BEFORE `buffer.fail()` on any path
+// where the in-handler write didn't already land — currently the
+// `cause: "max-attempts-exhausted"` case for retryable PG errors. Writes
+// the same SYSTEM_FAILURE row the non-retryable handler path writes
+// inline (via the shared `writeMollifierTerminalFailureRow` helper) so
+// the customer-visible behaviour is identical regardless of how the
+// failure was classified.
+//
+// Re-throws retryable PG errors so the drainer requeues — buffer.fail()ing
+// here would still lose the run if PG is genuinely unreachable. Throwing
+// anything else falls through to buffer.fail to avoid an infinite loop on
+// a genuinely bad snapshot (the drainer logs it).
+export function createDrainerTerminalFailureHandler(deps: {
+  engine: RunEngine;
+  prisma: PrismaClientOrTransaction;
+}): MollifierDrainerTerminalFailureHandler<MollifierSnapshot> {
+  return async (input) => {
+    // The handler's own non-retryable terminal path has already written
+    // the SYSTEM_FAILURE row before it throws non-retryable. Only the
+    // retryable-exhausted path reaches us with no row written yet — gate
+    // on `cause` to avoid double-writing for non-retryable failures.
+    if (input.cause !== "max-attempts-exhausted") return;
+    await startSpan(tracer, "mollifier.drained.terminal_failure", async (span) => {
+      span.setAttribute("mollifier.drained", false);
+      span.setAttribute("mollifier.attempts", input.attempts);
+      span.setAttribute("mollifier.run_friendly_id", input.runId);
+      span.setAttribute("mollifier.terminal_failure_cause", input.cause);
+      span.setAttribute("mollifier.terminal_failure_reason", input.error.message);
+      span.setAttribute("taskRunId", input.runId);
+      await writeMollifierTerminalFailureRow(deps, {
+        friendlyId: input.runId,
+        snapshot: input.payload as Record<string, unknown>,
+        reason: input.error.message,
+      });
+    });
+  };
+}
diff --git a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts
index 28b0a7f88cf..63146b4c323 100644
--- a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts
+++ b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts
@@ -46,6 +46,17 @@ export type GateInputs = {
   // the pattern used by `canAccessAi`, `canAccessPrivateConnections`, and the
   // compute-template beta gate.
   orgFeatureFlags: Record<string, unknown> | null;
+  // Trigger options that drive the debounce / OTU / triggerAndWait
+  // bypasses. The mollify path can't
+  // serialise stateful callbacks (debounce), can't safely break OTU's
+  // synchronous-rejection contract, and shouldn't intercept single
+  // triggerAndWait (batchTriggerAndWait still funnels through per item).
+  options?: {
+    debounce?: unknown;
+    oneTimeUseToken?: string;
+    parentTaskRunId?: string;
+    resumeParentOnCompletion?: boolean;
+  };
 };
 
 export type TripEvaluator = (inputs: GateInputs) => Promise<TripDecision>;
@@ -73,7 +84,7 @@ export type GateDependencies = {
 };
 
 // `options` is a thunk so env reads happen per-evaluation, not at module load.
-// Don't "simplify" to a plain object — Phase 2 dynamic config relies on the
+// Don't "simplify" to a plain object — dynamic config relies on the
 // gate observing whichever env values are live at trigger time.
 const defaultEvaluator = createRealTripEvaluator({
   getBuffer: () => getMollifierBuffer(),
@@ -141,6 +152,28 @@ export async function evaluateGate(
 ): Promise<GateOutcome> {
   const d = { ...defaultGateDependencies, ...deps };
 
+  // Debounce bypass. onDebounced is a closure over webapp state and
+  // can't be snapshotted into the buffer for drainer replay. Skip before the
+  // trip evaluator so debounce traffic is never counted against the rate.
+  if (inputs.options?.debounce) {
+    d.recordDecision("pass_through");
+    return { action: "pass_through" };
+  }
+  // OneTimeUseToken bypass. OTU is a security feature on the PUBLIC_JWT
+  // auth path; its synchronous-rejection contract is materially worse to
+  // break than the idempotency-key contract.
+  if (inputs.options?.oneTimeUseToken) {
+    d.recordDecision("pass_through");
+    return { action: "pass_through" };
+  }
+  // Single triggerAndWait bypass. batchTriggerAndWait still funnels
+  // through TriggerTaskService.call per item so the dominant burst pattern
+  // remains covered.
+  if (inputs.options?.parentTaskRunId && inputs.options?.resumeParentOnCompletion) {
+    d.recordDecision("pass_through");
+    return { action: "pass_through" };
+  }
+
   if (!d.isMollifierEnabled()) {
     d.recordDecision("pass_through");
     return { action: "pass_through" };
diff --git a/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts b/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts
new file mode 100644
index 00000000000..a8b0b151115
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts
@@ -0,0 +1,98 @@
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import { serialiseMollifierSnapshot, type MollifierSnapshot } from "./mollifierSnapshot.server";
+import type { TripDecision } from "./mollifierGate.server";
+
+export type MollifyNotice = {
+  code: "mollifier.queued";
+  message: string;
+  docs: string;
+};
+
+export type MollifySyntheticResult = {
+  // `id` is the canonical TaskRun primary key derived from `friendlyId`
+  // via `RunId.fromFriendlyId`. Downstream consumers in the trigger
+  // route — notably `saveRequestIdempotency` — index the request-
+  // idempotency cache by this id; without it the cache stores
+  // `undefined` and Prisma's `findFirst({ where: { id: undefined } })`
+  // on retry strips the predicate and returns an arbitrary TaskRun
+  // (potential cross-tenant leak). Always populated.
+  //
+  // `spanId` is the root-span id allocated at gate-accept time and
+  // stored in the snapshot. Callers like the dashboard's Test action
+  // use it to build a `v3RunSpanPath` URL that auto-opens the right
+  // details panel — without it, the buffered run lands on the
+  // run-detail page with no span selected (parity gap with PG runs).
+  run: { id: string; friendlyId: string; spanId: string };
+  error: undefined;
+  // The race-loser path: if accept's SETNX hit an existing
+  // buffered run with the same (env, task, idempotencyKey), the
+  // response echoes the winner's runId with isCached=true. The
+  // mollifier-queued notice is only attached for the happy accept.
+  isCached: boolean;
+  notice?: MollifyNotice;
+};
+
+const NOTICE: MollifyNotice = {
+  code: "mollifier.queued",
+  message:
+    "Trigger accepted into burst buffer. Consider batchTrigger for fan-outs of 100+.",
+  docs: "https://trigger.dev/docs/management/tasks/batch-trigger",
+};
+
+export async function mollifyTrigger(args: {
+  runFriendlyId: string;
+  environmentId: string;
+  organizationId: string;
+  engineTriggerInput: MollifierSnapshot;
+  decision: Extract<TripDecision, { divert: true }>;
+  buffer: MollifierBuffer;
+  // Optional idempotency context. When both are passed, accept SETNXes
+  // the lookup so the buffered window participates in trigger-time
+  // dedup symmetrically with PG.
+  idempotencyKey?: string;
+  taskIdentifier?: string;
+}): Promise<MollifySyntheticResult> {
+  const result = await args.buffer.accept({
+    runId: args.runFriendlyId,
+    envId: args.environmentId,
+    orgId: args.organizationId,
+    payload: serialiseMollifierSnapshot(args.engineTriggerInput),
+    idempotencyKey: args.idempotencyKey,
+    taskIdentifier: args.taskIdentifier,
+  });
+
+  if (result.kind === "duplicate_idempotency") {
+    // Race loser. Echo the winner's runId so the SDK's response shape
+    // matches PG-side idempotency cache hits. The winner's spanId isn't
+    // readily available without a second buffer fetch; an empty string
+    // causes `v3RunSpanPath` to omit the `?span=` param, which matches
+    // current behaviour for cached PG responses.
+    return {
+      run: {
+        id: RunId.fromFriendlyId(result.existingRunId),
+        friendlyId: result.existingRunId,
+        spanId: "",
+      },
+      error: undefined,
+      isCached: true,
+    };
+  }
+
+  // Both "accepted" and "duplicate_run_id" produce the same customer-
+  // visible response: a buffered-trigger acknowledgement. The duplicate
+  // runId case is unreachable in practice (runIds are server-generated
+  // and unique) but is silently idempotent at the buffer layer either way.
+  const rawSpanId = args.engineTriggerInput.spanId;
+  const spanId = typeof rawSpanId === "string" ? rawSpanId : "";
+  return {
+    run: {
+      id: RunId.fromFriendlyId(args.runFriendlyId),
+      friendlyId: args.runFriendlyId,
+      spanId,
+    },
+    error: undefined,
+    isCached: false,
+    notice: NOTICE,
+  };
+}
diff --git a/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts
new file mode 100644
index 00000000000..a0732a3542e
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts
@@ -0,0 +1,16 @@
+import { serialiseSnapshot, deserialiseSnapshot } from "@trigger.dev/redis-worker";
+
+// MollifierSnapshot is the JSON-serialisable shape of the input that would be
+// passed to engine.trigger(). The drainer deserialises and replays it.
+// Kept as Record<string, unknown> at this layer — the engine.trigger call site
+// casts it to the engine's typed input. This keeps the mollifier subdirectory
+// from depending on @internal/run-engine internals.
+export type MollifierSnapshot = Record<string, unknown>;
+
+export function serialiseMollifierSnapshot(input: MollifierSnapshot): string {
+  return serialiseSnapshot(input);
+}
+
+export function deserialiseMollifierSnapshot(serialised: string): MollifierSnapshot {
+  return deserialiseSnapshot<MollifierSnapshot>(serialised);
+}
diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts
new file mode 100644
index 00000000000..d135824032c
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts
@@ -0,0 +1,256 @@
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import { logger as defaultLogger } from "~/services/logger.server";
+import { getMollifierBuffer } from "./mollifierBuffer.server";
+import { MollifierStaleSweepState, type StaleSweepStateStore } from "./mollifierStaleSweepState.server";
+import {
+  recordStaleEntry as defaultRecordStaleEntry,
+  reportStaleEntrySnapshot as defaultReportStaleEntrySnapshot,
+} from "./mollifierTelemetry.server";
+
+// One pass of the sweep scans a bounded slice of orgs from the buffer's
+// queue LIST, identified by a durable cursor in Redis. Per-env entry
+// scan is also bounded so a single pathological env can't extend the
+// pass.
+const DEFAULT_MAX_ENTRIES_PER_ENV = 1000;
+// Max orgs visited per tick. Together with `maxEntriesPerEnv` this
+// caps Redis traffic per pass. One "cycle" (visiting every org once)
+// takes `ceil(N_orgs / cap)` ticks, after which the cursor wraps and a
+// fresh org list is taken.
+const DEFAULT_MAX_ORGS_PER_PASS = 100;
+
+export type StaleSweepConfig = {
+  // Entries whose dwell exceeds this threshold are flagged stale. Set
+  // it well below `entryTtlSeconds * 1000` so ops have lead time before
+  // TTL-induced silent loss; the default (half of entryTtlSeconds)
+  // matches the cadence in the plan doc.
+  staleThresholdMs: number;
+  maxEntriesPerEnv?: number;
+  // Hard cap on orgs visited per tick. Bounds the per-pass Redis traffic
+  // and wall-time. Default 100 — at typical fleet sizes one or two
+  // ticks cover everyone; under incident-scale fan-out a full cycle
+  // takes a handful of ticks (~minutes) which is still well below the
+  // staleness signal latency that ops cares about.
+  maxOrgsPerPass?: number;
+};
+
+export type StaleSweepDeps = {
+  getBuffer?: () => MollifierBuffer | null;
+  // Durable cursor + per-env counts hash. Required: the sweep is
+  // useless without persistent state across ticks. The webapp wires up
+  // a real `MollifierStaleSweepState`; tests pass one constructed
+  // against the test container.
+  state: StaleSweepStateStore;
+  // No `envId` arg — `envId` is a high-cardinality metric attribute and
+  // is intentionally not emitted as a metric label. The structured warn
+  // log below carries envId for forensic drill-down.
+  recordStaleEntry?: () => void;
+  reportStaleEntrySnapshot?: (snapshot: Map<string, number>) => void;
+  logger?: { warn: (message: string, fields: Record<string, unknown>) => void };
+  now?: () => number;
+};
+
+export type StaleSweepResult = {
+  orgsScanned: number;
+  envsScanned: number;
+  entriesScanned: number;
+  staleCount: number;
+};
+
+// Walks a bounded slice of `orgs → envs → entries`, emitting an OTel
+// counter tick and a structured warning log for each buffer entry whose
+// dwell exceeds the stale threshold. Read-only on the buffer's own
+// state; writes only to the sweep's three dedicated keys
+// (`mollifier:stale_sweep:*`). The sweep does NOT remove or salvage
+// buffer entries; that decision is deferred to a separate retention-
+// policy change. The signal here exists so ops sees the drainer falling
+// behind well before TTL-induced loss kicks in.
+//
+// Sharding contract:
+// - Cursor starts at 0. On cursor=0 the org list is refreshed by
+//   snapshotting `buffer.listOrgs()` into the durable LIST — that is
+//   the cycle's frozen view of orgs to visit.
+// - Each tick consumes up to `maxOrgsPerPass` orgs from the LIST,
+//   advances the cursor, and persists.
+// - When the cursor reaches the end of the LIST it wraps to 0; the next
+//   tick rebuilds the org list, capturing any orgs that joined the
+//   buffer mid-cycle.
+// - The per-env counts HASH carries over across ticks: an env visited
+//   on tick N and not revisited until tick N+M keeps its last-known
+//   stale count in the gauge for that window. This is the price of
+//   sharding — accepted because the alternative (re-scan everything
+//   every tick) does not bound work.
+export async function runStaleSweepOnce(
+  config: StaleSweepConfig,
+  deps: StaleSweepDeps,
+): Promise<StaleSweepResult> {
+  const getBuffer = deps.getBuffer ?? getMollifierBuffer;
+  const recordStale = deps.recordStaleEntry ?? defaultRecordStaleEntry;
+  const reportSnapshot =
+    deps.reportStaleEntrySnapshot ?? defaultReportStaleEntrySnapshot;
+  const log = deps.logger ?? defaultLogger;
+  const now = (deps.now ?? Date.now)();
+  const maxEntries = config.maxEntriesPerEnv ?? DEFAULT_MAX_ENTRIES_PER_ENV;
+  const maxOrgsPerPass = config.maxOrgsPerPass ?? DEFAULT_MAX_ORGS_PER_PASS;
+
+  const buffer = getBuffer();
+  if (!buffer) {
+    // Replace any previous snapshot with empty so a previously-paging
+    // env doesn't stay latched if mollifier is turned off mid-flight.
+    // Also clear the durable state so a re-enable starts from a clean
+    // slate instead of resuming on a stale cursor.
+    await deps.state.clearAll();
+    reportSnapshot(new Map());
+    return { orgsScanned: 0, envsScanned: 0, entriesScanned: 0, staleCount: 0 };
+  }
+
+  let cursor = await deps.state.readCursor();
+  if (cursor === 0) {
+    // Fresh cycle — capture the current set of orgs into the frozen
+    // LIST. Any orgs that join after this snapshot wait until the next
+    // cycle to be visited. Acceptable for an observational sweep; the
+    // staleness signal would only fire on entries that have been
+    // dwelling for `staleThresholdMs` anyway, so they're not new.
+    const orgs = await buffer.listOrgs();
+    await deps.state.rebuildOrgList(orgs);
+  }
+
+  const { orgs: slice, total } = await deps.state.readOrgListSlice(
+    cursor,
+    maxOrgsPerPass,
+  );
+
+  let envsScanned = 0;
+  let entriesScanned = 0;
+  let staleCount = 0;
+
+  for (const orgId of slice) {
+    const envs = await buffer.listEnvsForOrg(orgId);
+    for (const envId of envs) {
+      envsScanned += 1;
+      let envStale = 0;
+      const entries = await buffer.listEntriesForEnv(envId, maxEntries);
+      for (const entry of entries) {
+        entriesScanned += 1;
+        const dwellMs = now - entry.createdAt.getTime();
+        if (dwellMs > config.staleThresholdMs) {
+          recordStale();
+          log.warn("mollifier.stale_entry", {
+            runId: entry.runId,
+            envId,
+            orgId,
+            dwellMs,
+            staleThresholdMs: config.staleThresholdMs,
+          });
+          envStale += 1;
+        }
+      }
+      // Persist the per-env count to the durable hash. HSET when stale
+      // > 0, HDEL when it dropped back to zero — the hash is the source
+      // of truth for the gauge snapshot below.
+      await deps.state.setEnvStaleCount(envId, envStale);
+      // Track that this env was visited during the current cycle. The
+      // reconcile step at cycle wrap uses this to HDEL counts hash
+      // entries for envs that fully drained mid-cycle (they disappear
+      // from listEnvsForOrg, so the inner loop above never reaches them
+      // and never HDELs their hash field — without reconcile the gauge
+      // would stay elevated forever).
+      await deps.state.markEnvVisited(envId);
+      staleCount += envStale;
+    }
+  }
+
+  // Advance the cursor. If the slice consumed the end of the LIST, wrap
+  // to 0 so the next tick rebuilds the org list and starts a new cycle.
+  const advanced = cursor + slice.length;
+  const wrapped = advanced >= total;
+  const newCursor = wrapped ? 0 : advanced;
+  await deps.state.writeCursor(newCursor);
+
+  if (wrapped) {
+    // Cycle ended. HDEL any env still in the counts hash that didn't
+    // appear in any tick of the just-completed cycle — these are envs
+    // that fully drained from the buffer mid-cycle and would otherwise
+    // hold their stale gauge value forever. Also DELs the visited set
+    // so the next cycle starts clean.
+    await deps.state.reconcileVisited();
+  }
+
+  // Emit the snapshot from the durable hash, which carries values for
+  // envs visited in earlier ticks too. This is what makes the gauge
+  // stable across ticks (and across webapp restarts).
+  const snapshot = await deps.state.readAllEnvStaleCounts();
+  reportSnapshot(snapshot);
+
+  return { orgsScanned: slice.length, envsScanned, entriesScanned, staleCount };
+}
+
+export type StaleSweepIntervalHandle = {
+  stop: () => Promise<void>;
+};
+
+// Production wrapper: schedule `runStaleSweepOnce` on a fixed interval.
+// One pass at a time — if a sweep is still running when the timer fires
+// the next tick is skipped (a backed-up Redis would otherwise queue
+// overlapping sweeps that all log the same stale entries).
+export function startStaleSweepInterval(
+  config: StaleSweepConfig & { intervalMs: number },
+  deps: StaleSweepDeps,
+): StaleSweepIntervalHandle {
+  let stopped = false;
+  let inFlight = false;
+  // Tracks the current tick so `stop()` can await it before closing the
+  // state's Redis client. Without this, a tick that's already past the
+  // `stopped` guard at entry would continue making `state.*` calls
+  // against an ioredis client that `stop()` has already `quit()`ed,
+  // raising errors that the tick's own try/catch then logs as
+  // `mollifier.stale_sweep.failed` warnings — spurious noise on every
+  // graceful shutdown.
+  let currentTick: Promise<void> | null = null;
+
+  const tick = async () => {
+    if (stopped || inFlight) return;
+    inFlight = true;
+    const run = (async () => {
+      try {
+        await runStaleSweepOnce(config, deps);
+      } catch (err) {
+        const log = deps.logger ?? defaultLogger;
+        log.warn("mollifier.stale_sweep.failed", {
+          err: err instanceof Error ? err.message : String(err),
+        });
+      } finally {
+        inFlight = false;
+        currentTick = null;
+      }
+    })();
+    currentTick = run;
+    await run;
+  };
+
+  const timer = setInterval(() => {
+    void tick();
+  }, config.intervalMs);
+
+  return {
+    stop: async () => {
+      stopped = true;
+      clearInterval(timer);
+      // Drain any tick that started before `stopped` flipped. Its
+      // `state.*` calls must land before we close the Redis client.
+      if (currentTick) {
+        try {
+          await currentTick;
+        } catch {
+          // tick has its own catch — this await is just to ensure
+          // ordering, not to surface errors that have already been
+          // logged inside the tick.
+        }
+      }
+      // Close the state's underlying resource. The `close()` method is
+      // part of the `StaleSweepStateStore` contract — production's
+      // `MollifierStaleSweepState` shuts down its ioredis client; fake
+      // test states implement a no-op.
+      await deps.state.close();
+    },
+  };
+}
diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts
new file mode 100644
index 00000000000..4fce5ad9ee4
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts
@@ -0,0 +1,188 @@
+import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis";
+import { Logger } from "@trigger.dev/core/logger";
+
+// Durable per-tick state for the sharded stale sweep. Four Redis keys,
+// all in the `mollifier:` namespace alongside the buffer's own state:
+//
+//   mollifier:stale_sweep:cursor    STRING  next position in org_list (0 = fresh cycle)
+//   mollifier:stale_sweep:org_list  LIST    org IDs frozen at the start of the cycle
+//   mollifier:stale_sweep:counts    HASH    envId -> last-known stale count
+//   mollifier:stale_sweep:visited   SET     envIds visited during the current cycle
+//
+// The state survives webapp restarts: a restarted process picks up the
+// cursor where the previous one left off and re-emits the last-known
+// gauge values immediately, rather than blinking to zero until the next
+// cycle visits each env.
+//
+// The `visited` set exists to GC the `counts` hash at cycle wrap: an env
+// that drains completely between sweep ticks disappears from
+// `buffer.listEnvsForOrg`, so the sweep's inner loop never revisits it
+// and never HDELs its counts entry. Without the visited-set GC the
+// counts hash retains the env's last-known stale count forever and the
+// gauge stays permanently elevated. At cursor wrap we diff the hash
+// against the cycle's visited set and HDEL the difference.
+//
+// Storage is owned by this class rather than added to MollifierBuffer
+// because the keys are sweep-internal — the buffer abstracts the
+// drainer/queue state, this abstracts sweep state. They share a
+// namespace prefix but no API surface.
+
+export interface StaleSweepStateStore {
+  readCursor(): Promise<number>;
+  writeCursor(value: number): Promise<void>;
+  /** Replaces the cycle's frozen org_list. Called at cursor=0. */
+  rebuildOrgList(orgs: string[]): Promise<void>;
+  /** Returns up to `count` org IDs starting at `start`, plus the LIST's total length. */
+  readOrgListSlice(start: number, count: number): Promise<{ orgs: string[]; total: number }>;
+  /** HSET when count > 0, HDEL when count === 0 (so the snapshot reflects current truth). */
+  setEnvStaleCount(envId: string, count: number): Promise<void>;
+  readAllEnvStaleCounts(): Promise<Map<string, number>>;
+  /** SADD `envId` to the current cycle's visited set. Called once per env scanned per tick. */
+  markEnvVisited(envId: string): Promise<void>;
+  /**
+   * HDEL every env in the counts hash that is NOT in the visited set, then
+   * DEL the visited set. Called when the cursor wraps (cycle ends) so
+   * envs that fully drained mid-cycle get cleaned out of the gauge.
+   */
+  reconcileVisited(): Promise<void>;
+  clearAll(): Promise<void>;
+  close(): Promise<void>;
+}
+
+const CURSOR_KEY = "mollifier:stale_sweep:cursor";
+const ORG_LIST_KEY = "mollifier:stale_sweep:org_list";
+const COUNTS_KEY = "mollifier:stale_sweep:counts";
+const VISITED_KEY = "mollifier:stale_sweep:visited";
+
+export class MollifierStaleSweepState implements StaleSweepStateStore {
+  private readonly redis: Redis;
+  private readonly logger: Logger;
+
+  constructor(options: { redisOptions: RedisOptions; logger?: Logger }) {
+    this.logger = options.logger ?? new Logger("MollifierStaleSweepState", "debug");
+    this.redis = createRedisClient(
+      { ...options.redisOptions, maxRetriesPerRequest: 20 },
+      {
+        onError: (error) => {
+          this.logger.error("MollifierStaleSweepState redis client error:", { error });
+        },
+      },
+    );
+  }
+
+  async readCursor(): Promise<number> {
+    const raw = await this.redis.get(CURSOR_KEY);
+    if (raw === null) return 0;
+    const n = Number.parseInt(raw, 10);
+    return Number.isFinite(n) && n >= 0 ? n : 0;
+  }
+
+  async writeCursor(value: number): Promise<void> {
+    await this.redis.set(CURSOR_KEY, String(value));
+  }
+
+  async rebuildOrgList(orgs: string[]): Promise<void> {
+    // DEL + RPUSH in a pipeline — close enough to atomic for an
+    // observational sweep (the inFlight guard at startStaleSweepInterval
+    // serialises sweep passes; nothing else writes these keys).
+    const pipeline = this.redis.pipeline();
+    pipeline.del(ORG_LIST_KEY);
+    if (orgs.length > 0) {
+      pipeline.rpush(ORG_LIST_KEY, ...orgs);
+    }
+    await pipeline.exec();
+  }
+
+  async readOrgListSlice(
+    start: number,
+    count: number,
+  ): Promise<{ orgs: string[]; total: number }> {
+    const pipeline = this.redis.pipeline();
+    pipeline.lrange(ORG_LIST_KEY, start, start + count - 1);
+    pipeline.llen(ORG_LIST_KEY);
+    const results = await pipeline.exec();
+    // `pipeline.exec()` returning null is the abort-on-broken-pipe path.
+    // Surface it as a thrown error — the previous `return { orgs: [], total: 0 }`
+    // looked indistinguishable from a genuinely empty org list to the
+    // caller (`runStaleSweepOnce`), which then wrote cursor=0, reconciled
+    // visited envs against the empty result, and cleared the stale-entry
+    // gauge. That hid real Redis problems and silenced the alerts the
+    // sweep exists to raise.
+    if (!results) {
+      throw new Error("MollifierStaleSweepState.readOrgListSlice: pipeline.exec returned null");
+    }
+    const [lrangeErr, lrangeRes] = results[0] as [Error | null, string[] | null];
+    const [llenErr, llenRes] = results[1] as [Error | null, number | null];
+    if (lrangeErr || llenErr) {
+      this.logger.error("MollifierStaleSweepState.readOrgListSlice failed", {
+        lrangeErr: lrangeErr?.message,
+        llenErr: llenErr?.message,
+      });
+      // Same reasoning as the null-result path above — propagate the
+      // failure so the sweep's interval wrapper records a failed cycle
+      // and the durable cursor / counts hash stay untouched.
+      throw lrangeErr ?? llenErr ?? new Error("MollifierStaleSweepState.readOrgListSlice failed");
+    }
+    return { orgs: lrangeRes ?? [], total: llenRes ?? 0 };
+  }
+
+  async setEnvStaleCount(envId: string, count: number): Promise<void> {
+    if (count > 0) {
+      await this.redis.hset(COUNTS_KEY, envId, String(count));
+    } else {
+      await this.redis.hdel(COUNTS_KEY, envId);
+    }
+  }
+
+  async readAllEnvStaleCounts(): Promise<Map<string, number>> {
+    const raw = await this.redis.hgetall(COUNTS_KEY);
+    const out = new Map<string, number>();
+    for (const [envId, value] of Object.entries(raw)) {
+      const n = Number.parseInt(value, 10);
+      if (Number.isFinite(n)) out.set(envId, n);
+    }
+    return out;
+  }
+
+  async markEnvVisited(envId: string): Promise<void> {
+    await this.redis.sadd(VISITED_KEY, envId);
+  }
+
+  async reconcileVisited(): Promise<void> {
+    // HKEYS + SMEMBERS in a pipeline, then HDEL the difference locally.
+    // For typical fleet sizes (counts and visited both bounded by the
+    // count of buffered envs) this is well within a single RTT plus one
+    // small HDEL.
+    const pipeline = this.redis.pipeline();
+    pipeline.hkeys(COUNTS_KEY);
+    pipeline.smembers(VISITED_KEY);
+    const results = await pipeline.exec();
+    if (!results) return;
+    const [hkeysErr, hkeysRes] = results[0] as [Error | null, string[] | null];
+    const [smembersErr, smembersRes] = results[1] as [Error | null, string[] | null];
+    if (hkeysErr || smembersErr) {
+      this.logger.error("MollifierStaleSweepState.reconcileVisited failed", {
+        hkeysErr: hkeysErr?.message,
+        smembersErr: smembersErr?.message,
+      });
+      return;
+    }
+    const hashEnvs = hkeysRes ?? [];
+    const visited = new Set(smembersRes ?? []);
+    const orphans = hashEnvs.filter((envId) => !visited.has(envId));
+    const cleanup = this.redis.pipeline();
+    if (orphans.length > 0) {
+      cleanup.hdel(COUNTS_KEY, ...orphans);
+    }
+    cleanup.del(VISITED_KEY);
+    await cleanup.exec();
+  }
+
+  async clearAll(): Promise<void> {
+    await this.redis.del(CURSOR_KEY, ORG_LIST_KEY, COUNTS_KEY, VISITED_KEY);
+  }
+
+  async close(): Promise<void> {
+    await this.redis.quit();
+  }
+}
diff --git a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts
index 0fe302584ce..f9c7ca72f1f 100644
--- a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts
+++ b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts
@@ -15,3 +15,87 @@ export function recordDecision(outcome: DecisionOutcome, reason?: DecisionReason
     ...(reason ? { reason } : {}),
   });
 }
+
+// Counts subscriptions hitting `/realtime/v1/runs/<id>` for a run that
+// lives only in the mollifier buffer (no PG row yet). The route opens
+// the Electric stream anyway so the eventual drainer-INSERT propagates
+// to the client; this counter is the signal of how often customers
+// subscribe inside the buffered window.
+export const realtimeBufferedSubscriptionsCounter = meter.createCounter(
+  "mollifier.realtime_subscriptions.buffered",
+  {
+    description:
+      "Realtime subscriptions opened against a runId that exists only in the mollifier buffer",
+  },
+);
+
+// No `envId` attribute — `envId` is a banned high-cardinality metric
+// label per the repo's OTel rules. The structured warn log emitted
+// alongside the counter tick (in `mollifierStaleSweep.server.ts`)
+// carries the envId / orgId / runId for forensic drill-down; the
+// metric stays an aggregate.
+export function recordRealtimeBufferedSubscription(): void {
+  realtimeBufferedSubscriptionsCounter.add(1);
+}
+
+// Counts buffer entries that have been waiting in the queue ZSET longer
+// than the configured stale threshold. Useful for historical "stale
+// events over time" views, but not directly alertable on its own — a
+// single stuck entry observed by N sweep ticks adds N to the counter,
+// so `rate()` over an alerting window reflects (entries × ticks), not
+// "entries that are stale right now".
+export const staleEntriesCounter = meter.createCounter(
+  "mollifier.stale_entries",
+  {
+    description:
+      "Mollifier buffer entries whose dwell exceeds the stale threshold (per sweep pass)",
+  },
+);
+
+// No `envId` attribute — see comment above.
+export function recordStaleEntry(): void {
+  staleEntriesCounter.add(1);
+}
+
+// Alertable signal: the total count of stale entries observed by the
+// latest sweep. The sweep snapshots the full picture on each pass so
+// the gauge drops back to 0 when the drainer catches up instead of
+// staying latched. Recommended alert:
+//   mollifier_stale_entries_current > 0 for 5m
+export const staleEntriesGauge = meter.createObservableGauge(
+  "mollifier.stale_entries.current",
+  {
+    description:
+      "Buffer entries whose dwell exceeds the stale threshold, as observed by the latest sweep pass",
+  },
+);
+
+let latestStaleTotal = 0;
+
+export function reportStaleEntrySnapshot(snapshot: Map<string, number>): void {
+  // Sum across envs. Per-env breakdown is intentionally NOT emitted as
+  // a metric label (high-cardinality); the structured warn log lines
+  // from the sweep carry per-env detail for ops to drill down.
+  let total = 0;
+  for (const count of snapshot.values()) {
+    total += count;
+  }
+  latestStaleTotal = total;
+}
+
+meter.addBatchObservableCallback(
+  (result) => {
+    result.observe(staleEntriesGauge, latestStaleTotal);
+  },
+  [staleEntriesGauge],
+);
+
+// Electric SQL's shape-stream protocol adds a `handle=` query param on
+// every reconnect after the initial GET. Gating the realtime-buffered
+// log/counter on its absence keeps the signal at one tick per
+// subscription instead of one tick per ~20s live-poll iteration —
+// without it the counter would over-count by the long-poll factor.
+export function isInitialBufferedSubscriptionRequest(url: string | URL): boolean {
+  const u = typeof url === "string" ? new URL(url) : url;
+  return !u.searchParams.has("handle");
+}
diff --git a/apps/webapp/app/v3/mollifier/mollifierTripEvaluator.server.ts b/apps/webapp/app/v3/mollifier/mollifierTripEvaluator.server.ts
index 4bd9a34d412..9032467d200 100644
--- a/apps/webapp/app/v3/mollifier/mollifierTripEvaluator.server.ts
+++ b/apps/webapp/app/v3/mollifier/mollifierTripEvaluator.server.ts
@@ -35,8 +35,8 @@ export function createRealTripEvaluator(deps: CreateRealTripEvaluatorDeps): Trip
     } catch (err) {
       // Deliberate: no error counter here. Shadow mode means a silent miss is
       // harmless — fail-open is the safe direction. The error log + Sentry
-      // capture is sufficient operability for Phase 1. Revisit in Phase 2
-      // when buffer writes are the primary path and a missed evaluation has cost.
+      // capture is sufficient operability while this runs in shadow mode. Revisit
+      // once buffer writes are the primary path and a missed evaluation has cost.
       logger.error("mollifier trip evaluator: fail-open on error", {
         envId: inputs.envId,
         err: err instanceof Error ? err.message : String(err),
diff --git a/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts b/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts
new file mode 100644
index 00000000000..9de8f64b3e9
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts
@@ -0,0 +1,246 @@
+import type {
+  BufferEntry,
+  MollifierBuffer,
+  MutateSnapshotResult,
+  SnapshotPatch,
+} from "@trigger.dev/redis-worker";
+import type { TaskRun } from "@trigger.dev/database";
+import { prisma, $replica } from "~/db.server";
+import { logger } from "~/services/logger.server";
+import { getMollifierBuffer } from "./mollifierBuffer.server";
+
+// Wait/retry knobs. Exported for tests.
+export const DEFAULT_SAFETY_NET_MS = 2_000;
+// Initial gap between buffer polls; grows by BACKOFF_FACTOR up to
+// DEFAULT_MAX_POLL_STEP_MS so a slow drain doesn't poll at a tight fixed
+// cadence for the whole safety-net budget.
+export const DEFAULT_POLL_STEP_MS = 20;
+export const DEFAULT_MAX_POLL_STEP_MS = 250;
+const BACKOFF_FACTOR = 1.7;
+
+export type MutateWithFallbackInput<TResponse> = {
+  runId: string;
+  environmentId: string;
+  organizationId: string;
+  bufferPatch: SnapshotPatch;
+  // Called when a PG row exists (either replica-hit or post-wait writer-hit).
+  // Receives the full TaskRun shape and returns the customer-visible body.
+  pgMutation: (pgRow: TaskRun) => Promise<TResponse>;
+  // Called when the patch landed cleanly on the buffer snapshot. The
+  // drainer will see the patched payload on its next pop. Receives the
+  // pre-mutation snapshot entry (the one fetched for the env auth
+  // check above) so the caller can compute response details that
+  // depend on the prior state — e.g. the tags route needs to dedup
+  // against the existing tags to report an accurate `newTags` count
+  // matching the PG path, without an extra Redis round-trip.
+  // `bufferEntry` is `null` in the rare race where the entry didn't
+  // exist at pre-check time but appeared before `mutateSnapshot`.
+  synthesisedResponse: (ctx: {
+    bufferEntry: BufferEntry | null;
+  }) => TResponse | Promise<TResponse>;
+  // Called when the buffer rejected the patch as invalid (e.g. an
+  // `append_tags` patch carrying `maxTags` would exceed the cap). Required
+  // only by callers that send a rejectable patch; the helper throws if the
+  // buffer reports a rejection and no builder was supplied. Receives the
+  // same `bufferEntry` context as `synthesisedResponse` so a rejection
+  // message can reference the prior state if useful.
+  rejectedResponse?: (ctx: {
+    bufferEntry: BufferEntry | null;
+  }) => TResponse | Promise<TResponse>;
+  abortSignal?: AbortSignal;
+  // Override defaults for tests.
+  safetyNetMs?: number;
+  pollStepMs?: number;
+  maxPollStepMs?: number;
+  // Test injection.
+  getBuffer?: () => MollifierBuffer | null;
+  prismaWriter?: TaskRunReader;
+  prismaReplica?: TaskRunReader;
+  sleep?: (ms: number) => Promise<void>;
+  now?: () => number;
+  // Jitter source; defaults to Math.random. Inject `() => 0` for
+  // deterministic poll timing in tests.
+  random?: () => number;
+};
+
+export type MutateWithFallbackOutcome<TResponse> =
+  | { kind: "pg"; response: TResponse }
+  | { kind: "snapshot"; response: TResponse }
+  | { kind: "rejected"; response: TResponse }
+  | { kind: "not_found" }
+  | { kind: "timed_out" };
+
+// PG-first → buffer mutateSnapshot → wait-and-bounce. The
+// caller decides how to translate the outcome into an HTTP response —
+// this helper never throws Response objects so it remains route-agnostic
+// and unit-testable in isolation.
+export async function mutateWithFallback<TResponse>(
+  input: MutateWithFallbackInput<TResponse>,
+): Promise<MutateWithFallbackOutcome<TResponse>> {
+  const replica = input.prismaReplica ?? $replica;
+  const writer = input.prismaWriter ?? prisma;
+  const buffer = (input.getBuffer ?? getMollifierBuffer)();
+  const sleep = input.sleep ?? defaultSleep;
+  const now = input.now ?? Date.now;
+
+  // Path 1 — PG is already canonical.
+  const replicaRow = await findRunInPg(replica, input.runId, input.environmentId);
+  if (replicaRow) {
+    const response = await input.pgMutation(replicaRow);
+    return { kind: "pg", response };
+  }
+
+  if (!buffer) {
+    // No buffer configured (mollifier disabled or boot-time error). The
+    // pre-PR mutation routes read from the writer directly, so a freshly-
+    // created PG row was always visible regardless of replication lag.
+    // Now that the read moved to the replica (line 87) for the offload,
+    // a `!buffer` short-circuit would regress: a real PG row + replica
+    // lag would return 404. Mirror the writer-disambiguation block below
+    // (line 148, the buffer-says-not-found path) so degraded mode
+    // (mollifier disabled) still matches pre-PR mutation behaviour.
+    const writerRow = await findRunInPg(writer, input.runId, input.environmentId);
+    if (writerRow) {
+      const response = await input.pgMutation(writerRow);
+      return { kind: "pg", response };
+    }
+    return { kind: "not_found" };
+  }
+
+  // Env-scoped authorization for the buffer path. The replica/writer
+  // lookups above are already env-scoped via findRunInPg; this closes
+  // the same gap on the buffer side so a caller authed in env A can't
+  // mutate a buffered run that belongs to env B (or a different org)
+  // by guessing its friendlyId. Non-atomic w.r.t. the mutateSnapshot
+  // call below, but the TOCTOU is benign: runIds are globally unique,
+  // so a cross-env entry can't suddenly appear after a same-env check.
+  // A genuinely-missing entry (entry === null) falls through and is
+  // handled by the existing not_found / writer-recovery path below.
+  const entryForAuth = await buffer.getEntry(input.runId);
+  if (
+    entryForAuth &&
+    (entryForAuth.envId !== input.environmentId ||
+      entryForAuth.orgId !== input.organizationId)
+  ) {
+    // Hide existence on env mismatch: return not_found, same shape as
+    // a true miss, rather than 403 which would leak that the runId
+    // exists in some other env.
+    return { kind: "not_found" };
+  }
+
+  // Path 2 — buffer snapshot mutation.
+  const result: MutateSnapshotResult = await buffer.mutateSnapshot(
+    input.runId,
+    input.bufferPatch,
+  );
+
+  if (result === "applied_to_snapshot") {
+    return {
+      kind: "snapshot",
+      response: await input.synthesisedResponse({ bufferEntry: entryForAuth }),
+    };
+  }
+
+  if (result === "limit_exceeded") {
+    // The buffer refused the patch (e.g. tag cap). Nothing was written.
+    // Surface the caller's rejection body; a missing builder means the
+    // caller sent a rejectable patch without handling the rejection.
+    if (!input.rejectedResponse) {
+      throw new Error(
+        "mutateWithFallback: buffer returned 'limit_exceeded' but no rejectedResponse was provided",
+      );
+    }
+    return {
+      kind: "rejected",
+      response: await input.rejectedResponse({ bufferEntry: entryForAuth }),
+    };
+  }
+
+  if (result === "not_found") {
+    // Disambiguate a genuine 404 from a replica-lag miss: ask the writer
+    // directly. If the row just appeared post-drain we route through the
+    // PG mutation path.
+    const writerRow = await findRunInPg(writer, input.runId, input.environmentId);
+    if (writerRow) {
+      const response = await input.pgMutation(writerRow);
+      return { kind: "pg", response };
+    }
+    return { kind: "not_found" };
+  }
+
+  // result === "busy" — the entry is mid-handoff (DRAINING) or already
+  // materialised. We do NOT poll the primary for the row to appear: that
+  // piles read load onto the writer at exactly the moment mollifier exists
+  // to shed it. Instead we watch the buffer entry itself (cheap Redis
+  // reads). The drainer writes the PG row BEFORE it acks (sets
+  // `materialised`) or fails (deletes the entry), so the entry's own state
+  // is an authoritative, already-in-Redis signal for "is the row in PG
+  // yet?". Only once it resolves do we touch the primary — exactly once,
+  // for the real mutation.
+  const safetyNetMs = input.safetyNetMs ?? DEFAULT_SAFETY_NET_MS;
+  const maxPollStepMs = input.maxPollStepMs ?? DEFAULT_MAX_POLL_STEP_MS;
+  const random = input.random ?? Math.random;
+  const deadline = now() + safetyNetMs;
+  let step = input.pollStepMs ?? DEFAULT_POLL_STEP_MS;
+
+  while (now() < deadline) {
+    if (input.abortSignal?.aborted) {
+      return { kind: "timed_out" };
+    }
+
+    const entry = await buffer.getEntry(input.runId);
+    // Resolved when the entry is gone (`fail` deleted it after writing a
+    // terminal SYSTEM_FAILURE row) or materialised (`ack` after a
+    // successful trigger / cancel write). In both cases the PG row is now
+    // committed on the primary, so read it once and route through the
+    // canonical PG mutation path.
+    if (entry === null || entry.materialised === true) {
+      const row = await findRunInPg(writer, input.runId, input.environmentId);
+      if (row) {
+        const response = await input.pgMutation(row);
+        return { kind: "pg", response };
+      }
+      // Entry gone with no PG row: the drainer's terminal write itself
+      // failed (PG unreachable). Nothing to mutate.
+      return { kind: "not_found" };
+    }
+    // Still QUEUED (requeued after a retryable drain error) or DRAINING —
+    // the run hasn't reached PG. Back off with jitter so concurrent
+    // waiters on the same draining run don't requery in lockstep.
+    if (now() >= deadline) break;
+    const jittered = step + Math.floor(random() * step);
+    await sleep(jittered);
+    step = Math.min(Math.ceil(step * BACKOFF_FACTOR), maxPollStepMs);
+  }
+
+  logger.warn("mollifier mutate-with-fallback: drainer resolution timed out", {
+    runId: input.runId,
+    safetyNetMs,
+  });
+  return { kind: "timed_out" };
+}
+
+// Structural reader interface — accepts both the writer (`prisma`) and the
+// replica (`$replica`), which differ slightly in their generated Prisma
+// types but share the findFirst surface used here.
+type TaskRunReader = {
+  taskRun: {
+    findFirst(args: {
+      where: { friendlyId: string; runtimeEnvironmentId: string };
+    }): Promise<TaskRun | null>;
+  };
+};
+
+async function findRunInPg(
+  client: TaskRunReader,
+  friendlyId: string,
+  environmentId: string,
+): Promise<TaskRun | null> {
+  return client.taskRun.findFirst({
+    where: { friendlyId, runtimeEnvironmentId: environmentId },
+  });
+}
+
+function defaultSleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
diff --git a/apps/webapp/app/v3/mollifier/readFallback.server.ts b/apps/webapp/app/v3/mollifier/readFallback.server.ts
index 34a8b48f970..21dd6c23957 100644
--- a/apps/webapp/app/v3/mollifier/readFallback.server.ts
+++ b/apps/webapp/app/v3/mollifier/readFallback.server.ts
@@ -1,4 +1,10 @@
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+import { IdempotencyKeyOptionsSchema } from "@trigger.dev/core/v3/schemas";
+import type { z } from "zod";
 import { logger } from "~/services/logger.server";
+import { deserialiseMollifierSnapshot } from "./mollifierSnapshot.server";
+import { getMollifierBuffer } from "./mollifierBuffer.server";
 
 export type ReadFallbackInput = {
   runId: string;
@@ -6,11 +12,255 @@ export type ReadFallbackInput = {
   organizationId: string;
 };
 
+export type SyntheticRun = {
+  // Snapshot-derived TaskRun primary key. Used by ReplayTaskRunService
+  // for logging and by callers passing this object where a TaskRun is
+  // expected (cast). Derived deterministically from `friendlyId`.
+  id: string;
+  friendlyId: string;
+  status: "QUEUED" | "FAILED" | "CANCELED";
+  // Set when the customer cancelled the run via the dashboard or API
+  // while it was buffered. The drainer's cancel bifurcation reads this
+  // on next pop and writes a CANCELED PG row directly (skipping
+  // materialisation). Reflected back into the UI by the synthesised
+  // SpanRun so the run-detail page shows the cancelled state even before
+  // the drainer materialises it.
+  cancelledAt: Date | undefined;
+  cancelReason: string | undefined;
+  // Reschedule patch (`set_delay`) writes `delayUntil` into the snapshot.
+  // Surfacing it on SyntheticRun lets the retrieve-run shape reflect the
+  // pending delay before the drainer materialises the PG row.
+  delayUntil: Date | undefined;
+  taskIdentifier: string | undefined;
+  createdAt: Date;
+
+  payload: unknown;
+  payloadType: string | undefined;
+  metadata: unknown;
+  metadataType: string | undefined;
+  // Seed-metadata mirrors what `triggerTask.server.ts` writes into the
+  // snapshot: the original metadataPacket data preserved separately from
+  // any later customer mutations. ReplayTaskRunService uses these to
+  // rebuild the replay's metadata.
+  seedMetadata: string | undefined;
+  seedMetadataType: string | undefined;
+
+  idempotencyKey: string | undefined;
+  // Surfaced for the cached-hit expiration check in IdempotencyKeyConcern.
+  // The PG-resident path enforces this (clears key, allows new run when
+  // expired). For buffered runs the snapshot carries the same field — we
+  // expose it here so the cached-hit branch can apply the same check
+  // rather than indefinitely returning the buffered run's id.
+  idempotencyKeyExpiresAt: Date | undefined;
+  // `{ key, scope }` object form, matching how the SDK serialises and PG
+  // stores it. Previously typed as `string[]` (legacy/incorrect — Prisma
+  // is `Json?` carrying the schema-shaped object). `getUserProvidedIdempotencyKey`
+  // and `extractIdempotencyKeyScope` both parse via the same Zod schema;
+  // they returned `undefined` for the array-shape, which silently
+  // demoted the response to surface the hash instead of the user-
+  // provided key for buffered runs — a contract divergence from
+  // PG-resident runs. See the regression test in `mollifierReadFallback.test.ts`.
+  idempotencyKeyOptions: z.infer<typeof IdempotencyKeyOptionsSchema> | undefined;
+  isTest: boolean;
+  depth: number;
+  ttl: string | undefined;
+  tags: string[];
+  // Mirror of `tags` under the PG field name. ReplayTaskRunService reads
+  // `existingTaskRun.runTags`; both names are kept here so a synthetic
+  // run can be passed wherever the PG-shape `runTags` is expected.
+  runTags: string[];
+  lockedToVersion: string | undefined;
+  resumeParentOnCompletion: boolean;
+  parentTaskRunId: string | undefined;
+
+  // Allocated at gate-accept time and embedded in the snapshot so the run's
+  // trace is continuous from QUEUED-in-buffer through executing post-drain.
+  traceId: string | undefined;
+  spanId: string | undefined;
+  parentSpanId: string | undefined;
+
+  // Replay-relevant fields populated from the engine-trigger snapshot.
+  // ReplayTaskRunService reads each of these from the existing TaskRun;
+  // when the original lives in the buffer we synthesise them here.
+  runtimeEnvironmentId: string | undefined;
+  engine: "V2";
+  workerQueue: string | undefined;
+  queue: string | undefined;
+  concurrencyKey: string | undefined;
+  machinePreset: string | undefined;
+  realtimeStreamsVersion: string | undefined;
+
+  // Additional snapshot-sourced fields used when synthesising a SpanRun
+  // for the dashboard's right-side details panel. All optional because
+  // older snapshots may not carry them.
+  maxAttempts: number | undefined;
+  maxDurationInSeconds: number | undefined;
+  replayedFromTaskRunFriendlyId: string | undefined;
+  annotations: unknown;
+  traceContext: unknown;
+  scheduleId: string | undefined;
+  batchId: string | undefined;
+  parentTaskRunFriendlyId: string | undefined;
+  rootTaskRunFriendlyId: string | undefined;
+
+  error?: { code: string; message: string };
+};
+
+export type ReadFallbackDeps = {
+  getBuffer?: () => MollifierBuffer | null;
+};
+
+function asString(value: unknown): string | undefined {
+  return typeof value === "string" ? value : undefined;
+}
+
+function asStringArray(value: unknown): string[] {
+  return Array.isArray(value) && value.every((v) => typeof v === "string") ? (value as string[]) : [];
+}
+
+function asDate(value: unknown): Date | undefined {
+  const raw = asString(value);
+  if (!raw) return undefined;
+  const parsed = new Date(raw);
+  return Number.isNaN(parsed.getTime()) ? undefined : parsed;
+}
+
+// Snapshot ids are written by engine.trigger as INTERNAL ids (cuids); the
+// SyntheticRun contract exposes friendlyIds. `RunId.toFriendlyId` is
+// already used for the synthetic run's own id (line 155); reuse it for
+// parent/root so consumers see the same shape as the PG path.
+function internalRunIdToFriendlyId(internalId: string | undefined): string | undefined {
+  if (!internalId) return undefined;
+  return RunId.toFriendlyId(internalId);
+}
+
 export async function findRunByIdWithMollifierFallback(
   input: ReadFallbackInput,
-): Promise<null> {
-  logger.debug("mollifier read-fallback called (phase 1 stub)", {
-    runId: input.runId,
-  });
-  return null;
+  deps: ReadFallbackDeps = {},
+): Promise<SyntheticRun | null> {
+  const buffer = (deps.getBuffer ?? getMollifierBuffer)();
+  if (!buffer) return null;
+
+  try {
+    const entry = await buffer.getEntry(input.runId);
+    if (!entry) return null;
+
+    if (entry.envId !== input.environmentId || entry.orgId !== input.organizationId) {
+      logger.warn("mollifier read-fallback auth mismatch", {
+        runId: input.runId,
+        callerEnvId: input.environmentId,
+        callerOrgId: input.organizationId,
+      });
+      return null;
+    }
+
+    const snapshot = deserialiseMollifierSnapshot(entry.payload);
+    // Parse via the canonical schema (`{ key: string, scope: "run" |
+    // "attempt" | "global" }`) rather than the legacy Array.isArray
+    // check. The SDK and Prisma both store this as an object; the array
+    // form never matches, so a buffered run's response previously fell
+    // back to the server-side hash in `getUserProvidedIdempotencyKey`
+    // instead of the customer-supplied key — diverging from how
+    // materialised runs render the same field.
+    const idempotencyKeyOptionsParsed = IdempotencyKeyOptionsSchema.safeParse(
+      snapshot.idempotencyKeyOptions,
+    );
+    const idempotencyKeyOptions = idempotencyKeyOptionsParsed.success
+      ? idempotencyKeyOptionsParsed.data
+      : undefined;
+
+    const tags = asStringArray(snapshot.tags);
+    const environment =
+      snapshot.environment && typeof snapshot.environment === "object"
+        ? (snapshot.environment as Record<string, unknown>)
+        : undefined;
+
+    const cancelledAt = asDate(snapshot.cancelledAt);
+    const cancelReason = asString(snapshot.cancelReason);
+    let status: SyntheticRun["status"] = "QUEUED";
+    if (cancelledAt) {
+      status = "CANCELED";
+    } else if (entry.status === "FAILED") {
+      status = "FAILED";
+    }
+    const delayUntil = asDate(snapshot.delayUntil);
+
+    return {
+      id: RunId.fromFriendlyId(entry.runId),
+      friendlyId: entry.runId,
+      status,
+      cancelledAt,
+      cancelReason,
+      delayUntil,
+      taskIdentifier: asString(snapshot.taskIdentifier),
+      createdAt: entry.createdAt,
+
+      payload: snapshot.payload,
+      payloadType: asString(snapshot.payloadType),
+      metadata: snapshot.metadata,
+      metadataType: asString(snapshot.metadataType),
+      seedMetadata: asString(snapshot.seedMetadata),
+      seedMetadataType: asString(snapshot.seedMetadataType),
+
+      idempotencyKey: asString(snapshot.idempotencyKey),
+      idempotencyKeyExpiresAt: asDate(snapshot.idempotencyKeyExpiresAt),
+      idempotencyKeyOptions,
+      isTest: snapshot.isTest === true,
+      depth: typeof snapshot.depth === "number" ? snapshot.depth : 0,
+      ttl: asString(snapshot.ttl),
+      tags,
+      runTags: tags,
+      lockedToVersion: asString(snapshot.taskVersion),
+      resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true,
+      parentTaskRunId: asString(snapshot.parentTaskRunId),
+
+      traceId: asString(snapshot.traceId),
+      spanId: asString(snapshot.spanId),
+      parentSpanId: asString(snapshot.parentSpanId),
+
+      runtimeEnvironmentId:
+        asString(environment?.id) ?? entry.envId,
+      engine: "V2",
+      workerQueue: asString(snapshot.workerQueue),
+      queue: asString(snapshot.queue),
+      concurrencyKey: asString(snapshot.concurrencyKey),
+      machinePreset: asString(snapshot.machine),
+      realtimeStreamsVersion: asString(snapshot.realtimeStreamsVersion),
+
+      maxAttempts: typeof snapshot.maxAttempts === "number" ? snapshot.maxAttempts : undefined,
+      maxDurationInSeconds:
+        typeof snapshot.maxDurationInSeconds === "number"
+          ? snapshot.maxDurationInSeconds
+          : undefined,
+      replayedFromTaskRunFriendlyId: asString(snapshot.replayedFromTaskRunFriendlyId),
+      annotations: snapshot.annotations,
+      traceContext: snapshot.traceContext,
+      scheduleId: asString(snapshot.scheduleId),
+      // The engine.trigger input embeds the batch as `{ id, index }` (see
+      // triggerTask.server.ts #buildEngineTriggerInput), not as a flat
+      // `batchId`. The nested `id` is the batch's internal cuid — the same
+      // value PG stores in `TaskRun.batchId` — so callers reconstruct the
+      // friendly id via `BatchId.toFriendlyId` exactly as the PG path does.
+      batchId: asString((snapshot.batch as { id?: unknown } | undefined)?.id),
+      // The snapshot only carries the INTERNAL parent/root ids
+      // (`parentTaskRunId` / `rootTaskRunId` — what engine.trigger consumes),
+      // not the friendlyIds the SyntheticRun contract expects. Convert
+      // internal → friendly here so consumers don't have to special-case
+      // the buffered path.
+      parentTaskRunFriendlyId: internalRunIdToFriendlyId(
+        asString(snapshot.parentTaskRunId)
+      ),
+      rootTaskRunFriendlyId: internalRunIdToFriendlyId(
+        asString(snapshot.rootTaskRunId)
+      ),
+
+      error: entry.lastError,
+    };
+  } catch (err) {
+    logger.error("mollifier read-fallback errored — fail-open to null", {
+      runId: input.runId,
+      err: err instanceof Error ? err.message : String(err),
+    });
+    return null;
+  }
 }
diff --git a/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts b/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts
new file mode 100644
index 00000000000..b3db81368b9
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts
@@ -0,0 +1,82 @@
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import { $replica as defaultReplica, prisma as defaultWriter } from "~/db.server";
+import { getMollifierBuffer as defaultGetBuffer } from "./mollifierBuffer.server";
+
+// Discriminated-union resolver used by mutation routes' `findResource`.
+// The route builder treats a null return from `findResource` as a 404
+// BEFORE the action handler runs (`apiBuilder.server.ts:321`), so we
+// must check BOTH the PG canonical store and the mollifier buffer here
+// — otherwise a buffered run can't be cancelled / mutated even though
+// the underlying mutateWithFallback flow would handle it correctly.
+//
+// (Regression: before extracting this helper the cancel route had
+// `findResource: async () => null`, which made every cancel 404 before
+// the action ran. The helper makes the lookup unit-testable.)
+export type ResolvedRunForMutation =
+  | { source: "pg"; friendlyId: string }
+  | { source: "buffer"; friendlyId: string };
+
+type PrismaTaskRunFindFirst = {
+  taskRun: {
+    findFirst(args: {
+      where: { friendlyId: string; runtimeEnvironmentId: string };
+      select: { friendlyId: true };
+    }): Promise<{ friendlyId: string } | null>;
+  };
+};
+
+export type ResolveRunForMutationDeps = {
+  prismaReplica?: PrismaTaskRunFindFirst;
+  prismaWriter?: PrismaTaskRunFindFirst;
+  getBuffer?: () => MollifierBuffer | null;
+};
+
+export async function resolveRunForMutation(input: {
+  runParam: string;
+  environmentId: string;
+  organizationId: string;
+  deps?: ResolveRunForMutationDeps;
+}): Promise<ResolvedRunForMutation | null> {
+  const replica = input.deps?.prismaReplica ?? defaultReplica;
+  const writer = input.deps?.prismaWriter ?? defaultWriter;
+  const getBuffer = input.deps?.getBuffer ?? defaultGetBuffer;
+
+  const pgRun = await replica.taskRun.findFirst({
+    where: { friendlyId: input.runParam, runtimeEnvironmentId: input.environmentId },
+    select: { friendlyId: true },
+  });
+  if (pgRun) return { source: "pg", friendlyId: pgRun.friendlyId };
+
+  const buffer = getBuffer();
+
+  if (buffer) {
+    const entry = await buffer.getEntry(input.runParam);
+    if (
+      entry &&
+      entry.envId === input.environmentId &&
+      entry.orgId === input.organizationId
+    ) {
+      return { source: "buffer", friendlyId: input.runParam };
+    }
+  }
+
+  // Replica + buffer both missed. Before declaring "not found" (which the
+  // route builder converts to a hard 404 *before* the action handler runs,
+  // so the downstream `mutateWithFallback` writer-recovery never gets a
+  // chance to fire), do one final probe against the writer. This catches
+  // two cases:
+  //   1. Replica lag on a freshly-created PG row.
+  //   2. A buffered run that materialised in the window between the
+  //      replica read and our buffer check (the entry was ack'd and the
+  //      hash is mid-grace-TTL but our getEntry returned null due to
+  //      lookup-by-friendlyId timing).
+  // Without this, the resolver returns null in degraded states that the
+  // downstream mutateWithFallback flow would otherwise handle correctly.
+  const writerRun = await writer.taskRun.findFirst({
+    where: { friendlyId: input.runParam, runtimeEnvironmentId: input.environmentId },
+    select: { friendlyId: true },
+  });
+  if (writerRun) return { source: "pg", friendlyId: writerRun.friendlyId };
+
+  return null;
+}
diff --git a/apps/webapp/app/v3/mollifier/syntheticApiResponses.server.ts b/apps/webapp/app/v3/mollifier/syntheticApiResponses.server.ts
new file mode 100644
index 00000000000..02c63fe91f1
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/syntheticApiResponses.server.ts
@@ -0,0 +1,73 @@
+import type { SyntheticRun } from "./readFallback.server";
+
+// Buffered runs have no execution data — the drainer hasn't materialised
+// the PG row and the worker hasn't started. The SDK-facing read routes
+// still need to return a span/trace shape that satisfies their response
+// schemas; these helpers build that minimal shape from the buffered
+// SyntheticRun.
+//
+// CANCELED and FAILED are terminal states: a FAILED buffered run is
+// errored (drainer exhausted retries or the gate rejected it) and must
+// not signal "still in progress." The flags below mirror
+// syntheticTrace.server.ts so the SDK contract stays consistent across
+// the three read paths (spans, trace, dashboard trace presenter).
+
+function deriveTerminalFlags(status: SyntheticRun["status"]): {
+  isError: boolean;
+  isPartial: boolean;
+  isCancelled: boolean;
+} {
+  const isCancelled = status === "CANCELED";
+  const isFailed = status === "FAILED";
+  return {
+    isError: isFailed,
+    isPartial: !isCancelled && !isFailed,
+    isCancelled,
+  };
+}
+
+// Body for GET /api/v1/runs/:runId/spans/:spanId when the run is buffered
+// and `:spanId` has already been verified against `buffered.spanId` by the
+// route. Pure function so the route layer just authenticates, resolves
+// the run, validates the spanId, and forwards the buffered run here.
+export function buildSyntheticSpanDetailBody(buffered: SyntheticRun) {
+  const flags = deriveTerminalFlags(buffered.status);
+  return {
+    spanId: buffered.spanId,
+    parentId: buffered.parentSpanId ?? null,
+    runId: buffered.friendlyId,
+    message: buffered.taskIdentifier ?? "",
+    ...flags,
+    level: "TRACE" as const,
+    startTime: buffered.createdAt,
+    durationMs: 0,
+  };
+}
+
+// Body for GET /api/v1/runs/:runId/trace when the run is buffered.
+// Returns the `{ trace: { traceId, rootSpan } }` envelope expected by the
+// SDK's RetrieveRunTraceResponseBody schema.
+export function buildSyntheticTraceBody(buffered: SyntheticRun) {
+  const flags = deriveTerminalFlags(buffered.status);
+  return {
+    trace: {
+      traceId: buffered.traceId ?? "",
+      rootSpan: {
+        id: buffered.spanId ?? "",
+        runId: buffered.friendlyId,
+        data: {
+          message: buffered.taskIdentifier ?? "",
+          taskSlug: buffered.taskIdentifier ?? undefined,
+          events: [] as unknown[],
+          startTime: buffered.createdAt,
+          duration: 0,
+          ...flags,
+          level: "TRACE" as const,
+          queueName: buffered.queue ?? undefined,
+          machinePreset: buffered.machinePreset ?? undefined,
+        },
+        children: [] as unknown[],
+      },
+    },
+  };
+}
diff --git a/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts b/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts
new file mode 100644
index 00000000000..e316846d708
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts
@@ -0,0 +1,119 @@
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import type { PrismaClientOrTransaction } from "@trigger.dev/database";
+import { z } from "zod";
+import { prisma } from "~/db.server";
+import { logger } from "~/services/logger.server";
+import { getMollifierBuffer } from "./mollifierBuffer.server";
+// Use the webapp-side wrapper (not `deserialiseSnapshot` from
+// @trigger.dev/redis-worker directly) so this file shares a single
+// deserialisation path with readFallback.server.ts. The two are
+// behaviourally identical today (both wrap `JSON.parse`), but pinning
+// the shared helper keeps the two read-side modules from drifting if
+// snapshot encoding ever changes.
+import { deserialiseMollifierSnapshot } from "./mollifierSnapshot.server";
+
+// Validated subset of a mollifier snapshot — just the fields needed to
+// rebuild a canonical run-detail URL for a buffered run. Anything else
+// in the payload is ignored. `safeParse` against this schema replaces
+// the ad-hoc `as Record<string, unknown>` + `typeof === "string"` checks
+// that the redirect path used to do by hand; missing or wrong-typed
+// fields collapse into a single `parsed.success === false` branch.
+const BufferedSnapshotSchema = z.object({
+  spanId: z.string().optional(),
+  environment: z.object({
+    slug: z.string(),
+    project: z.object({ slug: z.string() }),
+    organization: z.object({ slug: z.string() }),
+  }),
+});
+
+export type BufferedRunRedirectInfo = {
+  organizationSlug: string;
+  projectSlug: string;
+  environmentSlug: string;
+  spanId: string | undefined;
+};
+
+export type FindBufferedRunRedirectInfoDeps = {
+  getBuffer?: () => MollifierBuffer | null;
+  prismaClient?: PrismaClientOrTransaction;
+};
+
+// Resolve the org/project/env slugs needed to build the canonical run-detail
+// URL for a buffered run. Used by the short-URL redirect routes
+// (`runs.$runParam`, `@.runs.$runParam`, `projects.v3.$projectRef.runs.$runParam`)
+// so a customer clicking the trigger-API-returned run link doesn't 404
+// during the buffered window.
+//
+// Authorisation: PG query confirms the requesting user belongs to the
+// organisation the buffer entry says owns the run. Without this check a
+// known runId would leak slugs.
+export async function findBufferedRunRedirectInfo(
+  args: {
+    runFriendlyId: string;
+    userId: string;
+    // Admin impersonation paths bypass org-membership; mirrors the existing
+    // PG-side admin route behaviour (`@.runs.$runParam` doesn't filter by
+    // org membership in the PG query either).
+    skipOrgMembershipCheck?: boolean;
+  },
+  deps: FindBufferedRunRedirectInfoDeps = {},
+): Promise<BufferedRunRedirectInfo | null> {
+  const buffer = (deps.getBuffer ?? getMollifierBuffer)();
+  const prismaClient = deps.prismaClient ?? prisma;
+  if (!buffer) return null;
+
+  let entry;
+  try {
+    entry = await buffer.getEntry(args.runFriendlyId);
+  } catch (err) {
+    logger.warn("buffered redirect: buffer.getEntry failed", {
+      runFriendlyId: args.runFriendlyId,
+      err: err instanceof Error ? err.message : String(err),
+    });
+    return null;
+  }
+  if (!entry) return null;
+
+  if (!args.skipOrgMembershipCheck) {
+    const member = await prismaClient.orgMember.findFirst({
+      where: { userId: args.userId, organizationId: entry.orgId },
+      select: { id: true },
+    });
+    if (!member) return null;
+  }
+
+  let raw: unknown;
+  try {
+    raw = deserialiseMollifierSnapshot(entry.payload);
+  } catch (err) {
+    logger.warn("buffered redirect: snapshot deserialise failed", {
+      runFriendlyId: args.runFriendlyId,
+      err: err instanceof Error ? err.message : String(err),
+    });
+    return null;
+  }
+
+  const parsed = BufferedSnapshotSchema.safeParse(raw);
+  if (!parsed.success) {
+    // Either the snapshot is from a different writer that doesn't carry
+    // environment slugs (in which case we genuinely can't build a URL)
+    // or a buffer-format drift snuck through. Log at debug; the caller
+    // 404s and the user sees the standard not-found page, not a 500.
+    logger.debug("buffered redirect: snapshot shape mismatch", {
+      runFriendlyId: args.runFriendlyId,
+      issues: parsed.error.issues.map((issue) => ({
+        path: issue.path.join("."),
+        code: issue.code,
+      })),
+    });
+    return null;
+  }
+
+  return {
+    organizationSlug: parsed.data.environment.organization.slug,
+    projectSlug: parsed.data.environment.project.slug,
+    environmentSlug: parsed.data.environment.slug,
+    spanId: parsed.data.spanId,
+  };
+}
diff --git a/apps/webapp/app/v3/mollifier/syntheticReplayTaskRun.server.ts b/apps/webapp/app/v3/mollifier/syntheticReplayTaskRun.server.ts
new file mode 100644
index 00000000000..01962cf7890
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/syntheticReplayTaskRun.server.ts
@@ -0,0 +1,51 @@
+import type { TaskRun } from "@trigger.dev/database";
+import type { SyntheticRun } from "./readFallback.server";
+
+export type SyntheticReplayTaskRun = TaskRun & {
+  project: { slug: string; organization: { slug: string } };
+  runtimeEnvironment: { slug: string };
+};
+
+// Adapt a buffered-run snapshot into the TaskRun-shaped input that
+// `ReplayTaskRunService.call` expects. ReplayTaskRunService builds the
+// new run's traceparent as `00-${existingTaskRun.traceId}-${existingTaskRun.spanId}-01`
+// without guarding for undefined, so a synthetic with missing traceId
+// or spanId (older snapshots — both fields are documented optional on
+// `SyntheticRun`) would produce `00-undefined-undefined-01`, an invalid
+// W3C traceparent that OTel silently drops, severing the replay's trace
+// link to the original run.
+//
+// Returns null when those fields are missing — the caller surfaces this
+// as "Run not found" so the customer retries once the drainer has
+// materialised the PG row, where traceId/spanId are guaranteed present.
+export function buildSyntheticReplayTaskRun(args: {
+  synthetic: SyntheticRun;
+  envRow: {
+    slug: string;
+    project: { slug: string; organization: { slug: string } };
+  };
+}): SyntheticReplayTaskRun | null {
+  const { synthetic, envRow } = args;
+  if (!synthetic.traceId || !synthetic.spanId) return null;
+  return {
+    // The double `as unknown as TaskRun` cast is load-bearing — a direct
+    // `synthetic as TaskRun` won't compile. `SyntheticRun` carries the
+    // subset of fields that `ReplayTaskRunService.call` actually reads
+    // (the contract is enumerated on the SyntheticRun type comment in
+    // readFallback.server.ts), but its shape is not structurally
+    // assignable to the full Prisma `TaskRun` row: optional vs required
+    // fields diverge, several PG columns (number, batchId variants,
+    // status enum widening) are deliberately absent or narrower on the
+    // synthetic. Routing it through `unknown` is the explicit "we know
+    // this is a subset, we've audited which fields are read" signal,
+    // and the traceId/spanId guard above prevents the only field
+    // ReplayTaskRunService consumes that would corrupt downstream
+    // behaviour (the OTel traceparent) when undefined.
+    ...(synthetic as unknown as TaskRun),
+    project: {
+      slug: envRow.project.slug,
+      organization: { slug: envRow.project.organization.slug },
+    },
+    runtimeEnvironment: { slug: envRow.slug },
+  };
+}
diff --git a/apps/webapp/app/v3/mollifier/syntheticRunHeader.server.ts b/apps/webapp/app/v3/mollifier/syntheticRunHeader.server.ts
new file mode 100644
index 00000000000..9b137f87fb3
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/syntheticRunHeader.server.ts
@@ -0,0 +1,75 @@
+import type { SyntheticRun } from "./readFallback.server";
+
+// Synthesise the run-detail page's `run` header shape (the NavBar +
+// status badge + Cancel-button gate) from a buffered run snapshot. The
+// shape matches `RunPresenter.getRun`'s `runData` — keep this in sync
+// when fields are added there.
+//
+// CANCELED and FAILED state is reflected back from
+// `SyntheticRun.cancelledAt` / `status` so terminal buffered runs show
+// the correct status in the NavBar + isFinished:true (which collapses
+// the Cancel button on the page header) before the drainer materialises
+// the PG row. This mirrors what `buildSyntheticSpanRun` does for the
+// right-side details panel — the SyntheticRun.cancelledAt contract
+// comment in readFallback.server.ts names this exact UI surface.
+//
+// FAILED status maps to `SYSTEM_FAILURE` to match the drainer's
+// non-retryable terminal path, which is what `buildSyntheticSpanRun`
+// uses too. Symmetric across the header + span-detail panel so an
+// admin doesn't see "Pending" + "FAILED" simultaneously on the same
+// run.
+export function buildSyntheticRunHeader(args: {
+  run: SyntheticRun;
+  environment: {
+    id: string;
+    organizationId: string;
+    type: "PRODUCTION" | "DEVELOPMENT" | "STAGING" | "PREVIEW";
+    slug: string;
+  };
+}) {
+  const { run, environment } = args;
+  const isCancelled = run.status === "CANCELED";
+  const isFailed = run.status === "FAILED";
+
+  return {
+    // `id` mirrors RunPresenter.getRun's runData (the PG path), which
+    // is the internal cuid — not the friendlyId. SyntheticRun.id is
+    // already the cuid (RunId.fromFriendlyId(entry.runId) in
+    // readFallback.server.ts) so the admin debug tooltip on the run
+    // detail page shows the same format for buffered + materialised
+    // runs.
+    id: run.id,
+    number: 1,
+    friendlyId: run.friendlyId,
+    traceId: run.traceId ?? "",
+    spanId: run.spanId ?? "",
+    status: isCancelled
+      ? ("CANCELED" as const)
+      : isFailed
+      ? ("SYSTEM_FAILURE" as const)
+      : ("PENDING" as const),
+    isFinished: isCancelled || isFailed,
+    startedAt: null,
+    // Symmetric with `buildSyntheticSpanRun` and the
+    // `ApiRetrieveRunPresenter` synth path. The run-detail route
+    // derives `isCompleted` from `completedAt !== null` and gates SSE
+    // live-reloading on it (`route.tsx:459`, `:551`); leaving
+    // `completedAt` null for FAILED would keep a terminal buffered run
+    // live-reloading forever. PG-resident SYSTEM_FAILURE rows always
+    // have completedAt set, so fall back to createdAt (the buffer
+    // entry has no separate failedAt — closest proxy for when the
+    // terminal state landed).
+    completedAt: run.cancelledAt ?? (isFailed ? run.createdAt : null),
+    logsDeletedAt: null,
+    rootTaskRun: null,
+    parentTaskRun: null,
+    environment: {
+      id: environment.id,
+      organizationId: environment.organizationId,
+      type: environment.type,
+      slug: environment.slug,
+      userId: undefined,
+      userName: undefined,
+    },
+  };
+}
diff --git a/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts b/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts
new file mode 100644
index 00000000000..ae274aac3d5
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts
@@ -0,0 +1,197 @@
+import { prettyPrintPacket, RunAnnotations } from "@trigger.dev/core/v3";
+import { getMaxDuration } from "@trigger.dev/core/v3/isomorphic";
+import {
+  extractIdempotencyKeyScope,
+  getUserProvidedIdempotencyKey,
+} from "@trigger.dev/core/v3/serverOnly";
+import { MachinePresetName } from "@trigger.dev/core/v3/schemas";
+import type { SpanRun } from "~/presenters/v3/SpanPresenter.server";
+import type { SyntheticRun } from "./readFallback.server";
+
+// `SyntheticRun.machinePreset` is sourced from the snapshot payload as
+// a plain string, but `SpanRun.machinePreset` is the narrowed enum.
+// Validate against the canonical enum so an unknown / stale preset
+// string collapses to undefined rather than fighting the type checker.
+function narrowMachinePreset(value: string | undefined): SpanRun["machinePreset"] {
+  if (value === undefined) return undefined;
+  const parsed = MachinePresetName.safeParse(value);
+  return parsed.success ? parsed.data : undefined;
+}
+
+// Synthesise a SpanRun-shaped object from a buffered run so the run-detail
+// page's right-side details panel renders identically to a PG-resident
+// run. The shape matches `SpanPresenter.getRun`'s return value;
+// buffered-irrelevant fields (output, attempts, schedule, session,
+// region, batch) are filled with sensible defaults, while terminal state
+// (CANCELED / FAILED) is reflected into `status`, `isFinished`, `isError`
+// and `error` so a finished buffered run does not render as PENDING.
+//
+// Pretty-printing for payload and metadata mirrors SpanPresenter so the
+// UI receives data in the same shape. Buffered runs cannot use the
+// `application/store` packet path (no R2 object yet) so we treat raw
+// snapshot fields as inline packets.
+export async function buildSyntheticSpanRun(args: {
+  run: SyntheticRun;
+  environment: { id: string; slug: string; type: "PRODUCTION" | "DEVELOPMENT" | "STAGING" | "PREVIEW" };
+}): Promise<SpanRun> {
+  const { run, environment } = args;
+
+  const payload =
+    typeof run.payload !== "undefined" && run.payload !== null
+      ? await prettyPrintPacket(run.payload, run.payloadType ?? undefined)
+      : undefined;
+
+  // Nullish check, not truthy — matches the payload branch above so an
+  // intentionally-empty packet (e.g. metadata: "") still gets handed to
+  // `prettyPrintPacket` and renders consistently. A truthy check would
+  // drop the empty-string case and the two paths would diverge.
+  const metadata =
+    typeof run.metadata !== "undefined" && run.metadata !== null
+      ? await prettyPrintPacket(run.metadata, run.metadataType, {
+          filteredKeys: ["$$streams", "$$streamsVersion", "$$streamsBaseUrl"],
+        })
+      : undefined;
+
+  const idempotencyShape = {
+    idempotencyKey: run.idempotencyKey ?? null,
+    idempotencyKeyExpiresAt: null,
+    idempotencyKeyOptions: run.idempotencyKeyOptions ?? null,
+  };
+
+  const idempotencyKey = getUserProvidedIdempotencyKey(idempotencyShape);
+  const idempotencyKeyScope = extractIdempotencyKeyScope(idempotencyShape);
+  const idempotencyKeyStatus: SpanRun["idempotencyKeyStatus"] = idempotencyKey
+    ? "active"
+    : idempotencyKeyScope
+    ? "inactive"
+    : undefined;
+
+  const taskKind = RunAnnotations.safeParse(run.annotations).data?.taskKind;
+  const isAgentRun = taskKind === "AGENT";
+
+  const queueName = run.queue ?? "task/";
+  const isCancelled = run.status === "CANCELED";
+  const isFailed = run.status === "FAILED";
+
+  // The run-detail panel derives terminal/error state from `status`,
+  // `isFinished` and `isError` (SpanPresenter.getRun -> isFinalRunStatus /
+  // isFailedRunStatus). Buffered FAILED runs surface as SYSTEM_FAILURE to
+  // match ApiRetrieveRunPresenter.bufferedStatusToTaskRunStatus; both
+  // CANCELED and SYSTEM_FAILURE are final run statuses, and SYSTEM_FAILURE
+  // is also a failed status.
+  const status: SpanRun["status"] = isCancelled
+    ? "CANCELED"
+    : isFailed
+    ? "SYSTEM_FAILURE"
+    : "PENDING";
+
+  // Mirror ApiRetrieveRunPresenter's STRING_ERROR synthesis so the panel
+  // shows why a buffered run failed instead of an empty error block.
+  const error: SpanRun["error"] =
+    isFailed && run.error
+      ? { type: "STRING_ERROR", raw: `${run.error.code}: ${run.error.message}` }
+      : undefined;
+
+  return {
+    id: run.id,
+    friendlyId: run.friendlyId,
+    status,
+    statusReason: isCancelled
+      ? run.cancelReason ?? undefined
+      : isFailed
+      ? run.error?.message ?? undefined
+      : undefined,
+    createdAt: run.createdAt,
+    startedAt: null,
+    executedAt: null,
+    updatedAt: run.cancelledAt ?? run.createdAt,
+    delayUntil: run.delayUntil ?? null,
+    expiredAt: null,
+    // Symmetric with `ApiRetrieveRunPresenter` — FAILED buffered runs
+    // must surface a non-null `completedAt` so the run-detail panel
+    // (and any caller checking `isFinished && completedAt`) doesn't
+    // render a finished run with no completion timestamp. PG-resident
+    // SYSTEM_FAILURE rows always have completedAt set; the buffer
+    // entry has no separate failedAt, so we fall back to createdAt
+    // as the best proxy for when the terminal state landed.
+    completedAt: run.cancelledAt ?? (isFailed ? run.createdAt : null),
+    logsDeletedAt: null,
+    ttl: run.ttl ?? null,
+    taskIdentifier: run.taskIdentifier ?? "",
+    version: undefined,
+    sdkVersion: undefined,
+    runtime: undefined,
+    runtimeVersion: undefined,
+    isTest: run.isTest,
+    replayedFromTaskRunFriendlyId: run.replayedFromTaskRunFriendlyId ?? null,
+    environmentId: environment.id,
+    idempotencyKey,
+    idempotencyKeyExpiresAt: null,
+    idempotencyKeyScope,
+    idempotencyKeyStatus,
+    debounce: null,
+    schedule: undefined,
+    queue: {
+      name: queueName,
+      isCustomQueue: !queueName.startsWith("task/"),
+      concurrencyKey: run.concurrencyKey ?? null,
+    },
+    tags: run.runTags,
+    baseCostInCents: 0,
+    costInCents: 0,
+    totalCostInCents: 0,
+    usageDurationMs: 0,
+    isFinished: isCancelled || isFailed,
+    isRunning: false,
+    isError: isFailed,
+    isAgentRun,
+    payload,
+    payloadType: run.payloadType ?? "application/json",
+    output: undefined,
+    outputType: "application/json",
+    error,
+    // The snapshot only carries the root/parent friendly IDs, not the
+    // spanId or taskIdentifier that SpanPresenter sources from the joined
+    // PG rows. Emitting them with empty-string stubs renders a blank task
+    // name and a misleading `?span=` jump target, so we omit the
+    // relationships until the drainer materialises the row (a transient
+    // window). Top-level buffered runs have no relationships regardless.
+    relationships: {
+      root: undefined,
+      parent: undefined,
+    },
+    context: JSON.stringify(
+      {
+        task: {
+          id: run.taskIdentifier ?? "",
+        },
+        run: {
+          id: run.friendlyId,
+          createdAt: run.createdAt,
+          isTest: run.isTest,
+        },
+        environment: {
+          id: environment.id,
+          slug: environment.slug,
+          type: environment.type,
+        },
+      },
+      null,
+      2,
+    ),
+    metadata,
+    maxDurationInSeconds: getMaxDuration(run.maxDurationInSeconds),
+    batch: undefined,
+    session: undefined,
+    engine: "V2",
+    region: null,
+    workerQueue: run.workerQueue ?? "",
+    traceId: run.traceId ?? "",
+    spanId: run.spanId ?? "",
+    isCached: false,
+    isBuffered: true,
+    machinePreset: narrowMachinePreset(run.machinePreset),
+    taskEventStore: "taskEvent",
+    externalTraceId: undefined,
+  };
+}
diff --git a/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts b/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts
new file mode 100644
index 00000000000..ee0d518e2e7
--- /dev/null
+++ b/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts
@@ -0,0 +1,76 @@
+import { millisecondsToNanoseconds } from "@trigger.dev/core/v3";
+import { createTreeFromFlatItems, flattenTree } from "~/components/primitives/TreeView/TreeView";
+import { createTimelineSpanEventsFromSpanEvents } from "~/utils/timelineSpanEvents";
+import type { SpanSummary } from "~/v3/eventRepository/eventRepository.types";
+import type { SyntheticRun } from "./readFallback.server";
+
+// Build a single-span trace for a buffered run so the run-detail page
+// renders a meaningful timeline before the drainer materialises the
+// row. Mirrors the shape produced by `RunPresenter` when its trace
+// store lookup returns no spans, so the dashboard consumer treats the
+// buffered run identically to a freshly enqueued PG run that hasn't
+// emitted any events yet.
+export function buildSyntheticTraceForBufferedRun(run: SyntheticRun) {
+  const spanId = run.spanId ?? "";
+  const isCancelled = run.status === "CANCELED";
+  const isFailed = run.status === "FAILED";
+  const span: SpanSummary = {
+    id: spanId,
+    parentId: run.parentSpanId,
+    runId: run.friendlyId,
+    data: {
+      message: run.taskIdentifier ?? "Task",
+      style: { icon: "task", variant: "primary" },
+      events: [],
+      startTime: run.createdAt,
+      duration: 0,
+      isError: isFailed,
+      // CANCELED and FAILED are terminal; only a still-queued buffered run
+      // is partial. A partial failed span would otherwise render as
+      // "executing" forever in the timeline.
+      isPartial: !isCancelled && !isFailed,
+      isCancelled,
+      isDebug: false,
+      level: "TRACE",
+    },
+  };
+
+  const tree = createTreeFromFlatItems([span], spanId);
+  const treeRootStartTimeMs = tree?.data.startTime.getTime() ?? 0;
+  const totalDuration = Math.max(tree?.data.duration ?? 0, millisecondsToNanoseconds(1));
+
+  const events = tree
+    ? flattenTree(tree).map((n) => {
+        const offset = millisecondsToNanoseconds(
+          n.data.startTime.getTime() - treeRootStartTimeMs
+        );
+        return {
+          ...n,
+          data: {
+            ...n.data,
+            timelineEvents: createTimelineSpanEventsFromSpanEvents(n.data.events, false, treeRootStartTimeMs),
+            duration: n.data.isPartial ? null : n.data.duration,
+            offset,
+            isRoot: n.id === spanId,
+          },
+        };
+      })
+    : [];
+
+  return {
+    // Matches RunPresenter's derivation: failed root span -> "failed",
+    // otherwise a terminal (non-partial) span -> "completed", else
+    // "executing". CANCELED is terminal-but-not-error, so "completed".
+    rootSpanStatus: (isFailed ? "failed" : isCancelled ? "completed" : "executing") as
+      | "executing"
+      | "completed"
+      | "failed",
+    events,
+    duration: totalDuration,
+    rootStartedAt: tree?.data.startTime,
+    startedAt: null,
+    queuedDuration: undefined,
+    overridesBySpanId: undefined,
+    linkedRunIdBySpanId: {} as Record<string, string>,
+  };
+}
diff --git a/apps/webapp/app/v3/mollifierDrainerWorker.server.ts b/apps/webapp/app/v3/mollifierDrainerWorker.server.ts
index 313e9af6719..e571344141d 100644
--- a/apps/webapp/app/v3/mollifierDrainerWorker.server.ts
+++ b/apps/webapp/app/v3/mollifierDrainerWorker.server.ts
@@ -97,8 +97,8 @@ export function initMollifierDrainerWorker(
     // Deterministic misconfig (shutdown-timeout vs GRACEFUL_SHUTDOWN_TIMEOUT,
     // missing buffer client) is a deploy-time mistake the operator must
     // see immediately — rethrow so the process crashes, health checks
-    // fail, and the orchestrator rolls the deploy back. Phase 1 is
-    // monitoring-only and the silent-fallback was tempting, but Phase 2/3
+    // fail, and the orchestrator rolls the deploy back. The drainer is currently
+    // monitoring-only and the silent-fallback was tempting, but later phases
     // make the drainer the source of truth for diverted triggers, where a
     // silently-disabled drainer means data loss. Better to fail loud now
     // than retrofit later.
diff --git a/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts
new file mode 100644
index 00000000000..de05ab24671
--- /dev/null
+++ b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts
@@ -0,0 +1,73 @@
+import { env } from "~/env.server";
+import { logger } from "~/services/logger.server";
+import { signalsEmitter } from "~/services/signals.server";
+import {
+  startStaleSweepInterval,
+  type StaleSweepIntervalHandle,
+} from "./mollifier/mollifierStaleSweep.server";
+import { MollifierStaleSweepState } from "./mollifier/mollifierStaleSweepState.server";
+
+declare global {
+  // eslint-disable-next-line no-var
+  var __mollifierStaleSweepRegistered__: boolean | undefined;
+  // eslint-disable-next-line no-var
+  var __mollifierStaleSweepHandle__: StaleSweepIntervalHandle | undefined;
+}
+
+/**
+ * Bootstraps the mollifier stale-entry sweep.
+ *
+ * Independent of the drainer — its purpose is to alert when entries are
+ * piling up despite the drainer being supposedly healthy, so it runs
+ * any time the mollifier itself is enabled (gated separately from
+ * `TRIGGER_MOLLIFIER_DRAINER_ENABLED`). The sweep is read-only: it
+ * counts and logs stale entries but does not remove or salvage them.
+ *
+ * The Remix dev server re-evaluates `entry.server.tsx` on every change,
+ * so the registration guard + handle cache make the bootstrap
+ * idempotent across hot reloads.
+ */
+export function initMollifierStaleSweepWorker(): void {
+  if (env.TRIGGER_MOLLIFIER_STALE_SWEEP_ENABLED !== "1") return;
+  if (global.__mollifierStaleSweepRegistered__) return;
+
+  logger.debug("Initializing mollifier stale-entry sweep", {
+    intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS,
+    staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS,
+  });
+
+  // Construct the sweep's durable-state Redis client using the same
+  // mollifier-Redis credentials as the buffer. Keeping this client
+  // separate from the buffer's own client keeps state ownership clean:
+  // the buffer abstracts queue/entry state, this abstracts sweep state.
+  const state = new MollifierStaleSweepState({
+    redisOptions: {
+      keyPrefix: "",
+      host: env.TRIGGER_MOLLIFIER_REDIS_HOST,
+      port: env.TRIGGER_MOLLIFIER_REDIS_PORT,
+      username: env.TRIGGER_MOLLIFIER_REDIS_USERNAME,
+      password: env.TRIGGER_MOLLIFIER_REDIS_PASSWORD,
+      enableAutoPipelining: true,
+      ...(env.TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }),
+    },
+  });
+
+  const handle = startStaleSweepInterval(
+    {
+      intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS,
+      staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS,
+    },
+    { state },
+  );
+
+  // `handle.stop` is now async (it closes the Redis client). The signals
+  // emitter swallows promise rejections from listeners, so wrap it in a
+  // void-returning shim to be explicit about discarding the promise.
+  const onShutdown = (): void => {
+    void handle.stop();
+  };
+  signalsEmitter.on("SIGTERM", onShutdown);
+  signalsEmitter.on("SIGINT", onShutdown);
+  global.__mollifierStaleSweepRegistered__ = true;
+  global.__mollifierStaleSweepHandle__ = handle;
+}
diff --git a/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts b/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts
index 95684999303..8273d8c9d97 100644
--- a/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts
+++ b/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts
@@ -1,6 +1,7 @@
 import type { AuthenticatedEnvironment } from "~/services/apiAuth.server";
 import { BaseService, ServiceValidationError } from "./baseService.server";
 import { logger } from "~/services/logger.server";
+import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server";
 
 export class ResetIdempotencyKeyService extends BaseService {
   public async call(
@@ -8,7 +9,7 @@ export class ResetIdempotencyKeyService extends BaseService {
     taskIdentifier: string,
     authenticatedEnv: AuthenticatedEnvironment
   ): Promise<{ id: string }> {
-    const { count } = await this._prisma.taskRun.updateMany({
+    const { count: pgCount } = await this._prisma.taskRun.updateMany({
       where: {
         idempotencyKey,
         taskIdentifier,
@@ -20,7 +21,77 @@ export class ResetIdempotencyKeyService extends BaseService {
       },
     });
 
-    if (count === 0) {
+    // Buffer-side reset: the key may belong to a buffered run that
+    // hasn't materialised yet. The PG updateMany above can't see it.
+    // resetIdempotency clears both the snapshot fields and the Redis
+    // lookup atomically. Returns null when nothing was bound there.
+    const buffer = getMollifierBuffer();
+    let bufferResetFailed = false;
+    const bufferResult = buffer
+      ? await buffer
+          .resetIdempotency({
+            envId: authenticatedEnv.id,
+            taskIdentifier,
+            idempotencyKey,
+          })
+          .catch((err) => {
+            // Don't drop a buffer outage on the floor. We log + flag so
+            // the 404 branch below can distinguish "no record anywhere"
+            // (legitimate not-found) from "PG cleared nothing AND we
+            // couldn't see the buffer" (partial outage — caller should
+            // retry, not be told "doesn't exist").
+            bufferResetFailed = true;
+            logger.error("ResetIdempotencyKeyService: buffer reset failed", {
+              idempotencyKey,
+              taskIdentifier,
+              err: err instanceof Error ? err.message : String(err),
+            });
+            return { clearedRunId: null };
+          })
+      : { clearedRunId: null };
+
+    const totalCount = pgCount + (bufferResult.clearedRunId ? 1 : 0);
+
+    if (pgCount === 0 && bufferResetFailed) {
+      // PG saw nothing AND the buffer is unreachable. We can't truthfully
+      // say "not found" — there may be a buffered run we can't observe.
+      // Surface as 503 so the caller retries instead of being misled.
+      throw new ServiceValidationError(
+        "Unable to verify buffered idempotency state right now; please retry",
+        503
+      );
+    }
+
+    if (totalCount === 0) {
+      // PG↔buffer handoff re-check. Between the initial `pg.updateMany`
+      // and the buffer reset above, a buffered run can materialise into
+      // PG: the drainer's `engine.trigger` writes the row with the
+      // original idempotencyKey, then `buffer.ack` clears the Redis
+      // idempotency lookup (per ack's contract on
+      // `packages/redis-worker/src/mollifier/buffer.ts`). Both surfaces
+      // now report "nothing", but the key still lives on the freshly-
+      // materialised PG row. One more conditional updateMany catches
+      // that row before we 404 the customer. Cost: a single indexed
+      // lookup against the writer when there's nothing to find;
+      // otherwise the exact write the customer asked for (i.e., not
+      // duplicative — without it the reset is silently lost).
+      const { count: handoffPgCount } = await this._prisma.taskRun.updateMany({
+        where: {
+          idempotencyKey,
+          taskIdentifier,
+          runtimeEnvironmentId: authenticatedEnv.id,
+        },
+        data: {
+          idempotencyKey: null,
+          idempotencyKeyExpiresAt: null,
+        },
+      });
+      if (handoffPgCount > 0) {
+        logger.info(
+          `Reset idempotency key via handoff re-check: ${idempotencyKey} for task: ${taskIdentifier} in env: ${authenticatedEnv.id}, affected ${handoffPgCount} run(s)`
+        );
+        return { id: idempotencyKey };
+      }
       throw new ServiceValidationError(
         `No runs found with idempotency key: ${idempotencyKey} and task: ${taskIdentifier}`,
         404
@@ -28,7 +99,7 @@ export class ResetIdempotencyKeyService extends BaseService {
     }
 
     logger.info(
-      `Reset idempotency key: ${idempotencyKey} for task: ${taskIdentifier} in env: ${authenticatedEnv.id}, affected ${count} run(s)`
+      `Reset idempotency key: ${idempotencyKey} for task: ${taskIdentifier} in env: ${authenticatedEnv.id}, affected ${totalCount} run(s) (pg=${pgCount}, buffered=${bufferResult.clearedRunId ? 1 : 0})`
     );
 
     return { id: idempotencyKey };
diff --git a/apps/webapp/app/v3/services/triggerTask.server.ts b/apps/webapp/app/v3/services/triggerTask.server.ts
index 96712c36cc4..7bbaa0dd99b 100644
--- a/apps/webapp/app/v3/services/triggerTask.server.ts
+++ b/apps/webapp/app/v3/services/triggerTask.server.ts
@@ -46,6 +46,14 @@ export class OutOfEntitlementError extends Error {
 export type TriggerTaskServiceResult = {
   run: TaskRun;
   isCached: boolean;
+  // True when the mollifier gate diverted the trigger to the Redis
+  // buffer and `run` is a synthesised record (no PG row exists yet).
+  // The trigger route reads this to skip `saveRequestIdempotency` —
+  // caching the synth runId would mean a lost-response SDK retry hits
+  // a PG-miss in `handleRequestIdempotency` and falls through to a
+  // fresh trigger, producing a duplicate buffer entry for trigger
+  // calls that don't carry a task-level idempotency key.
+  isMollified?: boolean;
 };
 
 export const MAX_ATTEMPTS = 2;
diff --git a/apps/webapp/test/engine/triggerTask.test.ts b/apps/webapp/test/engine/triggerTask.test.ts
index d07909d2907..8613ba429c3 100644
--- a/apps/webapp/test/engine/triggerTask.test.ts
+++ b/apps/webapp/test/engine/triggerTask.test.ts
@@ -68,17 +68,31 @@ class MockTriggerTaskValidator implements TriggerTaskValidator {
   }
 }
 
+// Mirror the production ClickhouseEventRepository.traceEvent shape so
+// callers that read `event.traceContext.traceparent` (e.g. the
+// mollifier branch seeding the snapshot) get the same W3C-formatted
+// value they'd get against a real event repository.
+const MOCK_TRACE_ID = "0123456789abcdef0123456789abcdef";
+const MOCK_SPAN_ID = "fedcba9876543210";
+const MOCK_TRACEPARENT = `00-${MOCK_TRACE_ID}-${MOCK_SPAN_ID}-01`;
+
 class MockTraceEventConcern implements TraceEventConcern {
+  // Records the start time of the most recent traceRun callback entry.
+  // Used by ordering assertions that verify traceRun fires before
+  // downstream side effects (e.g. mollifier buffer writes).
+  public traceRunEnteredAt: number | undefined;
+
   async traceRun<T>(
     request: TriggerTaskRequest,
     parentStore: string | undefined,
     callback: (span: TracedEventSpan, store: string) => Promise<T>
   ): Promise<T> {
+    this.traceRunEnteredAt = Date.now();
     return await callback(
       {
-        traceId: "test",
-        spanId: "test",
-        traceContext: {},
+        traceId: MOCK_TRACE_ID,
+        spanId: MOCK_SPAN_ID,
+        traceContext: { traceparent: MOCK_TRACEPARENT },
         traceparent: undefined,
         setAttribute: () => { },
         failWithError: () => { },
@@ -253,6 +267,76 @@ describe("RunEngineTriggerTaskService", () => {
     await engine.quit();
   });
 
+  // The BatchQueue worker rebuilds body.options from Redis-stored items
+  // (Record<string, unknown>), so the Phase-2 schema coercion doesn't apply
+  // to in-flight items enqueued before the schema fix. The defensive
+  // `typeof === "number"` coercion at the engine.trigger call site is what
+  // prevents these from failing at prisma.taskRun.create with
+  // "Argument concurrencyKey: Expected String or Null, provided Int".
+  containerTest(
+    "coerces a numeric concurrencyKey to a string at the engine.trigger boundary",
+    async ({ prisma, redisOptions }) => {
+      const engine = new RunEngine({
+        prisma,
+        worker: {
+          redis: redisOptions,
+          workers: 1,
+          tasksPerWorker: 10,
+          pollIntervalMs: 100,
+        },
+        queue: { redis: redisOptions },
+        runLock: { redis: redisOptions },
+        machines: {
+          defaultMachine: "small-1x",
+          machines: {
+            "small-1x": {
+              name: "small-1x" as const,
+              cpu: 0.5,
+              memory: 0.5,
+              centsPerMs: 0.0001,
+            },
+          },
+          baseCostInCents: 0.0005,
+        },
+        tracer: trace.getTracer("test", "0.0.0"),
+      });
+
+      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+      const taskIdentifier = "test-task";
+      await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier);
+
+      const triggerTaskService = new RunEngineTriggerTaskService({
+        engine,
+        prisma,
+        payloadProcessor: new MockPayloadProcessor(),
+        queueConcern: new DefaultQueueManager(prisma, engine),
+        idempotencyKeyConcern: new IdempotencyKeyConcern(
+          prisma,
+          engine,
+          new MockTraceEventConcern()
+        ),
+        validator: new MockTriggerTaskValidator(),
+        traceEventConcern: new MockTraceEventConcern(),
+        tracer: trace.getTracer("test", "0.0.0"),
+        metadataMaximumSize: 1024 * 1024 * 1,
+      });
+
+      const result = await triggerTaskService.call({
+        taskId: taskIdentifier,
+        environment: authenticatedEnvironment,
+        // Cast through `any` to simulate the in-flight Redis batch-item shape
+        // (Record<string, unknown>) that bypasses the BatchItemNDJSON schema.
+        body: { payload: { userId: 51262 }, options: { concurrencyKey: 51262 as any } },
+      });
+
+      expect(result).toBeDefined();
+      const run = await prisma.taskRun.findUnique({ where: { id: result!.run.id } });
+      expect(run?.concurrencyKey).toBe("51262");
+
+      await engine.quit();
+    }
+  );
+
   containerTest("should handle idempotency keys correctly", async ({ prisma, redisOptions }) => {
     const engine = new RunEngine({
       prisma,
@@ -1269,8 +1353,17 @@ describe("RunEngineTriggerTaskService", () => {
   );
 
   containerTest(
-    "mollifier · mollify action triggers dual-write (buffer.accept + engine.trigger)",
+    "mollifier · mollify action writes to buffer and returns synthetic result (no Postgres row)",
     async ({ prisma, redisOptions }) => {
+      // When the gate decides mollify, the call site
+      // invokes `mollifyTrigger` which writes the engine.trigger snapshot
+      // to the buffer and returns a synthesised `MollifySyntheticResult`
+      // (run.friendlyId + notice + isCached:false). `engine.trigger` is
+      // NEVER invoked on this path — the run materialises in Postgres
+      // later, when the drainer replays the snapshot. The replay is
+      // covered by `mollifierDrainerHandler.test.ts`; this test pins the
+      // call-site integration: synthetic result + buffer write + no
+      // Postgres side effect.
       const engine = new RunEngine({
         prisma,
         worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 },
@@ -1288,7 +1381,24 @@ describe("RunEngineTriggerTaskService", () => {
       const taskIdentifier = "test-task";
       await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier);
 
-      const buffer = new CapturingMollifierBuffer();
+      // Buffer override records the time of the accept call so we can
+      // assert that traceRun fired strictly before the buffer was
+      // touched. If a future change re-introduces the "skip traceRun on
+      // mollify" shortcut, traceConcern.traceRunEnteredAt stays
+      // undefined and the ordering assertion fails.
+      class TimestampedBuffer extends CapturingMollifierBuffer {
+        public acceptedAt: number | undefined;
+        override async accept(input: {
+          runId: string;
+          envId: string;
+          orgId: string;
+          payload: string;
+        }) {
+          this.acceptedAt = Date.now();
+          return await super.accept(input);
+        }
+      }
+      const buffer = new TimestampedBuffer();
       const trippedDecision = {
         divert: true as const,
         reason: "per_env_rate" as const,
@@ -1297,6 +1407,7 @@ describe("RunEngineTriggerTaskService", () => {
         windowMs: 200,
         holdMs: 500,
       };
+      const traceConcern = new MockTraceEventConcern();
 
       const triggerTaskService = new RunEngineTriggerTaskService({
         engine,
@@ -1305,7 +1416,7 @@ describe("RunEngineTriggerTaskService", () => {
         queueConcern: new DefaultQueueManager(prisma, engine),
         idempotencyKeyConcern: new IdempotencyKeyConcern(prisma, engine, new MockTraceEventConcern()),
         validator: new MockTriggerTaskValidator(),
-        traceEventConcern: new MockTraceEventConcern(),
+        traceEventConcern: traceConcern,
         tracer: trace.getTracer("test", "0.0.0"),
         metadataMaximumSize: 1024 * 1024,
         evaluateGate: async () => ({ action: "mollify", decision: trippedDecision }),
@@ -1319,25 +1430,93 @@ describe("RunEngineTriggerTaskService", () => {
         body: { payload: { hello: "world" } },
       });
 
-      // engine.trigger ran — Postgres has the run
+      // Pre-modifier span creation: traceRun must run BEFORE the buffer
+      // is touched. Customer-visible effect — the run span lands in
+      // ClickHouse from the moment the trigger returns, even when the
+      // drainer is offline, so buffered runs are visible in the trace
+      // view immediately rather than only after drain.
+      expect(traceConcern.traceRunEnteredAt).toBeDefined();
+      expect(buffer.acceptedAt).toBeDefined();
+      expect(traceConcern.traceRunEnteredAt!).toBeLessThanOrEqual(buffer.acceptedAt!);
+
+      // Synthetic result is returned with the `mollifier.queued` notice
+      // (the call-site casts the synthetic shape to `TriggerTaskServiceResult`;
+      // at runtime the `notice` and `isCached: false` fields are present
+      // and read by the api.v1.tasks.$taskId.trigger.ts route handler).
       expect(result).toBeDefined();
       expect(result?.run.friendlyId).toBeDefined();
-      const pgRun = await prisma.taskRun.findFirst({ where: { id: result!.run.id } });
-      expect(pgRun).not.toBeNull();
-      expect(pgRun!.friendlyId).toBe(result!.run.friendlyId);
-
-      // buffer.accept ran — Redis has the audit copy under the same friendlyId
+      const synthetic = result as unknown as {
+        run: { friendlyId: string };
+        isCached: false;
+        notice: { code: string; message: string; docs: string };
+      };
+      expect(synthetic.isCached).toBe(false);
+      expect(synthetic.notice.code).toBe("mollifier.queued");
+      expect(synthetic.notice.message).toBeTypeOf("string");
+      expect(synthetic.notice.docs).toBeTypeOf("string");
+
+      // The mollify branch must flag `isMollified: true` on the result so
+      // the trigger route can skip `saveRequestIdempotency`. Caching the
+      // synthetic runId in the request-idempotency table would mean a
+      // lost-response SDK retry (same `x-trigger-request-idempotency-key`
+      // header) hits a PG miss in `handleRequestIdempotency` and falls
+      // through to a fresh trigger — producing a duplicate buffer entry
+      // for trigger calls without a task-level idempotency key. The
+      // bounded behaviour (accept retry-as-fresh-trigger during the
+      // buffer window) is the deliberate choice; a stale-cache lookup
+      // returning null is not.
+      expect(result?.isMollified).toBe(true);
+
+      // buffer.accept ran — Redis has the canonical engine.trigger snapshot
+      // under the synthesised friendlyId. The drainer will read this and
+      // replay it through engine.trigger to materialise the run.
       expect(buffer.accepted).toHaveLength(1);
       expect(buffer.accepted[0]!.runId).toBe(result!.run.friendlyId);
       expect(buffer.accepted[0]!.envId).toBe(authenticatedEnvironment.id);
       expect(buffer.accepted[0]!.orgId).toBe(authenticatedEnvironment.organizationId);
+      // Payload is a JSON-serialised MollifierSnapshot (the engine.trigger
+      // input). Schema is internal to the engine, so we only assert that
+      // it parses and references the friendlyId — anything more specific
+      // would couple the mollifier-layer test to engine-layer fields.
+      const snapshot = JSON.parse(buffer.accepted[0]!.payload) as {
+        traceId?: string;
+        spanId?: string;
+        traceContext?: { traceparent?: string };
+      };
 
-      // payload is the canonical replay shape
-      const payload = JSON.parse(buffer.accepted[0]!.payload);
-      expect(payload.runFriendlyId).toBe(result!.run.friendlyId);
-      expect(payload.taskId).toBe(taskIdentifier);
-      expect(payload.envId).toBe(authenticatedEnvironment.id);
-      expect(payload.body).toEqual({ payload: { hello: "world" } });
+      // Regression guard for the dashboard trace-tree bug: the mollifier
+      // snapshot MUST carry a W3C `traceparent` in `traceContext`,
+      // seeded from the same span traceRun opened. Without it, the
+      // drainer replays through engine.trigger with empty traceContext
+      // and every downstream `recordRunDebugLog`
+      // (QUEUED/EXECUTING/FINISHED/run:notify…) gets a fresh traceId +
+      // null parentId — the run-detail page can only show the root
+      // span. Both the mollify and pass-through paths now flow through
+      // `traceEventConcern.traceRun`; this assertion pins the
+      // seeding-from-the-run-span contract.
+      expect(snapshot.traceContext?.traceparent).toMatch(
+        /^00-[0-9a-f]{32}-[0-9a-f]{16}-[0-9a-f]{2}$/
+      );
+      expect(snapshot.traceContext!.traceparent).toContain(snapshot.traceId);
+      expect(snapshot.traceContext!.traceparent).toContain(snapshot.spanId);
+      // The snapshot inherits the *run span's* traceId/spanId (from the
+      // event handed in by traceRun), not a separately-generated OTel
+      // span. This is what lets the drainer's `mollifier.drained` span
+      // and downstream engine.trigger materialisation parent on the
+      // same ClickHouse trace the customer sees from the moment trigger
+      // returns.
+      expect(snapshot.traceId).toBe(MOCK_TRACE_ID);
+      expect(snapshot.spanId).toBe(MOCK_SPAN_ID);
+
+      // Postgres has NOT been written: engine.trigger was never called on
+      // the mollify path. The run materialises only when the drainer
+      // replays the snapshot. Regression intent: if a future change makes
+      // the mollify branch fall through to engine.trigger (re-introducing
+      // phase-1 dual-write), this assertion fails loudly.
+      const pgRun = await prisma.taskRun.findFirst({
+        where: { friendlyId: result!.run.friendlyId },
+      });
+      expect(pgRun).toBeNull();
 
       await engine.quit();
     },
@@ -1393,108 +1572,12 @@ describe("RunEngineTriggerTaskService", () => {
       // getMollifierBuffer must not be called either — the call site short-circuits
       // before touching the singleton when the gate says pass_through.
       expect(getBufferSpy).not.toHaveBeenCalled();
-
-      await engine.quit();
-    },
-  );
-
-  containerTest(
-    "mollifier · engine.trigger throwing AFTER buffer.accept leaves an orphan entry (documented behaviour)",
-    async ({ prisma, redisOptions }) => {
-      // SCENARIO: dual-write where buffer.accept succeeds but engine.trigger
-      // throws. The throw propagates to the caller (correct: customer sees
-      // the same 4xx as today), and the buffer entry remains as an "orphan"
-      // — Phase 1's no-op drainer will pop+ack it on its next poll, so the
-      // orphan is bounded (~drainer pollIntervalMs) but observable in the
-      // audit trail (mollifier.buffered with no matching TaskRun).
-      //
-      // Why engine.trigger can throw post-buffer:
-      //   - RunDuplicateIdempotencyKeyError (Prisma P2002 on idempotencyKey):
-      //     a concurrent non-mollified trigger with the same idempotencyKey
-      //     wins the DB UNIQUE constraint between IdempotencyKeyConcern's
-      //     pre-check and engine.trigger's INSERT.
-      //   - RunOneTimeUseTokenError (Prisma P2002 on oneTimeUseToken).
-      //   - Transient Prisma errors (FK constraint, connection drop, etc.).
-      //
-      // Why we don't "fix" this race in Phase 1:
-      //   The customer correctly gets the error. State eventually converges
-      //   (drainer pops the orphan). The audit-trail explicitly surfaces
-      //   "buffered without TaskRun" entries to operators. A real fix is
-      //   Phase 2's responsibility once the buffer becomes the primary write
-      //   — at that point we add the mollifier-specific idempotency index.
-      //
-      // This test pins the current ordering: buffer.accept fires synchronously
-      // BEFORE engine.trigger, and engine.trigger failure does NOT roll back
-      // the buffer write. Any future change that reverses the order or adds
-      // a silent rollback will fail this assertion and force a design
-      // decision rather than a silent behaviour change.
-
-      const engine = new RunEngine({
-        prisma,
-        worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 },
-        queue: { redis: redisOptions },
-        runLock: { redis: redisOptions },
-        machines: {
-          defaultMachine: "small-1x",
-          machines: { "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 } },
-          baseCostInCents: 0.0005,
-        },
-        tracer: trace.getTracer("test", "0.0.0"),
-      });
-
-      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
-      const taskIdentifier = "test-task";
-      await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier);
-
-      const buffer = new CapturingMollifierBuffer();
-
-      // Force engine.trigger to throw on this single call. We spy AFTER
-      // setupBackgroundWorker so the worker setup still uses the real
-      // engine.trigger (which has its own engine.trigger-ish calls for
-      // worker bootstrap — though in practice setupBackgroundWorker doesn't
-      // call trigger).
-      const simulatedFailure = new Error("simulated engine.trigger failure post-buffer");
-      vi.spyOn(engine, "trigger").mockRejectedValueOnce(simulatedFailure);
-
-      const triggerTaskService = new RunEngineTriggerTaskService({
-        engine,
-        prisma,
-        payloadProcessor: new MockPayloadProcessor(),
-        queueConcern: new DefaultQueueManager(prisma, engine),
-        idempotencyKeyConcern: new IdempotencyKeyConcern(prisma, engine, new MockTraceEventConcern()),
-        validator: new MockTriggerTaskValidator(),
-        traceEventConcern: new MockTraceEventConcern(),
-        tracer: trace.getTracer("test", "0.0.0"),
-        metadataMaximumSize: 1024 * 1024,
-        evaluateGate: async () => ({
-          action: "mollify",
-          decision: {
-            divert: true,
-            reason: "per_env_rate",
-            count: 150,
-            threshold: 100,
-            windowMs: 200,
-            holdMs: 500,
-          },
-        }),
-        getMollifierBuffer: () => buffer as never,
-        isMollifierGloballyEnabled: () => true,
-      });
-
-      await expect(
-        triggerTaskService.call({
-          taskId: taskIdentifier,
-          environment: authenticatedEnvironment,
-          body: { payload: { test: "x" } },
-        }),
-      ).rejects.toThrow(/simulated engine.trigger failure post-buffer/);
-
-      // The buffer write happened BEFORE engine.trigger threw. The orphan
-      // remains; the audit-trail will surface it (mollifier.buffered with
-      // no matching TaskRun row). Phase 1's no-op drainer cleans it up.
-      expect(buffer.accepted).toHaveLength(1);
-      const orphanPayload = JSON.parse(buffer.accepted[0]!.payload);
-      expect(orphanPayload.taskId).toBe(taskIdentifier);
+      // Pass-through must NOT set `isMollified` — `result.run` is a real
+      // PG row, and the trigger route's `saveRequestIdempotency` is
+      // safe to call. Setting the flag here would silently skip the
+      // request-idempotency cache for every non-mollified trigger on a
+      // mollifier-enabled org, breaking lost-response retry dedup.
+      expect(result?.isMollified).toBeFalsy();
 
       await engine.quit();
     },
@@ -1607,143 +1690,6 @@ describe("RunEngineTriggerTaskService", () => {
     },
   );
 
-  containerTest(
-    "mollifier · debounce match produces an orphan buffer entry (documented behaviour)",
-    async ({ prisma, redisOptions }) => {
-      // SCENARIO: a trigger with a debounce key arrives while a matching
-      // debounced run already exists. `debounceSystem.handleDebounce` runs
-      // INSIDE `engine.trigger` (line ~514 of run-engine/src/engine/index.ts),
-      // AFTER buffer.accept has already written the new friendlyId. The
-      // service correctly returns the existing run id to the customer, but
-      // the buffer is left with an orphan entry for the new friendlyId.
-      //
-      // Why this is acceptable in Phase 1:
-      //   - Customer-facing behaviour is unchanged from today: they receive
-      //     the existing run id, same as the non-mollified path.
-      //   - The orphan is bounded — the drainer's no-op-ack handler pops
-      //     and acks it on its next poll.
-      //   - The audit-trail surfaces it: a `mollifier.buffered` log line
-      //     with `runId` that has no matching TaskRun in Postgres.
-      //
-      // Why Phase 2 cares:
-      //   - When the buffer becomes the primary write path, debounce can
-      //     no longer be allowed to run AFTER buffer.accept. The drainer's
-      //     engine.trigger replay would observe "existing" and skip the
-      //     persist — the customer's synthesised 200 (with the new
-      //     friendlyId) would never get a TaskRun, and the audit-trail
-      //     divergence becomes a real data-loss bug.
-      //   - Phase 2 must lift `handleDebounce` into the call site BEFORE
-      //     buffer.accept:
-      //       1. handleDebounce → if existing, return existing run; do NOT
-      //          touch the buffer.
-      //       2. Otherwise, accept with `claimId` threaded into the
-      //          canonical payload so the drainer's replay can
-      //          `registerDebouncedRun` after persisting.
-      //
-      // This test pins the current ordering. A future change that "fixes"
-      // it by lifting handleDebounce upfront will fail the orphan
-      // assertion below and force an explicit choice (update the test,
-      // remove this scenario, or stage the lift behind a flag).
-
-      const engine = new RunEngine({
-        prisma,
-        worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 },
-        queue: { redis: redisOptions },
-        runLock: { redis: redisOptions },
-        machines: {
-          defaultMachine: "small-1x",
-          machines: { "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 } },
-          baseCostInCents: 0.0005,
-        },
-        tracer: trace.getTracer("test", "0.0.0"),
-      });
-
-      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
-      const taskIdentifier = "test-task";
-      await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier);
-
-      const idempotencyKeyConcern = new IdempotencyKeyConcern(
-        prisma,
-        engine,
-        new MockTraceEventConcern(),
-      );
-
-      // Setup: trigger with debounce — creates the existing run + Redis claim.
-      const baseline = new RunEngineTriggerTaskService({
-        engine,
-        prisma,
-        payloadProcessor: new MockPayloadProcessor(),
-        queueConcern: new DefaultQueueManager(prisma, engine),
-        idempotencyKeyConcern,
-        validator: new MockTriggerTaskValidator(),
-        traceEventConcern: new MockTraceEventConcern(),
-        tracer: trace.getTracer("test", "0.0.0"),
-        metadataMaximumSize: 1024 * 1024,
-      });
-      const first = await baseline.call({
-        taskId: taskIdentifier,
-        environment: authenticatedEnvironment,
-        body: {
-          payload: { test: "x" },
-          options: { debounce: { key: "regression-debounce-6", delay: "30s" } },
-        },
-      });
-      expect(first?.run.friendlyId).toBeDefined();
-
-      // Action: same debounce key, mollify-stub gate.
-      const buffer = new CapturingMollifierBuffer();
-      const mollifierService = new RunEngineTriggerTaskService({
-        engine,
-        prisma,
-        payloadProcessor: new MockPayloadProcessor(),
-        queueConcern: new DefaultQueueManager(prisma, engine),
-        idempotencyKeyConcern,
-        validator: new MockTriggerTaskValidator(),
-        traceEventConcern: new MockTraceEventConcern(),
-        tracer: trace.getTracer("test", "0.0.0"),
-        metadataMaximumSize: 1024 * 1024,
-        evaluateGate: async () => ({
-          action: "mollify",
-          decision: {
-            divert: true,
-            reason: "per_env_rate",
-            count: 150,
-            threshold: 100,
-            windowMs: 200,
-            holdMs: 500,
-          },
-        }),
-        getMollifierBuffer: () => buffer as never,
-        isMollifierGloballyEnabled: () => true,
-      });
-
-      const debounced = await mollifierService.call({
-        taskId: taskIdentifier,
-        environment: authenticatedEnvironment,
-        body: {
-          payload: { test: "x" },
-          options: { debounce: { key: "regression-debounce-6", delay: "30s" } },
-        },
-      });
-
-      // Customer-facing behaviour: the existing run is returned (correct).
-      expect(debounced).toBeDefined();
-      expect(debounced?.run.friendlyId).toBe(first?.run.friendlyId);
-
-      // Orphan: buffer.accept fired with the new friendlyId we generated
-      // upfront, and that friendlyId has no matching TaskRun in Postgres
-      // because engine.trigger returned the existing run via debounce.
-      expect(buffer.accepted).toHaveLength(1);
-      expect(buffer.accepted[0]!.runId).not.toBe(first?.run.friendlyId);
-      const orphanFriendlyId = buffer.accepted[0]!.runId;
-      const orphanRow = await prisma.taskRun.findFirst({
-        where: { friendlyId: orphanFriendlyId },
-      });
-      expect(orphanRow).toBeNull();
-
-      await engine.quit();
-    },
-  );
 });
 
 describe("DefaultQueueManager task metadata cache", () => {
diff --git a/apps/webapp/test/metadataRouteOperationsLogging.test.ts b/apps/webapp/test/metadataRouteOperationsLogging.test.ts
new file mode 100644
index 00000000000..ab96c9b9b23
--- /dev/null
+++ b/apps/webapp/test/metadataRouteOperationsLogging.test.ts
@@ -0,0 +1,132 @@
+import { describe, expect, it, vi } from "vitest";
+
+// `vi.mock` factories are hoisted above regular top-level `const`s, so
+// any cross-references between the spy/mock fns and the factories have
+// to live inside `vi.hoisted`. See `mollifierDrainerHandler.test.ts`
+// for the same pattern.
+const { warnSpy, applyMetadataMutationToBufferedRunMock } = vi.hoisted(() => ({
+  warnSpy: vi.fn(),
+  applyMetadataMutationToBufferedRunMock: vi.fn(),
+}));
+
+// The route module's import graph (createActionApiRoute, the env, the
+// services singleton) is heavier than the helper actually needs. Stub
+// the leaf modules so only the helper under test executes; the route's
+// top-level `createActionApiRoute(...)` call runs against the stubbed
+// builder and never touches platform.v3.server / prisma.
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+vi.mock("~/env.server", () => ({
+  env: { TASK_RUN_METADATA_MAXIMUM_SIZE: 256 * 1024 },
+}));
+vi.mock("~/services/routeBuilders/apiBuilder.server", () => ({
+  createActionApiRoute: () => ({ action: vi.fn() }),
+}));
+vi.mock("~/services/apiAuth.server", () => ({
+  authenticateApiRequest: vi.fn(),
+}));
+vi.mock("~/v3/services/common.server", () => ({
+  ServiceValidationError: class extends Error {
+    constructor(public override message: string, public status?: number) {
+      super(message);
+    }
+  },
+}));
+vi.mock("~/services/metadata/updateMetadataInstance.server", () => ({
+  updateMetadataService: { call: vi.fn(async () => undefined) },
+}));
+vi.mock("~/v3/mollifier/applyMetadataMutation.server", () => ({
+  applyMetadataMutationToBufferedRun: applyMetadataMutationToBufferedRunMock,
+}));
+vi.mock("~/v3/mollifier/readFallback.server", () => ({
+  findRunByIdWithMollifierFallback: vi.fn(),
+}));
+vi.mock("~/services/logger.server", () => ({
+  logger: {
+    warn: warnSpy,
+    info: vi.fn(),
+    error: vi.fn(),
+    debug: vi.fn(),
+  },
+}));
+
+import { routeOperationsToRun } from "~/routes/api.v1.runs.$runId.metadata";
+import type { AuthenticatedEnvironment } from "~/services/apiAuth.server";
+
+const env = {
+  id: "env_a",
+  organizationId: "org_1",
+} as unknown as AuthenticatedEnvironment;
+
+const opsFixture = [{ type: "set", key: "k", value: "v" }] as Parameters<
+  typeof routeOperationsToRun
+>[1];
+
+describe("routeOperationsToRun — non-throw buffer outcome logging", () => {
+  // Each non-success outcome `applyMetadataMutationToBufferedRun` can
+  // return (`not_found`, `busy`, `version_exhausted`, `metadata_too_large`)
+  // must produce a warn log so ops can trace silent drops. Without this
+  // branch the parent/root operation would disappear with no record —
+  // `tryCatch` only catches throws, and the outcome object was
+  // previously ignored.
+  for (const kind of ["not_found", "busy", "version_exhausted", "metadata_too_large"] as const) {
+    it(`warn-logs when buffer outcome is { kind: "${kind}" }`, async () => {
+      warnSpy.mockClear();
+      applyMetadataMutationToBufferedRunMock.mockResolvedValueOnce({ kind });
+
+      await routeOperationsToRun("run_buffered_1", opsFixture, env);
+
+      expect(warnSpy).toHaveBeenCalledWith(
+        "metadata route: parent/root buffer op did not apply",
+        expect.objectContaining({ targetRunId: "run_buffered_1", kind }),
+      );
+    });
+  }
+
+  it("does NOT warn on the happy path (kind: 'applied')", async () => {
+    warnSpy.mockClear();
+    applyMetadataMutationToBufferedRunMock.mockResolvedValueOnce({
+      kind: "applied",
+      newMetadata: { k: "v" },
+      parentTaskRunFriendlyId: undefined,
+      rootTaskRunFriendlyId: undefined,
+    });
+
+    await routeOperationsToRun("run_buffered_1", opsFixture, env);
+
+    expect(warnSpy).not.toHaveBeenCalledWith(
+      "metadata route: parent/root buffer op did not apply",
+      expect.anything(),
+    );
+  });
+
+  it("warn-logs once when the helper throws (the pre-existing throw branch keeps working)", async () => {
+    warnSpy.mockClear();
+    applyMetadataMutationToBufferedRunMock.mockRejectedValueOnce(new Error("ECONNRESET"));
+
+    await routeOperationsToRun("run_buffered_1", opsFixture, env);
+
+    // Pre-existing branch — the catch logs `buffer fallback for parent/root
+    // op failed`. The new non-throw branch must NOT also fire (we return
+    // early on bufferError).
+    expect(warnSpy).toHaveBeenCalledWith(
+      "metadata route: buffer fallback for parent/root op failed",
+      expect.objectContaining({ targetRunId: "run_buffered_1" }),
+    );
+    expect(warnSpy).not.toHaveBeenCalledWith(
+      "metadata route: parent/root buffer op did not apply",
+      expect.anything(),
+    );
+  });
+
+  it("skips both PG and buffer when targetRunId is missing or operations is empty", async () => {
+    warnSpy.mockClear();
+    applyMetadataMutationToBufferedRunMock.mockClear();
+
+    await routeOperationsToRun(undefined, opsFixture, env);
+    await routeOperationsToRun("run_x", undefined, env);
+    await routeOperationsToRun("run_x", [], env);
+
+    expect(applyMetadataMutationToBufferedRunMock).not.toHaveBeenCalled();
+    expect(warnSpy).not.toHaveBeenCalled();
+  });
+});
diff --git a/apps/webapp/test/mollifierApplyMetadataMutation.test.ts b/apps/webapp/test/mollifierApplyMetadataMutation.test.ts
new file mode 100644
index 00000000000..5995f6969f3
--- /dev/null
+++ b/apps/webapp/test/mollifierApplyMetadataMutation.test.ts
@@ -0,0 +1,352 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server";
+import type { BufferEntry, MollifierBuffer, CasSetMetadataResult } from "@trigger.dev/redis-worker";
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+
+// Regression for a CAS retry-exhaustion bug: the default `maxRetries`
+// was 3, matching the PG-side service, but that exhausts fast when N
+// external API writers race the same buffered run's metadata. Bumped
+// to 12 + jittered backoff. These tests simulate version_conflict
+// races and assert (a) every delta lands and (b) the retry budget is
+// sized for realistic concurrency.
+
+const NOW = new Date("2026-05-21T10:00:00Z");
+
+type BufferStub = {
+  buffer: MollifierBuffer;
+  state: {
+    version: number;
+    metadata: Record<string, unknown>;
+    pendingConflictsForNextN: number;
+  };
+};
+
+// Build a stub MollifierBuffer that simulates Lua-CAS semantics
+// in-memory. The first `pendingConflictsForNextN` casSetMetadata calls
+// from any worker will return version_conflict (then the version
+// bumps); subsequent calls succeed.
+function makeBufferStub(initialPayload: Record<string, unknown> = {}): BufferStub {
+  const state = {
+    version: 0,
+    metadata: initialPayload.metadata
+      ? (JSON.parse(initialPayload.metadata as string) as Record<string, unknown>)
+      : {},
+    pendingConflictsForNextN: 0,
+  };
+  const entryTemplate: Omit<BufferEntry, "payload"> = {
+    runId: "run_1",
+    envId: "env_a",
+    orgId: "org_1",
+    status: "QUEUED",
+    attempts: 0,
+    createdAt: NOW,
+    createdAtMicros: 1747044000000000,
+    materialised: false,
+    idempotencyLookupKey: "",
+    metadataVersion: 0,
+  };
+
+  const buffer: MollifierBuffer = {
+    getEntry: vi.fn(async (): Promise<BufferEntry> => ({
+      ...entryTemplate,
+      metadataVersion: state.version,
+      payload: JSON.stringify({ ...initialPayload, metadata: JSON.stringify(state.metadata) }),
+    })),
+    casSetMetadata: vi.fn(
+      async (input: {
+        runId: string;
+        expectedVersion: number;
+        newMetadata: string;
+        newMetadataType: string;
+      }): Promise<CasSetMetadataResult> => {
+        // Inject a controlled number of conflicts to simulate races.
+        if (state.pendingConflictsForNextN > 0) {
+          state.pendingConflictsForNextN -= 1;
+          // Bump version as if some other writer just landed.
+          state.version += 1;
+          return { kind: "version_conflict", currentVersion: state.version };
+        }
+        if (input.expectedVersion !== state.version) {
+          return { kind: "version_conflict", currentVersion: state.version };
+        }
+        state.metadata = JSON.parse(input.newMetadata) as Record<string, unknown>;
+        state.version += 1;
+        return { kind: "applied", newVersion: state.version };
+      },
+    ),
+  } as unknown as MollifierBuffer;
+
+  return { buffer, state };
+}
+
+describe("applyMetadataMutationToBufferedRun — retry behaviour", () => {
+  it("succeeds when CAS lands on the first try (no contention)", async () => {
+    const { buffer, state } = makeBufferStub();
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: { metadata: { counter: 1 } },
+      buffer,
+    });
+    expect(result.kind).toBe("applied");
+    expect(state.metadata).toEqual({ counter: 1 });
+    expect(state.version).toBe(1);
+  });
+
+  it("succeeds after 5 version conflicts (default budget = 12)", async () => {
+    const { buffer, state } = makeBufferStub();
+    state.pendingConflictsForNextN = 5;
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
+      buffer,
+    });
+    expect(result.kind).toBe("applied");
+    if (result.kind === "applied") {
+      expect(result.newMetadata.counter).toBe(1);
+    }
+  });
+
+  it("succeeds after 11 version conflicts (one under the default budget)", async () => {
+    const { buffer } = makeBufferStub();
+    const setStateConflicts = (n: number) => {
+      // Re-read state from the closure
+      const state = (buffer as unknown as { __state__?: never; getEntry: () => Promise<BufferEntry> });
+      void state;
+    };
+    void setStateConflicts;
+    // Set conflicts directly via the shared state object
+    const { state } = makeBufferStub();
+    state.pendingConflictsForNextN = 11;
+    // Build a fresh stub since we want one shared state instance
+    const stub = makeBufferStub();
+    stub.state.pendingConflictsForNextN = 11;
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("applied");
+  });
+
+  it("returns version_exhausted after retries are spent", async () => {
+    const stub = makeBufferStub();
+    // 99 conflicts ≫ default budget of 12. With maxRetries 3 (the
+    // pre-fix value), this would have exhausted after 4 attempts.
+    stub.state.pendingConflictsForNextN = 99;
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
+      buffer: stub.buffer,
+      maxRetries: 12,
+    });
+    expect(result.kind).toBe("version_exhausted");
+  });
+
+  it("regression: 3 retries are NOT enough under 50-way concurrency simulation", async () => {
+    // The pre-fix default would have lost most deltas under this
+    // contention. Asserting that the OLD budget (3) exhausts confirms
+    // the regression actually existed and the new budget addresses it.
+    const stub = makeBufferStub();
+    stub.state.pendingConflictsForNextN = 8;
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
+      buffer: stub.buffer,
+      maxRetries: 3,
+    });
+    expect(result.kind).toBe("version_exhausted");
+  });
+
+  it("matches PG semantics when body has both metadata + operations: ops on top of EXISTING, body.metadata ignored", async () => {
+    // PG service (UpdateMetadataService.#updateRunMetadata) branches on
+    // Array.isArray(body.operations) — when present it applies ops on
+    // top of existing PG metadata and IGNORES body.metadata. The buffer
+    // helper used to merge both (replace then apply), producing different
+    // results across the buffered/materialised boundary. This regression
+    // pins the PG-matching behaviour.
+    const stub = makeBufferStub({ metadata: JSON.stringify({ a: 1 }) });
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: {
+        // Should be ignored because `operations` is also present.
+        metadata: { b: 2 },
+        operations: [{ type: "set", key: "c", value: 3 }],
+      },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("applied");
+    if (result.kind === "applied") {
+      // PG would produce {a:1, c:3}; previously the buffer produced {b:2, c:3}.
+      expect(result.newMetadata).toEqual({ a: 1, c: 3 });
+      expect(result.newMetadata).not.toHaveProperty("b");
+    }
+  });
+
+  it("returns metadata_too_large when the resulting payload exceeds maximumSize (mirrors PG 413)", async () => {
+    // PG-side `UpdateMetadataService` uses `handleMetadataPacket` to
+    // enforce TASK_RUN_METADATA_MAXIMUM_SIZE (default 256KB), throwing
+    // `MetadataTooLargeError` (413) on overflow. The buffer helper now
+    // matches that cap so a buffered run can't accept a payload PG
+    // would have rejected. Reject must fire BEFORE casSetMetadata.
+    const stub = makeBufferStub();
+    const big = "x".repeat(2048); // 2 KB string value
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024, // 1 KB cap — strictly less than the payload
+      body: { metadata: { big } },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("metadata_too_large");
+    if (result.kind === "metadata_too_large") {
+      expect(result.maximumSize).toBe(1024);
+      expect(result.observedSize).toBeGreaterThan(1024);
+    }
+    // No CAS write should have been attempted.
+    expect(stub.buffer.casSetMetadata).not.toHaveBeenCalled();
+    expect(stub.state.version).toBe(0);
+  });
+
+  it("returns not_found when the buffered entry belongs to a different env (cross-env auth gate)", async () => {
+    // Same shape as a normal apply call, but the caller's environmentId
+    // doesn't match the entry's envId. The helper must refuse the
+    // mutation and return not_found (without leaking existence) and
+    // must NOT call casSetMetadata.
+    const stub = makeBufferStub();
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_OTHER",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: { metadata: { counter: 1 } },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("not_found");
+    expect(stub.buffer.casSetMetadata).not.toHaveBeenCalled();
+    expect(stub.state.version).toBe(0);
+  });
+
+  it("returns not_found when the buffered entry belongs to a different org (cross-org auth gate)", async () => {
+    const stub = makeBufferStub();
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_OTHER",
+      maximumSize: 1024 * 1024,
+      body: { metadata: { counter: 1 } },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("not_found");
+    expect(stub.buffer.casSetMetadata).not.toHaveBeenCalled();
+  });
+
+  it("surfaces parent/root friendlyIds on `applied` so the route can fan parent/root ops without a second buffer read", async () => {
+    // Regression: the metadata route used to do a SECOND
+    // `findRunByIdWithMollifierFallback` after the primary CAS to
+    // obtain parent/root friendlyIds for `routeOperationsToRun`.
+    // If the drainer's terminal-failure path ran between the CAS and
+    // the second read, the entry hash was DELd and the second read
+    // came back null — the route silently skipped the entire
+    // parent/root fan-out, dropping `body.parentOperations` /
+    // `body.rootOperations` after the primary mutation already
+    // landed. The helper now captures the ids inside its own read
+    // loop and surfaces them on the `applied` outcome so the route
+    // never needs a second round trip.
+    //
+    // Engine-side snapshot stores internal cuids; we expect the
+    // helper to convert via `RunId.toFriendlyId` so the outcome
+    // matches what `readFallback.server.ts` would have produced.
+    const parentFriendly = RunId.generate().friendlyId;
+    const rootFriendly = RunId.generate().friendlyId;
+    const parentInternal = RunId.fromFriendlyId(parentFriendly);
+    const rootInternal = RunId.fromFriendlyId(rootFriendly);
+    const stub = makeBufferStub({
+      parentTaskRunId: parentInternal,
+      rootTaskRunId: rootInternal,
+    });
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: { metadata: { counter: 1 } },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("applied");
+    if (result.kind === "applied") {
+      expect(result.parentTaskRunFriendlyId).toBe(parentFriendly);
+      expect(result.rootTaskRunFriendlyId).toBe(rootFriendly);
+    }
+  });
+
+  it("`applied` parent/root ids are undefined when the snapshot carries neither (top-level run)", async () => {
+    // Top-level runs (parentTaskRunId/rootTaskRunId both undefined in
+    // the engine-trigger snapshot) must surface as undefined on the
+    // outcome so the route's `?? runId` self-fallback fires —
+    // matching the PG service's `taskRun.parentTaskRun?.id ??
+    // taskRun.id` semantics.
+    const stub = makeBufferStub({});
+    const result = await applyMetadataMutationToBufferedRun({
+      runId: "run_1",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      maximumSize: 1024 * 1024,
+      body: { metadata: { counter: 1 } },
+      buffer: stub.buffer,
+    });
+    expect(result.kind).toBe("applied");
+    if (result.kind === "applied") {
+      expect(result.parentTaskRunFriendlyId).toBeUndefined();
+      expect(result.rootTaskRunFriendlyId).toBeUndefined();
+    }
+  });
+
+  it("N-way concurrent applies all converge under default budget", async () => {
+    // Simulate N parallel writers against a shared state. Each writer
+    // reads, applies a delta, CAS-writes. The Lua CAS forces them to
+    // retry until they see the latest version.
+    const N = 30;
+    const sharedStub = makeBufferStub();
+    // Override the stub to model real per-attempt serialisation: each
+    // call reads the latest version, and CAS conflicts are organic
+    // (not pre-injected) when expectedVersion != current.
+    sharedStub.state.pendingConflictsForNextN = 0;
+
+    const calls = Array.from({ length: N }, () =>
+      applyMetadataMutationToBufferedRun({
+        runId: "run_1",
+        environmentId: "env_a",
+        organizationId: "org_1",
+        maximumSize: 1024 * 1024,
+        body: { operations: [{ type: "increment", key: "counter", value: 1 }] },
+        buffer: sharedStub.buffer,
+      }),
+    );
+    const results = await Promise.all(calls);
+    const applied = results.filter((r) => r.kind === "applied").length;
+    expect(applied).toBe(N);
+    expect(sharedStub.state.metadata.counter).toBe(N);
+  });
+});
diff --git a/apps/webapp/test/mollifierClaimResolution.test.ts b/apps/webapp/test/mollifierClaimResolution.test.ts
new file mode 100644
index 00000000000..f61cda0d04e
--- /dev/null
+++ b/apps/webapp/test/mollifierClaimResolution.test.ts
@@ -0,0 +1,143 @@
+import { describe, expect, it, vi } from "vitest";
+
+// Stub `~/db.server` before importing the concern — the real module
+// eagerly calls `prisma.$connect()` at singleton construction, which
+// would fail without a database. The concern under test receives its
+// prisma via the constructor, so the stub is never used by the code path.
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+// The IdempotencyKeyConcern resolves the pre-gate claim through the
+// global mollifier buffer (`getMollifierBuffer`), shared by both
+// `claimOrAwait` and `findBufferedRunWithIdempotency`. Control it via a
+// hoisted handle so each test can script the claim/lookup responses.
+const h = vi.hoisted(() => ({ buffer: null as unknown, orgFlag: true }));
+vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({
+  getMollifierBuffer: () => h.buffer,
+}));
+// Stub `mollifierGate.server` so loading the concern doesn't drag in
+// `env.server` (which fails to parse without a populated environment in
+// CI). The concern only uses `makeResolveMollifierFlag` to gate the
+// claim; tests flip `h.orgFlag` to cover both opted-in and opted-out
+// orgs without touching real env or feature-flag wiring.
+vi.mock("~/v3/mollifier/mollifierGate.server", () => ({
+  makeResolveMollifierFlag: () => async () => h.orgFlag,
+}));
+
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import { IdempotencyKeyConcern } from "~/runEngine/concerns/idempotencyKeys.server";
+import type { TriggerTaskRequest } from "~/runEngine/types";
+
+function makeConcern(prisma: { findFirst: () => Promise<unknown> }) {
+  return new IdempotencyKeyConcern(
+    { taskRun: { findFirst: prisma.findFirst } } as never,
+    {} as never, // engine — unused on this path
+    {} as never, // traceEventConcern — unused on this path
+  );
+}
+
+function makeRequest(): TriggerTaskRequest {
+  return {
+    taskId: "my-task",
+    environment: {
+      id: "env_a",
+      organizationId: "org_1",
+      // The pre-gate claim is gated by the per-org mollifier flag
+      // (mirroring evaluateGate's gating) so non-opted-in orgs don't pay
+      // the Redis SETNX. Tests covering the claim path must opt this
+      // fake org in, otherwise the concern skips claimOrAwait entirely
+      // and the resolution branches under test never run.
+      organization: { featureFlags: { mollifierEnabled: true } },
+    },
+    options: {},
+    body: { options: { idempotencyKey: "k-1" } },
+  } as unknown as TriggerTaskRequest;
+}
+
+describe("IdempotencyKeyConcern · claim resolution", () => {
+  it("resolved-but-unfindable falls through to a fresh trigger (no cached run, no claim held)", async () => {
+    // The claim slot holds a runId that is gone from both stores: the PG
+    // findFirst misses and the buffer lookup misses. Regression guard for
+    // the resolved-but-unfindable terminal case — the concern must fall
+    // through to a fresh trigger rather than throw, hand back a bogus
+    // cached run, or claim ownership it doesn't hold.
+    const lookupIdempotency = vi.fn(async () => null);
+    h.buffer = {
+      claimIdempotency: vi.fn(async () => ({ kind: "resolved", runId: "run_gone" })),
+      lookupIdempotency,
+    } as unknown as MollifierBuffer;
+
+    const findFirst = vi.fn(async () => null); // PG misses on every call
+    const concern = makeConcern({ findFirst });
+
+    const result = await concern.handleTriggerRequest(makeRequest(), undefined);
+
+    expect(result.isCached).toBe(false);
+    if (result.isCached === false) {
+      // No claim held — we resolved someone else's (stale) claim, we did
+      // not win one. The caller must NOT publish/release on our behalf.
+      expect(result.claim).toBeUndefined();
+      expect(result.idempotencyKey).toBe("k-1");
+    }
+    // We attempted the buffer fallback before giving up.
+    expect(lookupIdempotency).toHaveBeenCalled();
+  });
+
+  it("resolved-and-findable returns the existing run as a cached hit", async () => {
+    // Guard the happy resolved path: when the claimed runId IS findable
+    // (writer-side PG), the fall-through change must not swallow it.
+    h.buffer = {
+      claimIdempotency: vi.fn(async () => ({ kind: "resolved", runId: "run_winner" })),
+      lookupIdempotency: vi.fn(async () => null),
+    } as unknown as MollifierBuffer;
+
+    const winner = { id: "run_winner", friendlyId: "run_winner" };
+    // First findFirst (initial existingRun check) misses so we enter the
+    // claim path; the second (writer-side re-resolve) finds the winner.
+    let calls = 0;
+    const findFirst = vi.fn(async () => {
+      calls += 1;
+      return calls >= 2 ? winner : null;
+    });
+    const concern = makeConcern({ findFirst });
+
+    const result = await concern.handleTriggerRequest(makeRequest(), undefined);
+
+    expect(result.isCached).toBe(true);
+    if (result.isCached === true) {
+      expect(result.run).toBe(winner);
+    }
+  });
+
+  it("non-opted-in org skips claimOrAwait entirely (no buffer round-trip, no claim held)", async () => {
+    // Regression guard for the per-org gating that keeps the claim's
+    // Redis SETNX off the hot path for orgs that haven't opted into the
+    // mollifier — even when `TRIGGER_MOLLIFIER_ENABLED=1` globally and
+    // the buffer singleton exists. The concern should NOT touch
+    // `claimIdempotency` for these orgs; PG's unique constraint already
+    // deduplicates same-key races on the pass-through path.
+    h.orgFlag = false;
+    const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const }));
+    const lookupIdempotency = vi.fn(async () => null);
+    h.buffer = {
+      claimIdempotency,
+      lookupIdempotency,
+    } as unknown as MollifierBuffer;
+
+    const findFirst = vi.fn(async () => null);
+    const concern = makeConcern({ findFirst });
+
+    try {
+      const result = await concern.handleTriggerRequest(makeRequest(), undefined);
+      expect(result.isCached).toBe(false);
+      if (result.isCached === false) {
+        // No claim returned — the caller must NOT publish/release.
+        expect(result.claim).toBeUndefined();
+        expect(result.idempotencyKey).toBe("k-1");
+      }
+      // The headline guarantee: zero Redis claim activity for this org.
+      expect(claimIdempotency).not.toHaveBeenCalled();
+    } finally {
+      h.orgFlag = true; // restore for any later tests in this file
+    }
+  });
+});
diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts
new file mode 100644
index 00000000000..085fab6418b
--- /dev/null
+++ b/apps/webapp/test/mollifierDrainerHandler.test.ts
@@ -0,0 +1,574 @@
+import { describe, expect, it, vi } from "vitest";
+import { trace } from "@opentelemetry/api";
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+
+vi.mock("~/db.server", () => ({
+  prisma: {},
+  $replica: {},
+}));
+
+// `writeMollifierTerminalFailureRow` enqueues a PerformTaskRunAlertsService
+// after writing the SYSTEM_FAILURE row (mirrors TriggerFailedTaskService).
+// In production that enqueues into the alerts redis-worker; the test
+// environment has no redis-worker, so the real call hangs the tick out
+// to its 5s vitest timeout. Stub `enqueue` to a resolved no-op so the
+// handler's best-effort try/catch sees a clean success path.
+vi.mock("~/v3/services/alerts/performTaskRunAlerts.server", () => ({
+  PerformTaskRunAlertsService: {
+    enqueue: vi.fn(async () => undefined),
+  },
+}));
+
+// The drainer calls `recordRunDebugLog` after a successful engine.trigger
+// to emit an admin-only LOG-kind event encoding the buffered window.
+// The real implementation imports the configured event repository (prisma
+// + clickhouse + env), which has heavy side-effects on first import.
+// Stub it to a vi.fn so the unit tests can assert call shape without
+// dragging the whole eventRepository graph into webapp test setup.
+// `vi.hoisted` is required because `vi.mock` factories are hoisted above
+// regular `const`s — referencing a top-level variable from inside the
+// factory otherwise fires `Cannot access 'X' before initialization`.
+const { recordRunDebugLogMock } = vi.hoisted(() => ({
+  recordRunDebugLogMock: vi.fn(async () => ({ success: true as const })),
+}));
+vi.mock("~/v3/eventRepository/index.server", () => ({
+  recordRunDebugLog: recordRunDebugLogMock,
+}));
+
+import {
+  createDrainerHandler,
+  isRetryablePgError,
+} from "~/v3/mollifier/mollifierDrainerHandler.server";
+
+describe("isRetryablePgError", () => {
+  it("returns true for P2024 (connection pool timeout)", () => {
+    const err = Object.assign(new Error("Timed out fetching a new connection"), {
+      code: "P2024",
+    });
+    expect(isRetryablePgError(err)).toBe(true);
+  });
+
+  it("returns true for generic connection-lost messages", () => {
+    expect(isRetryablePgError(new Error("Connection lost"))).toBe(true);
+    expect(isRetryablePgError(new Error("Can't reach database server"))).toBe(true);
+  });
+
+  it("returns false for validation errors", () => {
+    expect(isRetryablePgError(new Error("Invalid payload"))).toBe(false);
+  });
+
+  it("returns false for non-Error inputs", () => {
+    expect(isRetryablePgError("string error")).toBe(false);
+    expect(isRetryablePgError({ message: "object" })).toBe(false);
+  });
+});
+
+describe("createDrainerHandler", () => {
+  it("invokes engine.trigger with the deserialised snapshot", async () => {
+    const trigger = vi.fn(async () => ({ friendlyId: "run_x" }));
+    const handler = createDrainerHandler({
+      engine: { trigger } as any,
+      prisma: {} as any,
+    });
+
+    await handler({
+      runId: "run_x",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: { taskIdentifier: "t", payload: "{}" },
+      attempts: 0,
+      createdAt: new Date(),
+    } as any);
+
+    expect(trigger).toHaveBeenCalledOnce();
+    const callArg = trigger.mock.calls[0][0] as { taskIdentifier: string };
+    expect(callArg.taskIdentifier).toBe("t");
+  });
+
+  it("re-attaches the snapshot's traceId so engine.trigger inherits the original trace", async () => {
+    // Captures the active traceId at the moment engine.trigger is invoked.
+    // Without context propagation it would be a fresh traceId, leaving the
+    // run-detail page with only the root span.
+    let observedTraceId: string | undefined;
+    const trigger = vi.fn(async () => {
+      observedTraceId = trace.getActiveSpan()?.spanContext().traceId;
+      return { friendlyId: "run_x" };
+    });
+
+    const handler = createDrainerHandler({
+      engine: { trigger } as any,
+      prisma: {} as any,
+    });
+
+    const snapshotTraceId = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
+    const snapshotSpanId = "bbbbbbbbbbbbbbbb";
+
+    await handler({
+      runId: "run_x",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: {
+        taskIdentifier: "t",
+        traceId: snapshotTraceId,
+        spanId: snapshotSpanId,
+      },
+      attempts: 0,
+      createdAt: new Date(),
+    } as any);
+
+    expect(observedTraceId).toBe(snapshotTraceId);
+  });
+
+  it("rethrows retryable PG errors so MollifierDrainer requeues the entry", async () => {
+    const err = new Error("Can't reach database server");
+    const trigger = vi.fn(async () => {
+      throw err;
+    });
+    const createFailedTaskRun = vi.fn();
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_x",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: { taskIdentifier: "t" },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any),
+    ).rejects.toThrow("Can't reach database server");
+    // Retryable: we do NOT write a SYSTEM_FAILURE row, the entry should
+    // be requeued for another shot.
+    expect(createFailedTaskRun).not.toHaveBeenCalled();
+  });
+
+  const envFixture = {
+    id: "env_a",
+    type: "DEVELOPMENT",
+    project: { id: "proj_1" },
+    organization: { id: "org_1" },
+  };
+
+  it("writes a SYSTEM_FAILURE PG row when engine.trigger fails non-retryably", async () => {
+    const trigger = vi.fn(async () => {
+      throw new Error("validation failed: payload too large");
+    });
+    const createFailedTaskRun = vi.fn(async () => ({
+      id: "internal",
+      friendlyId: "run_x",
+    }));
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_x",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: { taskIdentifier: "t", environment: envFixture },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any),
+    ).resolves.toBeUndefined();
+
+    expect(trigger).toHaveBeenCalledOnce();
+    expect(createFailedTaskRun).toHaveBeenCalledOnce();
+    const arg = createFailedTaskRun.mock.calls[0][0] as { error: { raw: string } };
+    expect(arg.error.raw).toContain("validation failed");
+  });
+
+  it("propagates the batch association into createFailedTaskRun (so batch parents don't hang on missing children)", async () => {
+    // Devin's ANALYSIS report on PR #3754: the terminal-failure path
+    // extracts most snapshot fields (parentTaskRunId, rootTaskRunId,
+    // depth, etc.) but dropped `batch`. If the original trigger was
+    // part of a batch, the SYSTEM_FAILURE row isn't associated with
+    // the batch, so the batch parent's completion-tracking can hang
+    // indefinitely waiting on a child that landed but isn't linked.
+    const trigger = vi.fn(async () => {
+      throw new Error("validation failed: payload too large");
+    });
+    const createFailedTaskRun = vi.fn(async () => ({
+      id: "internal",
+      friendlyId: "run_x",
+    }));
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_batched",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: {
+          taskIdentifier: "t",
+          environment: envFixture,
+          batch: { id: "batch_xyz", index: 7 },
+        },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any),
+    ).resolves.toBeUndefined();
+
+    expect(createFailedTaskRun).toHaveBeenCalledOnce();
+    const arg = createFailedTaskRun.mock.calls[0][0] as {
+      batch?: { id: string; index: number };
+    };
+    expect(arg.batch).toEqual({ id: "batch_xyz", index: 7 });
+  });
+
+  it("rethrows the original error when createFailedTaskRun also fails (PG genuinely unreachable)", async () => {
+    const triggerErr = new Error("engine rejected the snapshot");
+    const trigger = vi.fn(async () => {
+      throw triggerErr;
+    });
+    const createFailedTaskRun = vi.fn(async () => {
+      throw new Error("connection refused");
+    });
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_x",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: { taskIdentifier: "t", environment: envFixture },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any),
+    ).rejects.toThrow("engine rejected the snapshot");
+    // Drainer's outer drainOne loop now decides retry vs buffer.fail.
+    expect(createFailedTaskRun).toHaveBeenCalledOnce();
+  });
+
+  it("calls createCancelledRun with emitRunCancelledEvent: false (suppresses orphan trace-event log noise)", async () => {
+    // Buffered-only runs never had a primary trace event written for
+    // them — the mollifier gate skipped `repository.traceEvent` since
+    // the run hadn't materialised in PG yet. The `runCancelled` handler
+    // would log `[runCancelled] Failed to cancel run event` for every
+    // cancelled buffered run if we let the emit fire. Suppress it.
+    const friendlyId = RunId.generate().friendlyId;
+    const createCancelledRun = vi.fn(async () => ({
+      id: "internal",
+      friendlyId,
+      status: "CANCELED",
+    }));
+    const handler = createDrainerHandler({
+      engine: { createCancelledRun } as any,
+      prisma: {} as any,
+    });
+
+    await handler({
+      runId: friendlyId,
+      envId: "env_a",
+      orgId: "org_1",
+      payload: {
+        friendlyId,
+        taskIdentifier: "t",
+        environment: envFixture,
+        cancelledAt: new Date().toISOString(),
+        cancelReason: "Canceled by user",
+      },
+      attempts: 0,
+      createdAt: new Date(),
+    } as any);
+
+    expect(createCancelledRun).toHaveBeenCalledOnce();
+    const arg = createCancelledRun.mock.calls[0][0] as {
+      emitRunCancelledEvent?: boolean;
+    };
+    expect(arg.emitRunCancelledEvent).toBe(false);
+  });
+
+  it("honours the cancel when a buffered cancel races a materialised non-CANCELED row", async () => {
+    // Cancel-wins-over-trigger. If the normal trigger
+    // replay path materialised a live PENDING row before the cancel
+    // bifurcation drained, engine.createCancelledRun throws a conflict —
+    // its documented contract is that "the caller must decide between
+    // engine.cancelRun() and skipping". The drainer handler must honour
+    // the cancel intent by actually cancelling the now-live run; otherwise
+    // the conflict propagates, isRetryablePgError() returns false, and the
+    // drainer buffer.fail()s the entry — silently losing the cancellation
+    // while the run keeps executing.
+    const friendlyId = RunId.generate().friendlyId;
+    const createCancelledRun = vi.fn(async () => {
+      throw new Error(
+        `createCancelledRun conflict: existing run ${friendlyId} has status PENDING`
+      );
+    });
+    const cancelRun = vi.fn(async () => ({ alreadyFinished: false }));
+    const handler = createDrainerHandler({
+      engine: { createCancelledRun, cancelRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: friendlyId,
+        envId: "env_a",
+        orgId: "org_1",
+        payload: {
+          friendlyId,
+          taskIdentifier: "t",
+          environment: envFixture,
+          cancelledAt: new Date().toISOString(),
+          cancelReason: "Canceled by user",
+        },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any)
+    ).resolves.toBeUndefined();
+
+    // The live run is actually cancelled, by its internal id.
+    expect(cancelRun).toHaveBeenCalledOnce();
+    expect(cancelRun.mock.calls[0][0].runId).toBe(RunId.fromFriendlyId(friendlyId));
+  });
+
+  it("requeues on a transient PG outage during the SYSTEM_FAILURE fallback write", async () => {
+    // engine.trigger failed non-retryably, so we try to write a terminal
+    // SYSTEM_FAILURE row. If THAT write fails because PG is transiently
+    // unreachable, rethrowing the *original* non-retryable error makes the
+    // drainer buffer.fail() the entry — losing the run with no PG row ever
+    // landing. Rethrow the retryable write error instead so the drainer
+    // requeues; once PG recovers the failure row lands and the customer
+    // sees it.
+    const trigger = vi.fn(async () => {
+      throw new Error("validation failed: payload too large");
+    });
+    const createFailedTaskRun = vi.fn(async () => {
+      throw new Error("Can't reach database server");
+    });
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_x",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: { taskIdentifier: "t", environment: envFixture },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any)
+    ).rejects.toThrow("Can't reach database server");
+  });
+
+  it("writes a SYSTEM_FAILURE row when createCancelledRun fails non-retryably (cancel bifurcation)", async () => {
+    // Without this guard a non-conflict, non-retryable failure from
+    // createCancelledRun rethrows out of the handler. The drainer's
+    // onTerminalFailure gates on cause==="max-attempts-exhausted" and
+    // skips "non-retryable", so buffer.fail() deletes the entry with
+    // no PG row written — the cancellation disappears silently.
+    // Mirror the non-cancel path's SYSTEM_FAILURE fallback so the
+    // customer always sees a terminal row.
+    const friendlyId = RunId.generate().friendlyId;
+    const cancelErr = new Error("validation failed: bad cancel snapshot");
+    const createCancelledRun = vi.fn(async () => {
+      throw cancelErr;
+    });
+    const createFailedTaskRun = vi.fn(async () => ({ id: "internal_x" }));
+    const handler = createDrainerHandler({
+      engine: { createCancelledRun, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await handler({
+      runId: friendlyId,
+      envId: "env_a",
+      orgId: "org_1",
+      payload: {
+        friendlyId,
+        taskIdentifier: "t",
+        environment: envFixture,
+        cancelledAt: new Date().toISOString(),
+        cancelReason: "Canceled by user",
+      },
+      attempts: 0,
+      createdAt: new Date(),
+    } as any);
+
+    // SYSTEM_FAILURE row was written via the shared helper. Handler
+    // returns cleanly so the drainer ACKs the entry instead of
+    // buffer.fail()ing it.
+    expect(createFailedTaskRun).toHaveBeenCalledOnce();
+    expect(createFailedTaskRun.mock.calls[0][0].friendlyId).toBe(friendlyId);
+    expect(createFailedTaskRun.mock.calls[0][0].error.raw).toContain(
+      "validation failed: bad cancel snapshot"
+    );
+  });
+
+  it("requeues when createCancelledRun fails with a retryable PG error (cancel bifurcation)", async () => {
+    // Retryable PG failures must rethrow so the drainer requeues the
+    // entry — writing a SYSTEM_FAILURE row when PG is transiently
+    // unreachable would still fail. The drainer's existing retry loop
+    // handles the requeue.
+    const friendlyId = RunId.generate().friendlyId;
+    const cancelErr = new Error("Can't reach database server");
+    const createCancelledRun = vi.fn(async () => {
+      throw cancelErr;
+    });
+    const createFailedTaskRun = vi.fn();
+    const handler = createDrainerHandler({
+      engine: { createCancelledRun, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: friendlyId,
+        envId: "env_a",
+        orgId: "org_1",
+        payload: {
+          friendlyId,
+          taskIdentifier: "t",
+          environment: envFixture,
+          cancelledAt: new Date().toISOString(),
+          cancelReason: "Canceled by user",
+        },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any)
+    ).rejects.toThrow("Can't reach database server");
+    expect(createFailedTaskRun).not.toHaveBeenCalled();
+  });
+
+  it("rethrows the original error when the snapshot lacks an environment block", async () => {
+    const triggerErr = new Error("engine rejected the snapshot");
+    const trigger = vi.fn(async () => {
+      throw triggerErr;
+    });
+    const createFailedTaskRun = vi.fn();
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_x",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: { taskIdentifier: "t" /* no environment */ },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any),
+    ).rejects.toThrow("engine rejected the snapshot");
+    expect(createFailedTaskRun).not.toHaveBeenCalled();
+  });
+
+  it("emits an admin-only LOG-kind event with the buffered window after engine.trigger succeeds", async () => {
+    // The drainer's audit trail rides the existing TaskEventKind.LOG
+    // filter pattern (`eventRepository.server.ts:108` + `logs.download.ts:118`)
+    // — admins see the buffered window in the trace; non-admins don't.
+    recordRunDebugLogMock.mockClear();
+    const trigger = vi.fn(async () => ({ friendlyId: "run_z" }));
+    const handler = createDrainerHandler({
+      engine: { trigger } as any,
+      prisma: {} as any,
+    });
+
+    const bufferedAt = new Date(Date.now() - 4_000);
+    await handler({
+      runId: "run_z",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: { taskIdentifier: "t", spanId: "snapspan", traceId: "snaptrace" },
+      attempts: 2,
+      createdAt: bufferedAt,
+    } as any);
+
+    expect(recordRunDebugLogMock).toHaveBeenCalledOnce();
+    const [callRunId, message, options] = recordRunDebugLogMock.mock.calls[0] as [
+      string,
+      string,
+      any,
+    ];
+    // Internal cuid derived from the friendlyId, mirroring what
+    // `findRunForEventCreation` queries on.
+    expect(callRunId).toBe("z");
+    expect(message).toMatch(/Mollifier buffered \d+ms before materialising/);
+    // Emitted as a marker at materialisation time (no `startTime` /
+    // `duration` overrides) — engine.trigger has just rewritten the
+    // root span's start_time to "now", so back-dating the event would
+    // clip it off-screen in the trace renderer. The historical window
+    // is preserved in metadata so admins can still read it.
+    expect(options.startTime).toBeUndefined();
+    expect(options.duration).toBeUndefined();
+    expect(options.parentId).toBe("snapspan");
+    expect(options.attributes.metadata["mollifier.bufferedAt"]).toBe(bufferedAt.toISOString());
+    expect(options.attributes.metadata["mollifier.attempts"]).toBe(2);
+    expect(options.attributes.metadata["mollifier.dwellMs"]).toBeGreaterThan(0);
+  });
+
+  it("does NOT emit the admin LOG event when engine.trigger fails non-retryably", async () => {
+    // The audit trail is for runs that actually materialised. On a
+    // terminal SYSTEM_FAILURE path the customer-visible outcome is the
+    // failure row; emitting a "buffered for Xms" event next to it would
+    // imply the buffered window completed normally.
+    recordRunDebugLogMock.mockClear();
+    const trigger = vi.fn(async () => {
+      throw new Error("engine rejected the snapshot");
+    });
+    const createFailedTaskRun = vi.fn(async () => ({ id: "internal" }));
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await handler({
+      runId: "run_z",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: { taskIdentifier: "t", environment: envFixture },
+      attempts: 0,
+      createdAt: new Date(),
+    } as any);
+
+    expect(recordRunDebugLogMock).not.toHaveBeenCalled();
+  });
+
+  it("does NOT emit the admin LOG event on the cancel-bifurcation path", async () => {
+    // Cancel-bifurcation writes a CANCELED row directly without calling
+    // engine.trigger. There's no buffered-then-materialised window to
+    // describe — the run never ran.
+    recordRunDebugLogMock.mockClear();
+    const friendlyId = RunId.generate().friendlyId;
+    const createCancelledRun = vi.fn(async () => ({
+      id: "internal",
+      friendlyId,
+      status: "CANCELED",
+    }));
+    const handler = createDrainerHandler({
+      engine: { createCancelledRun } as any,
+      prisma: {} as any,
+    });
+
+    await handler({
+      runId: friendlyId,
+      envId: "env_a",
+      orgId: "org_1",
+      payload: {
+        friendlyId,
+        taskIdentifier: "t",
+        environment: envFixture,
+        cancelledAt: new Date().toISOString(),
+        cancelReason: "Canceled by user",
+      },
+      attempts: 0,
+      createdAt: new Date(),
+    } as any);
+
+    expect(recordRunDebugLogMock).not.toHaveBeenCalled();
+  });
+});
diff --git a/apps/webapp/test/mollifierDrainerWorker.test.ts b/apps/webapp/test/mollifierDrainerWorker.test.ts
index e5f38229d8f..0d4e931fd83 100644
--- a/apps/webapp/test/mollifierDrainerWorker.test.ts
+++ b/apps/webapp/test/mollifierDrainerWorker.test.ts
@@ -1,4 +1,17 @@
-import { describe, expect, it } from "vitest";
+import { describe, expect, it, vi } from "vitest";
+
+// Importing `~/v3/mollifier/mollifierDrainer.server` (below) transitively
+// loads `~/v3/runEngine.server`, whose top-level `singleton(...)` call
+// eagerly constructs a RunEngine. That spins up Prisma + Redis workers
+// that try to connect to localhost — which in CI (no PG, no Redis)
+// produces an unhandled `PrismaClientInitializationError` that fails
+// the test run even though the assertions all pass. Mocking the
+// runEngine module short-circuits the singleton so no worker starts.
+vi.mock("~/v3/runEngine.server", () => ({ engine: {} }));
+// Same problem: prisma.server.ts's top-level singleton tries to open a
+// PG client. The test never makes a query; an empty stub is enough.
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
 import { MollifierConfigurationError } from "~/v3/mollifier/mollifierDrainer.server";
 import { initMollifierDrainerWorker } from "~/v3/mollifierDrainerWorker.server";
 
diff --git a/apps/webapp/test/mollifierGate.test.ts b/apps/webapp/test/mollifierGate.test.ts
index b81df7f0c5b..e40a29b2481 100644
--- a/apps/webapp/test/mollifierGate.test.ts
+++ b/apps/webapp/test/mollifierGate.test.ts
@@ -432,3 +432,82 @@ describe("evaluateGate — per-org isolation via Organization.featureFlags", ()
     expect(unrelatedDeps.spies.evaluatorCalls).toBe(0);
   });
 });
+
+// Bypasses: the three categories of trigger that the mollifier never
+// intercepts, regardless of the per-org flag or the trip-evaluator decision.
+describe("evaluateGate — debounce / OTU / triggerAndWait bypasses", () => {
+  it("debounce triggers pass through without invoking the evaluator", async () => {
+    const { deps, spies } = makeDeps({
+      enabled: true,
+      shadow: false,
+      flag: true,
+      decision: trippedDecision,
+    });
+    const outcome = await evaluateGate(
+      { ...inputs, options: { debounce: { key: "k" } } },
+      deps,
+    );
+    expect(outcome).toEqual({ action: "pass_through" });
+    expect(spies.evaluatorCalls).toBe(0);
+  });
+
+  it("oneTimeUseToken triggers pass through without invoking the evaluator", async () => {
+    const { deps, spies } = makeDeps({
+      enabled: true,
+      shadow: false,
+      flag: true,
+      decision: trippedDecision,
+    });
+    const outcome = await evaluateGate(
+      { ...inputs, options: { oneTimeUseToken: "jwt-otu" } },
+      deps,
+    );
+    expect(outcome).toEqual({ action: "pass_through" });
+    expect(spies.evaluatorCalls).toBe(0);
+  });
+
+  it("single triggerAndWait (parentTaskRunId + resumeParentOnCompletion) passes through", async () => {
+    const { deps, spies } = makeDeps({
+      enabled: true,
+      shadow: false,
+      flag: true,
+      decision: trippedDecision,
+    });
+    const outcome = await evaluateGate(
+      {
+        ...inputs,
+        options: { parentTaskRunId: "run_parent", resumeParentOnCompletion: true },
+      },
+      deps,
+    );
+    expect(outcome).toEqual({ action: "pass_through" });
+    expect(spies.evaluatorCalls).toBe(0);
+  });
+
+  it("parentTaskRunId alone (no resumeParentOnCompletion) does NOT bypass — must be both", async () => {
+    const { deps, spies } = makeDeps({
+      enabled: true,
+      shadow: false,
+      flag: true,
+      decision: trippedDecision,
+    });
+    const outcome = await evaluateGate(
+      { ...inputs, options: { parentTaskRunId: "run_parent" } },
+      deps,
+    );
+    expect(outcome.action).toBe("mollify");
+    expect(spies.evaluatorCalls).toBe(1);
+  });
+
+  it("bypass records pass_through decision (so observability counters stay accurate)", async () => {
+    const { deps, spies } = makeDeps({
+      enabled: true,
+      shadow: false,
+      flag: true,
+      decision: trippedDecision,
+    });
+    await evaluateGate({ ...inputs, options: { debounce: { key: "k" } } }, deps);
+    expect(spies.recordDecisionCalls).toHaveLength(1);
+    expect(spies.recordDecisionCalls[0].outcome).toBe("pass_through");
+  });
+});
diff --git a/apps/webapp/test/mollifierIdempotencyClaim.test.ts b/apps/webapp/test/mollifierIdempotencyClaim.test.ts
new file mode 100644
index 00000000000..87c009cb1f7
--- /dev/null
+++ b/apps/webapp/test/mollifierIdempotencyClaim.test.ts
@@ -0,0 +1,268 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import {
+  claimOrAwait,
+  publishClaim,
+  releaseClaim,
+} from "~/v3/mollifier/idempotencyClaim.server";
+import type {
+  IdempotencyClaimResult,
+  MollifierBuffer,
+} from "@trigger.dev/redis-worker";
+
+type ClaimState = {
+  value: string | null;
+  // Scripted return sequence for claimIdempotency calls. When set,
+  // overrides the default behaviour of returning based on `value`.
+  scriptedClaims?: IdempotencyClaimResult[];
+};
+
+function makeBuffer(initial: ClaimState = { value: null }): {
+  buffer: MollifierBuffer;
+  state: ClaimState;
+} {
+  const state = { ...initial };
+  const buffer = {
+    claimIdempotency: vi.fn(async (): Promise<IdempotencyClaimResult> => {
+      if (state.scriptedClaims && state.scriptedClaims.length > 0) {
+        return state.scriptedClaims.shift()!;
+      }
+      if (state.value === null) {
+        state.value = "pending";
+        return { kind: "claimed" };
+      }
+      if (state.value === "pending") return { kind: "pending" };
+      return { kind: "resolved", runId: state.value };
+    }),
+    readClaim: vi.fn(async (): Promise<IdempotencyClaimResult | null> => {
+      if (state.value === null) return null;
+      if (state.value === "pending") return { kind: "pending" };
+      return { kind: "resolved", runId: state.value };
+    }),
+    publishClaim: vi.fn(async ({ runId }: { runId: string }) => {
+      state.value = runId;
+    }),
+    releaseClaim: vi.fn(async () => {
+      state.value = null;
+    }),
+  } as unknown as MollifierBuffer;
+  return { buffer, state };
+}
+
+const baseInput = {
+  envId: "env_a",
+  taskIdentifier: "my-task",
+  idempotencyKey: "k-1",
+};
+
+describe("claimOrAwait", () => {
+  it("returns 'claimed' for the first caller — empty key wins SETNX", async () => {
+    const { buffer } = makeBuffer({ value: null });
+    const outcome = await claimOrAwait({
+      ...baseInput,
+      buffer,
+      generateToken: () => "token-1",
+    });
+    expect(outcome).toEqual({ kind: "claimed", token: "token-1" });
+  });
+
+  it("returns 'resolved' immediately when the key already holds a runId", async () => {
+    const { buffer } = makeBuffer({ value: "run_X" });
+    const outcome = await claimOrAwait({ ...baseInput, buffer });
+    expect(outcome).toEqual({ kind: "resolved", runId: "run_X" });
+  });
+
+  it("polls a pending key, then resolves when the runId is published", async () => {
+    const { buffer, state } = makeBuffer({ value: "pending" });
+    let nowValue = 0;
+    let pollCount = 0;
+    const outcome = await claimOrAwait({
+      ...baseInput,
+      buffer,
+      now: () => nowValue,
+      sleep: async (ms) => {
+        nowValue += ms;
+        pollCount += 1;
+        if (pollCount === 3) state.value = "run_X";
+      },
+      safetyNetMs: 1000,
+      pollStepMs: 25,
+    });
+    expect(outcome).toEqual({ kind: "resolved", runId: "run_X" });
+  });
+
+  it("returns 'timed_out' when the key stays pending past safetyNetMs", async () => {
+    const { buffer } = makeBuffer({ value: "pending" });
+    let nowValue = 0;
+    const outcome = await claimOrAwait({
+      ...baseInput,
+      buffer,
+      now: () => nowValue,
+      sleep: async (ms) => {
+        nowValue += ms;
+      },
+      safetyNetMs: 50,
+      pollStepMs: 25,
+    });
+    expect(outcome).toEqual({ kind: "timed_out" });
+  });
+
+  it("retries the claim when a polled key vanishes (claimant released)", async () => {
+    const { buffer, state } = makeBuffer({ value: "pending" });
+    let nowValue = 0;
+    let pollCount = 0;
+    // Scripted retry: on the second `claimIdempotency` call we win.
+    state.scriptedClaims = [
+      { kind: "pending" }, // first call (initial)
+      { kind: "claimed" }, // second call (retry after release)
+    ];
+    const outcome = await claimOrAwait({
+      ...baseInput,
+      buffer,
+      generateToken: () => "token-retry",
+      now: () => nowValue,
+      sleep: async (ms) => {
+        nowValue += ms;
+        pollCount += 1;
+        // First poll cycle: key vanishes (release).
+        if (pollCount === 1) state.value = null;
+      },
+      safetyNetMs: 1000,
+      pollStepMs: 25,
+    });
+    expect(outcome).toEqual({ kind: "claimed", token: "token-retry" });
+  });
+
+  it("fails open with 'claimed' when buffer is null (mollifier disabled)", async () => {
+    const outcome = await claimOrAwait({
+      ...baseInput,
+      buffer: null,
+      generateToken: () => "token-fallopen-null",
+    });
+    expect(outcome).toEqual({ kind: "claimed", token: "token-fallopen-null" });
+  });
+
+  it("fails open with 'claimed' if buffer.claimIdempotency throws (Redis down)", async () => {
+    const buffer = {
+      claimIdempotency: vi.fn(async () => {
+        throw new Error("ECONNREFUSED");
+      }),
+    } as unknown as MollifierBuffer;
+    const outcome = await claimOrAwait({
+      ...baseInput,
+      buffer,
+      generateToken: () => "token-fallopen-throw",
+    });
+    expect(outcome).toEqual({ kind: "claimed", token: "token-fallopen-throw" });
+  });
+
+  it("respects an aborted signal during the wait loop", async () => {
+    const { buffer } = makeBuffer({ value: "pending" });
+    const controller = new AbortController();
+    let nowValue = 0;
+    let pollCount = 0;
+    const outcome = await claimOrAwait({
+      ...baseInput,
+      buffer,
+      now: () => nowValue,
+      sleep: async (ms) => {
+        nowValue += ms;
+        pollCount += 1;
+        if (pollCount === 1) controller.abort();
+      },
+      abortSignal: controller.signal,
+      safetyNetMs: 5000,
+      pollStepMs: 25,
+    });
+    expect(outcome).toEqual({ kind: "timed_out" });
+  });
+});
+
+describe("publishClaim", () => {
+  it("writes the runId to the claim key", async () => {
+    const { buffer, state } = makeBuffer({ value: "pending" });
+    await publishClaim({ ...baseInput, token: "owner-token", runId: "run_X", buffer });
+    expect(state.value).toBe("run_X");
+    expect(buffer.publishClaim).toHaveBeenCalledOnce();
+  });
+
+  it("no-op when buffer is null", async () => {
+    await expect(
+      publishClaim({ ...baseInput, token: "owner-token", runId: "run_X", buffer: null }),
+    ).resolves.toBeUndefined();
+  });
+
+  it("swallows errors so trigger pipeline isn't broken by Redis hiccups", async () => {
+    const buffer = {
+      publishClaim: vi.fn(async () => {
+        throw new Error("ECONNREFUSED");
+      }),
+    } as unknown as MollifierBuffer;
+    await expect(
+      publishClaim({ ...baseInput, token: "owner-token", runId: "run_X", buffer }),
+    ).resolves.toBeUndefined();
+  });
+});
+
+describe("releaseClaim", () => {
+  it("DELs the claim so waiters can re-acquire", async () => {
+    const { buffer, state } = makeBuffer({ value: "pending" });
+    await releaseClaim({ ...baseInput, token: "owner-token", buffer });
+    expect(state.value).toBeNull();
+  });
+
+  it("no-op when buffer is null", async () => {
+    await expect(releaseClaim({ ...baseInput, token: "owner-token", buffer: null })).resolves.toBeUndefined();
+  });
+});
+
+// End-to-end: the token from `claimOrAwait`'s `claimed` outcome must
+// reach `buffer.claimIdempotency` and round-trip through publishClaim /
+// releaseClaim. Without this the compare-and-act ownership protection
+// in the buffer is bypassed and the stale-claimant hazard returns.
+describe("claim ownership token wiring", () => {
+  it("threads the token from claimOrAwait into buffer.claimIdempotency", async () => {
+    const { buffer } = makeBuffer({ value: null });
+    const outcome = await claimOrAwait({
+      ...baseInput,
+      buffer,
+      generateToken: () => "owner-token-xyz",
+    });
+    expect(outcome).toEqual({ kind: "claimed", token: "owner-token-xyz" });
+    expect(buffer.claimIdempotency).toHaveBeenCalledWith({
+      ...baseInput,
+      token: "owner-token-xyz",
+      ttlSeconds: 30,
+    });
+  });
+
+  it("threads the token from publishClaim into buffer.publishClaim", async () => {
+    const { buffer } = makeBuffer({ value: "pending" });
+    await publishClaim({
+      ...baseInput,
+      token: "owner-token-xyz",
+      runId: "run_X",
+      buffer,
+    });
+    expect(buffer.publishClaim).toHaveBeenCalledWith(
+      expect.objectContaining({
+        token: "owner-token-xyz",
+        runId: "run_X",
+      }),
+    );
+  });
+
+  it("threads the token from releaseClaim into buffer.releaseClaim", async () => {
+    const { buffer } = makeBuffer({ value: "pending" });
+    await releaseClaim({
+      ...baseInput,
+      token: "owner-token-xyz",
+      buffer,
+    });
+    expect(buffer.releaseClaim).toHaveBeenCalledWith(
+      expect.objectContaining({ token: "owner-token-xyz" }),
+    );
+  });
+});
diff --git a/apps/webapp/test/mollifierMollify.test.ts b/apps/webapp/test/mollifierMollify.test.ts
new file mode 100644
index 00000000000..ec7a30b49c2
--- /dev/null
+++ b/apps/webapp/test/mollifierMollify.test.ts
@@ -0,0 +1,133 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({
+  prisma: {},
+  $replica: {},
+}));
+
+import { mollifyTrigger } from "~/v3/mollifier/mollifierMollify.server";
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+
+function fakeBuffer(
+  acceptResult: Awaited<ReturnType<MollifierBuffer["accept"]>> = { kind: "accepted" },
+): { buffer: MollifierBuffer; accept: ReturnType<typeof vi.fn> } {
+  const accept = vi.fn(async () => acceptResult);
+  return {
+    buffer: { accept } as unknown as MollifierBuffer,
+    accept,
+  };
+}
+
+describe("mollifyTrigger", () => {
+  it("writes the snapshot to buffer and returns synthesised result", async () => {
+    const { buffer, accept } = fakeBuffer();
+    const result = await mollifyTrigger({
+      runFriendlyId: "run_abc123def456",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      engineTriggerInput: { taskIdentifier: "my-task", payload: '{"x":1}' },
+      decision: {
+        divert: true,
+        reason: "per_env_rate",
+        count: 150,
+        threshold: 100,
+      },
+      buffer,
+    });
+
+    expect(accept).toHaveBeenCalledOnce();
+    expect(accept).toHaveBeenCalledWith({
+      runId: "run_abc123def456",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: expect.any(String),
+      idempotencyKey: undefined,
+      taskIdentifier: undefined,
+    });
+    expect(result.run.friendlyId).toBe("run_abc123def456");
+    expect(result.error).toBeUndefined();
+    expect(result.isCached).toBe(false);
+    expect(result.notice).toEqual({
+      code: "mollifier.queued",
+      message: expect.stringContaining("burst buffer"),
+      docs: expect.stringContaining("trigger.dev/docs"),
+    });
+  });
+
+  it("echoes the winner's runId with isCached=true on duplicate_idempotency", async () => {
+    const { buffer } = fakeBuffer({
+      kind: "duplicate_idempotency",
+      existingRunId: "run_winner12345",
+    });
+    const result = await mollifyTrigger({
+      runFriendlyId: "run_loser56789a",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      engineTriggerInput: { taskIdentifier: "t", payload: "{}" },
+      decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 },
+      buffer,
+      idempotencyKey: "key",
+      taskIdentifier: "t",
+    });
+    expect(result.run.friendlyId).toBe("run_winner12345");
+    expect(result.isCached).toBe(true);
+    expect(result.notice).toBeUndefined();
+  });
+
+  // Regression: the synthetic result MUST carry a populated `run.id`
+  // derived from the friendlyId. Without it, the route handler's
+  // `saveRequestIdempotency(…, result.run.id)` stores `undefined` as
+  // the cached entity id, and on SDK retry Prisma's
+  // `findFirst({ where: { id: undefined } })` silently drops the
+  // predicate and returns an arbitrary TaskRun — a cross-tenant leak
+  // path. (See Devin review on PR #3753.)
+  it("populates run.id from friendlyId on the happy-accept path", async () => {
+    const { buffer } = fakeBuffer();
+    const result = await mollifyTrigger({
+      runFriendlyId: "run_pri456789ab",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      engineTriggerInput: { taskIdentifier: "t", payload: "{}" },
+      decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 },
+      buffer,
+    });
+    expect(result.run.id).toBe(RunId.fromFriendlyId("run_pri456789ab"));
+    expect(result.run.id).toMatch(/^[a-z0-9]+$/); // non-undefined, non-empty
+  });
+
+  it("populates run.id from the WINNER's friendlyId on duplicate_idempotency", async () => {
+    const { buffer } = fakeBuffer({
+      kind: "duplicate_idempotency",
+      existingRunId: "run_winnerdup12",
+    });
+    const result = await mollifyTrigger({
+      runFriendlyId: "run_loser56789a",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      engineTriggerInput: { taskIdentifier: "t", payload: "{}" },
+      decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 },
+      buffer,
+      idempotencyKey: "key",
+      taskIdentifier: "t",
+    });
+    expect(result.run.id).toBe(RunId.fromFriendlyId("run_winnerdup12"));
+    expect(result.run.id).not.toBe(RunId.fromFriendlyId("run_loser56789a"));
+  });
+
+  it("snapshot is round-trippable: payload field is parseable JSON of engineTriggerInput", async () => {
+    const { buffer, accept } = fakeBuffer();
+    const engineInput = { taskIdentifier: "t", payload: "{}", tags: ["a", "b"] };
+    await mollifyTrigger({
+      runFriendlyId: "run_xabcde12345",
+      environmentId: "env_a",
+      organizationId: "org_1",
+      engineTriggerInput: engineInput,
+      decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 },
+      buffer,
+    });
+
+    const callArg = accept.mock.calls[0][0] as { payload: string };
+    expect(JSON.parse(callArg.payload)).toEqual(engineInput);
+  });
+});
diff --git a/apps/webapp/test/mollifierMutateWithFallback.test.ts b/apps/webapp/test/mollifierMutateWithFallback.test.ts
new file mode 100644
index 00000000000..1102229f568
--- /dev/null
+++ b/apps/webapp/test/mollifierMutateWithFallback.test.ts
@@ -0,0 +1,481 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({
+  prisma: { taskRun: { findFirst: vi.fn(async () => null) } },
+  $replica: { taskRun: { findFirst: vi.fn(async () => null) } },
+}));
+
+import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server";
+import type {
+  BufferEntry,
+  MollifierBuffer,
+  MutateSnapshotResult,
+} from "@trigger.dev/redis-worker";
+import type { TaskRun } from "@trigger.dev/database";
+
+type FindFirst = ReturnType<typeof vi.fn>;
+type PrismaStub = { taskRun: { findFirst: FindFirst } };
+
+function fakePrisma(rows: Array<TaskRun | null>): PrismaStub {
+  const fn = vi.fn();
+  for (const r of rows) fn.mockResolvedValueOnce(r);
+  fn.mockResolvedValue(null);
+  return { taskRun: { findFirst: fn } };
+}
+
+// Env-matching entry returned by the env-pre-check getEntry call that
+// mutateWithFallback now does before any buffer write (cross-env auth
+// gate). Same envId/orgId as `baseInput` so the check passes and the
+// flow under test proceeds to mutateSnapshot.
+const preCheckEntry = (): BufferEntry =>
+  ({
+    envId: "env_a",
+    orgId: "org_1",
+    status: "QUEUED",
+    materialised: false,
+  }) as unknown as BufferEntry;
+
+function bufferReturning(result: MutateSnapshotResult): MollifierBuffer {
+  const getEntry = vi.fn(async () => preCheckEntry());
+  return {
+    mutateSnapshot: vi.fn(async () => result),
+    getEntry,
+  } as unknown as MollifierBuffer;
+}
+
+// Buffer whose mutateSnapshot returns "busy" and whose getEntry walks a
+// scripted sequence of entry states. The pre-check getEntry call (one
+// extra read before the busy-wait loop, used for env authorization)
+// consumes the first scripted result, then the busy-wait loop pops the
+// remainder; the last element repeats once the sequence is exhausted.
+function bufferBusy(entries: Array<BufferEntry | null>): MollifierBuffer {
+  const getEntry = vi.fn();
+  // Pre-check consumes one entry. Use a QUEUED env-matching entry so
+  // the env-check passes and the flow reaches mutateSnapshot (which
+  // returns "busy") and enters the wait-loop.
+  getEntry.mockResolvedValueOnce(preCheckEntry());
+  for (const e of entries) getEntry.mockResolvedValueOnce(e);
+  getEntry.mockResolvedValue(entries.length ? entries[entries.length - 1] : null);
+  return {
+    mutateSnapshot: vi.fn(async () => "busy" as const),
+    getEntry,
+  } as unknown as MollifierBuffer;
+}
+
+const entryDraining = (): BufferEntry =>
+  ({
+    envId: "env_a",
+    orgId: "org_1",
+    status: "DRAINING",
+    materialised: false,
+  }) as unknown as BufferEntry;
+const entryQueued = (): BufferEntry =>
+  ({
+    envId: "env_a",
+    orgId: "org_1",
+    status: "QUEUED",
+    materialised: false,
+  }) as unknown as BufferEntry;
+const entryMaterialised = (): BufferEntry =>
+  ({
+    envId: "env_a",
+    orgId: "org_1",
+    status: "DRAINING",
+    materialised: true,
+  }) as unknown as BufferEntry;
+
+const fakeRun = (overrides: Partial<TaskRun> = {}): TaskRun =>
+  ({
+    id: "pg_id",
+    friendlyId: "run_1",
+    runtimeEnvironmentId: "env_a",
+    ...overrides,
+  }) as TaskRun;
+
+const baseInput = {
+  runId: "run_1",
+  environmentId: "env_a",
+  organizationId: "org_1",
+  bufferPatch: { type: "append_tags" as const, tags: ["x"] },
+};
+
+describe("mutateWithFallback", () => {
+  it("hits replica → calls pgMutation, returns pg outcome", async () => {
+    const row = fakeRun();
+    const pgMutation = vi.fn(async () => "pg-response");
+    const synthesisedResponse = vi.fn(() => "snapshot-response");
+
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse,
+      prismaReplica: fakePrisma([row]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => bufferReturning("applied_to_snapshot"),
+    });
+
+    expect(result).toEqual({ kind: "pg", response: "pg-response" });
+    expect(pgMutation).toHaveBeenCalledWith(row);
+    expect(synthesisedResponse).not.toHaveBeenCalled();
+  });
+
+  it("replica miss + buffer applied_to_snapshot → synthesisedResponse", async () => {
+    const pgMutation = vi.fn(async () => "pg");
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => bufferReturning("applied_to_snapshot"),
+    });
+    expect(result).toEqual({ kind: "snapshot", response: "snap" });
+    expect(pgMutation).not.toHaveBeenCalled();
+  });
+
+  it("applied_to_snapshot forwards the pre-mutation entry to synthesisedResponse (lets callers dedup)", async () => {
+    // The tags route uses this to compute the same post-dedup count
+    // the PG path reports, without an extra Redis round-trip.
+    const synthesised = vi.fn(({ bufferEntry }: { bufferEntry: BufferEntry | null }) => {
+      // Caller can inspect bufferEntry.payload (or other fields) to
+      // produce a response that depends on the prior snapshot state.
+      return bufferEntry ? "snap-with-entry" : "snap-without-entry";
+    });
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: synthesised,
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => bufferReturning("applied_to_snapshot"),
+    });
+    expect(result).toEqual({ kind: "snapshot", response: "snap-with-entry" });
+    expect(synthesised).toHaveBeenCalledTimes(1);
+    const ctx = synthesised.mock.calls[0]?.[0];
+    expect(ctx?.bufferEntry).not.toBeNull();
+    // The pre-check entry has the env-matching shape set up by
+    // bufferReturning() / preCheckEntry().
+    expect(ctx?.bufferEntry?.envId).toBe("env_a");
+    expect(ctx?.bufferEntry?.orgId).toBe("org_1");
+  });
+
+  // Symmetric writer-fallback in the `!buffer` short-circuit. Without
+  // this, mollifier-disabled deployments (or boot-time buffer init
+  // failures) would regress the pre-PR mutation routes — those read
+  // from the writer directly, so a fresh PG row was always visible.
+  // The replica offload introduced here moves the read to the lagging
+  // follower; if the buffer isn't available to disambiguate, we still
+  // probe the writer before returning 404.
+  it("replica miss + !buffer + writer hit → pgMutation (mollifier-disabled mode recovery)", async () => {
+    const row = fakeRun({ friendlyId: "run_1" });
+    const pgMutation = vi.fn(async () => "pg-recovered-no-buffer");
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([row]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => null,
+    });
+    expect(result).toEqual({ kind: "pg", response: "pg-recovered-no-buffer" });
+    expect(pgMutation).toHaveBeenCalledWith(row);
+  });
+
+  it("replica miss + !buffer + writer miss → not_found (genuine 404 in mollifier-disabled mode)", async () => {
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([null]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => null,
+    });
+    expect(result).toEqual({ kind: "not_found" });
+  });
+
+  it("replica miss + buffer not_found + writer miss → not_found", async () => {
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([null]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => bufferReturning("not_found"),
+    });
+    expect(result).toEqual({ kind: "not_found" });
+  });
+
+  it("replica miss + buffer not_found + writer hit → pgMutation (replica-lag recovery)", async () => {
+    const row = fakeRun({ friendlyId: "run_1" });
+    const pgMutation = vi.fn(async () => "pg-recovered");
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([row]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => bufferReturning("not_found"),
+    });
+    expect(result).toEqual({ kind: "pg", response: "pg-recovered" });
+    expect(pgMutation).toHaveBeenCalledWith(row);
+  });
+
+  it("busy → watches buffer through DRAINING, materialises, hits primary exactly once", async () => {
+    const row = fakeRun();
+    const pgMutation = vi.fn(async () => "pg-after-wait");
+    // Writer is read ONCE, only after the buffer reports materialised.
+    const writer = fakePrisma([row]);
+    const buffer = bufferBusy([entryDraining(), entryDraining(), entryMaterialised()]);
+    let nowValue = 0;
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: writer as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+      sleep: async (ms) => {
+        nowValue += ms;
+      },
+      now: () => nowValue,
+      safetyNetMs: 2000,
+      pollStepMs: 20,
+      random: () => 0,
+    });
+    expect(result).toEqual({ kind: "pg", response: "pg-after-wait" });
+    expect(pgMutation).toHaveBeenCalledWith(row);
+    // One env-pre-check call + 3 busy-wait polls = 4 getEntry reads;
+    // primary read exactly once.
+    expect(buffer.getEntry).toHaveBeenCalledTimes(4);
+    expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1);
+  });
+
+  it("busy → entry deleted by terminal fail, writer finds SYSTEM_FAILURE row → pgMutation", async () => {
+    const row = fakeRun();
+    const pgMutation = vi.fn(async () => "pg-failed-row");
+    const writer = fakePrisma([row]);
+    const buffer = bufferBusy([entryDraining(), null]);
+    let nowValue = 0;
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: writer as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+      sleep: async (ms) => {
+        nowValue += ms;
+      },
+      now: () => nowValue,
+      safetyNetMs: 2000,
+      pollStepMs: 20,
+      random: () => 0,
+    });
+    expect(result).toEqual({ kind: "pg", response: "pg-failed-row" });
+    expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1);
+  });
+
+  it("busy → entry deleted but no PG row (terminal write failed) → not_found", async () => {
+    const buffer = bufferBusy([null]);
+    const writer = fakePrisma([null]);
+    let nowValue = 0;
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: writer as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+      sleep: async (ms) => {
+        nowValue += ms;
+      },
+      now: () => nowValue,
+      safetyNetMs: 2000,
+      pollStepMs: 20,
+      random: () => 0,
+    });
+    expect(result).toEqual({ kind: "not_found" });
+    expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1);
+  });
+
+  it("busy → requeued (back to QUEUED) then materialises; doesn't resolve early", async () => {
+    const row = fakeRun();
+    const pgMutation = vi.fn(async () => "pg-after-requeue");
+    const writer = fakePrisma([row]);
+    // QUEUED (requeued after a retryable drain error) must NOT be treated
+    // as "done" — the run hasn't reached PG. Only the later materialise does.
+    const buffer = bufferBusy([entryQueued(), entryDraining(), entryMaterialised()]);
+    let nowValue = 0;
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: writer as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+      sleep: async (ms) => {
+        nowValue += ms;
+      },
+      now: () => nowValue,
+      safetyNetMs: 2000,
+      pollStepMs: 20,
+      random: () => 0,
+    });
+    expect(result).toEqual({ kind: "pg", response: "pg-after-requeue" });
+    // One env-pre-check + 3 busy-wait polls.
+    expect(buffer.getEntry).toHaveBeenCalledTimes(4);
+    expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1);
+  });
+
+  it("busy → drainer never resolves (stays DRAINING) → timed_out, primary never touched", async () => {
+    const writer = fakePrisma([]);
+    const buffer = bufferBusy([entryDraining()]);
+    let nowValue = 0;
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: writer as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+      sleep: async (ms) => {
+        nowValue += ms;
+      },
+      now: () => nowValue,
+      safetyNetMs: 100,
+      pollStepMs: 20,
+      random: () => 0,
+    });
+    expect(result).toEqual({ kind: "timed_out" });
+    // The whole point: while the run is still draining we never read the primary.
+    expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(0);
+  });
+
+  it("abort signal during wait → timed_out without further polls", async () => {
+    const writer = fakePrisma([]);
+    const buffer = bufferBusy([entryDraining(), entryDraining()]);
+    const controller = new AbortController();
+    let nowValue = 0;
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: writer as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+      sleep: async (ms) => {
+        nowValue += ms;
+        controller.abort();
+      },
+      now: () => nowValue,
+      safetyNetMs: 2000,
+      pollStepMs: 20,
+      random: () => 0,
+      abortSignal: controller.signal,
+    });
+    expect(result).toEqual({ kind: "timed_out" });
+    // One env-pre-check + one busy-wait poll before sleep+abort; primary untouched.
+    expect(buffer.getEntry).toHaveBeenCalledTimes(2);
+    expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(0);
+  });
+
+  it("replica miss + buffer limit_exceeded → rejected via rejectedResponse builder", async () => {
+    const pgMutation = vi.fn(async () => "pg");
+    const synthesisedResponse = vi.fn(() => "snap");
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse,
+      rejectedResponse: () => "too-many-tags",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => bufferReturning("limit_exceeded"),
+    });
+    expect(result).toEqual({ kind: "rejected", response: "too-many-tags" });
+    expect(pgMutation).not.toHaveBeenCalled();
+    expect(synthesisedResponse).not.toHaveBeenCalled();
+  });
+
+  it("buffer limit_exceeded without a rejectedResponse builder → throws (programmer error)", async () => {
+    await expect(
+      mutateWithFallback({
+        ...baseInput,
+        pgMutation: async () => "pg",
+        synthesisedResponse: () => "snap",
+        prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+        prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+        getBuffer: () => bufferReturning("limit_exceeded"),
+      })
+    ).rejects.toThrow(/limit_exceeded/);
+  });
+
+  it("replica miss + buffer entry belongs to a different env → not_found (cross-env auth gate)", async () => {
+    // Same flow as the applied_to_snapshot test, except the entry's
+    // envId doesn't match input.environmentId. mutateWithFallback must
+    // refuse the write and return not_found (without leaking that the
+    // runId exists in another env), and must NOT call mutateSnapshot.
+    const crossEnvEntry: BufferEntry = {
+      envId: "env_OTHER",
+      orgId: "org_1",
+      status: "QUEUED",
+      materialised: false,
+    } as unknown as BufferEntry;
+    const mutateSnapshot = vi.fn(async () => "applied_to_snapshot" as const);
+    const buffer = {
+      mutateSnapshot,
+      getEntry: vi.fn(async () => crossEnvEntry),
+    } as unknown as MollifierBuffer;
+
+    const pgMutation = vi.fn(async () => "pg");
+    const synthesisedResponse = vi.fn(() => "snap");
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse,
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+    });
+    expect(result).toEqual({ kind: "not_found" });
+    expect(mutateSnapshot).not.toHaveBeenCalled();
+    expect(pgMutation).not.toHaveBeenCalled();
+    expect(synthesisedResponse).not.toHaveBeenCalled();
+  });
+
+  it("replica miss + buffer entry belongs to a different org → not_found (cross-org auth gate)", async () => {
+    const crossOrgEntry: BufferEntry = {
+      envId: "env_a",
+      orgId: "org_OTHER",
+      status: "QUEUED",
+      materialised: false,
+    } as unknown as BufferEntry;
+    const mutateSnapshot = vi.fn(async () => "applied_to_snapshot" as const);
+    const buffer = {
+      mutateSnapshot,
+      getEntry: vi.fn(async () => crossOrgEntry),
+    } as unknown as MollifierBuffer;
+
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => buffer,
+    });
+    expect(result).toEqual({ kind: "not_found" });
+    expect(mutateSnapshot).not.toHaveBeenCalled();
+  });
+
+  it("buffer is null (mollifier disabled) → not_found after replica miss", async () => {
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation: async () => "pg",
+      synthesisedResponse: () => "snap",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => null,
+    });
+    expect(result).toEqual({ kind: "not_found" });
+  });
+});
diff --git a/apps/webapp/test/mollifierReadFallback.test.ts b/apps/webapp/test/mollifierReadFallback.test.ts
new file mode 100644
index 00000000000..feef6a420ad
--- /dev/null
+++ b/apps/webapp/test/mollifierReadFallback.test.ts
@@ -0,0 +1,535 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({
+  prisma: {},
+  $replica: {},
+}));
+
+import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
+import type { MollifierBuffer, BufferEntry } from "@trigger.dev/redis-worker";
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+
+function fakeBuffer(entry: BufferEntry | null): MollifierBuffer {
+  return {
+    getEntry: vi.fn(async () => entry),
+  } as unknown as MollifierBuffer;
+}
+
+const NOW = new Date("2026-05-11T12:00:00Z");
+
+describe("findRunByIdWithMollifierFallback", () => {
+  it("returns null when buffer is unavailable (mollifier disabled)", async () => {
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => null },
+    );
+    expect(result).toBeNull();
+  });
+
+  it("returns null when no buffer entry exists", async () => {
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(null) },
+    );
+    expect(result).toBeNull();
+  });
+
+  it("returns null when buffer entry envId does not match caller (auth mismatch)", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_OTHER",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result).toBeNull();
+  });
+
+  it("returns null when buffer entry orgId does not match caller (auth mismatch)", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_OTHER",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result).toBeNull();
+  });
+
+  it("returns synthesised QUEUED run when entry exists with matching auth", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "my-task" }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result).not.toBeNull();
+    expect(result!.friendlyId).toBe("run_1");
+    expect(result!.status).toBe("QUEUED");
+    expect(result!.taskIdentifier).toBe("my-task");
+    expect(result!.createdAt).toEqual(NOW);
+  });
+
+  it("returns synthesised QUEUED for DRAINING (internal state same externally)", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "DRAINING",
+      attempts: 1,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.status).toBe("QUEUED");
+  });
+
+  it("returns FAILED state with structured error for FAILED entries", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "FAILED",
+      attempts: 3,
+      createdAt: NOW,
+      lastError: { code: "VALIDATION", message: "task not found" },
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.status).toBe("FAILED");
+    expect(result!.error).toEqual({ code: "VALIDATION", message: "task not found" });
+  });
+
+  it("extracts snapshot-derived fields from the buffered payload", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "my-task",
+        payload: '{"foo":"bar"}',
+        payloadType: "application/json",
+        metadata: '{"customer":"acme"}',
+        metadataType: "application/json",
+        idempotencyKey: "client-abc",
+        idempotencyKeyOptions: { key: "client-abc", scope: "run" },
+        isTest: true,
+        depth: 2,
+        ttl: "1h",
+        tags: ["tag-a", "tag-b"],
+        // The engine.trigger snapshot stores the locked version string under
+        // `taskVersion` (see triggerTask.server.ts#buildEngineTriggerInput).
+        taskVersion: "20260511.1",
+        resumeParentOnCompletion: false,
+        parentTaskRunId: "run_parent",
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result).not.toBeNull();
+    expect(result!.payloadType).toBe("application/json");
+    expect(result!.metadata).toBe('{"customer":"acme"}');
+    expect(result!.metadataType).toBe("application/json");
+    expect(result!.idempotencyKey).toBe("client-abc");
+    expect(result!.idempotencyKeyOptions).toEqual({ key: "client-abc", scope: "run" });
+    expect(result!.isTest).toBe(true);
+    expect(result!.depth).toBe(2);
+    expect(result!.ttl).toBe("1h");
+    expect(result!.tags).toEqual(["tag-a", "tag-b"]);
+    expect(result!.lockedToVersion).toBe("20260511.1");
+    expect(result!.resumeParentOnCompletion).toBe(false);
+    expect(result!.parentTaskRunId).toBe("run_parent");
+  });
+
+  it("extracts gate-allocated trace context from the snapshot", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        traceId: "trace_abc",
+        spanId: "span_xyz",
+        parentSpanId: "span_parent",
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.traceId).toBe("trace_abc");
+    expect(result!.spanId).toBe("span_xyz");
+    expect(result!.parentSpanId).toBe("span_parent");
+  });
+
+  it("parses idempotencyKeyOptions in the canonical { key, scope } object shape (regression for the buffered-vs-PG API contract divergence)", async () => {
+    // Regression for the bug where `readFallback` parsed
+    // `idempotencyKeyOptions` via Array.isArray and rejected the
+    // canonical object shape. The SDK and Prisma both serialise this
+    // as `{ key, scope }`; the legacy array check would reject it,
+    // returning `undefined` here, which downstream demoted the API's
+    // `idempotencyKey` field to surface the *hash* (server-side
+    // generated) instead of the user-supplied key — diverging from
+    // how materialised runs render the same field, and creating a
+    // silent contract flip at the drainer-materialisation boundary.
+    // Pin the schema-parse path so the buffered response matches
+    // PG-resident behaviour from the moment the run is buffered.
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        idempotencyKey: "<hashed>",
+        idempotencyKeyOptions: { key: "user-supplied-key", scope: "global" },
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result).not.toBeNull();
+    expect(result!.idempotencyKeyOptions).toEqual({
+      key: "user-supplied-key",
+      scope: "global",
+    });
+  });
+
+  it("returns undefined for idempotencyKeyOptions when the snapshot carries a legacy/invalid shape", async () => {
+    // The Zod schema parse rejects:
+    //   - array shape (the legacy bug we just fixed)
+    //   - object without required fields
+    //   - missing field entirely
+    // In all these cases the field is left `undefined`. Downstream
+    // `getUserProvidedIdempotencyKey` then falls back to the
+    // `idempotencyKey` field, matching how PG-resident runs handle
+    // malformed/missing options.
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        idempotencyKey: "<hashed>",
+        // Legacy array shape — must NOT be accepted.
+        idempotencyKeyOptions: ["payload"],
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result).not.toBeNull();
+    expect(result!.idempotencyKeyOptions).toBeUndefined();
+  });
+
+  it("defaults snapshot-derived fields to safe values when absent", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.payloadType).toBeUndefined();
+    expect(result!.metadata).toBeUndefined();
+    expect(result!.idempotencyKey).toBeUndefined();
+    expect(result!.isTest).toBe(false);
+    expect(result!.depth).toBe(0);
+    expect(result!.tags).toEqual([]);
+    expect(result!.resumeParentOnCompletion).toBe(false);
+    expect(result!.traceId).toBeUndefined();
+    expect(result!.spanId).toBeUndefined();
+  });
+
+  it("populates replay-relevant fields from the snapshot", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "my-task",
+        environment: { id: "env_a" },
+        workerQueue: "default",
+        queue: "task/my-task",
+        concurrencyKey: "tenant-42",
+        machine: "medium-1x",
+        realtimeStreamsVersion: "v2",
+        seedMetadata: '{"k":"v"}',
+        seedMetadataType: "application/json",
+        tags: ["t1", "t2"],
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result).not.toBeNull();
+    expect(result!.id).toBeTypeOf("string");
+    expect(result!.id.length).toBeGreaterThan(0);
+    expect(result!.engine).toBe("V2");
+    expect(result!.runtimeEnvironmentId).toBe("env_a");
+    expect(result!.workerQueue).toBe("default");
+    expect(result!.queue).toBe("task/my-task");
+    expect(result!.concurrencyKey).toBe("tenant-42");
+    expect(result!.machinePreset).toBe("medium-1x");
+    expect(result!.realtimeStreamsVersion).toBe("v2");
+    expect(result!.seedMetadata).toBe('{"k":"v"}');
+    expect(result!.seedMetadataType).toBe("application/json");
+    expect(result!.runTags).toEqual(["t1", "t2"]);
+  });
+
+  it("extracts batchId from the snapshot's nested batch object (engine.trigger shape)", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        // The engine.trigger input nests the batch as `{ id, index }`,
+        // where `id` is the batch's internal cuid (not a flat `batchId`).
+        batch: { id: "batch_internal_cuid", index: 3 },
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.batchId).toBe("batch_internal_cuid");
+  });
+
+  it("leaves batchId undefined when the snapshot has no batch (non-batched run)", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.batchId).toBeUndefined();
+  });
+
+  it("treats invalid date strings as undefined and does not mis-classify status as CANCELED", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        cancelledAt: "not-a-date",
+        cancelReason: "user requested",
+        delayUntil: "also-not-a-date",
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result).not.toBeNull();
+    expect(result!.status).toBe("QUEUED");
+    expect(result!.cancelledAt).toBeUndefined();
+    expect(result!.delayUntil).toBeUndefined();
+  });
+
+  it("parses valid ISO date strings on cancelledAt and delayUntil", async () => {
+    const cancelledAtIso = "2026-05-11T13:00:00.000Z";
+    const delayUntilIso = "2026-05-11T14:00:00.000Z";
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        cancelledAt: cancelledAtIso,
+        cancelReason: "user requested",
+        delayUntil: delayUntilIso,
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.status).toBe("CANCELED");
+    expect(result!.cancelledAt).toEqual(new Date(cancelledAtIso));
+    expect(result!.cancelReason).toBe("user requested");
+    expect(result!.delayUntil).toEqual(new Date(delayUntilIso));
+  });
+
+  it("falls back to entry.envId for runtimeEnvironmentId when snapshot lacks environment.id", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.runtimeEnvironmentId).toBe("env_a");
+    expect(result!.workerQueue).toBeUndefined();
+    expect(result!.queue).toBeUndefined();
+  });
+
+  it("extracts batchId from the nested snapshot.batch object (not the flat key)", async () => {
+    // Regression for the field-name mismatch Devin flagged:
+    // #buildEngineTriggerInput writes batch info as
+    // `batch: { id, index }`, never as a flat `batchId`. readFallback
+    // must read the nested key, otherwise SyntheticRun.batchId is always
+    // undefined for buffered runs.
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        batch: { id: "batch_internal_xyz", index: 3 },
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.batchId).toBe("batch_internal_xyz");
+  });
+
+  it("does NOT read a flat `batchId` key — only the nested batch.id", async () => {
+    // Belt-and-braces: a payload with the wrong-shaped flat key should
+    // resolve to undefined, not silently pick up the bogus value.
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        batchId: "should-be-ignored",
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.batchId).toBeUndefined();
+  });
+
+  it("converts internal parent/root IDs in the snapshot to friendlyIds", async () => {
+    // Regression for Devin's structural-unfillable finding: the snapshot
+    // only carries INTERNAL parent/root ids (engine.trigger consumes the
+    // internal shape), while SyntheticRun exposes friendlyIds. Convert
+    // here so consumers don't have to special-case the buffered path.
+    // The conversion is deterministic via RunId.toFriendlyId — we drive
+    // it through `RunId.generate()` to get a matching internal+friendly
+    // pair and assert the round-trip.
+    const parent = RunId.generate();
+    const root = RunId.generate();
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        parentTaskRunId: parent.id,
+        rootTaskRunId: root.id,
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.parentTaskRunFriendlyId).toBe(parent.friendlyId);
+    expect(result!.rootTaskRunFriendlyId).toBe(root.friendlyId);
+  });
+
+  it("leaves parent/root friendlyIds undefined when the snapshot carries no parent context", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.parentTaskRunFriendlyId).toBeUndefined();
+    expect(result!.rootTaskRunFriendlyId).toBeUndefined();
+  });
+});
diff --git a/apps/webapp/test/mollifierReplayPayloadShape.test.ts b/apps/webapp/test/mollifierReplayPayloadShape.test.ts
new file mode 100644
index 00000000000..d2f098d7086
--- /dev/null
+++ b/apps/webapp/test/mollifierReplayPayloadShape.test.ts
@@ -0,0 +1,99 @@
+import { describe, expect, it } from "vitest";
+import {
+  serialiseMollifierSnapshot,
+  deserialiseMollifierSnapshot,
+} from "~/v3/mollifier/mollifierSnapshot.server";
+import { prettyPrintPacket } from "@trigger.dev/core/v3";
+
+// Regression test for the Devin "Buffered replay loader passes
+// non-string payload to prettyPrintPacket" finding on PR #3757.
+//
+// Devin's claim is that the snapshot codec double-unwraps the
+// payload: `engine.trigger` carries it pre-serialised, then the
+// snapshot serialise/deserialise round-trip would JSON.parse it a
+// second time, leaving `buffered.payload` as a *parsed* object —
+// which `prettyPrintPacket` then mis-handles, producing malformed
+// payload display in the Replay dialog.
+//
+// This test pins the actual contract: the snapshot codec is a single
+// JSON.stringify / JSON.parse layer. The payload field stored on the
+// engine trigger input is a string (the SDK-serialised payload from
+// `payloadPacket.data`). A string round-trips through
+// JSON.stringify/JSON.parse unchanged — it does NOT get a second
+// unwrap. Therefore `buffered.payload` reaches the replay loader as
+// a string, exactly the shape `prettyPrintPacket` expects.
+describe("mollifier replay payload shape", () => {
+  it("serialise/deserialise preserves the payload as a string", () => {
+    // Shape mirrors what `triggerTask.server.ts:#buildEngineTriggerInput`
+    // produces — `payload` is `args.payloadPacket.data`, already a JSON
+    // string from the SDK's packet serialisation.
+    const triggerInput = {
+      friendlyId: "run_x",
+      taskIdentifier: "hello-world",
+      payload: JSON.stringify({ hello: "world", n: 42 }),
+      payloadType: "application/json",
+      traceId: "trace_x",
+      spanId: "span_x",
+    };
+
+    const serialised = serialiseMollifierSnapshot(triggerInput);
+    const roundTripped = deserialiseMollifierSnapshot(serialised);
+
+    expect(typeof roundTripped.payload).toBe("string");
+    expect(roundTripped.payload).toBe(triggerInput.payload);
+    expect(roundTripped.payloadType).toBe("application/json");
+  });
+
+  it("prettyPrintPacket on the round-tripped payload produces the expected pretty JSON", async () => {
+    const original = { hello: "world", nested: { count: 3 } };
+    const triggerInput = {
+      payload: JSON.stringify(original),
+      payloadType: "application/json",
+    };
+
+    const roundTripped = deserialiseMollifierSnapshot(
+      serialiseMollifierSnapshot(triggerInput),
+    );
+
+    // This is exactly the call the replay loader makes:
+    //   prettyPrintPacket(run.payload, run.payloadType)
+    // If Devin were right, the payload here would be a parsed object
+    // and prettyPrintPacket would either double-encode or skip
+    // formatting. In reality it's a string, so we get correct pretty
+    // JSON.
+    const pretty = await prettyPrintPacket(
+      roundTripped.payload,
+      roundTripped.payloadType as string,
+    );
+
+    expect(pretty).toBe(JSON.stringify(original, null, 2));
+  });
+
+  it("string payload survives the buffer-codec round-trip even with snapshot fields around it", () => {
+    // Replicate the realistic snapshot shape (the engine.trigger input
+    // has many sibling fields). Confirms there's no field-shape
+    // interaction that would mutate payload.
+    const triggerInput = {
+      friendlyId: "run_x",
+      environment: {
+        id: "env",
+        type: "DEVELOPMENT",
+        project: { id: "p" },
+        organization: { id: "o" },
+      },
+      taskIdentifier: "t",
+      payload: '{"a":1}',
+      payloadType: "application/json",
+      context: { run: { id: "x" } },
+      traceContext: { traceparent: "00-...-..." },
+      traceId: "abc",
+      spanId: "def",
+      tags: ["one", "two"],
+      depth: 2,
+      isTest: false,
+    };
+    const out = deserialiseMollifierSnapshot(serialiseMollifierSnapshot(triggerInput));
+    expect(typeof out.payload).toBe("string");
+    expect(out.payload).toBe('{"a":1}');
+  });
+});
diff --git a/apps/webapp/test/mollifierResetIdempotencyKey.test.ts b/apps/webapp/test/mollifierResetIdempotencyKey.test.ts
new file mode 100644
index 00000000000..4909087d70c
--- /dev/null
+++ b/apps/webapp/test/mollifierResetIdempotencyKey.test.ts
@@ -0,0 +1,158 @@
+import { describe, expect, it, vi } from "vitest";
+
+// Mock the db module so the BaseService default prisma doesn't try to
+// open a real connection at module load. Each test wires its own
+// prisma stub.
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+// Prevent the runEngine singleton from instantiating and spinning up
+// PG/Redis workers at module load — without this CI fails with
+// unhandled `PrismaClientInitializationError`s even though the
+// assertions all pass (see `mollifierDrainerWorker.test.ts`).
+vi.mock("~/v3/runEngine.server", () => ({ engine: {} }));
+
+// Hoisted mock state so we can swap the buffer per test without
+// re-importing modules.
+const bufferMock: { current: unknown } = { current: null };
+vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({
+  getMollifierBuffer: () => bufferMock.current,
+}));
+
+import { ResetIdempotencyKeyService } from "~/v3/services/resetIdempotencyKey.server";
+import { ServiceValidationError } from "~/v3/services/baseService.server";
+
+type FakePrisma = {
+  taskRun: { updateMany: (...args: unknown[]) => Promise<{ count: number }> };
+};
+
+function makePrisma(pgCount: number): FakePrisma {
+  return {
+    taskRun: {
+      updateMany: vi.fn(async () => ({ count: pgCount })),
+    },
+  };
+}
+
+const env = {
+  id: "env_a",
+  organizationId: "org_1",
+} as unknown as Parameters<ResetIdempotencyKeyService["call"]>[2];
+
+describe("ResetIdempotencyKeyService — buffer-outage handling", () => {
+  it("returns success when PG cleared >=1 run, even if the buffer reset throws", async () => {
+    bufferMock.current = {
+      resetIdempotency: vi.fn(async () => {
+        throw new Error("ECONNREFUSED");
+      }),
+    };
+    const prisma = makePrisma(1);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    const result = await service.call("ikey", "task", env);
+    expect(result).toEqual({ id: "ikey" });
+  });
+
+  it("returns success when PG cleared nothing but the buffer cleared a run", async () => {
+    bufferMock.current = {
+      resetIdempotency: vi.fn(async () => ({ clearedRunId: "run_x" })),
+    };
+    const prisma = makePrisma(0);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    const result = await service.call("ikey", "task", env);
+    expect(result).toEqual({ id: "ikey" });
+  });
+
+  it("404s when PG and buffer both legitimately report 'nothing to clear'", async () => {
+    bufferMock.current = {
+      resetIdempotency: vi.fn(async () => ({ clearedRunId: null })),
+    };
+    const prisma = makePrisma(0);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    await expect(service.call("ikey", "task", env)).rejects.toMatchObject({
+      status: 404,
+    });
+  });
+
+  // Regression for the silent-not-found hazard CodeRabbit flagged: if PG
+  // sees nothing AND we can't read the buffer (Redis outage), the
+  // previous behaviour was to 404 — masking a partial outage and
+  // leaving a buffered key effectively un-reset while the caller was
+  // told "doesn't exist." We now surface 503 so the caller retries.
+  it("503s when PG cleared nothing AND the buffer reset failed (partial outage)", async () => {
+    bufferMock.current = {
+      resetIdempotency: vi.fn(async () => {
+        throw new Error("ECONNREFUSED");
+      }),
+    };
+    const prisma = makePrisma(0);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    const error = await service.call("ikey", "task", env).then(
+      () => null,
+      (err) => err,
+    );
+    expect(error).toBeInstanceOf(ServiceValidationError);
+    expect(error.status).toBe(503);
+    expect(error.message).toMatch(/retry/i);
+  });
+
+  it("404s normally when buffer is null (mollifier disabled) and PG cleared nothing", async () => {
+    bufferMock.current = null;
+    const prisma = makePrisma(0);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    await expect(service.call("ikey", "task", env)).rejects.toMatchObject({
+      status: 404,
+    });
+  });
+
+  // Regression for the PG↔buffer handoff race CodeRabbit flagged on PR #3756.
+  //
+  // Sequence the test models (deterministic, by setup):
+  //   1. ResetIdempotencyKeyService.call begins while the run is still
+  //      buffered. The initial pg.updateMany sees no PG row → count=0.
+  //   2. Between that update and the buffer reset, the drainer materialises
+  //      the buffered run into PG (engine.trigger writes the row with the
+  //      original idempotencyKey intact) AND `buffer.ack` clears the
+  //      associated Redis idempotency lookup — that's part of ack's
+  //      atomic contract (see `buffer.ts:493` comment).
+  //   3. buffer.resetIdempotency runs after ack → returns
+  //      `{ clearedRunId: null }` because the lookup is gone.
+  //
+  // Without the handoff re-check, totalCount = 0 + 0 = 0 → the service
+  // throws 404 for a key that genuinely still exists on the now-
+  // materialised PG row. The customer's reset is silently lost.
+  //
+  // Correct behaviour: the service must discover the materialised row
+  // and clear its key, returning success. This test pins that contract.
+  it("succeeds when a buffered run materialises into PG between the initial pgUpdate and the buffer reset (handoff race)", async () => {
+    let updateManyCalls = 0;
+    const prisma: FakePrisma = {
+      taskRun: {
+        // First call: pre-materialisation, no PG row yet → 0.
+        // Second call (the fix's re-check after both surfaces report
+        // nothing): post-materialisation, drainer wrote the row → 1.
+        updateMany: vi.fn(async () => {
+          updateManyCalls += 1;
+          return updateManyCalls === 1 ? { count: 0 } : { count: 1 };
+        }),
+      },
+    };
+    const resetIdempotency = vi.fn(async () => ({ clearedRunId: null as string | null }));
+    bufferMock.current = { resetIdempotency };
+
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    const result = await service.call("ikey", "task", env);
+    expect(result).toEqual({ id: "ikey" });
+
+    // Load-bearing pieces of the fix:
+    //   - The buffer path was consulted (we didn't bypass the normal
+    //     handoff window check), and
+    //   - A second pg.updateMany fired AFTER the buffer's null result,
+    //     catching the now-materialised row.
+    expect(resetIdempotency).toHaveBeenCalledOnce();
+    expect(updateManyCalls).toBe(2);
+  });
+});
diff --git a/apps/webapp/test/mollifierResolveRunForMutation.test.ts b/apps/webapp/test/mollifierResolveRunForMutation.test.ts
new file mode 100644
index 00000000000..b50d8ad9400
--- /dev/null
+++ b/apps/webapp/test/mollifierResolveRunForMutation.test.ts
@@ -0,0 +1,229 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({
+  // Both default clients return null. Individual tests inject their
+  // own fakes via `deps` when they want non-default behaviour.
+  prisma: { taskRun: { findFirst: vi.fn(async () => null) } },
+  $replica: { taskRun: { findFirst: vi.fn(async () => null) } },
+}));
+
+import { resolveRunForMutation } from "~/v3/mollifier/resolveRunForMutation.server";
+import type { BufferEntry, MollifierBuffer } from "@trigger.dev/redis-worker";
+
+// Regression coverage for the cancel-route 404 bug (commit b490afe23).
+// Before the fix the route had `findResource: async () => null`, which
+// caused the route builder to 404 every cancel — including for valid
+// PG-row runs — BEFORE the action handler could run. The helper
+// resolveRunForMutation has to return a non-null discriminated value
+// whenever the run exists in either store.
+
+const NOW = new Date("2026-05-21T10:00:00Z");
+
+function fakeReplica(row: { friendlyId: string } | null) {
+  return { taskRun: { findFirst: vi.fn(async () => row) } };
+}
+function fakeWriter(row: { friendlyId: string } | null) {
+  return { taskRun: { findFirst: vi.fn(async () => row) } };
+}
+
+function fakeBuffer(entry: BufferEntry | null): MollifierBuffer {
+  return {
+    getEntry: vi.fn(async () => entry),
+  } as unknown as MollifierBuffer;
+}
+
+const baseInput = {
+  runParam: "run_1",
+  environmentId: "env_a",
+  organizationId: "org_1",
+};
+
+describe("resolveRunForMutation", () => {
+  it("returns { source: 'pg' } when the PG row exists", async () => {
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica({ friendlyId: "run_1" }),
+        getBuffer: () => null,
+      },
+    });
+    expect(result).toEqual({ source: "pg", friendlyId: "run_1" });
+  });
+
+  it("returns { source: 'buffer' } when PG misses and the buffer entry matches env+org", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: "{}",
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+      createdAtMicros: 1747044000000000,
+      materialised: false,
+      idempotencyLookupKey: "",
+      metadataVersion: 0,
+    };
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica(null),
+        getBuffer: () => fakeBuffer(entry),
+      },
+    });
+    expect(result).toEqual({ source: "buffer", friendlyId: "run_1" });
+  });
+
+  it("returns null when PG misses and the buffer entry env doesn't match", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_OTHER",
+      orgId: "org_1",
+      payload: "{}",
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+      createdAtMicros: 1747044000000000,
+      materialised: false,
+      idempotencyLookupKey: "",
+      metadataVersion: 0,
+    };
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica(null),
+        getBuffer: () => fakeBuffer(entry),
+      },
+    });
+    expect(result).toBeNull();
+  });
+
+  it("returns null when PG misses and the buffer entry org doesn't match", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_OTHER",
+      payload: "{}",
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+      createdAtMicros: 1747044000000000,
+      materialised: false,
+      idempotencyLookupKey: "",
+      metadataVersion: 0,
+    };
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica(null),
+        getBuffer: () => fakeBuffer(entry),
+      },
+    });
+    expect(result).toBeNull();
+  });
+
+  it("returns null when both PG and buffer miss", async () => {
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica(null),
+        getBuffer: () => fakeBuffer(null),
+      },
+    });
+    expect(result).toBeNull();
+  });
+
+  it("returns null when buffer is unavailable (mollifier disabled) and PG misses", async () => {
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica(null),
+        getBuffer: () => null,
+      },
+    });
+    expect(result).toBeNull();
+  });
+
+  it("PG-hit short-circuits before consulting the buffer", async () => {
+    const buffer = fakeBuffer(null);
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica({ friendlyId: "run_1" }),
+        getBuffer: () => buffer,
+      },
+    });
+    expect(result?.source).toBe("pg");
+    expect(buffer.getEntry).not.toHaveBeenCalled();
+  });
+
+  // Regressions for the degraded-mode false-404 CodeRabbit flagged.
+  //
+  // Pre-PR the mutation routes read from the writer directly, so any
+  // PG row was visible regardless of replication lag. This helper
+  // moved the read to the replica for offload purposes. The route
+  // builder treats a null return as a hard 404 BEFORE the action
+  // handler runs, so any path where replica misses and the writer has
+  // the row needs to be reachable here — otherwise mutateWithFallback's
+  // own writer recovery never gets a chance to fire.
+  it("falls back to the writer when both replica and buffer miss, returning the writer row as 'pg' source", async () => {
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica(null),
+        prismaWriter: fakeWriter({ friendlyId: "run_1" }),
+        getBuffer: () => fakeBuffer(null),
+      },
+    });
+    expect(result?.source).toBe("pg");
+    expect(result?.friendlyId).toBe("run_1");
+  });
+
+  it("falls back to the writer when the buffer is unavailable (mollifier disabled) and replica misses", async () => {
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica(null),
+        prismaWriter: fakeWriter({ friendlyId: "run_1" }),
+        getBuffer: () => null,
+      },
+    });
+    expect(result?.source).toBe("pg");
+    expect(result?.friendlyId).toBe("run_1");
+  });
+
+  it("still returns null when replica, buffer, AND writer all miss (legitimate not-found)", async () => {
+    const writer = fakeWriter(null);
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica(null),
+        prismaWriter: writer,
+        getBuffer: () => fakeBuffer(null),
+      },
+    });
+    expect(result).toBeNull();
+    // Writer probe ran — the fallback fires exactly once on the miss
+    // path; doesn't pile retries.
+    expect(writer.taskRun.findFirst).toHaveBeenCalledOnce();
+  });
+
+  it("PG-hit short-circuits before consulting either the buffer OR the writer", async () => {
+    const buffer = fakeBuffer(null);
+    const writer = fakeWriter({ friendlyId: "should-not-be-read" });
+    const result = await resolveRunForMutation({
+      ...baseInput,
+      deps: {
+        prismaReplica: fakeReplica({ friendlyId: "run_1" }),
+        prismaWriter: writer,
+        getBuffer: () => buffer,
+      },
+    });
+    expect(result?.source).toBe("pg");
+    expect(result?.friendlyId).toBe("run_1");
+    expect(buffer.getEntry).not.toHaveBeenCalled();
+    // Writer must NOT fire when the replica already had the row —
+    // otherwise we'd negate the whole replica-offload purpose.
+    expect(writer.taskRun.findFirst).not.toHaveBeenCalled();
+  });
+});
diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts
new file mode 100644
index 00000000000..94928611119
--- /dev/null
+++ b/apps/webapp/test/mollifierStaleSweep.test.ts
@@ -0,0 +1,976 @@
+import { describe, expect, it, vi } from "vitest";
+import { redisTest } from "@internal/testcontainers";
+import { MollifierBuffer } from "@trigger.dev/redis-worker";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import {
+  runStaleSweepOnce,
+  startStaleSweepInterval,
+} from "~/v3/mollifier/mollifierStaleSweep.server";
+import { MollifierStaleSweepState } from "~/v3/mollifier/mollifierStaleSweepState.server";
+
+const SNAPSHOT = {
+  taskIdentifier: "hello-world",
+  payload: '{"x":1}',
+  payloadType: "application/json",
+  traceContext: {},
+};
+
+// In-memory fake state for unit tests that don't have a Redis container.
+// The testcontainer tests use a real MollifierStaleSweepState against
+// the test Redis instead.
+function makeFakeState() {
+  let cursor = 0;
+  let orgList: string[] = [];
+  const counts = new Map<string, number>();
+  let visited = new Set<string>();
+  return {
+    readCursor: async () => cursor,
+    writeCursor: async (v: number) => {
+      cursor = v;
+    },
+    rebuildOrgList: async (orgs: string[]) => {
+      orgList = [...orgs];
+    },
+    readOrgListSlice: async (start: number, count: number) => ({
+      orgs: orgList.slice(start, start + count),
+      total: orgList.length,
+    }),
+    setEnvStaleCount: async (envId: string, count: number) => {
+      if (count > 0) counts.set(envId, count);
+      else counts.delete(envId);
+    },
+    readAllEnvStaleCounts: async () => new Map(counts),
+    markEnvVisited: async (envId: string) => {
+      visited.add(envId);
+    },
+    reconcileVisited: async () => {
+      for (const envId of [...counts.keys()]) {
+        if (!visited.has(envId)) counts.delete(envId);
+      }
+      visited = new Set();
+    },
+    clearAll: async () => {
+      cursor = 0;
+      orgList = [];
+      counts.clear();
+      visited = new Set();
+    },
+    close: async () => {},
+  };
+}
+
+function spyDeps() {
+  // Counter ticks — metric carries no `envId` label (high-cardinality)
+  // so the spy is a simple call count. Per-env detail lives on the
+  // structured warn log and the snapshot map.
+  let staleEntryCount = 0;
+  const snapshots: Array<Map<string, number>> = [];
+  const warnings: Array<{ message: string; fields: Record<string, unknown> }> = [];
+  return {
+    get staleEntryCount() {
+      return staleEntryCount;
+    },
+    snapshots,
+    warnings,
+    deps: {
+      recordStaleEntry: () => {
+        staleEntryCount += 1;
+      },
+      reportStaleEntrySnapshot: (snapshot: Map<string, number>) => {
+        // Clone so post-sweep assertions see what was reported *at that
+        // call site*, not whatever subsequent passes mutate the source
+        // map into.
+        snapshots.push(new Map(snapshot));
+      },
+      logger: {
+        warn: (message: string, fields: Record<string, unknown>) => {
+          warnings.push({ message, fields });
+        },
+      },
+    },
+  };
+}
+
+describe("runStaleSweepOnce — unit", () => {
+  it("returns zeros when the buffer is null", async () => {
+    // Mirrors the prod gate: if TRIGGER_MOLLIFIER_ENABLED=0 the buffer
+    // singleton is null and the sweep is a no-op. We don't want it to
+    // emit a metric (or throw) just because mollifier is disabled.
+    const spies = spyDeps();
+    const result = await runStaleSweepOnce(
+      { staleThresholdMs: 1000 },
+      { ...spies.deps, getBuffer: () => null, state: makeFakeState() },
+    );
+    expect(result).toEqual({
+      orgsScanned: 0,
+      envsScanned: 0,
+      entriesScanned: 0,
+      staleCount: 0,
+    });
+    expect(spies.staleEntryCount).toBe(0);
+    expect(spies.warnings).toEqual([]);
+    const snapshots = spies.snapshots;
+    // An empty snapshot is still reported so any previously-paging env
+    // (from a prior sweep before mollifier was disabled) clears.
+    expect(snapshots).toHaveLength(1);
+    expect(snapshots[0].size).toBe(0);
+  });
+
+  it("surfaces readOrgListSlice failures and leaves durable state untouched", async () => {
+    // Regression: previously a Redis read failure inside
+    // `readOrgListSlice` returned `{ orgs: [], total: 0 }` and the
+    // sweep treated that as a clean empty cycle — writing cursor=0,
+    // reconciling visited envs against the empty result, and CLEARING
+    // the stale-entry gauge. That silenced the very alerts the sweep
+    // exists to raise. The fix re-throws; the caller (this function
+    // and the interval wrapper above it) must NOT mutate cursor or
+    // counts when readOrgListSlice fails.
+    const state = makeFakeState();
+    // Seed durable state so we can assert it isn't touched on failure.
+    await state.writeCursor(42);
+    await state.setEnvStaleCount("env_seed", 7);
+    await state.rebuildOrgList(["org_pre"]);
+    // Inject a failure on the very next slice read.
+    const readErr = new Error("Redis read failed");
+    let readAttempts = 0;
+    const failingState = {
+      ...state,
+      readOrgListSlice: async (start: number, count: number) => {
+        readAttempts += 1;
+        throw readErr;
+      },
+    };
+    const spies = spyDeps();
+    const buffer = {
+      listOrgs: async () => ["org_pre"],
+      listEnvsForOrg: async () => [],
+      listEntriesForEnv: async () => [],
+    } as unknown as MollifierBuffer;
+
+    await expect(
+      runStaleSweepOnce(
+        { staleThresholdMs: 60_000, maxOrgsPerPass: 10 },
+        {
+          ...spies.deps,
+          state: failingState,
+          getBuffer: () => buffer,
+          now: () => Date.now(),
+        },
+      ),
+    ).rejects.toThrow("Redis read failed");
+
+    expect(readAttempts).toBe(1);
+    // Cursor untouched (still the seeded 42, not reset to 0).
+    expect(await state.readCursor()).toBe(42);
+    // Counts hash untouched — the seeded env's count survives the
+    // failed cycle so the gauge keeps reporting its last-known value.
+    const counts = await state.readAllEnvStaleCounts();
+    expect(counts.get("env_seed")).toBe(7);
+    // No snapshot was reported because the function threw before
+    // reaching reportStaleEntrySnapshot.
+    expect(spies.snapshots).toHaveLength(0);
+    expect(spies.staleEntryCount).toBe(0);
+  });
+});
+
+describe("runStaleSweepOnce — testcontainers", () => {
+  redisTest(
+    "flags every entry whose dwell exceeds the stale threshold",
+    { timeout: 20_000 },
+    async ({ redisOptions }) => {
+      const buffer = new MollifierBuffer({ redisOptions });
+      try {
+        // Three entries across two envs in the same org. The sweep below
+        // runs against a `now` advanced by 5 minutes, so all three have
+        // dwell ~5min and ALL THREE are stale against a 1-minute
+        // threshold — there is no "fresh" entry in this scenario. The
+        // assertions below pin the all-three-stale shape.
+        await buffer.accept({
+          runId: "run_stale_a",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        await buffer.accept({
+          runId: "run_stale_b",
+          envId: "env_b",
+          orgId: "org_1",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        await buffer.accept({
+          runId: "run_stale_c",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        // Yank the system clock forward 5 minutes for the sweep — way
+        // past the threshold below. The `now` deps seam lets us drive
+        // the threshold without actually waiting in real time.
+        const futureNow = Date.now() + 5 * 60 * 1000;
+
+        const spies = spyDeps();
+        const state = new MollifierStaleSweepState({ redisOptions });
+        try {
+          const result = await runStaleSweepOnce(
+            { staleThresholdMs: 60 * 1000 },
+            {
+              ...spies.deps,
+              getBuffer: () => buffer,
+              state,
+              now: () => futureNow,
+            },
+          );
+
+          expect(result.envsScanned).toBe(2);
+          expect(result.entriesScanned).toBe(3);
+          expect(result.staleCount).toBe(3);
+          // All three entries exceed the threshold; each emits one
+          // counter tick + one warning.
+          expect(spies.staleEntryCount).toBe(3);
+          expect(spies.warnings).toHaveLength(3);
+          for (const w of spies.warnings) {
+            expect(w.message).toBe("mollifier.stale_entry");
+            expect(w.fields.staleThresholdMs).toBe(60 * 1000);
+            expect(w.fields.dwellMs).toBeGreaterThan(60 * 1000);
+          }
+          // Snapshot drives the alertable gauge — env_a has 2 stale
+          // entries, env_b has 1. Per-env detail is still passed to
+          // `reportStaleEntrySnapshot` for forensic value even though the
+          // gauge itself aggregates the total.
+          expect(spies.snapshots).toHaveLength(1);
+          expect(Object.fromEntries(spies.snapshots[0])).toEqual({
+            env_a: 2,
+            env_b: 1,
+          });
+        } finally {
+          await state.close();
+        }
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "snapshot omits envs that have entries but none stale (durable hash HDEL's zeros)",
+    { timeout: 20_000 },
+    async ({ redisOptions }) => {
+      // Critical for alert behaviour: a previous sweep flagged env_a
+      // stale, alert fired, drainer caught up. The next sweep must
+      // remove env_a from the durable counts hash so the gauge drops
+      // below the alert threshold instead of staying latched at the
+      // last stale value. With the sharded design the snapshot is
+      // sourced from the HASH directly — visiting an env with zero
+      // stale entries HDEL's it, so it's simply absent from the
+      // snapshot (telemetry sums values, so absence is equivalent to
+      // zero for the gauge).
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_just_arrived",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        const spies = spyDeps();
+        await runStaleSweepOnce(
+          { staleThresholdMs: 60 * 1000 },
+          { ...spies.deps, getBuffer: () => buffer, state },
+        );
+        expect(spies.snapshots).toHaveLength(1);
+        // env_a has entries but none stale → not in the snapshot.
+        expect(spies.snapshots[0].has("env_a")).toBe(false);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "leaves fresh entries alone (dwell below threshold)",
+    { timeout: 20_000 },
+    async ({ redisOptions }) => {
+      // Regression guard for the inequality direction. A bug that flipped
+      // `dwellMs > threshold` to `dwellMs >= threshold` would flag every
+      // entry the first time the sweep runs after a perfectly synchronised
+      // accept call — the dashboard would page on every burst.
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_fresh_only",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        const spies = spyDeps();
+        const result = await runStaleSweepOnce(
+          { staleThresholdMs: 60 * 1000 },
+          { ...spies.deps, getBuffer: () => buffer, state },
+        );
+        expect(result.staleCount).toBe(0);
+        expect(spies.staleEntryCount).toBe(0);
+        expect(spies.warnings).toEqual([]);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "shards work across ticks: cursor advances by maxOrgsPerPass and wraps after a full cycle",
+    { timeout: 30_000 },
+    async ({ redisOptions }) => {
+      // Without sharding the sweep walks every org/env every tick — at
+      // any meaningful backlog that runs longer than the tick interval
+      // and the next tick gets dropped by the inFlight guard. Sharding
+      // splits the work: each tick visits at most `maxOrgsPerPass` orgs,
+      // advances a durable cursor, and resumes from there next tick.
+      // Over `ceil(N / cap)` ticks the cycle covers every org.
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        for (let i = 0; i < 5; i++) {
+          await buffer.accept({
+            runId: `run_shard_${i}`,
+            envId: `env_shard_${i}`,
+            orgId: `org_shard_${i}`,
+            payload: JSON.stringify(SNAPSHOT),
+          });
+        }
+        const futureNow = Date.now() + 5 * 60 * 1000;
+        const spies = spyDeps();
+        const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 2 };
+        const baseDeps = {
+          ...spies.deps,
+          getBuffer: () => buffer,
+          state,
+          now: () => futureNow,
+        };
+
+        // Tick 1: cursor starts at 0, scans 2 orgs.
+        const r1 = await runStaleSweepOnce(cfg, baseDeps);
+        expect(r1.orgsScanned).toBe(2);
+        expect(spies.snapshots[0].size).toBe(2);
+
+        // Tick 2: cursor was 2, scans 2 more orgs.
+        const r2 = await runStaleSweepOnce(cfg, baseDeps);
+        expect(r2.orgsScanned).toBe(2);
+        // Snapshot is the durable HASH — accumulates across ticks.
+        expect(spies.snapshots[1].size).toBe(4);
+
+        // Tick 3: cursor was 4, scans the last 1 org and wraps to 0.
+        const r3 = await runStaleSweepOnce(cfg, baseDeps);
+        expect(r3.orgsScanned).toBe(1);
+        expect(spies.snapshots[2].size).toBe(5);
+
+        // Tick 4: cycle complete, cursor is back at 0 — starts over.
+        const r4 = await runStaleSweepOnce(cfg, baseDeps);
+        expect(r4.orgsScanned).toBe(2);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "clears an env from the durable snapshot on revisit when it has entries but none currently stale",
+    { timeout: 30_000 },
+    async ({ redisOptions }) => {
+      // Stale state in the durable hash must be HDEL'd, not just left
+      // stale, when a previously-flagged env no longer has any entries
+      // whose dwell exceeds the threshold (drainer caught up, alert
+      // condition cleared). The same `entry` flips from stale to
+      // not-stale between two sweep ticks by varying the sweep's `now`
+      // — tick 1 uses a future clock so the entry is flagged stale;
+      // tick 2 uses real time so the same entry has near-zero dwell and
+      // is no longer stale. The env stays in the active set throughout
+      // (queue still has an entry), so the cursor revisits it and the
+      // hash field is cleared.
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_drain",
+          envId: "env_drain",
+          orgId: "org_drain",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        const futureNow = Date.now() + 5 * 60 * 1000;
+        const spies = spyDeps();
+        const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 };
+
+        // Tick 1 with future clock: entry's dwell is 5min vs 1min
+        // threshold → flagged stale.
+        await runStaleSweepOnce(cfg, {
+          ...spies.deps,
+          getBuffer: () => buffer,
+          state,
+          now: () => futureNow,
+        });
+        expect(spies.snapshots[0].get("env_drain")).toBe(1);
+
+        // Tick 2 with real time: same entry, but its dwell is now ~ms
+        // vs the same 1min threshold → not stale. The env is revisited
+        // (cursor wrapped to 0 after tick 1, only 1 org in the list),
+        // setEnvStaleCount called with 0 → HDEL.
+        await runStaleSweepOnce(cfg, {
+          ...spies.deps,
+          getBuffer: () => buffer,
+          state,
+        });
+        expect(spies.snapshots[1].has("env_drain")).toBe(false);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "evicts fully-drained envs from the counts hash at cycle wrap (no permanent alert)",
+    { timeout: 30_000 },
+    async ({ redisOptions }) => {
+      // Devin's BUG report on PR #3754: an env that drains completely
+      // between sweep ticks disappears from `mollifier:org-envs:${orgId}`
+      // entirely, so the inner loop at runStaleSweepOnce never visits it
+      // and `setEnvStaleCount(envId, 0)` (which HDELs the field) is
+      // never called. The counts hash retains the env's last-known
+      // stale count forever, the gauge stays elevated, and the
+      // recommended alert `> 0 for 5m` fires indefinitely.
+      //
+      // Fix: at cycle wrap (cursor returned to 0) HDEL any env in the
+      // counts hash that wasn't visited during the just-completed cycle.
+      // Verified here by:
+      //   1. Flagging env_will_drain stale, confirming it's in the hash
+      //   2. Draining its only entry — now invisible to listEnvsForOrg
+      //   3. Running a sweep tick that triggers cycle wrap
+      //   4. Asserting the env is no longer in the snapshot
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_will_drain",
+          envId: "env_will_drain",
+          orgId: "org_will_drain",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        const futureNow = Date.now() + 5 * 60 * 1000;
+        const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 };
+        const spies = spyDeps();
+
+        // Tick 1: env_will_drain is flagged stale → enters counts hash.
+        // Cursor wraps to 0 (only 1 org in the list).
+        await runStaleSweepOnce(cfg, {
+          ...spies.deps,
+          getBuffer: () => buffer,
+          state,
+          now: () => futureNow,
+        });
+        expect(spies.snapshots[0].get("env_will_drain")).toBe(1);
+
+        // Drain the only entry. mollifier:queue:env_will_drain is now
+        // empty, and the buffer's atomic Lua removes env_will_drain
+        // from `mollifier:org-envs:org_will_drain` (and removes the org
+        // from `mollifier:orgs` since it has no other envs). The env is
+        // now invisible to listEnvsForOrg.
+        const popped = await buffer.pop("env_will_drain");
+        expect(popped?.runId).toBe("run_will_drain");
+
+        // Tick 2: cursor was 0 after tick 1's wrap, so this rebuilds
+        // the org list (now empty) and immediately wraps again. The
+        // wrap-handler must HDEL env_will_drain from the counts hash
+        // because it wasn't in the visited set for this cycle.
+        await runStaleSweepOnce(cfg, {
+          ...spies.deps,
+          getBuffer: () => buffer,
+          state,
+          now: () => futureNow,
+        });
+        expect(spies.snapshots[1].has("env_will_drain")).toBe(false);
+        // And the durable hash is genuinely empty, not just absent from
+        // this snapshot.
+        expect((await state.readAllEnvStaleCounts()).size).toBe(0);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "scans across multiple orgs",
+    { timeout: 20_000 },
+    async ({ redisOptions }) => {
+      // The drainer pops with org-level fairness, so the sweep must
+      // walk every org/env to surface stale entries across all of them
+      // — not just stop at the first env it finds. If a future refactor
+      // collapsed listOrgs/listEnvsForOrg into a single env-flat list,
+      // this test catches a regression there.
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_x",
+          envId: "env_x",
+          orgId: "org_x",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        await buffer.accept({
+          runId: "run_y",
+          envId: "env_y",
+          orgId: "org_y",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        const futureNow = Date.now() + 5 * 60 * 1000;
+        const spies = spyDeps();
+        const result = await runStaleSweepOnce(
+          { staleThresholdMs: 60 * 1000 },
+          { ...spies.deps, getBuffer: () => buffer, state, now: () => futureNow },
+        );
+        expect(result.orgsScanned).toBe(2);
+        expect(result.envsScanned).toBe(2);
+        expect(result.staleCount).toBe(2);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "state survives process restart: a second state instance picks up the cursor and counts",
+    { timeout: 30_000 },
+    async ({ redisOptions }) => {
+      // This is the headline reason the sweep state is durable in Redis
+      // instead of process-local — a webapp restart mid-cycle must not
+      // re-emit the gauge as fresh-zero for previously-flagged envs nor
+      // restart the cursor walk from scratch. Simulated here by closing
+      // state1 (its Redis client quits cleanly) and constructing state2
+      // against the same Redis. The cursor + counts that state1 wrote
+      // are visible to state2 on its first tick.
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state1 = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_a",
+          envId: "env_a",
+          orgId: "org_a",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        await buffer.accept({
+          runId: "run_b",
+          envId: "env_b",
+          orgId: "org_b",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        const futureNow = Date.now() + 5 * 60 * 1000;
+        const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 1 };
+        const spies1 = spyDeps();
+
+        // Tick 1 with state1: visits 1 of 2 orgs.
+        await runStaleSweepOnce(cfg, {
+          ...spies1.deps,
+          getBuffer: () => buffer,
+          state: state1,
+          now: () => futureNow,
+        });
+        expect(spies1.snapshots[0].size).toBe(1);
+      } finally {
+        // Simulate webapp restart: state1's Redis client closes cleanly.
+        await state1.close();
+      }
+
+      // New process boots, constructs a fresh state pointing at the
+      // same Redis. The cycle's frozen org_list, the cursor, and the
+      // counts hash are all preserved — state2 picks up at the second
+      // org of the cycle.
+      const state2 = new MollifierStaleSweepState({ redisOptions });
+      try {
+        const futureNow = Date.now() + 5 * 60 * 1000;
+        const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 1 };
+        const spies2 = spyDeps();
+
+        await runStaleSweepOnce(cfg, {
+          ...spies2.deps,
+          getBuffer: () => buffer,
+          state: state2,
+          now: () => futureNow,
+        });
+        // Snapshot now has BOTH envs: the one tick 1 flagged (still in
+        // the counts hash from state1) plus the one tick 2 just flagged.
+        // A non-durable design would show only the second.
+        expect(spies2.snapshots[0].size).toBe(2);
+      } finally {
+        await state2.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "cycle wrap rebuilds the org list, so orgs that joined mid-cycle get visited on the next cycle",
+    { timeout: 30_000 },
+    async ({ redisOptions }) => {
+      // The docstring promises "orgs joining mid-cycle wait until the
+      // next cycle to be visited." The mechanism is rebuildOrgList at
+      // cursor=0: a fresh snapshot of buffer.listOrgs() replaces the
+      // previous frozen LIST. Verified here by adding a third org
+      // between cycles and asserting it shows up only in the next
+      // cycle's snapshot.
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_init_a",
+          envId: "env_init_a",
+          orgId: "org_init_a",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        await buffer.accept({
+          runId: "run_init_b",
+          envId: "env_init_b",
+          orgId: "org_init_b",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        const futureNow = Date.now() + 5 * 60 * 1000;
+        const spies = spyDeps();
+        const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 };
+        const baseDeps = {
+          ...spies.deps,
+          getBuffer: () => buffer,
+          state,
+          now: () => futureNow,
+        };
+
+        // Tick 1: cycle 1. Visits both initial orgs; cursor wraps to 0.
+        await runStaleSweepOnce(cfg, baseDeps);
+        expect(spies.snapshots[0].size).toBe(2);
+
+        // Mid-flight: a third org joins the buffer. It must NOT have
+        // been part of cycle 1's frozen LIST.
+        await buffer.accept({
+          runId: "run_mid",
+          envId: "env_mid",
+          orgId: "org_mid",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+
+        // Tick 2: cycle 2 begins (cursor was 0 after tick 1's wrap).
+        // rebuildOrgList captures all 3 orgs; this tick visits all 3.
+        const r2 = await runStaleSweepOnce(cfg, baseDeps);
+        expect(r2.orgsScanned).toBe(3);
+        expect(spies.snapshots[1].size).toBe(3);
+        expect(spies.snapshots[1].has("env_mid")).toBe(true);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "empty buffer (no orgs) advances cleanly with zero work and an empty snapshot",
+    { timeout: 30_000 },
+    async ({ redisOptions }) => {
+      // `mollifier:orgs` is empty (no entries ever accepted, or every
+      // entry has been drained). The sweep must handle the boundary:
+      // rebuildOrgList with [], readOrgListSlice returns total=0,
+      // the org loop is skipped, and the cursor stays at 0 instead of
+      // tripping the wrap math.
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        const spies = spyDeps();
+        const result = await runStaleSweepOnce(
+          { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 },
+          { ...spies.deps, getBuffer: () => buffer, state },
+        );
+        expect(result).toEqual({
+          orgsScanned: 0,
+          envsScanned: 0,
+          entriesScanned: 0,
+          staleCount: 0,
+        });
+        expect(spies.snapshots).toHaveLength(1);
+        expect(spies.snapshots[0].size).toBe(0);
+        // Cursor stayed at 0 — nothing to advance through.
+        expect(await state.readCursor()).toBe(0);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "buffer-null branch wipes the durable state so a re-enable starts fresh",
+    { timeout: 30_000 },
+    async ({ redisOptions }) => {
+      // The unit test above asserts the snapshot is empty when the
+      // buffer is null, but doesn't verify the durable state was
+      // actually cleared. Without clearAll the next re-enable would
+      // resume on a stale cursor + carry over a stale counts hash.
+      const buffer = new MollifierBuffer({ redisOptions });
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_seed",
+          envId: "env_seed",
+          orgId: "org_seed",
+          payload: JSON.stringify(SNAPSHOT),
+        });
+        const futureNow = Date.now() + 5 * 60 * 1000;
+        const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 };
+        const spies = spyDeps();
+
+        // Tick 1: populate state.
+        await runStaleSweepOnce(cfg, {
+          ...spies.deps,
+          getBuffer: () => buffer,
+          state,
+          now: () => futureNow,
+        });
+        expect(spies.snapshots[0].size).toBe(1);
+        expect((await state.readAllEnvStaleCounts()).size).toBe(1);
+
+        // Tick 2: mollifier flips OFF — getBuffer returns null. The
+        // sweep must clear the durable state.
+        await runStaleSweepOnce(cfg, {
+          ...spies.deps,
+          getBuffer: () => null,
+          state,
+        });
+        expect(spies.snapshots[1].size).toBe(0);
+        expect((await state.readAllEnvStaleCounts()).size).toBe(0);
+        expect(await state.readCursor()).toBe(0);
+      } finally {
+        await state.close();
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierStaleSweepState — direct unit tests", () => {
+  redisTest("readCursor returns 0 when the key is absent", { timeout: 20_000 }, async ({ redisOptions }) => {
+    const state = new MollifierStaleSweepState({ redisOptions });
+    try {
+      expect(await state.readCursor()).toBe(0);
+    } finally {
+      await state.close();
+    }
+  });
+
+  redisTest(
+    "writeCursor + readCursor round-trip; readCursor parses a non-numeric value as 0",
+    { timeout: 20_000 },
+    async ({ redisOptions }) => {
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await state.writeCursor(42);
+        expect(await state.readCursor()).toBe(42);
+
+        // Defensive: a corrupted/garbage value must not throw or
+        // propagate NaN into the sweep's cursor arithmetic.
+        await state["redis"].set("mollifier:stale_sweep:cursor", "not-a-number");
+        expect(await state.readCursor()).toBe(0);
+      } finally {
+        await state.close();
+      }
+    },
+  );
+
+  redisTest(
+    "rebuildOrgList replaces the previous list (DEL + RPUSH, in order)",
+    { timeout: 20_000 },
+    async ({ redisOptions }) => {
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await state.rebuildOrgList(["org_a", "org_b", "org_c"]);
+        let slice = await state.readOrgListSlice(0, 10);
+        expect(slice.total).toBe(3);
+        expect(slice.orgs).toEqual(["org_a", "org_b", "org_c"]);
+
+        // Replacement, not append.
+        await state.rebuildOrgList(["org_x"]);
+        slice = await state.readOrgListSlice(0, 10);
+        expect(slice.total).toBe(1);
+        expect(slice.orgs).toEqual(["org_x"]);
+
+        // Empty rebuild leaves the list empty (DEL fires, no RPUSH).
+        await state.rebuildOrgList([]);
+        slice = await state.readOrgListSlice(0, 10);
+        expect(slice.total).toBe(0);
+        expect(slice.orgs).toEqual([]);
+      } finally {
+        await state.close();
+      }
+    },
+  );
+
+  redisTest(
+    "setEnvStaleCount HSETs when count > 0 and HDELs when count === 0",
+    { timeout: 20_000 },
+    async ({ redisOptions }) => {
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await state.setEnvStaleCount("env_a", 3);
+        await state.setEnvStaleCount("env_b", 1);
+        let counts = await state.readAllEnvStaleCounts();
+        expect(Object.fromEntries(counts)).toEqual({ env_a: 3, env_b: 1 });
+
+        // Zero clears the field (HDEL), not stores 0.
+        await state.setEnvStaleCount("env_a", 0);
+        counts = await state.readAllEnvStaleCounts();
+        expect(Object.fromEntries(counts)).toEqual({ env_b: 1 });
+        expect(counts.has("env_a")).toBe(false);
+      } finally {
+        await state.close();
+      }
+    },
+  );
+
+  redisTest(
+    "clearAll DELs cursor, org_list, and counts in one call",
+    { timeout: 20_000 },
+    async ({ redisOptions }) => {
+      const state = new MollifierStaleSweepState({ redisOptions });
+      try {
+        await state.writeCursor(7);
+        await state.rebuildOrgList(["org_a", "org_b"]);
+        await state.setEnvStaleCount("env_a", 5);
+
+        await state.clearAll();
+
+        expect(await state.readCursor()).toBe(0);
+        expect((await state.readOrgListSlice(0, 10)).total).toBe(0);
+        expect((await state.readAllEnvStaleCounts()).size).toBe(0);
+      } finally {
+        await state.close();
+      }
+    },
+  );
+});
+
+describe("startStaleSweepInterval — lifecycle", () => {
+  it("stop() waits for an in-flight tick to finish before closing the state", async () => {
+    // Devin's BUG report on PR #3754: `stop()` previously called
+    // `deps.state.close()` immediately after `clearInterval`, but the
+    // `tick` function only checks `stopped` at entry. A tick that was
+    // already past that check would keep making `state.*` Redis calls
+    // against a now-closed ioredis client, throw, get caught by tick's
+    // own try/catch, and log a `mollifier.stale_sweep.failed` warning
+    // for every graceful shutdown.
+    //
+    // The fix tracks the current tick promise so `stop()` can await it
+    // before closing. This test pins that order by gating one of the
+    // tick's state calls on a Deferred — until we resolve it, the tick
+    // can't progress, and `stop()` must hang in the meantime.
+    let resolveGate: () => void = () => {};
+    const gate = new Promise<void>((r) => {
+      resolveGate = r;
+    });
+
+    const callOrder: string[] = [];
+    let closeCalled = false;
+    const state = {
+      readCursor: async () => {
+        callOrder.push("readCursor:start");
+        await gate;
+        callOrder.push("readCursor:end");
+        return 0;
+      },
+      writeCursor: async () => {
+        callOrder.push("writeCursor");
+      },
+      rebuildOrgList: async () => {
+        callOrder.push("rebuildOrgList");
+      },
+      readOrgListSlice: async () => {
+        callOrder.push("readOrgListSlice");
+        // Return zero orgs so the org loop is a no-op — we only care
+        // about ordering of state calls vs close, not the work.
+        return { orgs: [] as string[], total: 0 };
+      },
+      setEnvStaleCount: async () => {
+        callOrder.push("setEnvStaleCount");
+      },
+      readAllEnvStaleCounts: async () => {
+        callOrder.push("readAllEnvStaleCounts");
+        return new Map<string, number>();
+      },
+      markEnvVisited: async () => {
+        callOrder.push("markEnvVisited");
+      },
+      reconcileVisited: async () => {
+        callOrder.push("reconcileVisited");
+      },
+      clearAll: async () => {
+        callOrder.push("clearAll");
+      },
+      close: async () => {
+        callOrder.push("close");
+        closeCalled = true;
+      },
+    };
+
+    const fakeBuffer = {
+      listOrgs: async () => [],
+      listEnvsForOrg: async () => [],
+      listEntriesForEnv: async () => [],
+    } as any;
+
+    const handle = startStaleSweepInterval(
+      {
+        intervalMs: 20,
+        staleThresholdMs: 60_000,
+        maxOrgsPerPass: 10,
+      },
+      {
+        state,
+        getBuffer: () => fakeBuffer,
+        recordStaleEntry: () => {},
+        reportStaleEntrySnapshot: () => {},
+        logger: { warn: () => {} },
+        now: () => Date.now(),
+      },
+    );
+
+    // Wait for the interval to fire one tick. The tick will start, call
+    // readCursor, and then block on `gate`.
+    await new Promise((r) => setTimeout(r, 80));
+    expect(callOrder).toContain("readCursor:start");
+    expect(closeCalled).toBe(false);
+
+    // Call stop() concurrently — its promise MUST NOT resolve while the
+    // tick is still mid-flight.
+    let stopResolved = false;
+    const stopPromise = handle.stop().then(() => {
+      stopResolved = true;
+    });
+    await new Promise((r) => setTimeout(r, 50));
+    expect(stopResolved).toBe(false);
+    expect(closeCalled).toBe(false);
+
+    // Release the gate. The tick can now finish, and only then should
+    // stop() resolve and close the state.
+    resolveGate();
+    await stopPromise;
+    expect(stopResolved).toBe(true);
+    expect(closeCalled).toBe(true);
+
+    // The tick's readCursor:end MUST appear before the close — otherwise
+    // we closed the Redis client out from under an in-flight tick.
+    expect(callOrder.indexOf("readCursor:end")).toBeGreaterThan(-1);
+    expect(callOrder.indexOf("close")).toBeGreaterThan(
+      callOrder.indexOf("readCursor:end"),
+    );
+  });
+});
diff --git a/apps/webapp/test/mollifierSynthesiseFoundRun.test.ts b/apps/webapp/test/mollifierSynthesiseFoundRun.test.ts
new file mode 100644
index 00000000000..4e2d6a61632
--- /dev/null
+++ b/apps/webapp/test/mollifierSynthesiseFoundRun.test.ts
@@ -0,0 +1,216 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import {
+  synthesiseFoundRunFromBuffer,
+  type FoundRun,
+} from "~/presenters/v3/ApiRetrieveRunPresenter.server";
+import type { SyntheticRun } from "~/v3/mollifier/readFallback.server";
+
+const NOW = new Date("2026-05-24T10:00:00Z");
+
+function makeSyntheticRun(overrides: Partial<SyntheticRun> = {}): SyntheticRun {
+  return {
+    id: "run_internal_1",
+    friendlyId: "run_friendly_1",
+    status: "QUEUED",
+    cancelledAt: undefined,
+    cancelReason: undefined,
+    delayUntil: undefined,
+    taskIdentifier: "hello-world",
+    createdAt: NOW,
+    payload: '{"hello":"world"}',
+    payloadType: "application/json",
+    metadata: undefined,
+    metadataType: undefined,
+    seedMetadata: undefined,
+    seedMetadataType: undefined,
+    idempotencyKey: undefined,
+    idempotencyKeyOptions: undefined,
+    isTest: false,
+    depth: 0,
+    ttl: undefined,
+    tags: ["alpha", "beta"],
+    runTags: ["alpha", "beta"],
+    lockedToVersion: undefined,
+    resumeParentOnCompletion: false,
+    parentTaskRunId: undefined,
+    traceId: "trace_1",
+    spanId: "span_1",
+    parentSpanId: undefined,
+    runtimeEnvironmentId: "env_a",
+    engine: "V2",
+    workerQueue: undefined,
+    queue: undefined,
+    concurrencyKey: undefined,
+    machinePreset: undefined,
+    realtimeStreamsVersion: undefined,
+    maxAttempts: undefined,
+    maxDurationInSeconds: undefined,
+    replayedFromTaskRunFriendlyId: undefined,
+    annotations: undefined,
+    traceContext: undefined,
+    scheduleId: undefined,
+    batchId: undefined,
+    parentTaskRunFriendlyId: undefined,
+    rootTaskRunFriendlyId: undefined,
+    ...overrides,
+  };
+}
+
+describe("synthesiseFoundRunFromBuffer", () => {
+  it("populates internal id and friendlyId so downstream logging keys off the cuid", () => {
+    const found: FoundRun = synthesiseFoundRunFromBuffer(makeSyntheticRun());
+    expect(found.id).toBe("run_internal_1");
+    expect(found.friendlyId).toBe("run_friendly_1");
+  });
+
+  it("marks the synth as isBuffered=true so callers like the events route can short-circuit ClickHouse lookups", () => {
+    // The PG path of `findRun` sets `isBuffered: false`; the buffered
+    // path goes through `synthesiseFoundRunFromBuffer` and must set
+    // `isBuffered: true` so consumers (e.g. the events endpoint) can
+    // skip queries that are guaranteed to return empty for buffered
+    // runs without rewriting them around surrogate signals like
+    // `traceId === ""`.
+    const found: FoundRun = synthesiseFoundRunFromBuffer(makeSyntheticRun());
+    expect(found.isBuffered).toBe(true);
+  });
+
+  it("forwards scheduleId from the snapshot so resolveSchedule can hydrate the schedule field", () => {
+    // Regression: scheduleId was previously hardcoded to null, dropping the
+    // schedule metadata for buffered scheduled runs even though the snapshot
+    // carries it (readFallback.server.ts extracts snapshot.scheduleId).
+    const found = synthesiseFoundRunFromBuffer(
+      makeSyntheticRun({ scheduleId: "schedule_internal_42" })
+    );
+    expect(found.scheduleId).toBe("schedule_internal_42");
+  });
+
+  it("leaves scheduleId null when the snapshot has no scheduleId (non-scheduled trigger)", () => {
+    const found = synthesiseFoundRunFromBuffer(makeSyntheticRun());
+    expect(found.scheduleId).toBeNull();
+  });
+
+  it("reconstructs batch.friendlyId from snapshot.batchId so batch-scoped JWTs authorise", () => {
+    // Regression: batch was previously hardcoded to null, so the
+    // route-authorization callbacks (which read run.batch?.friendlyId)
+    // skipped pushing the batch resource — a batch-scoped JWT 403'd on
+    // buffered batched runs.
+    const found = synthesiseFoundRunFromBuffer(
+      // BatchId.toFriendlyId encodes the internal id with a "batch_" prefix.
+      makeSyntheticRun({ batchId: "abcdefghijklmnopqrstuvwx" })
+    );
+    expect(found.batch).not.toBeNull();
+    expect(found.batch!.id).toBe("abcdefghijklmnopqrstuvwx");
+    expect(found.batch!.friendlyId).toMatch(/^batch_/);
+  });
+
+  it("leaves batch null when the snapshot has no batchId (non-batched run)", () => {
+    const found = synthesiseFoundRunFromBuffer(makeSyntheticRun());
+    expect(found.batch).toBeNull();
+  });
+
+  it("defaults workerQueue to '' so createCommonRunStructure coerces region to undefined", () => {
+    // Regression: workerQueue previously defaulted to "main", which fed
+    // through `run.workerQueue || undefined` as the API response's
+    // `region` — advertising a not-yet-assigned region.
+    const found = synthesiseFoundRunFromBuffer(makeSyntheticRun({ workerQueue: undefined }));
+    expect(found.workerQueue).toBe("");
+  });
+
+  it("passes through an explicit workerQueue from the snapshot unchanged", () => {
+    const found = synthesiseFoundRunFromBuffer(
+      makeSyntheticRun({ workerQueue: "us-east-1" })
+    );
+    expect(found.workerQueue).toBe("us-east-1");
+  });
+
+  it("maps buffered FAILED to SYSTEM_FAILURE so the API surfaces the failure", () => {
+    const found = synthesiseFoundRunFromBuffer(
+      makeSyntheticRun({
+        status: "FAILED",
+        error: { code: "GATE_REJECTED", message: "buffer rejected the run" },
+      })
+    );
+    expect(found.status).toBe("SYSTEM_FAILURE");
+    expect(found.error).toEqual({
+      type: "STRING_ERROR",
+      raw: "GATE_REJECTED: buffer rejected the run",
+    });
+  });
+
+  it("maps buffered CANCELED to CANCELED with completedAt populated from cancelledAt", () => {
+    const cancelledAt = new Date("2026-05-24T10:05:00Z");
+    const found = synthesiseFoundRunFromBuffer(
+      makeSyntheticRun({ status: "CANCELED", cancelledAt })
+    );
+    expect(found.status).toBe("CANCELED");
+    expect(found.completedAt).toEqual(cancelledAt);
+  });
+
+  it("maps buffered QUEUED to PENDING with no error and no completedAt", () => {
+    const found = synthesiseFoundRunFromBuffer(makeSyntheticRun({ status: "QUEUED" }));
+    expect(found.status).toBe("PENDING");
+    expect(found.error).toBeNull();
+    expect(found.completedAt).toBeNull();
+  });
+
+  it("passes through a string snapshot.metadata unchanged", () => {
+    const found = synthesiseFoundRunFromBuffer(
+      makeSyntheticRun({ metadata: '{"customer":"acme"}' })
+    );
+    expect(found.metadata).toBe('{"customer":"acme"}');
+  });
+
+  it("defensively coerces a non-string snapshot.metadata to a JSON string instead of dropping it silently", () => {
+    // Production never writes non-string metadata, but if the snapshot
+    // shape drifts we'd rather see the value (with a warn log) than have
+    // it disappear.
+    const found = synthesiseFoundRunFromBuffer(
+      makeSyntheticRun({ metadata: { customer: "acme" } })
+    );
+    expect(found.metadata).toBe('{"customer":"acme"}');
+  });
+
+  it("defaults idempotencyKey / idempotencyKeyOptions to null when absent", () => {
+    const found = synthesiseFoundRunFromBuffer(makeSyntheticRun());
+    expect(found.idempotencyKey).toBeNull();
+    expect(found.idempotencyKeyOptions).toBeNull();
+  });
+
+  it("zeroes execution-state fields that aren't meaningful for a buffered run", () => {
+    const found = synthesiseFoundRunFromBuffer(makeSyntheticRun());
+    expect(found.startedAt).toBeNull();
+    expect(found.attempts).toEqual([]);
+    expect(found.attemptNumber).toBeNull();
+    expect(found.parentTaskRun).toBeNull();
+    expect(found.rootTaskRun).toBeNull();
+    expect(found.childRuns).toEqual([]);
+    expect(found.output).toBeNull();
+    expect(found.costInCents).toBe(0);
+    expect(found.baseCostInCents).toBe(0);
+    expect(found.usageDurationMs).toBe(0);
+  });
+
+  it("forwards runTags from the snapshot tags array", () => {
+    // Use distinct values for `tags` and `runTags` so the assertion
+    // actually pins the mapping. With the fixture's previous
+    // `runTags` default matching the same `["alpha", "beta"]` input,
+    // this test would have passed even if synthesiseFoundRunFromBuffer
+    // accidentally read `runTags` instead of `tags`.
+    const found = synthesiseFoundRunFromBuffer(
+      makeSyntheticRun({
+        tags: ["from-tags"],
+        runTags: ["stale-run-tags"],
+      })
+    );
+    expect(found.runTags).toEqual(["from-tags"]);
+  });
+
+  it("pins engine to V2 and taskEventStore to taskEvent (only valid values for a buffered run)", () => {
+    const found = synthesiseFoundRunFromBuffer(makeSyntheticRun());
+    expect(found.engine).toBe("V2");
+    expect(found.taskEventStore).toBe("taskEvent");
+  });
+});
diff --git a/apps/webapp/test/mollifierSyntheticApiResponses.test.ts b/apps/webapp/test/mollifierSyntheticApiResponses.test.ts
new file mode 100644
index 00000000000..94ee67c8584
--- /dev/null
+++ b/apps/webapp/test/mollifierSyntheticApiResponses.test.ts
@@ -0,0 +1,164 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import {
+  buildSyntheticSpanDetailBody,
+  buildSyntheticTraceBody,
+} from "~/v3/mollifier/syntheticApiResponses.server";
+import type { SyntheticRun } from "~/v3/mollifier/readFallback.server";
+
+const NOW = new Date("2026-05-23T10:00:00Z");
+
+function makeSyntheticRun(overrides: Partial<SyntheticRun> = {}): SyntheticRun {
+  return {
+    id: "run_internal_1",
+    friendlyId: "run_friendly_1",
+    status: "QUEUED",
+    cancelledAt: undefined,
+    cancelReason: undefined,
+    delayUntil: undefined,
+    taskIdentifier: "hello-world",
+    createdAt: NOW,
+    payload: undefined,
+    payloadType: undefined,
+    metadata: undefined,
+    metadataType: undefined,
+    seedMetadata: undefined,
+    seedMetadataType: undefined,
+    idempotencyKey: undefined,
+    idempotencyKeyOptions: undefined,
+    isTest: false,
+    depth: 0,
+    ttl: undefined,
+    tags: [],
+    runTags: [],
+    lockedToVersion: undefined,
+    resumeParentOnCompletion: false,
+    parentTaskRunId: undefined,
+    traceId: "trace_1",
+    spanId: "span_1",
+    parentSpanId: "span_parent",
+    runtimeEnvironmentId: "env_a",
+    engine: "V2",
+    workerQueue: undefined,
+    queue: "task/hello-world",
+    concurrencyKey: undefined,
+    machinePreset: "small-1x",
+    realtimeStreamsVersion: undefined,
+    maxAttempts: undefined,
+    maxDurationInSeconds: undefined,
+    replayedFromTaskRunFriendlyId: undefined,
+    annotations: undefined,
+    traceContext: undefined,
+    scheduleId: undefined,
+    batchId: undefined,
+    parentTaskRunFriendlyId: undefined,
+    rootTaskRunFriendlyId: undefined,
+    ...overrides,
+  };
+}
+
+describe("buildSyntheticSpanDetailBody", () => {
+  it("populates identity fields from the buffered run", () => {
+    const body = buildSyntheticSpanDetailBody(makeSyntheticRun());
+    expect(body.spanId).toBe("span_1");
+    expect(body.parentId).toBe("span_parent");
+    expect(body.runId).toBe("run_friendly_1");
+    expect(body.message).toBe("hello-world");
+    expect(body.level).toBe("TRACE");
+    expect(body.startTime).toEqual(NOW);
+    expect(body.durationMs).toBe(0);
+  });
+
+  it("defaults parentId to null when the buffered run has no parentSpanId", () => {
+    const body = buildSyntheticSpanDetailBody(makeSyntheticRun({ parentSpanId: undefined }));
+    expect(body.parentId).toBeNull();
+  });
+
+  it("defaults message to '' when the buffered run has no taskIdentifier", () => {
+    const body = buildSyntheticSpanDetailBody(
+      makeSyntheticRun({ taskIdentifier: undefined })
+    );
+    expect(body.message).toBe("");
+  });
+
+  it("renders a QUEUED buffered run as a still-partial, non-error, non-cancelled span", () => {
+    const body = buildSyntheticSpanDetailBody(makeSyntheticRun({ status: "QUEUED" }));
+    expect(body.isPartial).toBe(true);
+    expect(body.isError).toBe(false);
+    expect(body.isCancelled).toBe(false);
+  });
+
+  it("renders a CANCELED buffered run as a non-partial, non-error, cancelled span", () => {
+    const body = buildSyntheticSpanDetailBody(makeSyntheticRun({ status: "CANCELED" }));
+    expect(body.isPartial).toBe(false);
+    expect(body.isError).toBe(false);
+    expect(body.isCancelled).toBe(true);
+  });
+
+  it("renders a FAILED buffered run as a non-partial, errored, non-cancelled span", () => {
+    // Regression: a FAILED buffered run used to slip through as
+    // `isPartial: true, isError: false`, telling SDK pollers it was still
+    // executing.
+    const body = buildSyntheticSpanDetailBody(makeSyntheticRun({ status: "FAILED" }));
+    expect(body.isPartial).toBe(false);
+    expect(body.isError).toBe(true);
+    expect(body.isCancelled).toBe(false);
+  });
+});
+
+describe("buildSyntheticTraceBody", () => {
+  it("envelopes the synthesised root span under `trace.rootSpan` with the buffered traceId", () => {
+    const body = buildSyntheticTraceBody(makeSyntheticRun());
+    expect(body.trace.traceId).toBe("trace_1");
+    expect(body.trace.rootSpan.id).toBe("span_1");
+    expect(body.trace.rootSpan.runId).toBe("run_friendly_1");
+    expect(body.trace.rootSpan.children).toEqual([]);
+    expect(body.trace.rootSpan.data.events).toEqual([]);
+  });
+
+  it("falls back to empty strings when traceId / spanId are absent from the snapshot", () => {
+    const body = buildSyntheticTraceBody(
+      makeSyntheticRun({ traceId: undefined, spanId: undefined })
+    );
+    expect(body.trace.traceId).toBe("");
+    expect(body.trace.rootSpan.id).toBe("");
+  });
+
+  it("passes through queueName and machinePreset from the snapshot", () => {
+    const body = buildSyntheticTraceBody(makeSyntheticRun());
+    expect(body.trace.rootSpan.data.queueName).toBe("task/hello-world");
+    expect(body.trace.rootSpan.data.machinePreset).toBe("small-1x");
+  });
+
+  it("defaults taskSlug to undefined when the buffered run has no taskIdentifier", () => {
+    const body = buildSyntheticTraceBody(makeSyntheticRun({ taskIdentifier: undefined }));
+    expect(body.trace.rootSpan.data.taskSlug).toBeUndefined();
+    expect(body.trace.rootSpan.data.message).toBe("");
+  });
+
+  it("renders a QUEUED buffered run as a partial, non-error, non-cancelled root span", () => {
+    const body = buildSyntheticTraceBody(makeSyntheticRun({ status: "QUEUED" }));
+    expect(body.trace.rootSpan.data.isPartial).toBe(true);
+    expect(body.trace.rootSpan.data.isError).toBe(false);
+    expect(body.trace.rootSpan.data.isCancelled).toBe(false);
+  });
+
+  it("renders a CANCELED buffered run as a non-partial, non-error, cancelled root span", () => {
+    const body = buildSyntheticTraceBody(makeSyntheticRun({ status: "CANCELED" }));
+    expect(body.trace.rootSpan.data.isPartial).toBe(false);
+    expect(body.trace.rootSpan.data.isError).toBe(false);
+    expect(body.trace.rootSpan.data.isCancelled).toBe(true);
+  });
+
+  it("renders a FAILED buffered run as a non-partial, errored, non-cancelled root span", () => {
+    // Regression: a FAILED buffered run used to render with
+    // `isPartial: true, isError: false`, masking the failure from SDK
+    // consumers.
+    const body = buildSyntheticTraceBody(makeSyntheticRun({ status: "FAILED" }));
+    expect(body.trace.rootSpan.data.isPartial).toBe(false);
+    expect(body.trace.rootSpan.data.isError).toBe(true);
+    expect(body.trace.rootSpan.data.isCancelled).toBe(false);
+  });
+});
diff --git a/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts b/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts
new file mode 100644
index 00000000000..a996b9de693
--- /dev/null
+++ b/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts
@@ -0,0 +1,197 @@
+import { describe, expect, vi } from "vitest";
+import { redisTest } from "@internal/testcontainers";
+import { MollifierBuffer } from "@trigger.dev/redis-worker";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import { findBufferedRunRedirectInfo } from "~/v3/mollifier/syntheticRedirectInfo.server";
+
+const SNAPSHOT = {
+  spanId: "span_1",
+  environment: {
+    slug: "dev",
+    project: { slug: "hello-world-bN7m" },
+    organization: { slug: "references-6120" },
+  },
+};
+
+function fakePrisma(member: { id: string } | null) {
+  return {
+    orgMember: { findFirst: vi.fn(async () => member) },
+  } as unknown as Parameters<typeof findBufferedRunRedirectInfo>[1]["prismaClient"];
+}
+
+describe("findBufferedRunRedirectInfo (testcontainers)", () => {
+  redisTest("returns slugs + spanId for a real buffer entry when user is a member", async ({ redisOptions }) => {
+    const buffer = new MollifierBuffer({ redisOptions });
+    try {
+      await buffer.accept({
+        runId: "run_real_1",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: JSON.stringify(SNAPSHOT),
+      });
+      const info = await findBufferedRunRedirectInfo(
+        { runFriendlyId: "run_real_1", userId: "user_1" },
+        { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) },
+      );
+      expect(info).toEqual({
+        organizationSlug: "references-6120",
+        projectSlug: "hello-world-bN7m",
+        environmentSlug: "dev",
+        spanId: "span_1",
+      });
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest("returns null when no buffer entry exists for the runId", async ({ redisOptions }) => {
+    const buffer = new MollifierBuffer({ redisOptions });
+    try {
+      const info = await findBufferedRunRedirectInfo(
+        { runFriendlyId: "run_missing", userId: "user_1" },
+        { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) },
+      );
+      expect(info).toBeNull();
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest("returns null when the user is not an org member (default check enforced)", async ({ redisOptions }) => {
+    const buffer = new MollifierBuffer({ redisOptions });
+    try {
+      await buffer.accept({
+        runId: "run_real_2",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: JSON.stringify(SNAPSHOT),
+      });
+      const info = await findBufferedRunRedirectInfo(
+        { runFriendlyId: "run_real_2", userId: "user_other" },
+        { getBuffer: () => buffer, prismaClient: fakePrisma(null) },
+      );
+      expect(info).toBeNull();
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest("skips the org-membership check when skipOrgMembershipCheck is set (admin path)", async ({ redisOptions }) => {
+    const buffer = new MollifierBuffer({ redisOptions });
+    try {
+      await buffer.accept({
+        runId: "run_real_3",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: JSON.stringify(SNAPSHOT),
+      });
+      const findFirst = vi.fn();
+      const info = await findBufferedRunRedirectInfo(
+        { runFriendlyId: "run_real_3", userId: "user_admin", skipOrgMembershipCheck: true },
+        {
+          getBuffer: () => buffer,
+          prismaClient: { orgMember: { findFirst } } as unknown as Parameters<typeof findBufferedRunRedirectInfo>[1]["prismaClient"],
+        },
+      );
+      expect(info?.organizationSlug).toBe("references-6120");
+      expect(findFirst).not.toHaveBeenCalled();
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest("returns null when snapshot is malformed JSON", async ({ redisOptions }) => {
+    const buffer = new MollifierBuffer({ redisOptions });
+    try {
+      await buffer.accept({
+        runId: "run_real_4",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: "{not-json",
+      });
+      const info = await findBufferedRunRedirectInfo(
+        { runFriendlyId: "run_real_4", userId: "user_1" },
+        { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) },
+      );
+      expect(info).toBeNull();
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest("returns null when snapshot lacks org/project slugs", async ({ redisOptions }) => {
+    const buffer = new MollifierBuffer({ redisOptions });
+    try {
+      await buffer.accept({
+        runId: "run_real_5",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: JSON.stringify({ spanId: "s", environment: { slug: "dev" } }),
+      });
+      const info = await findBufferedRunRedirectInfo(
+        { runFriendlyId: "run_real_5", userId: "user_1" },
+        { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) },
+      );
+      expect(info).toBeNull();
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest("returns info with undefined spanId when snapshot has no spanId", async ({ redisOptions }) => {
+    const buffer = new MollifierBuffer({ redisOptions });
+    try {
+      await buffer.accept({
+        runId: "run_real_6",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: JSON.stringify({ environment: SNAPSHOT.environment }),
+      });
+      const info = await findBufferedRunRedirectInfo(
+        { runFriendlyId: "run_real_6", userId: "user_1" },
+        { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) },
+      );
+      expect(info?.spanId).toBeUndefined();
+      expect(info?.environmentSlug).toBe("dev");
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest(
+    "rejects snapshots where a slug is the wrong type (Zod guard, not just typeof)",
+    async ({ redisOptions }) => {
+      // Regression for the pre-Zod implementation: the slug check was
+      // `typeof slug !== "string"` so any string passed, including ones
+      // that should've been rejected on shape grounds. The Zod schema
+      // gives us full structural validation — a `slug: 42` (number)
+      // collapses into the parse-fail branch like any other shape
+      // mismatch and we return null instead of leaking a half-built
+      // redirect URL.
+      const buffer = new MollifierBuffer({ redisOptions });
+      try {
+        await buffer.accept({
+          runId: "run_real_7",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: JSON.stringify({
+            environment: {
+              slug: 42,
+              project: { slug: "p" },
+              organization: { slug: "o" },
+            },
+          }),
+        });
+        const info = await findBufferedRunRedirectInfo(
+          { runFriendlyId: "run_real_7", userId: "user_1" },
+          { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) },
+        );
+        expect(info).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
diff --git a/apps/webapp/test/mollifierSyntheticReplayTaskRun.test.ts b/apps/webapp/test/mollifierSyntheticReplayTaskRun.test.ts
new file mode 100644
index 00000000000..6df2d92dde4
--- /dev/null
+++ b/apps/webapp/test/mollifierSyntheticReplayTaskRun.test.ts
@@ -0,0 +1,106 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import { buildSyntheticReplayTaskRun } from "~/v3/mollifier/syntheticReplayTaskRun.server";
+import type { SyntheticRun } from "~/v3/mollifier/readFallback.server";
+
+const NOW = new Date("2026-05-21T10:00:00Z");
+
+function makeSyntheticRun(overrides: Partial<SyntheticRun> = {}): SyntheticRun {
+  return {
+    id: "run_internal_1",
+    friendlyId: "run_friendly_1",
+    status: "QUEUED",
+    cancelledAt: undefined,
+    cancelReason: undefined,
+    delayUntil: undefined,
+    taskIdentifier: "hello-world",
+    createdAt: NOW,
+    payload: { message: "hi" },
+    payloadType: "application/json",
+    metadata: undefined,
+    metadataType: undefined,
+    seedMetadata: undefined,
+    seedMetadataType: undefined,
+    idempotencyKey: undefined,
+    idempotencyKeyOptions: undefined,
+    isTest: false,
+    depth: 0,
+    ttl: "10m",
+    tags: [],
+    runTags: [],
+    lockedToVersion: undefined,
+    resumeParentOnCompletion: false,
+    parentTaskRunId: undefined,
+    traceId: "trace_1",
+    spanId: "span_1",
+    parentSpanId: undefined,
+    runtimeEnvironmentId: "env_a",
+    engine: "V2",
+    workerQueue: "worker-queue-1",
+    queue: "task/hello-world",
+    concurrencyKey: undefined,
+    machinePreset: "small-1x",
+    realtimeStreamsVersion: "v1",
+    maxAttempts: 3,
+    maxDurationInSeconds: 3600,
+    replayedFromTaskRunFriendlyId: undefined,
+    annotations: undefined,
+    traceContext: undefined,
+    scheduleId: undefined,
+    batchId: undefined,
+    parentTaskRunFriendlyId: undefined,
+    rootTaskRunFriendlyId: undefined,
+    ...overrides,
+  };
+}
+
+const ENV_ROW = {
+  slug: "dev",
+  project: { slug: "hello-world", organization: { slug: "references" } },
+};
+
+describe("buildSyntheticReplayTaskRun", () => {
+  it("returns the adapted TaskRun shape when traceId and spanId are present", () => {
+    const taskRun = buildSyntheticReplayTaskRun({
+      synthetic: makeSyntheticRun(),
+      envRow: ENV_ROW,
+    });
+    expect(taskRun).not.toBeNull();
+    expect(taskRun!.traceId).toBe("trace_1");
+    expect(taskRun!.spanId).toBe("span_1");
+    expect(taskRun!.project.slug).toBe("hello-world");
+    expect(taskRun!.project.organization.slug).toBe("references");
+    expect(taskRun!.runtimeEnvironment.slug).toBe("dev");
+  });
+
+  it("returns null when the snapshot has no traceId", () => {
+    // ReplayTaskRunService builds `00-${traceId}-${spanId}-01` without
+    // guarding for undefined. Falling through with a missing traceId
+    // would emit `00-undefined-...-01`, an invalid W3C traceparent that
+    // OTel silently drops, breaking the replayed run's trace linkage to
+    // the original. The helper must refuse rather than degrade silently.
+    const taskRun = buildSyntheticReplayTaskRun({
+      synthetic: makeSyntheticRun({ traceId: undefined }),
+      envRow: ENV_ROW,
+    });
+    expect(taskRun).toBeNull();
+  });
+
+  it("returns null when the snapshot has no spanId", () => {
+    const taskRun = buildSyntheticReplayTaskRun({
+      synthetic: makeSyntheticRun({ spanId: undefined }),
+      envRow: ENV_ROW,
+    });
+    expect(taskRun).toBeNull();
+  });
+
+  it("returns null when both traceId and spanId are missing", () => {
+    const taskRun = buildSyntheticReplayTaskRun({
+      synthetic: makeSyntheticRun({ traceId: undefined, spanId: undefined }),
+      envRow: ENV_ROW,
+    });
+    expect(taskRun).toBeNull();
+  });
+});
diff --git a/apps/webapp/test/mollifierSyntheticRunHeader.test.ts b/apps/webapp/test/mollifierSyntheticRunHeader.test.ts
new file mode 100644
index 00000000000..0d9f7c7e13f
--- /dev/null
+++ b/apps/webapp/test/mollifierSyntheticRunHeader.test.ts
@@ -0,0 +1,130 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import { buildSyntheticRunHeader } from "~/v3/mollifier/syntheticRunHeader.server";
+import type { SyntheticRun } from "~/v3/mollifier/readFallback.server";
+
+const NOW = new Date("2026-05-21T10:00:00Z");
+const CANCELLED_AT = new Date("2026-05-21T10:00:30Z");
+
+function makeSyntheticRun(overrides: Partial<SyntheticRun> = {}): SyntheticRun {
+  return {
+    id: "run_internal_1",
+    friendlyId: "run_friendly_1",
+    status: "QUEUED",
+    cancelledAt: undefined,
+    cancelReason: undefined,
+    delayUntil: undefined,
+    taskIdentifier: "hello-world",
+    createdAt: NOW,
+    payload: { message: "hi" },
+    payloadType: "application/json",
+    metadata: undefined,
+    metadataType: undefined,
+    seedMetadata: undefined,
+    seedMetadataType: undefined,
+    idempotencyKey: undefined,
+    idempotencyKeyOptions: undefined,
+    isTest: false,
+    depth: 0,
+    ttl: "10m",
+    tags: [],
+    runTags: [],
+    lockedToVersion: undefined,
+    resumeParentOnCompletion: false,
+    parentTaskRunId: undefined,
+    traceId: "trace_1",
+    spanId: "span_1",
+    parentSpanId: undefined,
+    runtimeEnvironmentId: "env_a",
+    engine: "V2",
+    workerQueue: "worker-queue-1",
+    queue: "task/hello-world",
+    concurrencyKey: undefined,
+    machinePreset: "small-1x",
+    realtimeStreamsVersion: "v1",
+    maxAttempts: 3,
+    maxDurationInSeconds: 3600,
+    replayedFromTaskRunFriendlyId: undefined,
+    annotations: undefined,
+    traceContext: undefined,
+    scheduleId: undefined,
+    batchId: undefined,
+    parentTaskRunFriendlyId: undefined,
+    rootTaskRunFriendlyId: undefined,
+    ...overrides,
+  };
+}
+
+const ENV = {
+  id: "env_a",
+  organizationId: "org_a",
+  type: "DEVELOPMENT" as const,
+  slug: "dev",
+};
+
+describe("buildSyntheticRunHeader", () => {
+  it("returns PENDING / non-final state for a queued buffered run", () => {
+    const header = buildSyntheticRunHeader({ run: makeSyntheticRun(), environment: ENV });
+    expect(header.status).toBe("PENDING");
+    expect(header.isFinished).toBe(false);
+    expect(header.completedAt).toBeNull();
+  });
+
+  it("reflects CANCELED state from the snapshot so the NavBar and Cancel-button gate update before the drainer materialises", () => {
+    const header = buildSyntheticRunHeader({
+      run: makeSyntheticRun({ status: "CANCELED", cancelledAt: CANCELLED_AT }),
+      environment: ENV,
+    });
+    // The Cancel button in route.tsx is gated on `!run.isFinished` and the
+    // status badge reads `run.status`. Both must flip on buffered-cancel
+    // or the user sees a "Pending" badge with a Cancel button on a run
+    // that's already cancelled in the snapshot.
+    expect(header.status).toBe("CANCELED");
+    expect(header.isFinished).toBe(true);
+    expect(header.completedAt).toEqual(CANCELLED_AT);
+  });
+
+  it("populates completedAt for FAILED runs so the route stops live-reloading and renders as completed", () => {
+    // The run-detail route derives `isCompleted` from
+    // `run.completedAt !== null` and gates SSE live-reloading on it
+    // (`route.tsx:459`, `:551`). Leaving completedAt null for FAILED
+    // buffered runs would keep a terminal run live-reloading forever
+    // while the badge already says SYSTEM_FAILURE. Symmetric with
+    // buildSyntheticSpanRun + ApiRetrieveRunPresenter.
+    const header = buildSyntheticRunHeader({
+      run: makeSyntheticRun({ status: "FAILED" }),
+      environment: ENV,
+    });
+    expect(header.status).toBe("SYSTEM_FAILURE");
+    expect(header.isFinished).toBe(true);
+    expect(header.completedAt).toEqual(NOW);
+  });
+
+  it("forwards identity and environment fields from the snapshot", () => {
+    const header = buildSyntheticRunHeader({ run: makeSyntheticRun(), environment: ENV });
+    expect(header.friendlyId).toBe("run_friendly_1");
+    // `id` mirrors RunPresenter.getRun (the PG path) which puts the
+    // internal cuid in this field. SyntheticRun.id is the cuid; the
+    // header must surface it (not the friendlyId).
+    expect(header.id).toBe("run_internal_1");
+    expect(header.traceId).toBe("trace_1");
+    expect(header.spanId).toBe("span_1");
+    expect(header.environment).toMatchObject({
+      id: "env_a",
+      organizationId: "org_a",
+      type: "DEVELOPMENT",
+      slug: "dev",
+    });
+  });
+
+  it("falls back to empty strings when the snapshot has no trace/span ids", () => {
+    const header = buildSyntheticRunHeader({
+      run: makeSyntheticRun({ traceId: undefined, spanId: undefined }),
+      environment: ENV,
+    });
+    expect(header.traceId).toBe("");
+    expect(header.spanId).toBe("");
+  });
+});
diff --git a/apps/webapp/test/mollifierSyntheticSpanRun.test.ts b/apps/webapp/test/mollifierSyntheticSpanRun.test.ts
new file mode 100644
index 00000000000..3a89046e8cb
--- /dev/null
+++ b/apps/webapp/test/mollifierSyntheticSpanRun.test.ts
@@ -0,0 +1,197 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import { buildSyntheticSpanRun } from "~/v3/mollifier/syntheticSpanRun.server";
+import type { SyntheticRun } from "~/v3/mollifier/readFallback.server";
+
+const NOW = new Date("2026-05-21T10:00:00Z");
+
+function makeSyntheticRun(overrides: Partial<SyntheticRun> = {}): SyntheticRun {
+  return {
+    id: "run_internal_1",
+    friendlyId: "run_friendly_1",
+    status: "QUEUED",
+    taskIdentifier: "hello-world",
+    createdAt: NOW,
+    payload: { message: "hi" },
+    payloadType: "application/json",
+    metadata: undefined,
+    metadataType: undefined,
+    seedMetadata: undefined,
+    seedMetadataType: undefined,
+    idempotencyKey: undefined,
+    idempotencyKeyOptions: undefined,
+    isTest: false,
+    depth: 0,
+    ttl: "10m",
+    tags: ["a", "b"],
+    runTags: ["a", "b"],
+    lockedToVersion: undefined,
+    resumeParentOnCompletion: false,
+    parentTaskRunId: undefined,
+    traceId: "trace_1",
+    spanId: "span_1",
+    parentSpanId: undefined,
+    runtimeEnvironmentId: "env_a",
+    engine: "V2",
+    workerQueue: "worker-queue-1",
+    queue: "task/hello-world",
+    concurrencyKey: undefined,
+    machinePreset: "small-1x",
+    realtimeStreamsVersion: "v1",
+    maxAttempts: 3,
+    maxDurationInSeconds: 3600,
+    replayedFromTaskRunFriendlyId: undefined,
+    annotations: undefined,
+    traceContext: undefined,
+    scheduleId: undefined,
+    batchId: undefined,
+    parentTaskRunFriendlyId: undefined,
+    rootTaskRunFriendlyId: undefined,
+    ...overrides,
+  };
+}
+
+const ENV = {
+  id: "env_a",
+  slug: "dev",
+  type: "DEVELOPMENT" as const,
+};
+
+describe("buildSyntheticSpanRun", () => {
+  it("populates the core identity fields from the snapshot", async () => {
+    const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV });
+    expect(synth.id).toBe("run_internal_1");
+    expect(synth.friendlyId).toBe("run_friendly_1");
+    expect(synth.taskIdentifier).toBe("hello-world");
+    expect(synth.traceId).toBe("trace_1");
+    expect(synth.spanId).toBe("span_1");
+    expect(synth.environmentId).toBe("env_a");
+    expect(synth.engine).toBe("V2");
+    expect(synth.workerQueue).toBe("worker-queue-1");
+  });
+
+  it("reports PENDING status and the non-final flags", async () => {
+    const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV });
+    expect(synth.status).toBe("PENDING");
+    expect(synth.isFinished).toBe(false);
+    expect(synth.isRunning).toBe(false);
+    expect(synth.isError).toBe(false);
+    expect(synth.startedAt).toBeNull();
+    expect(synth.completedAt).toBeNull();
+  });
+
+  it("pretty-prints the JSON payload from the snapshot", async () => {
+    const synth = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({ payload: { message: "hi" }, payloadType: "application/json" }),
+      environment: ENV,
+    });
+    // prettyPrintPacket round-trips JSON with 2-space indent.
+    expect(synth.payload).toContain('"message": "hi"');
+    expect(synth.payloadType).toBe("application/json");
+  });
+
+  it("forwards runTags onto `tags` exactly", async () => {
+    const synth = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({ runTags: ["alpha", "beta"] }),
+      environment: ENV,
+    });
+    expect(synth.tags).toEqual(["alpha", "beta"]);
+  });
+
+  it("classifies the queue name as custom when it does not start with 'task/'", async () => {
+    const taskQueue = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({ queue: "task/hello-world" }),
+      environment: ENV,
+    });
+    expect(taskQueue.queue.isCustomQueue).toBe(false);
+
+    const customQueue = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({ queue: "my-custom" }),
+      environment: ENV,
+    });
+    expect(customQueue.queue.isCustomQueue).toBe(true);
+  });
+
+  it("derives idempotency status from the snapshot key/options", async () => {
+    const withKey = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({ idempotencyKey: "abc", idempotencyKeyOptions: ["scope"] }),
+      environment: ENV,
+    });
+    expect(withKey.idempotencyKey).toBe("abc");
+    expect(withKey.idempotencyKeyStatus).toBe("active");
+
+    const noKey = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({ idempotencyKey: undefined, idempotencyKeyOptions: undefined }),
+      environment: ENV,
+    });
+    expect(noKey.idempotencyKeyStatus).toBeUndefined();
+  });
+
+  it("omits relationships even when parent/root friendlyIds are present, since the snapshot lacks their spanId/taskIdentifier", async () => {
+    const synth = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({
+        parentTaskRunFriendlyId: "run_parent",
+        rootTaskRunFriendlyId: "run_root",
+      }),
+      environment: ENV,
+    });
+    expect(synth.relationships.parent).toBeUndefined();
+    expect(synth.relationships.root).toBeUndefined();
+  });
+
+  it("returns no relationship objects when the snapshot has no parent/root", async () => {
+    const synth = await buildSyntheticSpanRun({
+      run: makeSyntheticRun(),
+      environment: ENV,
+    });
+    expect(synth.relationships.parent).toBeUndefined();
+    expect(synth.relationships.root).toBeUndefined();
+  });
+
+  it("reflects a buffered CANCELED run as a finished, cancelled terminal state", async () => {
+    const synth = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({
+        status: "CANCELED",
+        cancelledAt: NOW,
+        cancelReason: "cancelled by user",
+      }),
+      environment: ENV,
+    });
+    expect(synth.status).toBe("CANCELED");
+    expect(synth.statusReason).toBe("cancelled by user");
+    expect(synth.isFinished).toBe(true);
+    expect(synth.isError).toBe(false);
+    expect(synth.completedAt).toEqual(NOW);
+  });
+
+  it("reflects a buffered FAILED run as a finished, errored SYSTEM_FAILURE", async () => {
+    const synth = await buildSyntheticSpanRun({
+      run: makeSyntheticRun({
+        status: "FAILED",
+        error: { code: "GATE_REJECTED", message: "buffer rejected the run" },
+      }),
+      environment: ENV,
+    });
+    expect(synth.status).toBe("SYSTEM_FAILURE");
+    expect(synth.isFinished).toBe(true);
+    expect(synth.isError).toBe(true);
+    expect(synth.statusReason).toBe("buffer rejected the run");
+    expect(synth.error).toEqual({
+      type: "STRING_ERROR",
+      raw: "GATE_REJECTED: buffer rejected the run",
+    });
+    // PG-resident SYSTEM_FAILURE rows always have completedAt set;
+    // mirror that on the synth path so callers checking
+    // `isFinished && completedAt` don't render a finished run with
+    // no completion timestamp. The buffer entry has no separate
+    // failedAt, so createdAt is the best-available proxy.
+    expect(synth.completedAt).toEqual(NOW);
+  });
+
+  it("flags the synthetic run as 'not cached' since cache lookup did not match it", async () => {
+    const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV });
+    expect(synth.isCached).toBe(false);
+  });
+});
diff --git a/apps/webapp/test/mollifierSyntheticTrace.test.ts b/apps/webapp/test/mollifierSyntheticTrace.test.ts
new file mode 100644
index 00000000000..ac7425a8fe9
--- /dev/null
+++ b/apps/webapp/test/mollifierSyntheticTrace.test.ts
@@ -0,0 +1,149 @@
+import { describe, expect, it, vi } from "vitest";
+
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+import { buildSyntheticTraceForBufferedRun } from "~/v3/mollifier/syntheticTrace.server";
+import type { SyntheticRun } from "~/v3/mollifier/readFallback.server";
+
+const NOW = new Date("2026-05-22T10:00:00Z");
+const ONE_MS_IN_NS = 1_000_000;
+
+function makeSyntheticRun(overrides: Partial<SyntheticRun> = {}): SyntheticRun {
+  return {
+    id: "run_internal_1",
+    friendlyId: "run_friendly_1",
+    status: "QUEUED",
+    cancelledAt: undefined,
+    cancelReason: undefined,
+    delayUntil: undefined,
+    taskIdentifier: "hello-world",
+    createdAt: NOW,
+    payload: undefined,
+    payloadType: undefined,
+    metadata: undefined,
+    metadataType: undefined,
+    seedMetadata: undefined,
+    seedMetadataType: undefined,
+    idempotencyKey: undefined,
+    idempotencyKeyOptions: undefined,
+    isTest: false,
+    depth: 0,
+    ttl: undefined,
+    tags: [],
+    runTags: [],
+    lockedToVersion: undefined,
+    resumeParentOnCompletion: false,
+    parentTaskRunId: undefined,
+    traceId: "trace_1",
+    spanId: "span_1",
+    parentSpanId: undefined,
+    runtimeEnvironmentId: "env_a",
+    engine: "V2",
+    workerQueue: undefined,
+    queue: undefined,
+    concurrencyKey: undefined,
+    machinePreset: undefined,
+    realtimeStreamsVersion: undefined,
+    maxAttempts: undefined,
+    maxDurationInSeconds: undefined,
+    replayedFromTaskRunFriendlyId: undefined,
+    annotations: undefined,
+    traceContext: undefined,
+    scheduleId: undefined,
+    batchId: undefined,
+    parentTaskRunFriendlyId: undefined,
+    rootTaskRunFriendlyId: undefined,
+    ...overrides,
+  };
+}
+
+describe("buildSyntheticTraceForBufferedRun", () => {
+  it("populates the synthesised root span from snapshot identity fields", () => {
+    const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun());
+    expect(trace.events).toHaveLength(1);
+    const root = trace.events[0];
+    expect(root.id).toBe("span_1");
+    expect(root.data.message).toBe("hello-world");
+    expect(root.data.startTime).toEqual(NOW);
+    expect(root.data.isRoot).toBe(true);
+    expect(root.data.offset).toBe(0);
+    expect(root.data.level).toBe("TRACE");
+  });
+
+  it("defaults the span message to 'Task' when the snapshot has no taskIdentifier", () => {
+    const trace = buildSyntheticTraceForBufferedRun(
+      makeSyntheticRun({ taskIdentifier: undefined })
+    );
+    expect(trace.events[0].data.message).toBe("Task");
+  });
+
+  it("falls back to an empty-string span id when the snapshot has no spanId", () => {
+    const trace = buildSyntheticTraceForBufferedRun(
+      makeSyntheticRun({ spanId: undefined })
+    );
+    expect(trace.events[0].id).toBe("");
+    // Empty id still marks as root (it matches the rootId fallback).
+    expect(trace.events[0].data.isRoot).toBe(true);
+  });
+
+  it("renders a QUEUED buffered run as an executing, partial root span", () => {
+    const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun({ status: "QUEUED" }));
+    expect(trace.rootSpanStatus).toBe("executing");
+    expect(trace.events[0].data.isPartial).toBe(true);
+    expect(trace.events[0].data.isError).toBe(false);
+    expect(trace.events[0].data.isCancelled).toBe(false);
+    // A partial span exposes duration as null (the timeline reads it as
+    // "still running"); see syntheticTrace.server.ts duration mapping.
+    expect(trace.events[0].data.duration).toBeNull();
+  });
+
+  it("renders a CANCELED buffered run as a completed, non-partial cancelled span", () => {
+    const trace = buildSyntheticTraceForBufferedRun(
+      makeSyntheticRun({ status: "CANCELED", cancelledAt: NOW })
+    );
+    expect(trace.rootSpanStatus).toBe("completed");
+    expect(trace.events[0].data.isPartial).toBe(false);
+    expect(trace.events[0].data.isCancelled).toBe(true);
+    expect(trace.events[0].data.isError).toBe(false);
+    // Non-partial: duration is the span's numeric value (0 here), not null.
+    expect(trace.events[0].data.duration).toBe(0);
+  });
+
+  it("renders a FAILED buffered run as a failed, non-partial errored span", () => {
+    const trace = buildSyntheticTraceForBufferedRun(
+      makeSyntheticRun({
+        status: "FAILED",
+        error: { code: "GATE_REJECTED", message: "buffer rejected the run" },
+      })
+    );
+    expect(trace.rootSpanStatus).toBe("failed");
+    expect(trace.events[0].data.isPartial).toBe(false);
+    expect(trace.events[0].data.isError).toBe(true);
+    expect(trace.events[0].data.isCancelled).toBe(false);
+    expect(trace.events[0].data.duration).toBe(0);
+  });
+
+  it("floors the trace duration to a minimum of 1ms (in nanoseconds) so the timeline has a positive extent", () => {
+    const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun());
+    expect(trace.duration).toBe(ONE_MS_IN_NS);
+  });
+
+  it("reports the buffered createdAt as the trace's rootStartedAt and leaves startedAt null", () => {
+    const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun());
+    expect(trace.rootStartedAt).toEqual(NOW);
+    expect(trace.startedAt).toBeNull();
+  });
+
+  it("returns no link or override metadata (buffered traces are single-span)", () => {
+    const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun());
+    expect(trace.linkedRunIdBySpanId).toEqual({});
+    expect(trace.overridesBySpanId).toBeUndefined();
+    expect(trace.queuedDuration).toBeUndefined();
+  });
+
+  it("synthesises an empty events list (no timeline events from the buffer)", () => {
+    const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun());
+    expect(trace.events[0].data.events).toEqual([]);
+    expect(trace.events[0].data.timelineEvents).toEqual([]);
+  });
+});
diff --git a/apps/webapp/test/mollifierTripEvaluator.test.ts b/apps/webapp/test/mollifierTripEvaluator.test.ts
index b9a9bf8c94a..14ac0cc55bc 100644
--- a/apps/webapp/test/mollifierTripEvaluator.test.ts
+++ b/apps/webapp/test/mollifierTripEvaluator.test.ts
@@ -14,7 +14,7 @@ describe("createRealTripEvaluator", () => {
   redisTest(
     "returns divert=false when the sliding window stays under threshold",
     async ({ redisOptions }) => {
-      const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 });
+      const buffer = new MollifierBuffer({ redisOptions });
       try {
         const evaluator = createRealTripEvaluator({
           getBuffer: () => buffer,
@@ -32,7 +32,7 @@ describe("createRealTripEvaluator", () => {
   redisTest(
     "returns divert=true with reason per_env_rate once the window trips",
     async ({ redisOptions }) => {
-      const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 });
+      const buffer = new MollifierBuffer({ redisOptions });
       try {
         // threshold=2 → the 3rd call within windowMs is the first that trips.
         const options = { windowMs: 5000, threshold: 2, holdMs: 5000 } as const;
@@ -73,7 +73,7 @@ describe("createRealTripEvaluator", () => {
   redisTest(
     "returns divert=false when buffer throws (fail-open)",
     async ({ redisOptions }) => {
-      const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 });
+      const buffer = new MollifierBuffer({ redisOptions });
       // Closing the client up front means evaluateTrip will throw on the first
       // Redis command — a real failure mode, not a stub.
       await buffer.close();
diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts
index da42247111a..835ff90cc48 100644
--- a/internal-packages/run-engine/src/engine/index.ts
+++ b/internal-packages/run-engine/src/engine/index.ts
@@ -450,6 +450,199 @@ export class RunEngine {
 
   //MARK: - Run functions
 
+  /**
+   * Writes a TaskRun row in CANCELED state directly, bypassing the trigger
+   * pipeline. Used by the mollifier drainer when a cancel API call lands on
+   * a buffered run before it materialises.
+   *
+   * Skips: queue insertion (no execution), waitpoint creation (the
+   * mollifier gate refuses to buffer triggerAndWait children, so a
+   * cancelled buffered run never has a waiting parent to unblock),
+   * concurrency reservation. Emits `runCancelled` by default — callers
+   * working on buffered-only runs (no primary trace event exists) can
+   * opt out via `emitRunCancelledEvent: false` to avoid the systematic
+   * "Failed to cancel run event" noise the handler would log when its
+   * `cancelRunEvent` call can't find a span.
+   *
+   * Idempotent: if a row with the same friendlyId already exists (double
+   * drainer pop after requeue), Prisma's P2002 unique-constraint violation
+   * is caught and the existing row is returned. The duplicate runCancelled
+   * emission is skipped — the original drain's emit already wrote the
+   * TaskEvent (when applicable).
+   */
+  async createCancelledRun(
+    {
+      snapshot,
+      cancelledAt,
+      cancelReason,
+      emitRunCancelledEvent = true,
+    }: {
+      snapshot: TriggerParams;
+      cancelledAt: Date;
+      cancelReason: string;
+      /**
+       * Whether to emit the `runCancelled` engine-bus event. Defaults to
+       * true.
+       *
+       * Set to `false` for buffered-only runs that never had a primary
+       * trace event written (the mollifier gate never called
+       * `repository.traceEvent` for them). The `runCancelled` handler in
+       * `runEngineHandlers.server.ts` calls `cancelRunEvent`, which
+       * looks up the run's primary span in the event store — for
+       * buffered-only runs that span doesn't exist, so the lookup fails,
+       * the handler's `tryCatch` swallows it, and a "[runCancelled]
+       * Failed to cancel run event" error is logged for every cancelled
+       * buffered run. Suppressing the emit avoids that systematic noise.
+       * The CANCELED PG row is still written; only the trace-event
+       * mirror is skipped.
+       */
+      emitRunCancelledEvent?: boolean;
+    },
+    tx?: PrismaClientOrTransaction,
+  ): Promise<TaskRun> {
+    const prisma = tx ?? this.prisma;
+    return startSpan(this.tracer, "createCancelledRun", async (span) => {
+      span.setAttribute("friendlyId", snapshot.friendlyId);
+      span.setAttribute("taskIdentifier", snapshot.taskIdentifier);
+      const id = RunId.fromFriendlyId(snapshot.friendlyId);
+      const error: TaskRunError = { type: "STRING_ERROR", raw: cancelReason };
+
+      try {
+        const taskRun = await prisma.taskRun.create({
+          data: {
+            id,
+            engine: "V2",
+            status: "CANCELED",
+            friendlyId: snapshot.friendlyId,
+            runtimeEnvironmentId: snapshot.environment.id,
+            environmentType: snapshot.environment.type,
+            organizationId: snapshot.environment.organization.id,
+            projectId: snapshot.environment.project.id,
+            idempotencyKey: snapshot.idempotencyKey,
+            idempotencyKeyExpiresAt: snapshot.idempotencyKeyExpiresAt,
+            idempotencyKeyOptions: snapshot.idempotencyKeyOptions,
+            taskIdentifier: snapshot.taskIdentifier,
+            payload: snapshot.payload,
+            payloadType: snapshot.payloadType,
+            context: snapshot.context,
+            traceContext: snapshot.traceContext,
+            traceId: snapshot.traceId,
+            spanId: snapshot.spanId,
+            parentSpanId: snapshot.parentSpanId,
+            lockedToVersionId: snapshot.lockedToVersionId,
+            taskVersion: snapshot.taskVersion,
+            sdkVersion: snapshot.sdkVersion,
+            cliVersion: snapshot.cliVersion,
+            concurrencyKey: snapshot.concurrencyKey,
+            queue: snapshot.queue,
+            lockedQueueId: snapshot.lockedQueueId,
+            workerQueue: snapshot.workerQueue,
+            isTest: snapshot.isTest,
+            taskEventStore: snapshot.taskEventStore,
+            // Defensive: the snapshot comes from a cjson-encoded buffer
+            // payload, where empty Lua tables encode as `{}` not `[]`. If
+            // the drainer pops a buffered run with no tags, snapshot.tags
+            // will be an empty object, which Prisma misreads as a relation
+            // update op. Normalise to a real array (or undefined for the
+            // empty case).
+            runTags: Array.isArray(snapshot.tags) && snapshot.tags.length > 0
+              ? snapshot.tags
+              : undefined,
+            oneTimeUseToken: snapshot.oneTimeUseToken,
+            parentTaskRunId: snapshot.parentTaskRunId,
+            rootTaskRunId: snapshot.rootTaskRunId,
+            replayedFromTaskRunFriendlyId: snapshot.replayedFromTaskRunFriendlyId,
+            batchId: snapshot.batch?.id,
+            resumeParentOnCompletion: snapshot.resumeParentOnCompletion,
+            depth: snapshot.depth,
+            seedMetadata: snapshot.seedMetadata,
+            seedMetadataType: snapshot.seedMetadataType,
+            metadata: snapshot.metadata,
+            metadataType: snapshot.metadataType,
+            machinePreset: snapshot.machine,
+            scheduleId: snapshot.scheduleId,
+            scheduleInstanceId: snapshot.scheduleInstanceId,
+            createdAt: snapshot.createdAt,
+            bulkActionGroupIds: snapshot.bulkActionId ? [snapshot.bulkActionId] : undefined,
+            planType: snapshot.planType,
+            realtimeStreamsVersion: snapshot.realtimeStreamsVersion,
+            streamBasinName: snapshot.streamBasinName,
+            annotations: snapshot.annotations,
+            completedAt: cancelledAt,
+            updatedAt: cancelledAt,
+            error: error as unknown as Prisma.InputJsonValue,
+            attemptNumber: 0,
+            executionSnapshots: {
+              create: {
+                engine: "V2",
+                executionStatus: "FINISHED",
+                description: "Run cancelled before materialisation",
+                runStatus: "CANCELED",
+                environmentId: snapshot.environment.id,
+                environmentType: snapshot.environment.type,
+                projectId: snapshot.environment.project.id,
+                organizationId: snapshot.environment.organization.id,
+              },
+            },
+          },
+        });
+
+        if (emitRunCancelledEvent) {
+          this.eventBus.emit("runCancelled", {
+            time: cancelledAt,
+            run: {
+              id: taskRun.id,
+              status: taskRun.status,
+              friendlyId: taskRun.friendlyId,
+              spanId: taskRun.spanId,
+              taskEventStore: taskRun.taskEventStore,
+              createdAt: taskRun.createdAt,
+              completedAt: taskRun.completedAt,
+              error,
+              updatedAt: taskRun.updatedAt,
+              attemptNumber: taskRun.attemptNumber ?? 0,
+            },
+            organization: { id: snapshot.environment.organization.id },
+            project: { id: snapshot.environment.project.id },
+            environment: { id: snapshot.environment.id },
+          });
+        }
+
+        return taskRun;
+      } catch (err) {
+        // P2002 = unique constraint violation. Double-pop after a drainer
+        // requeue can reach this. Idempotent: return the existing row
+        // without re-emitting.
+        if (
+          err instanceof Prisma.PrismaClientKnownRequestError &&
+          err.code === "P2002"
+        ) {
+          this.logger.info(
+            "createCancelledRun: row already exists, returning existing (idempotent)",
+            { friendlyId: snapshot.friendlyId },
+          );
+          const existing = await prisma.taskRun.findFirst({ where: { id } });
+          if (existing) {
+            // Only treat the conflict as idempotent when the existing
+            // row is ALREADY canceled. If a non-canceled row landed
+            // first (e.g. the drainer's normal `engine.trigger` replay
+            // path raced ahead of the cancel) we surface a conflict
+            // rather than silently reporting "cancelled" — the run is
+            // genuinely live and the caller must decide between
+            // engine.cancelRun() and skipping.
+            if (existing.status === "CANCELED") {
+              return existing;
+            }
+            throw new Error(
+              `createCancelledRun conflict: existing run ${snapshot.friendlyId} has status ${existing.status}`,
+            );
+          }
+        }
+        throw err;
+      }
+    });
+  }
+
   /** "Triggers" one run. */
   async trigger(
     {
@@ -648,7 +841,16 @@ export class RunEngine {
               priorityMs,
               queueTimestamp: queueTimestamp ?? delayUntil ?? new Date(),
               ttl: resolvedTtl,
-              runTags: tags.length === 0 ? undefined : tags,
+              // Defensive: when the mollifier drainer replays a buffered
+              // snapshot whose payload was rewritten by a buffer-side Lua
+              // mutate (e.g. append_tags clears an empty list), cjson
+              // encodes an empty Lua table as `{}` rather than `[]`. JS
+              // parses that back as an empty object, and `{}.length` is
+              // undefined — the original `tags.length === 0` check would
+              // pass `{}` straight to Prisma's `String[]` column. Mirror
+              // the same Array.isArray guard that `createCancelledRun`
+              // uses for symmetry with the trigger replay path.
+              runTags: Array.isArray(tags) && tags.length > 0 ? tags : undefined,
               oneTimeUseToken,
               parentTaskRunId,
               rootTaskRunId,
@@ -881,6 +1083,7 @@ export class RunEngine {
     taskEventStore,
     queue: queueOverride,
     lockedQueueId: lockedQueueIdOverride,
+    emitRunFailedEvent = true,
   }: {
     friendlyId: string;
     environment: {
@@ -908,6 +1111,19 @@ export class RunEngine {
     queue?: string;
     /** Resolved TaskQueue.id when the task is locked to a specific queue. */
     lockedQueueId?: string;
+    /**
+     * Whether to emit the `runFailed` engine-bus event. Defaults to true.
+     *
+     * Set to `false` when the caller is ALREADY managing the trace-event
+     * lifecycle for this run via `repository.traceEvent({ incomplete: false,
+     * isError: true, ... })`. In that path the outer trace event handles
+     * span completion itself; emitting `runFailed` from here causes the
+     * `runFailed` → `completeFailedRunEvent` handler to write a second
+     * completion row for the same (traceId, spanId), racing with the
+     * outer trace event's own write. The alert side of `runFailed` is
+     * preserved by emitting from the caller after `traceEvent` returns.
+     */
+    emitRunFailedEvent?: boolean;
   }): Promise<TaskRun> {
     return startSpan(
       this.tracer,
@@ -983,6 +1199,57 @@ export class RunEngine {
           });
         }
 
+        // Emit `runFailed` so the alert pipeline picks up the
+        // SYSTEM_FAILURE row and the event-store handler writes the
+        // completion event into the trace. Without this the mollifier
+        // drainer's terminal failures (and batch-trigger's
+        // exceed-limit failures) land in PG silently — visible in the
+        // dashboard list but never reaching customers' configured
+        // ERROR alert channels.
+        //
+        // Gated by `emitRunFailedEvent` so call sites that already wrap
+        // this inside `repository.traceEvent({ incomplete: false,
+        // isError: true })` can opt out — the outer trace event writes
+        // the completion row itself, and a second write via
+        // `completeFailedRunEvent` would race against it. Callers that
+        // disable the emit are responsible for triggering the alerts
+        // side themselves (e.g. by calling
+        // `PerformTaskRunAlertsService.enqueue` directly after the
+        // trace event closes).
+        if (!emitRunFailedEvent) {
+          return taskRun;
+        }
+        this.eventBus.emit("runFailed", {
+          time: taskRun.completedAt ?? new Date(),
+          run: {
+            id: taskRun.id,
+            status: taskRun.status,
+            spanId: taskRun.spanId,
+            error,
+            taskEventStore: taskRun.taskEventStore,
+            createdAt: taskRun.createdAt,
+            completedAt: taskRun.completedAt,
+            updatedAt: taskRun.updatedAt,
+            // This row never attempted execution — it's a synthesised
+            // terminal failure. The alert payload's `attemptNumber=0`
+            // is the signal downstream consumers can use to
+            // distinguish a never-ran failure from a run that
+            // exhausted its retries.
+            attemptNumber: 0,
+            usageDurationMs: 0,
+            costInCents: 0,
+          },
+          organization: {
+            id: environment.organization.id,
+          },
+          project: {
+            id: environment.project.id,
+          },
+          environment: {
+            id: environment.id,
+          },
+        });
+
         return taskRun;
       },
       {
diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts
new file mode 100644
index 00000000000..68662074ea2
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts
@@ -0,0 +1,345 @@
+import { containerTest } from "@internal/testcontainers";
+import { trace } from "@internal/tracing";
+import { RunId } from "@trigger.dev/core/v3/isomorphic";
+
+function freshRunId() {
+  return RunId.generate().friendlyId;
+}
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import type { EventBusEventArgs } from "../eventBus.js";
+import { setupAuthenticatedEnvironment } from "./setup.js";
+
+vi.setConfig({ testTimeout: 60_000 });
+
+function baseEngineOptions(redisOptions: Parameters<typeof RunEngine>[0]["queue"]["redis"]) {
+  return {
+    worker: {
+      redis: redisOptions,
+      workers: 1,
+      tasksPerWorker: 10,
+      pollIntervalMs: 100,
+    },
+    queue: {
+      redis: redisOptions,
+      masterQueueConsumersDisabled: true,
+      processWorkerQueueDebounceMs: 50,
+    },
+    runLock: {
+      redis: redisOptions,
+    },
+    machines: {
+      defaultMachine: "small-1x" as const,
+      machines: {
+        "small-1x": {
+          name: "small-1x" as const,
+          cpu: 0.5,
+          memory: 0.5,
+          centsPerMs: 0.0001,
+        },
+      },
+      baseCostInCents: 0.0001,
+    },
+    tracer: trace.getTracer("test", "0.0.0"),
+  };
+}
+
+// engine.createCancelledRun writes a CANCELED
+// TaskRun row directly from a buffer snapshot. Verifies the bypass-
+// queue / bypass-waitpoint / emit-runCancelled contract.
+describe("RunEngine.createCancelledRun", () => {
+  containerTest(
+    "writes CANCELED PG row with snapshot fields, completedAt, error",
+    async ({ prisma, redisOptions }) => {
+      const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+      const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) });
+      try {
+        const friendlyId = freshRunId();
+        const cancelledAt = new Date("2026-05-20T12:00:00.000Z");
+        const cancelReason = "Canceled by user";
+
+        const result = await engine.createCancelledRun({
+          snapshot: {
+            friendlyId,
+            environment: env,
+            taskIdentifier: "test-task",
+            payload: '{"hello":"world"}',
+            payloadType: "application/json",
+            context: {},
+            traceContext: {},
+            traceId: "0000000000000000aaaa000000000000",
+            spanId: "bbbb000000000000",
+            queue: "task/test-task",
+            isTest: false,
+            tags: ["test-tag"],
+          },
+          cancelledAt,
+          cancelReason,
+        });
+
+        expect(result.status).toBe("CANCELED");
+        expect(result.friendlyId).toBe(friendlyId);
+        expect(result.id).toBe(RunId.fromFriendlyId(friendlyId));
+        expect(result.completedAt?.toISOString()).toBe(cancelledAt.toISOString());
+        expect(result.taskIdentifier).toBe("test-task");
+        expect(result.runTags).toEqual(["test-tag"]);
+        expect(result.payload).toBe('{"hello":"world"}');
+        const err = result.error as { type?: string; raw?: string };
+        expect(err.type).toBe("STRING_ERROR");
+        expect(err.raw).toBe(cancelReason);
+
+        // Verify the PG row is canonical (findFirst returns the row).
+        const stored = await prisma.taskRun.findFirst({
+          where: { friendlyId },
+        });
+        expect(stored).not.toBeNull();
+        expect(stored!.status).toBe("CANCELED");
+      } finally {
+        await engine.quit();
+      }
+    },
+  );
+
+  containerTest(
+    "emits runCancelled with correct payload",
+    async ({ prisma, redisOptions }) => {
+      const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+      const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) });
+      const captured: EventBusEventArgs<"runCancelled">[0][] = [];
+      engine.eventBus.on("runCancelled", (event) => {
+        captured.push(event);
+      });
+
+      try {
+        const cancelledAt = new Date();
+        const cancelReason = "Test cancel";
+        const friendlyId = freshRunId();
+        await engine.createCancelledRun({
+          snapshot: {
+            friendlyId,
+            environment: env,
+            taskIdentifier: "test-task",
+            payload: "{}",
+            payloadType: "application/json",
+            context: {},
+            traceContext: {},
+            traceId: "0000000000000000cccc000000000000",
+            spanId: "dddd000000000000",
+            queue: "task/test-task",
+            isTest: false,
+            tags: [],
+          },
+          cancelledAt,
+          cancelReason,
+        });
+
+        expect(captured).toHaveLength(1);
+        expect(captured[0]!.run.status).toBe("CANCELED");
+        expect(captured[0]!.run.friendlyId).toBe(friendlyId);
+        expect(captured[0]!.run.error).toEqual({ type: "STRING_ERROR", raw: cancelReason });
+        expect(captured[0]!.organization.id).toBe(env.organization.id);
+      } finally {
+        await engine.quit();
+      }
+    },
+  );
+
+  containerTest(
+    "emitRunCancelledEvent: false suppresses the bus emit but still writes the CANCELED PG row",
+    async ({ prisma, redisOptions }) => {
+      // The mollifier drainer passes `emitRunCancelledEvent: false` for
+      // buffered-only runs because the runCancelled handler's
+      // `cancelRunEvent` lookup fails for them (no primary trace event
+      // span exists — the mollifier gate never called
+      // `repository.traceEvent` for this run). Without the gate, every
+      // cancelled buffered run produces a `[runCancelled] Failed to
+      // cancel run event` error log. This pins the gate's contract: PG
+      // row still lands, bus emit suppressed.
+      const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+      const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) });
+      const captured: EventBusEventArgs<"runCancelled">[0][] = [];
+      engine.eventBus.on("runCancelled", (event) => {
+        captured.push(event);
+      });
+
+      try {
+        const friendlyId = freshRunId();
+        const result = await engine.createCancelledRun({
+          snapshot: {
+            friendlyId,
+            environment: env,
+            taskIdentifier: "test-task",
+            payload: "{}",
+            payloadType: "application/json",
+            context: {},
+            traceContext: {},
+            traceId: "0000000000000000eeee000000000000",
+            spanId: "ffff000000000000",
+            queue: "task/test-task",
+            isTest: false,
+            tags: [],
+          },
+          cancelledAt: new Date(),
+          cancelReason: "Test cancel (silent emit)",
+          emitRunCancelledEvent: false,
+        });
+
+        // PG row still lands.
+        expect(result.status).toBe("CANCELED");
+        expect(result.friendlyId).toBe(friendlyId);
+        // Bus emit suppressed.
+        expect(captured).toHaveLength(0);
+      } finally {
+        await engine.quit();
+      }
+    },
+  );
+
+  containerTest(
+    "idempotent on double-pop: second call returns existing row without re-emitting",
+    async ({ prisma, redisOptions }) => {
+      const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+      const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) });
+      const captured: EventBusEventArgs<"runCancelled">[0][] = [];
+      engine.eventBus.on("runCancelled", (event) => {
+        captured.push(event);
+      });
+
+      try {
+        const snapshot = {
+          friendlyId: freshRunId(),
+          environment: env,
+          taskIdentifier: "test-task",
+          payload: "{}",
+          payloadType: "application/json",
+          context: {},
+          traceContext: {},
+          traceId: "0000000000000000eeee000000000000",
+          spanId: "ffff000000000000",
+          queue: "task/test-task",
+          isTest: false,
+          tags: [],
+        };
+        const cancelledAt = new Date();
+        const cancelReason = "Test idempotent";
+
+        const first = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason });
+        const second = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason });
+
+        expect(second.id).toBe(first.id);
+        // Only the first call's emit fired; the P2002 path skips re-emission.
+        expect(captured).toHaveLength(1);
+      } finally {
+        await engine.quit();
+      }
+    },
+  );
+
+  // Regression: cjson encodes empty Lua tables as `{}`, not `[]`. When
+  // the drainer pops a buffered run that never had a tag set, the
+  // deserialised snapshot's `tags` field is an empty object. The old
+  // implementation passed it straight into Prisma's `runTags:` field;
+  // Prisma misread the object as a relation update op and threw
+  // `Argument 'set' is missing`. The drainer caught the error and
+  // marked the buffer entry FAILED — so the CANCELED PG row never
+  // landed.
+  containerTest(
+    "tolerates snapshot.tags being an empty object (cjson edge case)",
+    async ({ prisma, redisOptions }) => {
+      const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+      const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) });
+      try {
+        const friendlyId = freshRunId();
+        // Cast through unknown to simulate the cjson-decode output shape
+        // for an empty Lua table — TypeScript's snapshot type says
+        // string[], but the buffer Lua delivers {} for the empty case.
+        const result = await engine.createCancelledRun({
+          snapshot: {
+            friendlyId,
+            environment: env,
+            taskIdentifier: "test-task",
+            payload: "{}",
+            payloadType: "application/json",
+            context: {},
+            traceContext: {},
+            traceId: "0000000000000000abcd000000000000",
+            spanId: "1234000000000000",
+            queue: "task/test-task",
+            isTest: false,
+            tags: {} as unknown as string[],
+          },
+          cancelledAt: new Date(),
+          cancelReason: "Cancelled — empty tags",
+        });
+        expect(result.status).toBe("CANCELED");
+        expect(result.friendlyId).toBe(friendlyId);
+        // Prisma normalises the absent-tags case to either [] or null
+        // depending on the column default; assert it's an empty array.
+        expect(result.runTags).toEqual([]);
+      } finally {
+        await engine.quit();
+      }
+    },
+  );
+
+  // Regression: the P2002-on-id idempotency path used to return ANY
+  // existing row, which would silently report success even if a live
+  // (non-CANCELED) row landed first. The guard now requires the
+  // existing row's status to be CANCELED; anything else surfaces a
+  // conflict so the caller can route to engine.cancelRun() or skip.
+  containerTest(
+    "P2002 conflict with non-CANCELED existing row throws (does not silently succeed)",
+    async ({ prisma, redisOptions }) => {
+      const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+      const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) });
+      try {
+        const friendlyId = freshRunId();
+        const id = RunId.fromFriendlyId(friendlyId);
+
+        // Plant a live (non-CANCELED) row with the same id so the
+        // cancelled-run INSERT hits P2002 and the guard finds a row
+        // that ISN'T CANCELED.
+        await prisma.taskRun.create({
+          data: {
+            id,
+            friendlyId,
+            taskIdentifier: "test-task",
+            payload: "{}",
+            payloadType: "application/json",
+            status: "PENDING",
+            runtimeEnvironmentId: env.id,
+            projectId: env.project.id,
+            organizationId: env.organizationId,
+            queue: "task/test-task",
+            traceId: "0000000000000000aaaa000000000000",
+            spanId: "bbbb000000000000",
+            engine: "V2",
+          },
+        });
+
+        await expect(
+          engine.createCancelledRun({
+            snapshot: {
+              friendlyId,
+              environment: env,
+              taskIdentifier: "test-task",
+              payload: "{}",
+              payloadType: "application/json",
+              context: {},
+              traceContext: {},
+              traceId: "0000000000000000aaaa000000000000",
+              spanId: "bbbb000000000000",
+              queue: "task/test-task",
+              isTest: false,
+              tags: [],
+            },
+            cancelledAt: new Date(),
+            cancelReason: "Should not silently overwrite a live row",
+          }),
+        ).rejects.toThrow(/createCancelledRun conflict.*PENDING/);
+      } finally {
+        await engine.quit();
+      }
+    },
+  );
+});
diff --git a/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts
new file mode 100644
index 00000000000..84d33baa87d
--- /dev/null
+++ b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts
@@ -0,0 +1,176 @@
+import { containerTest } from "@internal/testcontainers";
+import { trace } from "@internal/tracing";
+import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { EventBusEventArgs } from "../eventBus.js";
+import { setupAuthenticatedEnvironment } from "./setup.js";
+
+vi.setConfig({ testTimeout: 60_000 });
+
+describe("RunEngine.createFailedTaskRun", () => {
+  containerTest("emits runFailed so the alert pipeline wakes up", async ({ prisma, redisOptions }) => {
+    // The mollifier drainer (and batch-trigger over-limit path) call
+    // createFailedTaskRun to write a terminal SYSTEM_FAILURE PG row
+    // for runs that never actually executed. Without an explicit
+    // runFailed emit, the row lands silently — the
+    // runEngineHandlers' `runFailed` listener (which enqueues
+    // PerformTaskRunAlertsService) never fires, so customers'
+    // configured TASK_RUN alert channels miss the failure entirely.
+    //
+    // Regression intent: if the emit is removed or moved out of
+    // createFailedTaskRun's success path, this test fails. The
+    // shape assertions pin the fields the alert delivery service
+    // reads from the event payload (run.id, run.status, error,
+    // attemptNumber=0 as the never-ran-marker).
+    const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+    const engine = new RunEngine({
+      prisma,
+      worker: {
+        redis: redisOptions,
+        workers: 1,
+        tasksPerWorker: 10,
+        pollIntervalMs: 100,
+      },
+      queue: {
+        redis: redisOptions,
+        masterQueueConsumersDisabled: true,
+        processWorkerQueueDebounceMs: 50,
+      },
+      runLock: {
+        redis: redisOptions,
+      },
+      machines: {
+        defaultMachine: "small-1x",
+        machines: {
+          "small-1x": {
+            name: "small-1x" as const,
+            cpu: 0.5,
+            memory: 0.5,
+            centsPerMs: 0.0001,
+          },
+        },
+        baseCostInCents: 0.0005,
+      },
+      tracer: trace.getTracer("test", "0.0.0"),
+    });
+
+    try {
+      const failedEvents: EventBusEventArgs<"runFailed">[0][] = [];
+      engine.eventBus.on("runFailed", (event) => {
+        failedEvents.push(event);
+      });
+
+      const friendlyId = generateFriendlyId("run");
+      const taskIdentifier = "drainer-terminal-test";
+
+      const failed = await engine.createFailedTaskRun({
+        friendlyId,
+        environment: {
+          id: authenticatedEnvironment.id,
+          type: authenticatedEnvironment.type,
+          project: { id: authenticatedEnvironment.project.id },
+          organization: { id: authenticatedEnvironment.organization.id },
+        },
+        taskIdentifier,
+        payload: "{}",
+        payloadType: "application/json",
+        error: {
+          type: "STRING_ERROR",
+          raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic",
+        },
+        traceId: "0123456789abcdef0123456789abcdef",
+        spanId: "fedcba9876543210",
+      });
+
+      expect(failed.status).toBe("SYSTEM_FAILURE");
+
+      expect(failedEvents).toHaveLength(1);
+      const event = failedEvents[0];
+      expect(event.run.id).toBe(failed.id);
+      expect(event.run.status).toBe("SYSTEM_FAILURE");
+      expect(event.run.spanId).toBe("fedcba9876543210");
+      // attemptNumber=0 is the marker that the run never executed —
+      // it's a synthesised terminal failure, not an exhausted-retries
+      // failure. Downstream consumers can use this to distinguish.
+      expect(event.run.attemptNumber).toBe(0);
+      expect(event.run.usageDurationMs).toBe(0);
+      expect(event.run.costInCents).toBe(0);
+      expect(event.run.error).toEqual({
+        type: "STRING_ERROR",
+        raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic",
+      });
+      expect(event.organization.id).toBe(authenticatedEnvironment.organization.id);
+      expect(event.project.id).toBe(authenticatedEnvironment.project.id);
+      expect(event.environment.id).toBe(authenticatedEnvironment.id);
+    } finally {
+      await engine.quit();
+    }
+  });
+
+  // The TriggerFailedTaskService.call() path wraps createFailedTaskRun
+  // inside `repository.traceEvent({ incomplete: false, isError: true })`
+  // which already writes the completion row for the (traceId, spanId).
+  // Emitting `runFailed` from here would cause the
+  // `completeFailedRunEvent` handler to race a second write against
+  // the same span — the `emitRunFailedEvent: false` opt-out is what
+  // suppresses the emit. The PG row + alert side stay correct because
+  // the caller enqueues `PerformTaskRunAlertsService.enqueue(run.id)`
+  // directly after the trace event closes.
+  containerTest(
+    "emitRunFailedEvent: false suppresses the bus emit but still creates the PG row",
+    async ({ prisma, redisOptions }) => {
+      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+      const engine = new RunEngine({
+        prisma,
+        worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 },
+        queue: { redis: redisOptions, masterQueueConsumersDisabled: true, processWorkerQueueDebounceMs: 50 },
+        runLock: { redis: redisOptions },
+        machines: {
+          defaultMachine: "small-1x",
+          machines: {
+            "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 },
+          },
+          baseCostInCents: 0.0005,
+        },
+        tracer: trace.getTracer("test", "0.0.0"),
+      });
+
+      try {
+        const failedEvents: EventBusEventArgs<"runFailed">[0][] = [];
+        engine.eventBus.on("runFailed", (event) => {
+          failedEvents.push(event);
+        });
+
+        const friendlyId = generateFriendlyId("run");
+        const failed = await engine.createFailedTaskRun({
+          friendlyId,
+          environment: {
+            id: authenticatedEnvironment.id,
+            type: authenticatedEnvironment.type,
+            project: { id: authenticatedEnvironment.project.id },
+            organization: { id: authenticatedEnvironment.organization.id },
+          },
+          taskIdentifier: "outer-trace-event-test",
+          payload: "{}",
+          payloadType: "application/json",
+          error: { type: "STRING_ERROR", raw: "outer trace event manages span" },
+          traceId: "0123456789abcdef0123456789abcdef",
+          spanId: "fedcba9876543210",
+          emitRunFailedEvent: false,
+        });
+
+        // PG row landed (caller still gets a usable TaskRun).
+        expect(failed.status).toBe("SYSTEM_FAILURE");
+        expect(failed.friendlyId).toBe(friendlyId);
+
+        // Bus emit was suppressed.
+        expect(failedEvents).toHaveLength(0);
+      } finally {
+        await engine.quit();
+      }
+    },
+  );
+});
diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts
index e86e503de47..570f03aca2c 100644
--- a/packages/core/src/v3/schemas/api.ts
+++ b/packages/core/src/v3/schemas/api.ts
@@ -157,6 +157,14 @@ export const IdempotencyKeyOptionsSchema = z.object({
 
 export type IdempotencyKeyOptionsSchema = z.infer<typeof IdempotencyKeyOptionsSchema>;
 
+// Coerces user-supplied concurrencyKey values to string. The downstream Prisma
+// column is String?, so passing a number (a common foot-gun when callers do
+// `concurrencyKey: payload.userId`) used to fail at `prisma.taskRun.create`
+// with PrismaClientValidationError. Accept the intent and stringify here.
+const ConcurrencyKeySchema = z
+  .union([z.string(), z.number()])
+  .transform((value) => String(value));
+
 export const TriggerTaskRequestBody = z.object({
   payload: z.any(),
   context: z.any(),
@@ -195,7 +203,7 @@ export const TriggerTaskRequestBody = z.object({
           concurrencyLimit: z.number().int().optional(),
         })
         .optional(),
-      concurrencyKey: z.string().optional(),
+      concurrencyKey: ConcurrencyKeySchema.optional(),
       delay: z.string().or(z.coerce.date()).optional(),
       idempotencyKey: z
         .string()
@@ -253,7 +261,7 @@ export const BatchTriggerTaskItem = z.object({
   context: z.any(),
   options: z
     .object({
-      concurrencyKey: z.string().optional(),
+      concurrencyKey: ConcurrencyKeySchema.optional(),
       delay: z.string().or(z.coerce.date()).optional(),
       idempotencyKey: z
         .string()
@@ -401,7 +409,12 @@ export type CreateBatchResponse = z.infer<typeof CreateBatchResponse>;
 
 /**
  * Phase 2: Individual item in the NDJSON stream
- * Each line in the NDJSON body should match this schema
+ * Each line in the NDJSON body should match this schema.
+ *
+ * `options` reuses the strict shape from BatchTriggerTaskItem so that the
+ * Phase-2 streaming path validates option fields identically to the V2/V3
+ * batch trigger endpoints — historically this used z.record(z.unknown()) and
+ * let invalid values (e.g. numeric concurrencyKey) reach Prisma.
  */
 export const BatchItemNDJSON = z.object({
   /** Zero-based index of this item (used for idempotency and ordering) */
@@ -411,7 +424,7 @@ export const BatchItemNDJSON = z.object({
   /** The payload for this task run */
   payload: z.unknown().optional(),
   /** Options for this specific item */
-  options: z.record(z.unknown()).optional(),
+  options: BatchTriggerTaskItem.shape.options,
 });
 
 export type BatchItemNDJSON = z.infer<typeof BatchItemNDJSON>;
diff --git a/packages/core/src/v3/schemas/batchItemNDJSON.test.ts b/packages/core/src/v3/schemas/batchItemNDJSON.test.ts
new file mode 100644
index 00000000000..f130bba4450
--- /dev/null
+++ b/packages/core/src/v3/schemas/batchItemNDJSON.test.ts
@@ -0,0 +1,88 @@
+import { describe, it, expect } from "vitest";
+import { BatchItemNDJSON, BatchTriggerTaskItem, TriggerTaskRequestBody } from "./api.js";
+
+describe("concurrencyKey coercion", () => {
+  // Phase-2 NDJSON used to accept arbitrary shapes for `options`, so a numeric
+  // concurrencyKey (a common foot-gun when callers pass
+  // `concurrencyKey: payload.userId`) reached Prisma untouched and failed
+  // there with PrismaClientValidationError. The schema now coerces
+  // number → string at the API boundary across every trigger path.
+  describe("BatchItemNDJSON", () => {
+    it("coerces a numeric concurrencyKey to a string", () => {
+      const result = BatchItemNDJSON.safeParse({
+        index: 0,
+        task: "user-workflow-tick",
+        payload: { json: { userId: 51262 } },
+        options: { concurrencyKey: 51262 },
+      });
+
+      expect(result.success).toBe(true);
+      if (result.success) {
+        expect(result.data.options?.concurrencyKey).toBe("51262");
+      }
+    });
+
+    it("accepts a string concurrencyKey unchanged", () => {
+      const result = BatchItemNDJSON.safeParse({
+        index: 0,
+        task: "user-workflow-tick",
+        payload: { json: { userId: 51262 } },
+        options: { concurrencyKey: "user-51262" },
+      });
+
+      expect(result.success).toBe(true);
+      if (result.success) {
+        expect(result.data.options?.concurrencyKey).toBe("user-51262");
+      }
+    });
+
+    it("accepts an item with no options", () => {
+      const result = BatchItemNDJSON.safeParse({
+        index: 0,
+        task: "user-workflow-tick",
+        payload: { json: { userId: 51262 } },
+      });
+
+      expect(result.success).toBe(true);
+    });
+
+    it("rejects a non-numeric, non-string concurrencyKey", () => {
+      const result = BatchItemNDJSON.safeParse({
+        index: 0,
+        task: "user-workflow-tick",
+        options: { concurrencyKey: { nested: "object" } },
+      });
+
+      expect(result.success).toBe(false);
+    });
+  });
+
+  describe("BatchTriggerTaskItem", () => {
+    it("coerces a numeric concurrencyKey to a string", () => {
+      const result = BatchTriggerTaskItem.safeParse({
+        task: "user-workflow-tick",
+        payload: { userId: 51262 },
+        options: { concurrencyKey: 51262 },
+      });
+
+      expect(result.success).toBe(true);
+      if (result.success) {
+        expect(result.data.options?.concurrencyKey).toBe("51262");
+      }
+    });
+  });
+
+  describe("TriggerTaskRequestBody", () => {
+    it("coerces a numeric concurrencyKey to a string", () => {
+      const result = TriggerTaskRequestBody.safeParse({
+        payload: { userId: 51262 },
+        options: { concurrencyKey: 51262 },
+      });
+
+      expect(result.success).toBe(true);
+      if (result.success) {
+        expect(result.data.options?.concurrencyKey).toBe("51262");
+      }
+    });
+  });
+});
diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts
index c8f7b95c97a..b47e41589e3 100644
--- a/packages/redis-worker/src/mollifier/buffer.test.ts
+++ b/packages/redis-worker/src/mollifier/buffer.test.ts
@@ -2,7 +2,52 @@ import { describe, expect, it } from "vitest";
 import { BufferEntrySchema, serialiseSnapshot, deserialiseSnapshot } from "./schemas.js";
 import { redisTest } from "@internal/testcontainers";
 import { Logger } from "@trigger.dev/core/logger";
-import { MollifierBuffer } from "./buffer.js";
+import {
+  MollifierBuffer,
+  idempotencyLookupKeyFor,
+  makeIdempotencyClaimKey,
+  mollifierReconnectDelayMs,
+} from "./buffer.js";
+
+describe("mollifierReconnectDelayMs", () => {
+  it("grows linearly with the attempt count and caps the base at 1s", () => {
+    // random=()=>1 yields the top of the equal-jitter band (== base).
+    const top = (times: number) => mollifierReconnectDelayMs(times, () => 1);
+    expect(top(1)).toBe(50);
+    expect(top(4)).toBe(200);
+    expect(top(20)).toBe(1000);
+    // Past the cap the base stays at 1000.
+    expect(top(100)).toBe(1000);
+  });
+
+  it("applies equal jitter: result is uniform in [base/2, base]", () => {
+    // base for times=10 is 500, so the band is [250, 500].
+    expect(mollifierReconnectDelayMs(10, () => 0)).toBe(250); // floor of band
+    expect(mollifierReconnectDelayMs(10, () => 0.999999)).toBe(500); // top of band
+    const mid = mollifierReconnectDelayMs(10, () => 0.5);
+    expect(mid).toBeGreaterThanOrEqual(250);
+    expect(mid).toBeLessThanOrEqual(500);
+  });
+
+  it("never exceeds the original fixed-schedule envelope (strictly an improvement)", () => {
+    for (const times of [1, 2, 5, 10, 20, 50]) {
+      const cap = Math.min(times * 50, 1000);
+      for (const r of [0, 0.25, 0.5, 0.75, 0.999999]) {
+        const delay = mollifierReconnectDelayMs(times, () => r);
+        expect(delay).toBeLessThanOrEqual(cap);
+        expect(delay).toBeGreaterThanOrEqual(Math.floor(cap / 2));
+      }
+    }
+  });
+
+  it("decorrelates concurrent reconnects (distinct values across random draws)", () => {
+    const draws = [0.05, 0.3, 0.55, 0.8, 0.95].map((r) =>
+      mollifierReconnectDelayMs(20, () => r),
+    );
+    // Lockstep would collapse to a single value; jitter spreads them.
+    expect(new Set(draws).size).toBeGreaterThan(1);
+  });
+});
 
 describe("schemas", () => {
   it("serialiseSnapshot then deserialiseSnapshot is identity for plain objects", () => {
@@ -20,12 +65,32 @@ describe("schemas", () => {
       status: "QUEUED",
       attempts: "0",
       createdAt: "2026-05-11T10:00:00.000Z",
+      createdAtMicros: "1747044000000000",
     };
     const parsed = BufferEntrySchema.parse(raw);
     expect(parsed.runId).toBe("run_abc");
     expect(parsed.status).toBe("QUEUED");
     expect(parsed.attempts).toBe(0);
     expect(parsed.createdAt).toBeInstanceOf(Date);
+    expect(parsed.createdAtMicros).toBe(1747044000000000);
+  });
+
+  it("BufferEntrySchema defaults createdAtMicros for entries written before the field existed", () => {
+    // Backward compat: an entry written by an accept Lua predating
+    // createdAtMicros (only the original 7 fields) must still parse on
+    // pop rather than being silently dropped.
+    const raw = {
+      runId: "run_old",
+      envId: "env_1",
+      orgId: "org_1",
+      payload: serialiseSnapshot({}),
+      status: "QUEUED",
+      attempts: "0",
+      createdAt: "2026-05-11T10:00:00.000Z",
+      // no createdAtMicros
+    };
+    const parsed = BufferEntrySchema.parse(raw);
+    expect(parsed.createdAtMicros).toBe(0);
   });
 
   it("BufferEntrySchema parses a FAILED entry with lastError", () => {
@@ -37,6 +102,7 @@ describe("schemas", () => {
       status: "FAILED",
       attempts: "3",
       createdAt: "2026-05-11T10:00:00.000Z",
+      createdAtMicros: "1747044000000000",
       lastError: JSON.stringify({ code: "P2024", message: "connection lost" }),
     };
     const parsed = BufferEntrySchema.parse(raw);
@@ -52,7 +118,6 @@ describe("MollifierBuffer construction", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -68,7 +133,6 @@ describe("MollifierBuffer.accept", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -105,7 +169,6 @@ describe("MollifierBuffer.pop", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -132,7 +195,6 @@ describe("MollifierBuffer.pop", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -151,7 +213,6 @@ describe("MollifierBuffer.pop", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -169,24 +230,56 @@ describe("MollifierBuffer.pop", () => {
 });
 
 describe("MollifierBuffer.ack", () => {
-  redisTest("ack deletes the entry", { timeout: 20_000 }, async ({ redisContainer }) => {
+  redisTest(
+    "ack marks entry materialised and applies the grace TTL — entry persists as a read-fallback safety net",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await buffer.pop("env_a");
+        await buffer.ack("run_x");
+
+        const after = await buffer.getEntry("run_x");
+        expect(after).not.toBeNull();
+        expect(after!.materialised).toBe(true);
+
+        // ack grace TTL is the only context where an entry hash gets
+        // an EXPIRE — accept no longer sets one. Should be at most 30s.
+        const ttl = await buffer.getEntryTtlSeconds("run_x");
+        expect(ttl).toBeGreaterThan(0);
+        expect(ttl).toBeLessThanOrEqual(30);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest("ack on missing entry is a no-op", { timeout: 20_000 }, async ({ redisContainer }) => {
     const buffer = new MollifierBuffer({
       redisOptions: {
         host: redisContainer.getHost(),
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
     try {
-      await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" });
-      await buffer.pop("env_a");
-      await buffer.ack("run_x");
-
-      const after = await buffer.getEntry("run_x");
-      expect(after).toBeNull();
+      await buffer.ack("run_ghost");
+      const stored = await buffer.getEntry("run_ghost");
+      expect(stored).toBeNull();
+      // Critical: no partial hash created.
+      const raw = await buffer["redis"].hgetall("mollifier:entries:run_ghost");
+      expect(Object.keys(raw)).toHaveLength(0);
     } finally {
       await buffer.close();
     }
@@ -204,13 +297,12 @@ describe("MollifierBuffer.pop orphan handling", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
       try {
-        // Simulate a TTL-expired orphan: queue ref exists, entry hash does not.
-        await buffer["redis"].lpush("mollifier:queue:env_a", "run_orphan");
+        // Simulate an evicted orphan: queue ref exists, entry hash does not.
+        await buffer["redis"].rpush("mollifier:queue:env_a", "run_orphan");
 
         const popped = await buffer.pop("env_a");
         expect(popped).toBeNull();
@@ -238,17 +330,17 @@ describe("MollifierBuffer.pop orphan handling", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
       try {
-        // Layout (oldest-first, since RPOP takes from tail): orphan, valid, orphan.
-        // LPUSH puts items at the head, so to get RPOP order [orphan_a, valid, orphan_b]
-        // we LPUSH in reverse: orphan_b first, then valid, then orphan_a.
-        await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b");
+        // Build the queue so RPOP (tail-first) yields: orphan_a, valid,
+        // orphan_b. accept LPUSHes "valid"; RPUSH puts orphan_a at the
+        // tail (popped first), LPUSH puts orphan_b at the head (popped
+        // last). First pop skips orphan_a, returns valid; orphan_b remains.
         await buffer.accept({ runId: "valid", envId: "env_a", orgId: "org_1", payload: "{}" });
-        await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_a");
+        await buffer["redis"].rpush("mollifier:queue:env_a", "orphan_a");
+        await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b");
 
         const popped = await buffer.pop("env_a");
         expect(popped).not.toBeNull();
@@ -283,7 +375,6 @@ describe("MollifierBuffer.requeue", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -305,30 +396,43 @@ describe("MollifierBuffer.requeue", () => {
 });
 
 describe("MollifierBuffer.fail", () => {
-  redisTest("fail transitions to FAILED and stores lastError", { timeout: 20_000 }, async ({ redisContainer }) => {
-    const buffer = new MollifierBuffer({
-      redisOptions: {
-        host: redisContainer.getHost(),
-        port: redisContainer.getPort(),
-        password: redisContainer.getPassword(),
-      },
-      entryTtlSeconds: 600,
-      logger: new Logger("test", "log"),
-    });
+  redisTest(
+    "fail returns true and tears the entry down (drainer-terminal cleanup)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Post-TTL-drop design: the drainer's createFailedTaskRun has
+      // already written a SYSTEM_FAILURE PG row by the time we call
+      // fail(), so the entry hash is no longer load-bearing. fail
+      // returns true and removes the entry; without this teardown
+      // failed entries would accrete forever now that there's no
+      // accept-time TTL. The Lua also DELs the idempotency lookup so
+      // future retries with the same key go through to PG instead of
+      // hitting an orphan dedup record.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
 
-    try {
-      await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" });
-      await buffer.pop("env_a");
-      const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" });
-      expect(failed).toBe(true);
+      try {
+        await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await buffer.pop("env_a");
+        const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" });
+        expect(failed).toBe(true);
 
-      const entry = await buffer.getEntry("run_f");
-      expect(entry!.status).toBe("FAILED");
-      expect(entry!.lastError).toEqual({ code: "VALIDATION", message: "boom" });
-    } finally {
-      await buffer.close();
-    }
-  });
+        // Entry hash is gone post-fail.
+        const entry = await buffer.getEntry("run_f");
+        expect(entry).toBeNull();
+        const raw = await buffer["redis"].hgetall("mollifier:entries:run_f");
+        expect(Object.keys(raw)).toHaveLength(0);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 
   redisTest(
     "fail on missing entry is a no-op (returns false; no partial hash created)",
@@ -340,7 +444,6 @@ describe("MollifierBuffer.fail", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -358,30 +461,94 @@ describe("MollifierBuffer.fail", () => {
       }
     },
   );
+
+  redisTest(
+    "fail DELs the idempotency lookup so a same-key retry goes through to PG",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Symmetric with the ack path: the failMollifierEntry Lua reads the
+      // idempotencyLookupKey off the hash and DELs it. Without this, a
+      // post-fail retry with the same idempotency key would hit the
+      // orphaned dedup record and resolve to a run that no longer exists.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({
+          runId: "run_fk",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "kf",
+          taskIdentifier: "t",
+        });
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_a",
+          taskIdentifier: "t",
+          idempotencyKey: "kf",
+        });
+        // Lookup exists before fail.
+        expect(await buffer["redis"].get(lookupKey)).toBe("run_fk");
+
+        await buffer.pop("env_a");
+        const failed = await buffer.fail("run_fk", { code: "VALIDATION", message: "boom" });
+        expect(failed).toBe(true);
+
+        // Lookup is cleared, so the slot is reclaimable: a fresh accept
+        // with the same tuple succeeds rather than deduping.
+        expect(await buffer["redis"].get(lookupKey)).toBeNull();
+        const reaccept = await buffer.accept({
+          runId: "run_fk2",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "kf",
+          taskIdentifier: "t",
+        });
+        expect(reaccept).toEqual({ kind: "accepted" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 });
 
 describe("MollifierBuffer TTL", () => {
-  redisTest("entry has TTL applied on accept", { timeout: 20_000 }, async ({ redisContainer }) => {
-    const buffer = new MollifierBuffer({
-      redisOptions: {
-        host: redisContainer.getHost(),
-        port: redisContainer.getPort(),
-        password: redisContainer.getPassword(),
-      },
-      entryTtlSeconds: 600,
-      logger: new Logger("test", "log"),
-    });
+  redisTest(
+    "entry has NO TTL applied on accept — drainer is the only cleanup path",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Regression guard for the design change: buffer entries must
+      // persist until the drainer ACKs or FAILs them. An accept-time
+      // EXPIRE would re-introduce the silent-loss-when-drainer-offline
+      // failure mode that the stale-entry alerting pipeline depends on
+      // *not* happening.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
 
-    try {
-      await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" });
+      try {
+        await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" });
 
-      const ttl = await buffer.getEntryTtlSeconds("run_t");
-      expect(ttl).toBeGreaterThan(0);
-      expect(ttl).toBeLessThanOrEqual(600);
-    } finally {
-      await buffer.close();
-    }
-  });
+        // Redis returns -1 when the key exists but has no TTL set.
+        const ttl = await buffer.getEntryTtlSeconds("run_t");
+        expect(ttl).toBe(-1);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
 });
 
 describe("MollifierBuffer payload encoding", () => {
@@ -395,7 +562,6 @@ describe("MollifierBuffer payload encoding", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -437,7 +603,6 @@ describe("MollifierBuffer.requeue on missing entry", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -458,16 +623,22 @@ describe("MollifierBuffer.requeue on missing entry", () => {
 
 describe("MollifierBuffer.requeue ordering", () => {
   redisTest(
-    "requeued entry is popped AFTER other queued entries on the same env (FIFO retry)",
+    "requeued entry gets retry priority (RPUSH to the RPOP/tail end), popping ahead of newer items",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
+      // LIST: accept LPUSHes at the head, pop RPOPs from the tail, so the
+      // first-accepted entry pops first. requeue RPUSHes back to the tail,
+      // giving a transiently failed entry *retry priority* — it pops next,
+      // ahead of newer queued items, rather than going to the back. (This
+      // is deliberately not FIFO relative to the rest of the queue.)
+      // `maxAttempts` in the drainer bounds the retry loop for a
+      // persistently failing entry (after which it goes to `fail`, not requeue).
       const buffer = new MollifierBuffer({
         redisOptions: {
           host: redisContainer.getHost(),
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -481,12 +652,13 @@ describe("MollifierBuffer.requeue ordering", () => {
 
         await buffer.requeue("a");
 
+        // a was RPUSHed back to the tail → pops next, ahead of b and c.
         const next = await buffer.pop("env_a");
-        expect(next!.runId).toBe("b");
+        expect(next!.runId).toBe("a");
         const after = await buffer.pop("env_a");
-        expect(after!.runId).toBe("c");
+        expect(after!.runId).toBe("b");
         const last = await buffer.pop("env_a");
-        expect(last!.runId).toBe("a");
+        expect(last!.runId).toBe("c");
       } finally {
         await buffer.close();
       }
@@ -508,7 +680,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -530,7 +701,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -557,7 +727,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -585,7 +754,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -610,7 +778,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
         port: redisContainer.getPort(),
         password: redisContainer.getPassword(),
       },
-      entryTtlSeconds: 600,
       logger: new Logger("test", "log"),
     });
 
@@ -638,7 +805,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -671,7 +837,6 @@ describe("MollifierBuffer.evaluateTrip", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -707,22 +872,21 @@ describe("MollifierBuffer entry lifecycle invariants", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
       try {
         await buffer.accept({ runId: "run_ttl", envId: "env_a", orgId: "org_1", payload: "{}" });
         const beforeTtl = await buffer.getEntryTtlSeconds("run_ttl");
-        expect(beforeTtl).toBeGreaterThan(0);
+        expect(beforeTtl).toBe(-1);
 
         await buffer.pop("env_a");
         const afterTtl = await buffer.getEntryTtlSeconds("run_ttl");
 
-        // TTL must still be present (>0). Redis returns -1 if the key has no
-        // TTL — that's the leak shape we're guarding against.
-        expect(afterTtl).toBeGreaterThan(0);
-        expect(afterTtl).toBeLessThanOrEqual(beforeTtl);
+        // No TTL applied at any point during accept/pop — the entry
+        // persists until the drainer ACKs or FAILs. Returning -1 from
+        // Redis here is the expected steady state, not a leak.
+        expect(afterTtl).toBe(-1);
       } finally {
         await buffer.close();
       }
@@ -739,7 +903,6 @@ describe("MollifierBuffer entry lifecycle invariants", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -795,7 +958,6 @@ describe("MollifierBuffer.accept idempotency", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -813,8 +975,8 @@ describe("MollifierBuffer.accept idempotency", () => {
           payload: serialiseSnapshot({ first: false }),
         });
 
-        expect(first).toBe(true);
-        expect(second).toBe(false);
+        expect(first).toEqual({ kind: "accepted" });
+        expect(second).toEqual({ kind: "duplicate_run_id" });
 
         // First payload preserved; second was a no-op.
         const stored = await buffer.getEntry("run_dup");
@@ -844,7 +1006,6 @@ describe("MollifierBuffer.accept idempotency", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -855,7 +1016,7 @@ describe("MollifierBuffer.accept idempotency", () => {
         expect(stored!.status).toBe("DRAINING");
 
         const dup = await buffer.accept({ runId: "run_dr", envId: "env_a", orgId: "org_1", payload: "{}" });
-        expect(dup).toBe(false);
+        expect(dup).toEqual({ kind: "duplicate_run_id" });
 
         const afterDup = await buffer.getEntry("run_dr");
         expect(afterDup!.status).toBe("DRAINING"); // unchanged
@@ -866,16 +1027,21 @@ describe("MollifierBuffer.accept idempotency", () => {
   );
 
   redisTest(
-    "accept refused while existing entry is FAILED",
+    "runId slot is reclaimable after fail tears the entry down",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
+      // Post-TTL-drop design: fail() deletes the entry hash because
+      // the SYSTEM_FAILURE PG row is the canonical record of the
+      // failure. The runId slot is therefore free for a fresh accept
+      // afterwards — runIds are server-generated CUIDs and don't
+      // collide in practice, but the contract pinning here documents
+      // that a re-acceptance does NOT see a phantom "FAILED" entry.
       const buffer = new MollifierBuffer({
         redisOptions: {
           host: redisContainer.getHost(),
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -883,15 +1049,20 @@ describe("MollifierBuffer.accept idempotency", () => {
         await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" });
         await buffer.pop("env_a");
         await buffer.fail("run_fl", { code: "VALIDATION", message: "boom" });
-        const stored = await buffer.getEntry("run_fl");
-        expect(stored!.status).toBe("FAILED");
 
-        const dup = await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" });
-        expect(dup).toBe(false);
+        // Entry hash gone after fail (see "fail returns true and tears
+        // the entry down" — this test pins the accept-side effect).
+        expect(await buffer.getEntry("run_fl")).toBeNull();
 
-        const afterDup = await buffer.getEntry("run_fl");
-        expect(afterDup!.status).toBe("FAILED"); // unchanged
-        expect(afterDup!.lastError).toEqual({ code: "VALIDATION", message: "boom" });
+        const fresh = await buffer.accept({
+          runId: "run_fl",
+          envId: "env_a",
+          orgId: "org_1",
+          payload: '{"fresh":true}',
+        });
+        expect(fresh).toEqual({ kind: "accepted" });
+        const after = await buffer.getEntry("run_fl");
+        expect(after?.status).toBe("QUEUED");
       } finally {
         await buffer.close();
       }
@@ -899,16 +1070,21 @@ describe("MollifierBuffer.accept idempotency", () => {
   );
 
   redisTest(
-    "re-accept after ack works (terminal entry can be re-accepted)",
+    "accept refused while a previously-acked (materialised) entry is still inside its grace TTL",
     { timeout: 20_000 },
     async ({ redisContainer }) => {
+      // After ack, the entry hash persists for the grace window as a
+      // read-fallback safety net. RunIds are server-generated and
+      // never collide in practice, but defense-in-depth: accept refuses
+      // while *any* entry exists for the runId, including materialised
+      // ones. The entry hash's TTL is now ~30s instead of the original
+      // entryTtlSeconds.
       const buffer = new MollifierBuffer({
         redisOptions: {
           host: redisContainer.getHost(),
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -922,7 +1098,6 @@ describe("MollifierBuffer.accept idempotency", () => {
         await buffer.pop("env_a");
         await buffer.ack("run_x");
 
-        // Entry is gone — re-accept should succeed.
         const reAccept = await buffer.accept({
           runId: "run_x",
           envId: "env_a",
@@ -930,8 +1105,11 @@ describe("MollifierBuffer.accept idempotency", () => {
           payload: "{}",
         });
 
-        expect(first).toBe(true);
-        expect(reAccept).toBe(true);
+        expect(first).toEqual({ kind: "accepted" });
+        expect(reAccept).toEqual({ kind: "duplicate_run_id" });
+
+        const stored = await buffer.getEntry("run_x");
+        expect(stored!.materialised).toBe(true);
       } finally {
         await buffer.close();
       }
@@ -950,7 +1128,6 @@ describe("MollifierBuffer envs set lifecycle", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -976,7 +1153,6 @@ describe("MollifierBuffer envs set lifecycle", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -1006,7 +1182,6 @@ describe("MollifierBuffer envs set lifecycle", () => {
           port: redisContainer.getPort(),
           password: redisContainer.getPassword(),
         },
-        entryTtlSeconds: 600,
         logger: new Logger("test", "log"),
       });
 
@@ -1025,3 +1200,1527 @@ describe("MollifierBuffer envs set lifecycle", () => {
     },
   );
 });
+
+describe("MollifierBuffer idempotency lookup", () => {
+  redisTest(
+    "accept with idempotencyKey + taskIdentifier writes the lookup with no TTL",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Post-TTL-drop design: the idempotency lookup has no TTL, so it
+      // can never expire ahead of the entry hash (which used to cause
+      // a dedup-drift bug — once the lookup expired but the entry
+      // didn't, a retry with the same key would create a *new*
+      // buffered run for the same key). The drainer's ack and fail
+      // both DEL the lookup as part of teardown.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const result = await buffer.accept({
+          runId: "ri1",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "ikey-1",
+          taskIdentifier: "my-task",
+        });
+        expect(result).toEqual({ kind: "accepted" });
+
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_i",
+          taskIdentifier: "my-task",
+          idempotencyKey: "ikey-1",
+        });
+        const stored = await buffer["redis"].get(lookupKey);
+        expect(stored).toBe("ri1");
+        // -1 = key exists with no TTL set.
+        expect(await buffer["redis"].ttl(lookupKey)).toBe(-1);
+
+        const entry = await buffer.getEntry("ri1");
+        expect(entry!.idempotencyLookupKey).toBe(lookupKey);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "second accept with same (env, task, idempotencyKey) returns duplicate_idempotency with the winner's runId",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const first = await buffer.accept({
+          runId: "ri-a",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "ikey-2",
+          taskIdentifier: "my-task",
+        });
+        const second = await buffer.accept({
+          runId: "ri-b",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "ikey-2",
+          taskIdentifier: "my-task",
+        });
+
+        expect(first).toEqual({ kind: "accepted" });
+        expect(second).toEqual({
+          kind: "duplicate_idempotency",
+          existingRunId: "ri-a",
+        });
+
+        // The loser's runId entry was never created.
+        const loserEntry = await buffer.getEntry("ri-b");
+        expect(loserEntry).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "lookupIdempotency hits when the run is buffered",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rl1",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "k1",
+          taskIdentifier: "t",
+        });
+        const found = await buffer.lookupIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "k1",
+        });
+        expect(found).toBe("rl1");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "lookupIdempotency returns null when no lookup is bound",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const found = await buffer.lookupIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "absent",
+        });
+        expect(found).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "lookupIdempotency self-heals when the lookup points at an expired entry",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        // Plant a stale lookup pointing at a non-existent entry.
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "stale",
+        });
+        await buffer["redis"].set(lookupKey, "rl-stale", "EX", 600);
+        expect(await buffer["redis"].get(lookupKey)).toBe("rl-stale");
+
+        const found = await buffer.lookupIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "stale",
+        });
+        expect(found).toBeNull();
+        // Self-healed.
+        expect(await buffer["redis"].get(lookupKey)).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "ack DELs the idempotency lookup along with marking materialised",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "ra1",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: "{}",
+          idempotencyKey: "ka",
+          taskIdentifier: "t",
+        });
+        await buffer.pop("env_i");
+        await buffer.ack("ra1");
+
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "ka",
+        });
+        expect(await buffer["redis"].get(lookupKey)).toBeNull();
+        const entry = await buffer.getEntry("ra1");
+        expect(entry!.materialised).toBe(true);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "resetIdempotency clears snapshot fields + lookup; returns the runId",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rr1",
+          envId: "env_i",
+          orgId: "org_1",
+          payload: serialiseSnapshot({
+            idempotencyKey: "kr",
+            idempotencyKeyExpiresAt: "2026-12-01T00:00:00Z",
+            other: "field",
+          }),
+          idempotencyKey: "kr",
+          taskIdentifier: "t",
+        });
+
+        const result = await buffer.resetIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "kr",
+        });
+        expect(result.clearedRunId).toBe("rr1");
+
+        // Lookup is gone.
+        const lookupKey = idempotencyLookupKeyFor({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "kr",
+        });
+        expect(await buffer["redis"].get(lookupKey)).toBeNull();
+
+        // Snapshot's idempotency fields are nulled, other fields kept.
+        const entry = await buffer.getEntry("rr1");
+        const payload = JSON.parse(entry!.payload) as {
+          idempotencyKey: unknown;
+          idempotencyKeyExpiresAt: unknown;
+          other: string;
+        };
+        expect(payload.idempotencyKey).toBeNull();
+        expect(payload.idempotencyKeyExpiresAt).toBeNull();
+        expect(payload.other).toBe("field");
+        expect(entry!.idempotencyLookupKey).toBe("");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "resetIdempotency returns null when nothing is bound",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const result = await buffer.resetIdempotency({
+          envId: "env_i",
+          taskIdentifier: "t",
+          idempotencyKey: "absent",
+        });
+        expect(result.clearedRunId).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "resetIdempotency also clears the pre-gate claim slot",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // The lookup and the cross-store claim are two pointers for the same
+      // key. Reset must reopen both — otherwise a resolved/pending claim
+      // keeps deduping new triggers for the rest of its TTL even though
+      // the binding was reset.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      const tuple = { envId: "env_rc", taskIdentifier: "t", idempotencyKey: "krc" };
+      try {
+        // A resolved claim is in place...
+        await buffer.claimIdempotency({ ...tuple, token: "owner", ttlSeconds: 600 });
+        await buffer.publishClaim({ ...tuple, token: "owner", runId: "rc1", ttlSeconds: 600 });
+        expect(await buffer.readClaim(tuple)).toEqual({ kind: "resolved", runId: "rc1" });
+        // ...alongside a buffered run holding the lookup.
+        await buffer.accept({
+          runId: "rc1",
+          envId: "env_rc",
+          orgId: "org_1",
+          payload: serialiseSnapshot({}),
+          idempotencyKey: "krc",
+          taskIdentifier: "t",
+        });
+
+        await buffer.resetIdempotency(tuple);
+
+        // Both the lookup and the claim are gone.
+        expect(await buffer.lookupIdempotency(tuple)).toBeNull();
+        expect(await buffer.readClaim(tuple)).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "accept self-heals a stale lookup: a new run rebinds when the bound entry was evicted",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // If an entry hash is evicted (maxmemory) but its idempotency lookup
+      // survives, a fresh accept with the same key must NOT return the dead
+      // runId (which would block the key forever) — it should rebind to the
+      // new run and accept it.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      const idem = { idempotencyKey: "kheal", taskIdentifier: "t" };
+      try {
+        await buffer.accept({ runId: "heal_old", envId: "env_h", orgId: "org_1", payload: "{}", ...idem });
+        // Simulate eviction of the entry hash while the lookup survives.
+        await buffer["redis"].del("mollifier:entries:heal_old");
+        const lookupKey = idempotencyLookupKeyFor({ envId: "env_h", ...idem });
+        expect(await buffer["redis"].get(lookupKey)).toBe("heal_old");
+
+        // A fresh accept with the same key rebinds rather than deduping
+        // onto the dead run.
+        const result = await buffer.accept({
+          runId: "heal_new",
+          envId: "env_h",
+          orgId: "org_1",
+          payload: "{}",
+          ...idem,
+        });
+        expect(result).toEqual({ kind: "accepted" });
+        expect(await buffer["redis"].get(lookupKey)).toBe("heal_new");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "accept still dedups when the bound entry is live",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // The self-heal must not weaken normal dedup: a live bound entry
+      // still wins, and the loser gets its runId back.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      const idem = { idempotencyKey: "klive", taskIdentifier: "t" };
+      try {
+        await buffer.accept({ runId: "live_win", envId: "env_h", orgId: "org_1", payload: "{}", ...idem });
+        const result = await buffer.accept({
+          runId: "live_lose",
+          envId: "env_h",
+          orgId: "org_1",
+          payload: "{}",
+          ...idem,
+        });
+        expect(result).toEqual({ kind: "duplicate_idempotency", existingRunId: "live_win" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierBuffer.casSetMetadata", () => {
+  redisTest(
+    "applies when expectedVersion matches; increments version; updates payload",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "cas1",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }),
+        });
+        const result = await buffer.casSetMetadata({
+          runId: "cas1",
+          expectedVersion: 0,
+          newMetadata: '{"v":2}',
+          newMetadataType: "application/json",
+        });
+        expect(result).toEqual({ kind: "applied", newVersion: 1 });
+
+        const entry = await buffer.getEntry("cas1");
+        expect(entry!.metadataVersion).toBe(1);
+        const payload = JSON.parse(entry!.payload) as { metadata: string };
+        expect(payload.metadata).toBe('{"v":2}');
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns version_conflict when expectedVersion is stale",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "cas2",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({}),
+        });
+        await buffer.casSetMetadata({
+          runId: "cas2",
+          expectedVersion: 0,
+          newMetadata: '{"a":1}',
+          newMetadataType: "application/json",
+        });
+
+        // Second write with stale expectedVersion = 0 must conflict.
+        const result = await buffer.casSetMetadata({
+          runId: "cas2",
+          expectedVersion: 0,
+          newMetadata: '{"a":2}',
+          newMetadataType: "application/json",
+        });
+        expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns not_found / busy on missing or terminal entries",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const nf = await buffer.casSetMetadata({
+          runId: "absent",
+          expectedVersion: 0,
+          newMetadata: "{}",
+          newMetadataType: "application/json",
+        });
+        expect(nf).toEqual({ kind: "not_found" });
+
+        await buffer.accept({
+          runId: "cas3",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({}),
+        });
+        await buffer.pop("env_c");
+        const busy = await buffer.casSetMetadata({
+          runId: "cas3",
+          expectedVersion: 0,
+          newMetadata: "{}",
+          newMetadataType: "application/json",
+        });
+        expect(busy).toEqual({ kind: "busy" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns busy on a materialised entry (post-ack grace window)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // The guard rejects `materialised == 'true'` as well as non-QUEUED
+      // status. After ack the entry lingers QUEUED-but-materialised for
+      // the grace TTL; a CAS in that window must not mutate it.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "cas_mat",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({}),
+        });
+        await buffer.pop("env_c");
+        await buffer.ack("cas_mat");
+
+        const result = await buffer.casSetMetadata({
+          runId: "cas_mat",
+          expectedVersion: 0,
+          newMetadata: "{}",
+          newMetadataType: "application/json",
+        });
+        expect(result).toEqual({ kind: "busy" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "a mutateSnapshot set_metadata bumps metadataVersion so an in-flight CAS conflicts",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // CAS isolation: a reader fetches version N, then a concurrent
+      // mutateSnapshot(set_metadata) overwrites the metadata. The reader's
+      // CAS at expectedVersion=N must NOT silently win — both paths write
+      // payload.metadata, so set_metadata bumps the same counter the CAS
+      // is gated on.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "cas_int",
+          envId: "env_c",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ metadata: '{"v":0}', metadataType: "application/json" }),
+        });
+        // Reader observes version 0.
+        const before = await buffer.getEntry("cas_int");
+        expect(before!.metadataVersion).toBe(0);
+
+        // Concurrent snapshot mutation writes metadata + bumps version.
+        const mutated = await buffer.mutateSnapshot("cas_int", {
+          type: "set_metadata",
+          metadata: '{"v":1}',
+          metadataType: "application/json",
+        });
+        expect(mutated).toBe("applied_to_snapshot");
+        const mid = await buffer.getEntry("cas_int");
+        expect(mid!.metadataVersion).toBe(1);
+
+        // The reader's stale CAS conflicts instead of clobbering.
+        const result = await buffer.casSetMetadata({
+          runId: "cas_int",
+          expectedVersion: 0,
+          newMetadata: '{"v":2}',
+          newMetadataType: "application/json",
+        });
+        expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierBuffer.mutateSnapshot", () => {
+  redisTest(
+    "returns not_found when no entry exists for the runId",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const result = await buffer.mutateSnapshot("nope", {
+          type: "append_tags",
+          tags: ["x"],
+        });
+        expect(result).toBe("not_found");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "append_tags on QUEUED entry appends and dedupes",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r1",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: ["existing"] }),
+        });
+        const first = await buffer.mutateSnapshot("r1", {
+          type: "append_tags",
+          tags: ["existing", "new"],
+        });
+        expect(first).toBe("applied_to_snapshot");
+
+        const entry = await buffer.getEntry("r1");
+        const payload = JSON.parse(entry!.payload) as { tags: string[] };
+        expect(payload.tags).toEqual(["existing", "new"]);
+
+        // Second mutation appends without duplicating
+        const second = await buffer.mutateSnapshot("r1", {
+          type: "append_tags",
+          tags: ["new", "third"],
+        });
+        expect(second).toBe("applied_to_snapshot");
+        const e2 = await buffer.getEntry("r1");
+        const p2 = JSON.parse(e2!.payload) as { tags: string[] };
+        expect(p2.tags).toEqual(["existing", "new", "third"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "append_tags creates payload.tags when absent",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r2",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ taskId: "t" }),
+        });
+        const result = await buffer.mutateSnapshot("r2", {
+          type: "append_tags",
+          tags: ["a", "b"],
+        });
+        expect(result).toBe("applied_to_snapshot");
+        const entry = await buffer.getEntry("r2");
+        const payload = JSON.parse(entry!.payload) as { tags: string[] };
+        expect(payload.tags).toEqual(["a", "b"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "append_tags rejects with limit_exceeded when maxTags would be exceeded, writing nothing",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r_cap",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: ["a", "b"] }),
+        });
+
+        // 2 existing + 2 new = 4 deduped > cap of 3 → rejected, nothing written.
+        const rejected = await buffer.mutateSnapshot("r_cap", {
+          type: "append_tags",
+          tags: ["c", "d"],
+          maxTags: 3,
+        });
+        expect(rejected).toBe("limit_exceeded");
+        const afterReject = await buffer.getEntry("r_cap");
+        const rejPayload = JSON.parse(afterReject!.payload) as { tags: string[] };
+        expect(rejPayload.tags).toEqual(["a", "b"]);
+
+        // Dedup keeps the count under the cap → applied.
+        const applied = await buffer.mutateSnapshot("r_cap", {
+          type: "append_tags",
+          tags: ["a", "c"],
+          maxTags: 3,
+        });
+        expect(applied).toBe("applied_to_snapshot");
+        const afterApply = await buffer.getEntry("r_cap");
+        const appPayload = JSON.parse(afterApply!.payload) as { tags: string[] };
+        expect(appPayload.tags).toEqual(["a", "b", "c"]);
+
+        // Landing exactly on the cap is allowed.
+        const exact = await buffer.mutateSnapshot("r_cap", {
+          type: "append_tags",
+          tags: ["a", "b", "c"],
+          maxTags: 3,
+        });
+        expect(exact).toBe("applied_to_snapshot");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "set_metadata replaces metadata + metadataType (last-write-wins)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r3",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }),
+        });
+        const result = await buffer.mutateSnapshot("r3", {
+          type: "set_metadata",
+          metadata: '{"v":2}',
+          metadataType: "application/json",
+        });
+        expect(result).toBe("applied_to_snapshot");
+        const entry = await buffer.getEntry("r3");
+        const payload = JSON.parse(entry!.payload) as {
+          metadata: string;
+          metadataType: string;
+        };
+        expect(payload.metadata).toBe('{"v":2}');
+        expect(payload.metadataType).toBe("application/json");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "set_delay sets payload.delayUntil",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r4",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ taskId: "t" }),
+        });
+        const result = await buffer.mutateSnapshot("r4", {
+          type: "set_delay",
+          delayUntil: "2026-06-01T00:00:00.000Z",
+        });
+        expect(result).toBe("applied_to_snapshot");
+        const entry = await buffer.getEntry("r4");
+        const payload = JSON.parse(entry!.payload) as { delayUntil: string };
+        expect(payload.delayUntil).toBe("2026-06-01T00:00:00.000Z");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "mark_cancelled stamps cancelledAt + cancelReason",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r5",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ taskId: "t" }),
+        });
+        const result = await buffer.mutateSnapshot("r5", {
+          type: "mark_cancelled",
+          cancelledAt: "2026-05-19T12:00:00.000Z",
+          cancelReason: "user-initiated",
+        });
+        expect(result).toBe("applied_to_snapshot");
+        const entry = await buffer.getEntry("r5");
+        const payload = JSON.parse(entry!.payload) as {
+          cancelledAt: string;
+          cancelReason: string;
+        };
+        expect(payload.cancelledAt).toBe("2026-05-19T12:00:00.000Z");
+        expect(payload.cancelReason).toBe("user-initiated");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns busy when entry is DRAINING",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rd",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: [] }),
+        });
+        await buffer.pop("env_m");
+        const result = await buffer.mutateSnapshot("rd", {
+          type: "append_tags",
+          tags: ["x"],
+        });
+        expect(result).toBe("busy");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns not_found when entry was FAILED (drainer-terminal teardown)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Post-TTL-drop design: fail() DELs the entry hash because the
+      // drainer has already written the canonical SYSTEM_FAILURE PG
+      // row, and without an accept-time TTL we'd otherwise accrete
+      // failed entries in Redis forever. Late mutations against a
+      // failed run therefore see `not_found`, matching the same shape
+      // they'd get for any other already-cleaned-up runId.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rf",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: [] }),
+        });
+        await buffer.pop("env_m");
+        await buffer.fail("rf", { code: "X", message: "boom" });
+        const result = await buffer.mutateSnapshot("rf", {
+          type: "append_tags",
+          tags: ["x"],
+        });
+        expect(result).toBe("not_found");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "returns busy when entry is materialised (post-ack grace window)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rm",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: [] }),
+        });
+        await buffer.pop("env_m");
+        await buffer.ack("rm");
+        const result = await buffer.mutateSnapshot("rm", {
+          type: "append_tags",
+          tags: ["x"],
+        });
+        expect(result).toBe("busy");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "Lua atomicity serialises concurrent mutations per-runId",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "rcc",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: [] }),
+        });
+
+        const tagsToAdd = Array.from({ length: 50 }, (_, i) => `t${i}`);
+        await Promise.all(
+          tagsToAdd.map((t) => buffer.mutateSnapshot("rcc", { type: "append_tags", tags: [t] })),
+        );
+
+        const entry = await buffer.getEntry("rcc");
+        const payload = JSON.parse(entry!.payload) as { tags: string[] };
+        expect(payload.tags.sort()).toEqual(tagsToAdd.sort());
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierBuffer LIST storage", () => {
+  redisTest(
+    "queue key is a LIST; createdAtMicros is a hash field, not a sort key",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "z1", envId: "env_z", orgId: "org_1", payload: "{}" });
+
+        // LIST-only commands must succeed against the queue key.
+        const len = await buffer["redis"].llen("mollifier:queue:env_z");
+        expect(len).toBe(1);
+        const members = await buffer["redis"].lrange("mollifier:queue:env_z", 0, -1);
+        expect(members).toEqual(["z1"]);
+
+        // The queue holds no score — it's not a ZSET.
+        await expect(buffer["redis"].zscore("mollifier:queue:env_z", "z1")).rejects.toThrow();
+
+        // createdAtMicros lives on the entry hash (for dwell metrics) and
+        // is plausibly recent (within the last minute, as microseconds).
+        const micros = Number(await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros"));
+        const nowMicros = Date.now() * 1000;
+        expect(micros).toBeGreaterThan(nowMicros - 60_000_000);
+        expect(micros).toBeLessThanOrEqual(nowMicros + 1_000_000);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "pop returns entries in FIFO insertion order (independent of member lex order)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        // Accept in reverse-lex order to prove ordering is by insertion
+        // (LPUSH head / RPOP tail), not by member value.
+        await buffer.accept({ runId: "zzz", envId: "env_o", orgId: "org_1", payload: "{}" });
+        await buffer.accept({ runId: "mmm", envId: "env_o", orgId: "org_1", payload: "{}" });
+        await buffer.accept({ runId: "aaa", envId: "env_o", orgId: "org_1", payload: "{}" });
+
+        const first = await buffer.pop("env_o");
+        expect(first!.runId).toBe("zzz");
+        const second = await buffer.pop("env_o");
+        expect(second!.runId).toBe("mmm");
+        const third = await buffer.pop("env_o");
+        expect(third!.runId).toBe("aaa");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "requeue re-enqueues to the LIST; createdAt is immutable across retries",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "rq", envId: "env_rq", orgId: "org_1", payload: "{}" });
+        const originalMicros = await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros");
+
+        await buffer.pop("env_rq");
+        // Queue is empty after the pop.
+        expect(await buffer["redis"].llen("mollifier:queue:env_rq")).toBe(0);
+
+        await buffer.requeue("rq");
+
+        // Back on the LIST, and createdAtMicros is unchanged.
+        expect(await buffer["redis"].lrange("mollifier:queue:env_rq", 0, -1)).toEqual(["rq"]);
+        const newMicros = await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros");
+        expect(newMicros).toBe(originalMicros);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
+describe("MollifierBuffer.listEntriesForEnv", () => {
+  redisTest(
+    "returns up to maxCount entries from the queue without consuming them",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "r1", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await buffer.accept({ runId: "r2", envId: "env_a", orgId: "org_1", payload: "{}" });
+        await buffer.accept({ runId: "r3", envId: "env_a", orgId: "org_1", payload: "{}" });
+
+        const entries = await buffer.listEntriesForEnv("env_a", 2);
+        expect(entries).toHaveLength(2);
+        const runIds = entries.map((e) => e.runId);
+        expect(new Set(runIds).size).toBe(2);
+        for (const id of runIds) expect(["r1", "r2", "r3"]).toContain(id);
+
+        // Non-destructive: the drainer can still pop all three.
+        const popped: string[] = [];
+        for (let i = 0; i < 3; i++) {
+          const entry = await buffer.pop("env_a");
+          if (entry) popped.push(entry.runId);
+        }
+        expect(new Set(popped)).toEqual(new Set(["r1", "r2", "r3"]));
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest("returns empty array when env queue is empty", { timeout: 20_000 }, async ({ redisContainer }) => {
+    const buffer = new MollifierBuffer({
+      redisOptions: {
+        host: redisContainer.getHost(),
+        port: redisContainer.getPort(),
+        password: redisContainer.getPassword(),
+      },
+      logger: new Logger("test", "log"),
+    });
+
+    try {
+      expect(await buffer.listEntriesForEnv("env_empty", 10)).toEqual([]);
+    } finally {
+      await buffer.close();
+    }
+  });
+
+  redisTest(
+    "skips entries whose hash was torn down between LRANGE and HGETALL (concurrent drainer ack/fail race)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // The drainer can RPOP + ack/fail an entry between our LRANGE and
+      // the per-runId HGETALL — its DEL of the entry hash races our read.
+      // listEntriesForEnv must tolerate this: skip the runId, return
+      // every other entry. This is exercised here by simulating the race:
+      // LPUSH a runId onto the queue without an accompanying entry hash.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "r_a", envId: "env_race", orgId: "org_1", payload: "{}" });
+        await buffer.accept({ runId: "r_b", envId: "env_race", orgId: "org_1", payload: "{}" });
+
+        // Tear down r_a's hash to simulate the drainer winning the race.
+        // The runId stays on the queue LIST but its entry hash is gone —
+        // listEntriesForEnv must tolerate the missing HGETALL result.
+        await buffer["redis"].del("mollifier:entries:r_a");
+
+        const entries = await buffer.listEntriesForEnv("env_race", 10);
+        expect(entries.map((e) => e.runId).sort()).toEqual(["r_b"]);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest("maxCount <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => {
+    const buffer = new MollifierBuffer({
+      redisOptions: {
+        host: redisContainer.getHost(),
+        port: redisContainer.getPort(),
+        password: redisContainer.getPassword(),
+      },
+      logger: new Logger("test", "log"),
+    });
+
+    try {
+      expect(await buffer.listEntriesForEnv("env_a", 0)).toEqual([]);
+      expect(await buffer.listEntriesForEnv("env_a", -5)).toEqual([]);
+    } finally {
+      await buffer.close();
+    }
+  });
+});
+
+// Composite-key safety. The Redis-key builders concatenate
+// `(envId, taskIdentifier, idempotencyKey)` with `:` separators; without
+// per-segment encoding, `taskIdentifier="a:b"` and `idempotencyKey="x"`
+// would map to the same key as `taskIdentifier="a"` and
+// `idempotencyKey="b:x"`. base64url encoding has no `:` in its alphabet,
+// so the encoded keys are unique per tuple.
+describe("MollifierBuffer composite-key encoding (collision resistance)", () => {
+  redisTest(
+    "two accepts whose unencoded keys would alias don't collide on the idempotency lookup",
+    { timeout: 30_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        // Aliased tuples under raw `:` concatenation:
+        //   env_x : "a:b" : "x"   →   "mollifier:idempotency:env_x:a:b:x"
+        //   env_x : "a"   : "b:x" →   "mollifier:idempotency:env_x:a:b:x"
+        const r1 = await buffer.accept({
+          runId: "ck_run_1",
+          envId: "env_x",
+          orgId: "org_1",
+          payload: "{}",
+          taskIdentifier: "a:b",
+          idempotencyKey: "x",
+        });
+        const r2 = await buffer.accept({
+          runId: "ck_run_2",
+          envId: "env_x",
+          orgId: "org_1",
+          payload: "{}",
+          taskIdentifier: "a",
+          idempotencyKey: "b:x",
+        });
+        // Both accepted — no false-positive collision.
+        expect(r1).toEqual({ kind: "accepted" });
+        expect(r2).toEqual({ kind: "accepted" });
+
+        // Each tuple resolves to its own runId.
+        const hit1 = await buffer.lookupIdempotency({
+          envId: "env_x",
+          taskIdentifier: "a:b",
+          idempotencyKey: "x",
+        });
+        const hit2 = await buffer.lookupIdempotency({
+          envId: "env_x",
+          taskIdentifier: "a",
+          idempotencyKey: "b:x",
+        });
+        expect(hit1).toBe("ck_run_1");
+        expect(hit2).toBe("ck_run_2");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "encoded lookup key contains no ':' separator beyond the namespace",
+    { timeout: 20_000 },
+    async () => {
+      // Pure-function test — verifies the encoding bijection without
+      // needing a live buffer. Re-uses the redisTest fixture for
+      // parallelism with other describe blocks but doesn't touch redis.
+      const key = idempotencyLookupKeyFor({
+        envId: "env_x",
+        taskIdentifier: "a:b",
+        idempotencyKey: "x:y:z",
+      });
+      // namespace prefix is exactly `mollifier:idempotency:` (two `:`),
+      // then three base64url segments separated by two more `:` —
+      // never the customer-supplied colons.
+      const colonCount = key.split(":").length - 1;
+      expect(colonCount).toBe(4);
+      // base64url alphabet has no `:`, `+`, `/`, or `=`.
+      const afterNamespace = key.slice("mollifier:idempotency:".length);
+      expect(afterNamespace).toMatch(/^[A-Za-z0-9_\-]+:[A-Za-z0-9_\-]+:[A-Za-z0-9_\-]+$/);
+    },
+  );
+});
+
+// Pre-gate claim ownership protection. The claim slot stores
+// `"pending:<token>"` so publish and release compare-and-act on the
+// caller's token — a late release from a previous claimant whose TTL
+// expired cannot erase a new owner's claim.
+describe("MollifierBuffer pre-gate claim — ownership token safety", () => {
+  const claimInput = {
+    envId: "env_c",
+    taskIdentifier: "task_c",
+    idempotencyKey: "key_c",
+  };
+
+  redisTest(
+    "claimIdempotency: first caller gets 'claimed', second concurrent caller gets 'pending'",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        const first = await buffer.claimIdempotency({
+          ...claimInput,
+          token: "token-A",
+          ttlSeconds: 30,
+        });
+        expect(first.kind).toBe("claimed");
+
+        // Second concurrent caller with a different token sees pending.
+        const second = await buffer.claimIdempotency({
+          ...claimInput,
+          token: "token-B",
+          ttlSeconds: 30,
+        });
+        expect(second.kind).toBe("pending");
+
+        // readClaim distinguishes pending from resolved without leaking
+        // the token to the loser.
+        const read = await buffer.readClaim(claimInput);
+        expect(read?.kind).toBe("pending");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "releaseClaim with the wrong token is a no-op (compare-and-delete)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.claimIdempotency({ ...claimInput, token: "owner", ttlSeconds: 30 });
+
+        // Pretend a stale claimant fires a release with their old token.
+        await buffer.releaseClaim({ ...claimInput, token: "stale-impostor" });
+
+        // The owner's claim survives.
+        const stillThere = await buffer.readClaim(claimInput);
+        expect(stillThere?.kind).toBe("pending");
+
+        // The owner can still release.
+        await buffer.releaseClaim({ ...claimInput, token: "owner" });
+        expect(await buffer.readClaim(claimInput)).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "publishClaim with the wrong token is a no-op and returns false",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.claimIdempotency({ ...claimInput, token: "owner", ttlSeconds: 30 });
+
+        const wrongTokenPublish = await buffer.publishClaim({
+          ...claimInput,
+          token: "stale-impostor",
+          runId: "imposter-run",
+          ttlSeconds: 60,
+        });
+        expect(wrongTokenPublish).toBe(false);
+
+        // Claim slot unchanged.
+        const stillPending = await buffer.readClaim(claimInput);
+        expect(stillPending?.kind).toBe("pending");
+
+        const goodPublish = await buffer.publishClaim({
+          ...claimInput,
+          token: "owner",
+          runId: "real-run",
+          ttlSeconds: 60,
+        });
+        expect(goodPublish).toBe(true);
+
+        const resolved = await buffer.readClaim(claimInput);
+        expect(resolved).toEqual({ kind: "resolved", runId: "real-run" });
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "regression: stale release after TTL expiry does NOT erase a fresh claim",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      // Hazard from CodeRabbit r3290070707:
+      //   1. Claimant A SETNXs the slot with their token, then stalls.
+      //   2. TTL expires, slot vanishes.
+      //   3. Claimant B SETNXs the slot with a DIFFERENT token.
+      //   4. Claimant A finally finishes (or errors) and calls
+      //      releaseClaim with their original token.
+      // Without compare-and-delete, A's release would wipe B's slot and
+      // any concurrent customer of B's idempotency key would see "no
+      // claim" and re-issue, breaking same-key dedup.
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        // Step 1: A claims with token "A".
+        const a = await buffer.claimIdempotency({
+          ...claimInput,
+          token: "A",
+          ttlSeconds: 1, // short TTL to simulate expiry quickly
+        });
+        expect(a.kind).toBe("claimed");
+
+        // Step 2: simulate TTL expiry — DEL the slot directly so the
+        // test doesn't rely on wall-clock sleeping. Targets the same key
+        // the buffer writes via the exported builder, so a key-format
+        // change can't silently make this DEL miss.
+        await buffer["redis"].del(makeIdempotencyClaimKey(claimInput));
+
+        // Step 3: B claims with token "B".
+        const b = await buffer.claimIdempotency({
+          ...claimInput,
+          token: "B",
+          ttlSeconds: 30,
+        });
+        expect(b.kind).toBe("claimed");
+
+        // Step 4: A's late release. MUST be a no-op.
+        await buffer.releaseClaim({ ...claimInput, token: "A" });
+
+        // B's claim survives intact.
+        const after = await buffer.readClaim(claimInput);
+        expect(after?.kind).toBe("pending");
+
+        // B can still publish.
+        const published = await buffer.publishClaim({
+          ...claimInput,
+          token: "B",
+          runId: "B-run",
+          ttlSeconds: 60,
+        });
+        expect(published).toBe(true);
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts
index f739e3ff362..71920bb4ff4 100644
--- a/packages/redis-worker/src/mollifier/buffer.ts
+++ b/packages/redis-worker/src/mollifier/buffer.ts
@@ -10,25 +10,119 @@ import { BufferEntry, BufferEntrySchema } from "./schemas.js";
 
 export type MollifierBufferOptions = {
   redisOptions: RedisOptions;
-  entryTtlSeconds: number;
   logger?: Logger;
 };
 
+// Grace TTL applied to the entry hash on drainer ack. The entry survives
+// this long after materialisation so direct reads (retrieve, trace, etc.)
+// have a safety net while PG replica lag settles.
+const ACK_GRACE_TTL_SECONDS = 30;
+
+// ioredis reconnect backoff for the mollifier buffer client. The base
+// grows linearly with the attempt count and is capped at 1s (the same
+// envelope as the previous fixed `Math.min(times * 50, 1000)` schedule).
+// We then apply equal jitter — a uniform pick in `[base/2, base]` — so a
+// fleet of webapp instances reconnecting after the same Redis blip don't
+// retry in lockstep and stampede Redis on recovery (thundering herd).
+// Because the jittered value never exceeds the original cap, this is never
+// slower than before — just decorrelated. Mirrors the jittered-backoff
+// approach the mutate-fallback wait loop adopted for the same reason.
+export function mollifierReconnectDelayMs(
+  times: number,
+  random: () => number = Math.random,
+): number {
+  const base = Math.min(times * 50, 1000);
+  const half = Math.floor(base / 2);
+  return half + Math.round(random() * (base - half));
+}
+
+export type SnapshotPatch =
+  // `maxTags`, when set, caps the deduped tag count atomically inside the
+  // Lua: if appending would push the snapshot over the limit the patch is
+  // rejected ("limit_exceeded") and nothing is written, mirroring the
+  // PG-path MAX_TAGS_PER_RUN check so a buffered run can't accumulate more
+  // tags than the trigger validator would have allowed at creation.
+  | { type: "append_tags"; tags: string[]; maxTags?: number }
+  | { type: "set_metadata"; metadata: string; metadataType: string }
+  | { type: "set_delay"; delayUntil: string }
+  | { type: "mark_cancelled"; cancelledAt: string; cancelReason?: string };
+
+export type MutateSnapshotResult =
+  | "applied_to_snapshot"
+  | "not_found"
+  | "busy"
+  | "limit_exceeded";
+
+export type CasSetMetadataResult =
+  | { kind: "applied"; newVersion: number }
+  | { kind: "version_conflict"; currentVersion: number }
+  | { kind: "not_found" }
+  | { kind: "busy" };
+
+export type AcceptResult =
+  | { kind: "accepted" }
+  | { kind: "duplicate_run_id" }
+  | { kind: "duplicate_idempotency"; existingRunId: string };
+
+export type IdempotencyLookupInput = {
+  envId: string;
+  taskIdentifier: string;
+  idempotencyKey: string;
+};
+
+// Reversible encoding for Redis-key segments. The composite-key builders
+// concatenate `envId`, `taskIdentifier`, and `idempotencyKey` with `:`
+// separators; if any segment contains a literal `:` (envId is internal
+// and `:`-free, but taskIdentifier and idempotencyKey are
+// customer-supplied) different tuples would map to the same Redis key
+// and dedupe the wrong run. base64url has no `:` in its alphabet and is
+// bijective on the input string, so the encoded keys are
+// collision-free.
+function encodeKeyPart(value: string): string {
+  return Buffer.from(value, "utf8").toString("base64url");
+}
+
+// Exported so tests can compute the same Redis key the buffer writes
+// without hard-coding the encoding (which is a buffer-internal detail).
+export function idempotencyLookupKeyFor(input: IdempotencyLookupInput): string {
+  return `mollifier:idempotency:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
+}
+
+// Pre-gate claim key namespace, distinct from `mollifier:idempotency` so the
+// existing buffer-side dedup stays isolated. The claim is the
+// authoritative cross-store "this idempotency key is in flight or
+// resolved" pointer used by the trigger hot path. Values:
+//   "pending:<token>"  → claimed by a trigger pipeline; `<token>` is the
+//                        caller-supplied ownership token. Release and
+//                        publish compare-and-act on this token so a
+//                        late release from a previous claimant whose TTL
+//                        expired cannot erase a new owner's claim.
+//   <runId>            → the winning trigger's resolved runId.
+const PENDING_PREFIX = "pending:";
+
+// Exported (like `idempotencyLookupKeyFor`) so tests can target the same
+// claim key the buffer writes without hard-coding the encoding.
+export function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string {
+  return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`;
+}
+
+export type IdempotencyClaimResult =
+  | { kind: "claimed" }
+  | { kind: "pending" }
+  | { kind: "resolved"; runId: string };
+
 export class MollifierBuffer {
   private readonly redis: Redis;
-  private readonly entryTtlSeconds: number;
   private readonly logger: Logger;
 
   constructor(options: MollifierBufferOptions) {
-    this.entryTtlSeconds = options.entryTtlSeconds;
     this.logger = options.logger ?? new Logger("MollifierBuffer", "debug");
 
     this.redis = createRedisClient(
       {
         ...options.redisOptions,
         retryStrategy(times) {
-          const delay = Math.min(times * 50, 1000);
-          return delay;
+          return mollifierReconnectDelayMs(times);
         },
         maxRetriesPerRequest: 20,
       },
@@ -41,19 +135,45 @@ export class MollifierBuffer {
     this.#registerCommands();
   }
 
-  // Returns true if the entry was newly written; false if a duplicate runId
-  // was already buffered (idempotent no-op). Callers can use the boolean to
-  // record a duplicate-accept metric without affecting buffer state.
+  // Three outcomes:
+  //   - { kind: "accepted" } — entry was newly written.
+  //   - { kind: "duplicate_run_id" } — runId was already buffered (idempotent
+  //     no-op, same semantic as the previous boolean-false return).
+  //   - { kind: "duplicate_idempotency", existingRunId } — the (env, task,
+  //     idempotencyKey) tuple was already bound to another buffered run.
+  //     The Lua's atomic SETNX is the race-winner; the second caller gets
+  //     the winner's runId so it can return that as the trigger response.
   async accept(input: {
     runId: string;
     envId: string;
     orgId: string;
     payload: string;
-  }): Promise<boolean> {
+    // Optional idempotency-key triple. When all three are present we
+    // SETNX a Redis lookup at `mollifier:idempotency:{env}:{task}:{key}`
+    // pointing at the runId so trigger-time dedup during the buffered
+    // window resolves the same way PG's unique constraint resolves it
+    // post-materialisation.
+    idempotencyKey?: string;
+    taskIdentifier?: string;
+  }): Promise<AcceptResult> {
     const entryKey = `mollifier:entries:${input.runId}`;
     const queueKey = `mollifier:queue:${input.envId}`;
     const orgsKey = "mollifier:orgs";
-    const createdAt = new Date().toISOString();
+    const nowMs = Date.now();
+    const createdAt = new Date(nowMs).toISOString();
+    // Microsecond epoch, stored as a hash field for dwell-time metrics
+    // (stale sweep, drainer dwell span). FIFO ordering comes from the
+    // LIST itself (LPUSH head / RPOP tail), not from this value — it is
+    // no longer a queue sort key.
+    const createdAtMicros = nowMs * 1000;
+    const idempotencyLookupKey =
+      input.idempotencyKey && input.taskIdentifier
+        ? idempotencyLookupKeyFor({
+            envId: input.envId,
+            taskIdentifier: input.taskIdentifier,
+            idempotencyKey: input.idempotencyKey,
+          })
+        : "";
     const result = await this.redis.acceptMollifierEntry(
       entryKey,
       queueKey,
@@ -63,10 +183,18 @@ export class MollifierBuffer {
       input.orgId,
       input.payload,
       createdAt,
-      String(this.entryTtlSeconds),
+      String(createdAtMicros),
       "mollifier:org-envs:",
+      idempotencyLookupKey,
+      "mollifier:entries:",
     );
-    return result === 1;
+    // Lua returns 1 (accepted), 0 (duplicate runId), or a string runId
+    // (duplicate idempotency — value is the existing winner's runId).
+    if (typeof result === "string" && result.length > 0) {
+      return { kind: "duplicate_idempotency", existingRunId: result };
+    }
+    if (result === 1) return { kind: "accepted" };
+    return { kind: "duplicate_run_id" };
   }
 
   async pop(envId: string): Promise<BufferEntry | null> {
@@ -128,8 +256,245 @@ export class MollifierBuffer {
     return this.redis.smembers(`mollifier:org-envs:${orgId}`);
   }
 
+  // Read-only enumeration of currently-queued entries for a single env.
+  // Used by the stale-sweep to compute per-entry dwell time, so order is
+  // immaterial — LRANGE returns them newest-first (LPUSH head) but the
+  // caller scans the whole window. Non-destructive: the drainer still
+  // RPOPs these entries in FIFO order.
+  //
+  // The entry HGETALLs are issued in a single pipelined batch (one
+  // network round-trip instead of N) — at the stale-sweep's default
+  // maxCount=1000 the serial implementation cost ~1000 RTTs per env,
+  // which dominated sweep wall-time at any meaningful backlog.
+  //
+  // A missing entry (empty hash) is skipped: the drainer's RPOP+DEL of
+  // the entry hash can race our LRANGE→HGETALL window, so a runId on
+  // the queue with no backing hash is an expected concurrency outcome,
+  // not an error.
+  async listEntriesForEnv(envId: string, maxCount: number): Promise<BufferEntry[]> {
+    if (maxCount <= 0) return [];
+    const runIds = await this.redis.lrange(
+      `mollifier:queue:${envId}`,
+      0,
+      maxCount - 1,
+    );
+    if (runIds.length === 0) return [];
+
+    const pipeline = this.redis.pipeline();
+    for (const runId of runIds) {
+      pipeline.hgetall(`mollifier:entries:${runId}`);
+    }
+    const results = await pipeline.exec();
+    if (!results) return [];
+
+    const entries: BufferEntry[] = [];
+    for (let i = 0; i < results.length; i++) {
+      const [err, raw] = results[i] as [Error | null, Record<string, string> | null];
+      if (err) {
+        this.logger.error("MollifierBuffer.listEntriesForEnv: hgetall failed", {
+          runId: runIds[i],
+          err: err.message,
+        });
+        continue;
+      }
+      if (!raw || Object.keys(raw).length === 0) continue;
+      const parsed = BufferEntrySchema.safeParse(raw);
+      if (!parsed.success) {
+        this.logger.error("MollifierBuffer.listEntriesForEnv: invalid entry shape", {
+          runId: runIds[i],
+          errors: parsed.error.flatten(),
+        });
+        continue;
+      }
+      entries.push(parsed.data);
+    }
+    return entries;
+  }
+
+  // Atomic snapshot mutation. Used by customer-mutation API endpoints
+  // (tags, metadata-put, reschedule, cancel) when the run is still in
+  // the buffer. Three outcomes:
+  //   - "applied_to_snapshot": entry was QUEUED + not materialised; the
+  //     drainer will read the patched payload on its next pop.
+  //   - "not_found": no entry hash exists for this runId — including a
+  //     FAILED entry, whose hash the drainer-terminal `fail` path DELs.
+  //   - "busy": entry is DRAINING or materialised. The API
+  //     wait-and-bounces through PG.
+  //   - "limit_exceeded": an `append_tags` patch carrying `maxTags` would
+  //     push the deduped tag count over the cap; nothing is written.
+  async mutateSnapshot(runId: string, patch: SnapshotPatch): Promise<MutateSnapshotResult> {
+    const result = (await this.redis.mutateMollifierSnapshot(
+      `mollifier:entries:${runId}`,
+      JSON.stringify(patch),
+    )) as string;
+    if (
+      result === "applied_to_snapshot" ||
+      result === "not_found" ||
+      result === "busy" ||
+      result === "limit_exceeded"
+    ) {
+      return result;
+    }
+    throw new Error(`MollifierBuffer.mutateSnapshot: unexpected Lua return value: ${result}`);
+  }
+
+  // Optimistic compare-and-swap on the snapshot's metadata. Caller reads
+  // the current metadataVersion via getEntry, applies operations in JS via
+  // `applyMetadataOperations`, then calls this with the new metadata + the
+  // expected version. Lua refuses if the version has moved (caller retries
+  // up to N times). Mirrors the PG-side `UpdateMetadataService` retry
+  // loop so concurrent increment/append operations don't lose deltas.
+  async casSetMetadata(input: {
+    runId: string;
+    expectedVersion: number;
+    newMetadata: string;
+    newMetadataType: string;
+  }): Promise<CasSetMetadataResult> {
+    const entryKey = `mollifier:entries:${input.runId}`;
+    const raw = (await this.redis.casSetMollifierMetadata(
+      entryKey,
+      String(input.expectedVersion),
+      input.newMetadata,
+      input.newMetadataType,
+    )) as string;
+    if (raw === "not_found") return { kind: "not_found" };
+    if (raw === "busy") return { kind: "busy" };
+    if (raw.startsWith("conflict:")) {
+      return { kind: "version_conflict", currentVersion: Number(raw.slice("conflict:".length)) };
+    }
+    if (raw.startsWith("applied:")) {
+      return { kind: "applied", newVersion: Number(raw.slice("applied:".length)) };
+    }
+    throw new Error(`MollifierBuffer.casSetMetadata: unexpected Lua return: ${raw}`);
+  }
+
+  // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One
+  // call across both PG and buffer paths serialises through this claim;
+  // closes the race the buffer-side SETNX leaves open during the
+  // gate-transition burst window.
+  //
+  // The caller supplies an opaque `token` (UUID) on claim. The same token
+  // MUST be passed to `publishClaim` / `releaseClaim`, which compare-and-
+  // act so a late release from a previous claimant whose TTL expired
+  // cannot erase a new owner's claim.
+  //
+  // - "claimed": we now own the claim, the caller proceeds with the
+  //   trigger pipeline and must `publishClaim` on success or
+  //   `releaseClaim` on failure.
+  // - "pending": another trigger owns the claim and hasn't published
+  //   yet; the caller should poll.
+  // - "resolved": the claim already holds a runId; the caller can
+  //   return that runId as a cached hit.
+  async claimIdempotency(
+    input: IdempotencyLookupInput & { token: string; ttlSeconds: number },
+  ): Promise<IdempotencyClaimResult> {
+    const claimKey = makeIdempotencyClaimKey(input);
+    const raw = (await this.redis.claimMollifierIdempotency(
+      claimKey,
+      `${PENDING_PREFIX}${input.token}`,
+      PENDING_PREFIX,
+      String(input.ttlSeconds),
+    )) as string;
+    if (raw === "claimed") return { kind: "claimed" };
+    if (raw === "pending") return { kind: "pending" };
+    if (raw.startsWith("resolved:")) {
+      return { kind: "resolved", runId: raw.slice("resolved:".length) };
+    }
+    throw new Error(`MollifierBuffer.claimIdempotency: unexpected return: ${raw}`);
+  }
+
+  // Publish the winning runId to the claim so subsequent claimants /
+  // waiters see "resolved". TTL bounded by the customer's
+  // `idempotencyKeyExpiresAt` minus now; caller computes.
+  //
+  // Compare-and-set on the caller's token: if the current value isn't
+  // our pending marker (TTL expired and another claimant moved in, or
+  // someone else already published), the publish is a no-op. The caller
+  // can treat any such case as "we lost the claim" and re-read.
+  // Returns true if we published; false if the claim slot was no longer
+  // ours.
+  async publishClaim(
+    input: IdempotencyLookupInput & { token: string; runId: string; ttlSeconds: number },
+  ): Promise<boolean> {
+    const claimKey = makeIdempotencyClaimKey(input);
+    const result = (await this.redis.publishMollifierClaim(
+      claimKey,
+      `${PENDING_PREFIX}${input.token}`,
+      input.runId,
+      String(input.ttlSeconds),
+    )) as number;
+    return result === 1;
+  }
+
+  // Release the claim on pipeline error so waiters can re-claim and
+  // retry. Idempotent.
+  //
+  // Compare-and-delete on the caller's token: only deletes if the
+  // current value is exactly our pending marker. A late release from a
+  // claimant whose TTL expired is a no-op, so a new owner's claim is
+  // never wiped by a slow predecessor.
+  async releaseClaim(input: IdempotencyLookupInput & { token: string }): Promise<void> {
+    const claimKey = makeIdempotencyClaimKey(input);
+    await this.redis.releaseMollifierClaim(
+      claimKey,
+      `${PENDING_PREFIX}${input.token}`,
+    );
+  }
+
+  // Read the current claim value, used by the wait/poll loop on losers
+  // to detect "pending" → "resolved" transitions and timeouts.
+  async readClaim(input: IdempotencyLookupInput): Promise<IdempotencyClaimResult | null> {
+    const claimKey = makeIdempotencyClaimKey(input);
+    const value = await this.redis.get(claimKey);
+    if (value === null) return null;
+    if (value.startsWith(PENDING_PREFIX)) return { kind: "pending" };
+    return { kind: "resolved", runId: value };
+  }
+
+  // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by
+  // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check
+  // misses — same key may belong to a buffered run waiting to drain. The
+  // lookup self-heals: if the lookup points at an entry hash that's gone,
+  // we clear the lookup and report a miss. The clear is a compare-and-
+  // delete (only if the key still holds the stale runId we observed) so a
+  // fresh accept that rebinds the key between our GET and DEL isn't wiped.
+  async lookupIdempotency(input: IdempotencyLookupInput): Promise<string | null> {
+    const lookupKey = idempotencyLookupKeyFor(input);
+    const runId = await this.redis.get(lookupKey);
+    if (!runId) return null;
+    const entry = await this.getEntry(runId);
+    if (!entry) {
+      await this.redis.delMollifierKeyIfEquals(lookupKey, runId);
+      return null;
+    }
+    return runId;
+  }
+
+  // Clear the idempotency binding from a buffered run. Used by
+  // `ResetIdempotencyKeyService` alongside the existing PG-side
+  // `updateMany`. Returns the runId that was cleared, or null if no
+  // buffered run held this key.
+  async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> {
+    const lookupKey = idempotencyLookupKeyFor(input);
+    const claimKey = makeIdempotencyClaimKey(input);
+    const clearedRunId = (await this.redis.resetMollifierIdempotency(
+      lookupKey,
+      "mollifier:entries:",
+      claimKey,
+    )) as string;
+    return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null };
+  }
+
+  // Marks the entry as materialised (PG row written) and resets its TTL to
+  // the grace window. Entry hash persists past ack as a read-fallback
+  // safety net for the brief PG replica-lag window between drainer-side
+  // write and reader-side visibility. Also clears the associated
+  // idempotency lookup if one was set on accept.
   async ack(runId: string): Promise<void> {
-    await this.redis.del(`mollifier:entries:${runId}`);
+    await this.redis.ackMollifierEntry(
+      `mollifier:entries:${runId}`,
+      String(ACK_GRACE_TTL_SECONDS),
+    );
   }
 
   async requeue(runId: string): Promise<void> {
@@ -142,9 +507,12 @@ export class MollifierBuffer {
     );
   }
 
-  // Returns true if the entry transitioned to FAILED; false if the entry no
-  // longer exists (TTL expired between pop and fail). Caller can use the
-  // boolean to skip downstream FAILED handling for ghost entries.
+  // Returns true if a live entry was torn down; false if the entry no
+  // longer existed (a concurrent ack or manual cleanup removed it between
+  // pop and fail — there is no accept-time TTL). Note FAILED is not an
+  // observable state: the Lua marks the hash FAILED then DELs it in the
+  // same atomic script, so a subsequent getEntry returns null. Caller can
+  // use the boolean to skip downstream FAILED handling for ghost entries.
   async fail(runId: string, error: { code: string; message: string }): Promise<boolean> {
     const result = await this.redis.failMollifierEntry(
       `mollifier:entries:${runId}`,
@@ -153,10 +521,16 @@ export class MollifierBuffer {
     return result === 1;
   }
 
+  // Returns Redis-side TTL on the entry hash. Returns -1 for entries
+  // with no TTL — the steady state under the current design, where
+  // entries persist until drainer ack/fail. The ack grace TTL (30s
+  // post-materialise) is the only context where this returns a
+  // positive value; tests around the grace TTL still rely on it.
   async getEntryTtlSeconds(runId: string): Promise<number> {
     return this.redis.ttl(`mollifier:entries:${runId}`);
   }
 
+
   async evaluateTrip(
     envId: string,
     options: { windowMs: number; threshold: number; holdMs: number },
@@ -190,8 +564,10 @@ export class MollifierBuffer {
         local orgId = ARGV[3]
         local payload = ARGV[4]
         local createdAt = ARGV[5]
-        local ttlSeconds = tonumber(ARGV[6])
+        local createdAtMicros = ARGV[6]
         local orgEnvsPrefix = ARGV[7]
+        local idempotencyLookupKey = ARGV[8] or ''
+        local entryPrefix = ARGV[9]
 
         -- Idempotent: refuse if an entry for this runId already exists in any
         -- state. Caller-side dedup is also enforced via API idempotency keys,
@@ -200,6 +576,27 @@ export class MollifierBuffer {
           return 0
         end
 
+        -- Idempotency-key dedup. If the caller passed a lookup key
+        -- and it's already bound to another buffered run, return the
+        -- winner's runId so the loser's API response can echo it as a
+        -- cached hit. Otherwise SET the lookup (no TTL — lifecycle is
+        -- paired with the entry hash; drainer ack/fail clear it
+        -- explicitly).
+        if idempotencyLookupKey ~= '' then
+          local existing = redis.call('GET', idempotencyLookupKey)
+          if existing then
+            -- Self-heal: only honour the binding if its entry hash still
+            -- exists. If the entry was evicted (maxmemory) but the lookup
+            -- survived, the binding is stale — fall through and rebind to
+            -- this run rather than returning a dead runId that would block
+            -- the key indefinitely. Mirrors lookupIdempotency's self-heal.
+            if redis.call('EXISTS', entryPrefix .. existing) == 1 then
+              return existing
+            end
+          end
+          redis.call('SET', idempotencyLookupKey, runId)
+        end
+
         redis.call('HSET', entryKey,
           'runId', runId,
           'envId', envId,
@@ -207,8 +604,20 @@ export class MollifierBuffer {
           'payload', payload,
           'status', 'QUEUED',
           'attempts', '0',
-          'createdAt', createdAt)
-        redis.call('EXPIRE', entryKey, ttlSeconds)
+          'createdAt', createdAt,
+          'createdAtMicros', createdAtMicros,
+          'idempotencyLookupKey', idempotencyLookupKey,
+          'metadataVersion', '0')
+        -- No EXPIRE on the entry hash. Buffer entries persist until the
+        -- drainer ACKs (post-materialise grace) or FAILs them — the
+        -- drainer is the only recovery mechanism, so silent TTL-based
+        -- eviction would lose runs with no customer-visible signal.
+        -- Memory pressure from an offline drainer is the alertable
+        -- failure mode instead; see _ops/mollifier-ops.md.
+        -- LIST queue: LPUSH at the head, drainer RPOPs from the tail, so
+        -- insertion order == drain order (FIFO). createdAtMicros is kept
+        -- as a hash field for dwell metrics only — it is no longer a sort
+        -- key now that the buffer has no list/pagination surface.
         redis.call('LPUSH', queueKey, runId)
         -- Org-level membership: maintained atomically with the per-env
         -- queue so the drainer can walk orgs → envs-for-org and
@@ -239,7 +648,12 @@ export class MollifierBuffer {
         local nextAttempts = tonumber(currentAttempts or '0') + 1
 
         redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts))
-        redis.call('LPUSH', queuePrefix .. envId, runId)
+        -- Requeue RPUSHes to the tail (the RPOP end) so a transiently
+        -- failed entry pops next rather than going to the back of the
+        -- line behind a fresh backlog. createdAt is immutable across
+        -- retries; the drainer's maxAttempts caps the
+        -- retry loop so a poisoned entry doesn't head-of-line forever.
+        redis.call('RPUSH', queuePrefix .. envId, runId)
         -- Re-track the org/env: pop may have SREM'd them when the queue
         -- last emptied. SADDs are idempotent if the values are still
         -- present.
@@ -274,11 +688,13 @@ export class MollifierBuffer {
           end
         end
 
-        -- Loop to skip orphan queue references — runIds whose entry hash has
-        -- expired (TTL hit). HSET on a missing key would CREATE a partial
-        -- hash without a TTL, leaking memory. The loop is bounded by queue
-        -- length; entire Lua script remains atomic.
+        -- Loop to skip orphan queue references — runIds whose entry hash is
+        -- gone (e.g. Redis maxmemory eviction, since QUEUED entries carry
+        -- no TTL of their own). HSET on a missing key would CREATE a
+        -- partial hash without a TTL, leaking memory. The loop is bounded
+        -- by queue length; entire Lua script remains atomic.
         while true do
+          -- RPOP returns the tail member (oldest, FIFO), or false when empty.
           local runId = redis.call('RPOP', queueKey)
           if not runId then
             -- Queue is empty AND we have no entry to read orgId from, so
@@ -296,16 +712,274 @@ export class MollifierBuffer {
               result[raw[i]] = raw[i + 1]
             end
             -- Prune org-level membership if this pop drained the queue.
-            -- Atomic with the RPOP above — a concurrent accept AFTER this
-            -- script will SADD both back along with its LPUSH.
+            -- Atomic with the RPOP above — a concurrent accept AFTER
+            -- this script will SADD both back along with its LPUSH.
             if redis.call('LLEN', queueKey) == 0 then
               pruneOrgMembership(result['orgId'])
             end
             return cjson.encode(result)
           end
-          -- Orphan queue reference: entry TTL expired while runId was queued.
-          -- Discard the reference and loop to the next.
+          -- Orphan queue reference: entry hash gone (evicted) while runId
+          -- was queued. Discard the reference and loop to the next.
+        end
+      `,
+    });
+
+    this.redis.defineCommand("casSetMollifierMetadata", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local expectedVersion = tonumber(ARGV[1])
+        local newMetadata = ARGV[2]
+        local newMetadataType = ARGV[3]
+
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 'not_found'
+        end
+
+        local status = redis.call('HGET', entryKey, 'status')
+        local materialised = redis.call('HGET', entryKey, 'materialised')
+        if status ~= 'QUEUED' or materialised == 'true' then
+          return 'busy'
+        end
+
+        local currentVersionStr = redis.call('HGET', entryKey, 'metadataVersion') or '0'
+        local currentVersion = tonumber(currentVersionStr) or 0
+        if currentVersion ~= expectedVersion then
+          return 'conflict:' .. tostring(currentVersion)
+        end
+
+        -- Write the new metadata onto the snapshot's payload JSON. We
+        -- keep the rest of the payload intact — only metadata/metadataType
+        -- change. metadataVersion is denormalised on the hash for cheap
+        -- CAS reads; it's intentionally NOT stored inside the payload
+        -- itself (PG-side metadataVersion is a column, not a JSON field).
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        local ok, payload = pcall(cjson.decode, payloadJson)
+        if not ok then return 'busy' end
+        payload.metadata = newMetadata
+        payload.metadataType = newMetadataType
+
+        local newVersion = currentVersion + 1
+        redis.call('HSET', entryKey,
+          'payload', cjson.encode(payload),
+          'metadataVersion', tostring(newVersion))
+        return 'applied:' .. tostring(newVersion)
+      `,
+    });
+
+    this.redis.defineCommand("claimMollifierIdempotency", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local pendingMarker = ARGV[1]   -- "pending:<caller-token>"
+        local pendingPrefix = ARGV[2]   -- "pending:"
+        local ttl = tonumber(ARGV[3])
+
+        -- SETNX-with-TTL: atomic; only one caller can win.
+        local won = redis.call('SET', claimKey, pendingMarker, 'NX', 'EX', ttl)
+        if won then
+          return 'claimed'
+        end
+
+        local existing = redis.call('GET', claimKey)
+        if not existing then
+          -- The slot expired in the race window between the SET NX
+          -- failing and this GET. It's free now — claim it so we don't
+          -- string.sub a nil and error out.
+          redis.call('SET', claimKey, pendingMarker, 'EX', ttl)
+          return 'claimed'
+        end
+        -- Any "pending:*" value is a live claim — the caller-supplied
+        -- token differentiates ownership but is opaque to losers.
+        if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then
+          return 'pending'
+        end
+        return 'resolved:' .. existing
+      `,
+    });
+
+    // Publish a winning runId to a claim slot we own. Compare-and-set on
+    // the caller's pending marker: if the slot is no longer ours (TTL
+    // expired and another claimant moved in, or already resolved by
+    // someone else), we no-op. Returns 1 on publish, 0 on no-op.
+    this.redis.defineCommand("publishMollifierClaim", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local ownerMarker = ARGV[1]   -- "pending:<our-token>"
+        local runId = ARGV[2]
+        local ttl = tonumber(ARGV[3])
+
+        local existing = redis.call('GET', claimKey)
+        if existing == ownerMarker then
+          redis.call('SET', claimKey, runId, 'EX', ttl)
+          return 1
+        end
+        return 0
+      `,
+    });
+
+    // Release a claim slot we own. Compare-and-delete on the caller's
+    // pending marker: a late release from a previous claimant whose TTL
+    // expired is a no-op, so a new owner's claim is never wiped.
+    this.redis.defineCommand("releaseMollifierClaim", {
+      numberOfKeys: 1,
+      lua: `
+        local claimKey = KEYS[1]
+        local ownerMarker = ARGV[1]   -- "pending:<our-token>"
+
+        local existing = redis.call('GET', claimKey)
+        if existing == ownerMarker then
+          redis.call('DEL', claimKey)
+          return 1
+        end
+        return 0
+      `,
+    });
+
+    this.redis.defineCommand("resetMollifierIdempotency", {
+      numberOfKeys: 1,
+      lua: `
+        local lookupKey = KEYS[1]
+        local entryPrefix = ARGV[1]
+        local claimKey = ARGV[2]
+
+        -- Reset reopens the key across BOTH the buffer lookup and the
+        -- cross-store pre-gate claim pointer. Without clearing the claim,
+        -- a resolved/pending claim would keep deduping new triggers for
+        -- the rest of its TTL even though the binding was reset. DEL is
+        -- unconditional — the claim is gone regardless of whether a
+        -- buffered run currently holds the lookup.
+        redis.call('DEL', claimKey)
+
+        local runId = redis.call('GET', lookupKey)
+        if not runId then
+          return ''
+        end
+
+        local entryKey = entryPrefix .. runId
+        if redis.call('EXISTS', entryKey) == 0 then
+          -- Stale lookup. Lazy cleanup.
+          redis.call('DEL', lookupKey)
+          return ''
+        end
+
+        -- Clear the idempotency fields on the snapshot payload so the
+        -- drainer's eventual engine.trigger call inserts a PG row
+        -- without the key set.
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        if payloadJson then
+          local ok, payload = pcall(cjson.decode, payloadJson)
+          if ok then
+            payload.idempotencyKey = cjson.null
+            payload.idempotencyKeyExpiresAt = cjson.null
+            redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
+          end
+        end
+        -- Clear the denormalised lookup pointer on the hash so a later
+        -- ack doesn't try to DEL a key that's already gone.
+        redis.call('HSET', entryKey, 'idempotencyLookupKey', '')
+        redis.call('DEL', lookupKey)
+        return runId
+      `,
+    });
+
+    this.redis.defineCommand("mutateMollifierSnapshot", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local patchJson = ARGV[1]
+
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 'not_found'
+        end
+
+        local status = redis.call('HGET', entryKey, 'status')
+        local materialised = redis.call('HGET', entryKey, 'materialised')
+        if status ~= 'QUEUED' or materialised == 'true' then
+          return 'busy'
+        end
+
+        local payloadJson = redis.call('HGET', entryKey, 'payload')
+        local ok, payload = pcall(cjson.decode, payloadJson)
+        if not ok then return 'busy' end
+
+        local patch = cjson.decode(patchJson)
+
+        if patch.type == 'append_tags' then
+          -- cjson decode of an absent or empty-array field gives nil or
+          -- an empty table; we rebuild as a dense array. Existing tags
+          -- are preserved; new tags are appended only if not present.
+          local existing = payload.tags or {}
+          local seen = {}
+          local merged = {}
+          for _, t in ipairs(existing) do
+            if not seen[t] then
+              seen[t] = true
+              table.insert(merged, t)
+            end
+          end
+          for _, t in ipairs(patch.tags or {}) do
+            if not seen[t] then
+              seen[t] = true
+              table.insert(merged, t)
+            end
+          end
+          -- Cap the deduped count when the caller supplies a limit, so a
+          -- buffered run can't exceed MAX_TAGS_PER_RUN via the tags API.
+          -- Reject the whole patch (write nothing) rather than truncating.
+          if patch.maxTags ~= nil and #merged > patch.maxTags then
+            return 'limit_exceeded'
+          end
+          payload.tags = merged
+        elseif patch.type == 'set_metadata' then
+          payload.metadata = patch.metadata
+          payload.metadataType = patch.metadataType
+          -- Bump the denormalised metadataVersion so an in-flight
+          -- casSetMetadata (optimistic CAS keyed on this counter) sees
+          -- the concurrent write as a version conflict and retries,
+          -- instead of clobbering it under a now-stale expectedVersion.
+          local currentVersion = tonumber(redis.call('HGET', entryKey, 'metadataVersion') or '0') or 0
+          redis.call('HSET', entryKey, 'metadataVersion', tostring(currentVersion + 1))
+        elseif patch.type == 'set_delay' then
+          payload.delayUntil = patch.delayUntil
+        elseif patch.type == 'mark_cancelled' then
+          payload.cancelledAt = patch.cancelledAt
+          payload.cancelReason = patch.cancelReason
+        else
+          return 'busy'
         end
+
+        redis.call('HSET', entryKey, 'payload', cjson.encode(payload))
+        return 'applied_to_snapshot'
+      `,
+    });
+
+    this.redis.defineCommand("ackMollifierEntry", {
+      numberOfKeys: 1,
+      lua: `
+        local entryKey = KEYS[1]
+        local graceTtlSeconds = tonumber(ARGV[1])
+
+        -- Guard: never create a partial entry. If the hash is gone between
+        -- pop and ack (concurrent fail or eviction — QUEUED entries carry
+        -- no TTL), the run is gone, nothing to mark materialised.
+        if redis.call('EXISTS', entryKey) == 0 then
+          return 0
+        end
+
+        -- If the entry was accepted with an idempotency key, the lookup
+        -- string was stored on the hash at accept time. Clear it now —
+        -- PG becomes canonical for the key post-materialisation.
+        local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
+        if lookupKey and lookupKey ~= '' then
+          redis.call('DEL', lookupKey)
+        end
+
+        redis.call('HSET', entryKey, 'materialised', 'true')
+        redis.call('EXPIRE', entryKey, graceTtlSeconds)
+        return 1
       `,
     });
 
@@ -315,17 +989,49 @@ export class MollifierBuffer {
         local entryKey = KEYS[1]
         local errorPayload = ARGV[1]
 
-        -- Guard: never create a partial entry. If the hash expired between
-        -- pop and fail, the run is gone — nothing to mark FAILED.
+        -- Guard: nothing to mark FAILED if the hash is gone (concurrent
+        -- ack/manual cleanup). Returning 0 lets the caller distinguish
+        -- "marked failed" from "no-op".
         if redis.call('EXISTS', entryKey) == 0 then
           return 0
         end
 
         redis.call('HSET', entryKey, 'status', 'FAILED', 'lastError', errorPayload)
+
+        -- Terminal-failure contract: the drainer's onTerminalFailure
+        -- callback (see MollifierDrainer.processEntry) has been
+        -- invoked before this fail() and has either written a
+        -- SYSTEM_FAILURE PG row (for both non-retryable AND
+        -- max-attempts-exhausted retryable errors) or chosen to fall
+        -- through (genuinely bad snapshot the engine can't materialise
+        -- a row from). Either way the buffer entry is no longer
+        -- load-bearing here. Clear the idempotency lookup -- PG's
+        -- unique constraint is the canonical dedup mechanism
+        -- post-materialise -- and drop the entry hash so failed runs
+        -- don't accrete forever now that there's no accept-time TTL.
+        local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey')
+        if lookupKey and lookupKey ~= '' then
+          redis.call('DEL', lookupKey)
+        end
+        redis.call('DEL', entryKey)
         return 1
       `,
     });
 
+    // Compare-and-delete: DEL the key only if it still holds the expected
+    // value. Used by lookupIdempotency's stale-lookup self-heal so a
+    // concurrent accept that rebinds the key between the reader's GET and
+    // this DEL isn't clobbered.
+    this.redis.defineCommand("delMollifierKeyIfEquals", {
+      numberOfKeys: 1,
+      lua: `
+        if redis.call('GET', KEYS[1]) == ARGV[1] then
+          return redis.call('DEL', KEYS[1])
+        end
+        return 0
+      `,
+    });
+
     this.redis.defineCommand("mollifierEvaluateTrip", {
       numberOfKeys: 2,
       lua: `
@@ -362,10 +1068,12 @@ declare module "@internal/redis" {
       orgId: string,
       payload: string,
       createdAt: string,
-      ttlSeconds: string,
+      createdAtMicros: string,
       orgEnvsPrefix: string,
-      callback?: Callback<number>,
-    ): Result<number, Context>;
+      idempotencyLookupKey: string,
+      entryPrefix: string,
+      callback?: Callback<number | string>,
+    ): Result<number | string, Context>;
     popAndMarkDraining(
       queueKey: string,
       orgsKey: string,
@@ -382,11 +1090,58 @@ declare module "@internal/redis" {
       orgEnvsPrefix: string,
       callback?: Callback<number>,
     ): Result<number, Context>;
+    mutateMollifierSnapshot(
+      entryKey: string,
+      patchJson: string,
+      callback?: Callback<string>,
+    ): Result<string, Context>;
+    casSetMollifierMetadata(
+      entryKey: string,
+      expectedVersion: string,
+      newMetadata: string,
+      newMetadataType: string,
+      callback?: Callback<string>,
+    ): Result<string, Context>;
+    resetMollifierIdempotency(
+      lookupKey: string,
+      entryPrefix: string,
+      claimKey: string,
+      callback?: Callback<string>,
+    ): Result<string, Context>;
+    claimMollifierIdempotency(
+      claimKey: string,
+      pendingMarker: string,
+      pendingPrefix: string,
+      ttlSeconds: string,
+      callback?: Callback<string>,
+    ): Result<string, Context>;
+    publishMollifierClaim(
+      claimKey: string,
+      ownerMarker: string,
+      runId: string,
+      ttlSeconds: string,
+      callback?: Callback<number>,
+    ): Result<number, Context>;
+    releaseMollifierClaim(
+      claimKey: string,
+      ownerMarker: string,
+      callback?: Callback<number>,
+    ): Result<number, Context>;
+    ackMollifierEntry(
+      entryKey: string,
+      graceTtlSeconds: string,
+      callback?: Callback<number>,
+    ): Result<number, Context>;
     failMollifierEntry(
       entryKey: string,
       errorPayload: string,
       callback?: Callback<number>,
     ): Result<number, Context>;
+    delMollifierKeyIfEquals(
+      key: string,
+      expected: string,
+      callback?: Callback<number>,
+    ): Result<number, Context>;
     mollifierEvaluateTrip(
       rateKey: string,
       trippedKey: string,
diff --git a/packages/redis-worker/src/mollifier/drainer.test.ts b/packages/redis-worker/src/mollifier/drainer.test.ts
index c8f68977f69..c6832e94c77 100644
--- a/packages/redis-worker/src/mollifier/drainer.test.ts
+++ b/packages/redis-worker/src/mollifier/drainer.test.ts
@@ -6,7 +6,6 @@ import { MollifierDrainer } from "./drainer.js";
 import { serialiseSnapshot } from "./schemas.js";
 
 const noopOptions = {
-  entryTtlSeconds: 600,
   logger: new Logger("test", "log"),
 };
 
@@ -87,8 +86,11 @@ describe("MollifierDrainer.runOnce", () => {
         payload: { foo: 1 },
       });
 
+      // After ack the entry persists as a read-fallback safety net with
+      // materialised=true and a fresh grace TTL.
       const entry = await buffer.getEntry("run_1");
-      expect(entry).toBeNull();
+      expect(entry).not.toBeNull();
+      expect(entry!.materialised).toBe(true);
     } finally {
       await buffer.close();
     }
@@ -167,9 +169,14 @@ describe("MollifierDrainer error handling", () => {
       expect(after2!.status).toBe("QUEUED");
       expect(after2!.attempts).toBe(2);
 
-      await drainer.runOnce();
+      const result3 = await drainer.runOnce();
+      // On attempt 3 the drainer hits maxAttempts and calls fail(),
+      // which deletes the entry — once the drainer-handler has written
+      // the SYSTEM_FAILURE PG row the buffer entry is no longer
+      // load-bearing. The runOnce result is the surviving signal.
       const after3 = await buffer.getEntry("run_r");
-      expect(after3!.status).toBe("FAILED");
+      expect(after3).toBeNull();
+      expect(result3.failed).toBe(1);
       expect(calls).toBe(3);
     } finally {
       await buffer.close();
@@ -202,11 +209,13 @@ describe("MollifierDrainer error handling", () => {
     try {
       await buffer.accept({ runId: "run_nr", envId: "env_a", orgId: "org_1", payload: "{}" });
 
-      await drainer.runOnce();
+      const result = await drainer.runOnce();
 
+      // fail() deletes the entry once the drainer-handler has written
+      // the canonical SYSTEM_FAILURE PG row.
       const entry = await buffer.getEntry("run_nr");
-      expect(entry!.status).toBe("FAILED");
-      expect(entry!.lastError).toEqual({ code: "Error", message: "validation failure" });
+      expect(entry).toBeNull();
+      expect(result.failed).toBe(1);
     } finally {
       await buffer.close();
     }
@@ -270,6 +279,296 @@ describe("MollifierDrainer error handling", () => {
   );
 });
 
+// `onTerminalFailure` is the callback the drainer fires on any terminal
+// path (non-retryable OR max-attempts-exhausted retryable) before it
+// calls `buffer.fail()`. Webapp wires it to `createFailedTaskRun` so the
+// customer's run lands a SYSTEM_FAILURE PG row in both cases. Pre-fix,
+// the retryable-exhausted path called `buffer.fail()` with no PG row,
+// silently losing the run. These tests pin both terminal causes plus the
+// retry-on-retryable-callback-failure escape hatch.
+describe("MollifierDrainer.onTerminalFailure", () => {
+  redisTest(
+    "fires with cause max-attempts-exhausted after retryable failures exhaust",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        ...noopOptions,
+      });
+
+      let handlerCalls = 0;
+      const handler = async () => {
+        handlerCalls++;
+        throw new Error("retryable PG blip");
+      };
+
+      type TerminalCallArgs = {
+        runId: string;
+        attempts: number;
+        cause: "non-retryable" | "max-attempts-exhausted";
+        errorMessage: string;
+      };
+      const terminalCalls: TerminalCallArgs[] = [];
+
+      const drainer = new MollifierDrainer({
+        buffer,
+        handler,
+        onTerminalFailure: async (input) => {
+          terminalCalls.push({
+            runId: input.runId,
+            attempts: input.attempts,
+            cause: input.cause,
+            errorMessage: input.error.message,
+          });
+        },
+        concurrency: 1,
+        maxAttempts: 2,
+        isRetryable: () => true,
+        logger: new Logger("test-drainer", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "run_exhaust", envId: "env_a", orgId: "org_1", payload: "{}" });
+
+        // Attempt 1: retryable error → requeue, no terminal callback fires.
+        const r1 = await drainer.runOnce();
+        expect(r1.failed).toBe(1);
+        expect(terminalCalls).toHaveLength(0);
+        const after1 = await buffer.getEntry("run_exhaust");
+        expect(after1!.status).toBe("QUEUED");
+        expect(after1!.attempts).toBe(1);
+
+        // Attempt 2: maxAttempts (2) reached → terminal callback fires
+        // with cause "max-attempts-exhausted", THEN buffer.fail() deletes.
+        const r2 = await drainer.runOnce();
+        expect(r2.failed).toBe(1);
+        expect(handlerCalls).toBe(2);
+        expect(terminalCalls).toHaveLength(1);
+        expect(terminalCalls[0]).toMatchObject({
+          runId: "run_exhaust",
+          attempts: 2,
+          cause: "max-attempts-exhausted",
+          errorMessage: "retryable PG blip",
+        });
+        // buffer entry torn down post-callback.
+        expect(await buffer.getEntry("run_exhaust")).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "fires with cause non-retryable on the first non-retryable error",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        ...noopOptions,
+      });
+
+      const handler = async () => {
+        throw new Error("validation failure");
+      };
+
+      const terminalCalls: Array<{ cause: string; attempts: number }> = [];
+      const drainer = new MollifierDrainer({
+        buffer,
+        handler,
+        onTerminalFailure: async (input) => {
+          terminalCalls.push({ cause: input.cause, attempts: input.attempts });
+        },
+        concurrency: 1,
+        // Generous attempts budget — non-retryable should bypass it
+        // entirely and terminate on the first attempt.
+        maxAttempts: 5,
+        isRetryable: () => false,
+        logger: new Logger("test-drainer", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "run_nr", envId: "env_a", orgId: "org_1", payload: "{}" });
+
+        const r = await drainer.runOnce();
+        expect(r.failed).toBe(1);
+        expect(terminalCalls).toHaveLength(1);
+        expect(terminalCalls[0]).toEqual({ cause: "non-retryable", attempts: 1 });
+        expect(await buffer.getEntry("run_nr")).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "callback throwing a retryable error requeues instead of failing",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        ...noopOptions,
+      });
+
+      // Handler always fails (non-retryable so we hit onTerminalFailure
+      // on the first attempt regardless of maxAttempts).
+      const handler = async () => {
+        throw new Error("validation failure");
+      };
+
+      let callbackInvocations = 0;
+      const drainer = new MollifierDrainer({
+        buffer,
+        handler,
+        onTerminalFailure: async () => {
+          callbackInvocations++;
+          // Simulate PG still unreachable when we try to write the
+          // SYSTEM_FAILURE row — drainer should requeue, not fail.
+          const err: Error & { code?: string } = new Error("Can't reach database server");
+          err.code = "P1001";
+          throw err;
+        },
+        concurrency: 1,
+        maxAttempts: 3,
+        // Both `validation failure` (handler) AND `P1001` (callback) are
+        // retryable from the drainer's perspective. The handler's
+        // non-retryable disposition is set by the underlying error
+        // identity, not by `isRetryable` — callers like the webapp use a
+        // narrower retryable predicate. Here we set `isRetryable: true`
+        // because the test only cares about the callback-retryable path.
+        isRetryable: () => true,
+        logger: new Logger("test-drainer", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "run_cb_retry", envId: "env_a", orgId: "org_1", payload: "{}" });
+
+        // Tick 1: handler throws → attempts=1 < maxAttempts=3 → requeue
+        // (no callback invocation, retryable path).
+        const r1 = await drainer.runOnce();
+        expect(r1.failed).toBe(1);
+        expect(callbackInvocations).toBe(0);
+        const after1 = await buffer.getEntry("run_cb_retry");
+        expect(after1!.status).toBe("QUEUED");
+        expect(after1!.attempts).toBe(1);
+
+        // Tick 2: handler throws → attempts=2 < 3 → requeue again.
+        const r2 = await drainer.runOnce();
+        expect(r2.failed).toBe(1);
+        expect(callbackInvocations).toBe(0);
+
+        // Tick 3: handler throws → attempts=3 (the nextAttempts check is
+        // `< maxAttempts`, so 3 < 3 is false) → terminal. Callback throws
+        // retryable → drainer requeues instead of fail(). Entry survives.
+        const r3 = await drainer.runOnce();
+        expect(r3.failed).toBe(1);
+        expect(callbackInvocations).toBe(1);
+        const after3 = await buffer.getEntry("run_cb_retry");
+        expect(after3).not.toBeNull();
+        expect(after3!.status).toBe("QUEUED");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "callback throwing a non-retryable error falls through to buffer.fail()",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        ...noopOptions,
+      });
+
+      const handler = async () => {
+        throw new Error("validation failure");
+      };
+
+      const drainer = new MollifierDrainer({
+        buffer,
+        handler,
+        onTerminalFailure: async () => {
+          // Genuinely bad write (e.g. snapshot too malformed to insert).
+          // Drainer must NOT loop on this — falls through to buffer.fail.
+          throw new Error("malformed snapshot");
+        },
+        concurrency: 1,
+        maxAttempts: 3,
+        isRetryable: () => false,
+        logger: new Logger("test-drainer", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "run_cb_dead", envId: "env_a", orgId: "org_1", payload: "{}" });
+
+        const r = await drainer.runOnce();
+        expect(r.failed).toBe(1);
+        // Entry was failed despite the callback throwing — the
+        // non-retryable branch of the callback-error guard sends it to
+        // buffer.fail so a poisoned run can't loop forever.
+        expect(await buffer.getEntry("run_cb_dead")).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
+  redisTest(
+    "no onTerminalFailure provided keeps pre-fix behaviour (buffer.fail with no callback)",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        ...noopOptions,
+      });
+
+      const handler = async () => {
+        throw new Error("validation failure");
+      };
+
+      const drainer = new MollifierDrainer({
+        buffer,
+        handler,
+        // onTerminalFailure intentionally omitted — verifies the option
+        // is genuinely optional and backwards-compatible.
+        concurrency: 1,
+        maxAttempts: 2,
+        isRetryable: () => false,
+        logger: new Logger("test-drainer", "log"),
+      });
+
+      try {
+        await buffer.accept({ runId: "run_no_cb", envId: "env_a", orgId: "org_1", payload: "{}" });
+        const r = await drainer.runOnce();
+        expect(r.failed).toBe(1);
+        expect(await buffer.getEntry("run_no_cb")).toBeNull();
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+});
+
 // Transient Redis errors used to permanently kill the loop because
 // `processOneFromEnv` didn't catch `buffer.pop()` rejections — the error
 // bubbled through `Promise.all` → `runOnce` → `loop`'s outer catch and
@@ -972,7 +1271,7 @@ describe("MollifierDrainer additional coverage", () => {
     // ack() lives inside the same try as the handler call, so if the
     // handler succeeds but ack throws (e.g. transient Redis blip), the
     // entry is routed through the retry/terminal path even though the
-    // handler-side work completed. Phase 2's engine-replay handler will
+    // handler-side work completed. A later engine-replay handler will
     // need idempotency to absorb the re-execution this implies on retry,
     // OR ack should be lifted out of the try block.
     let handlerCalls = 0;
diff --git a/packages/redis-worker/src/mollifier/drainer.ts b/packages/redis-worker/src/mollifier/drainer.ts
index 407b389e14e..20b5ee3ae1f 100644
--- a/packages/redis-worker/src/mollifier/drainer.ts
+++ b/packages/redis-worker/src/mollifier/drainer.ts
@@ -12,9 +12,30 @@ export type MollifierDrainerHandler<TPayload> = (input: {
   createdAt: Date;
 }) => Promise<void>;
 
+// Invoked once per entry before `buffer.fail()` on any terminal path —
+// non-retryable error OR retryable error after maxAttempts. Lets the caller
+// land a SYSTEM_FAILURE PG row so the customer sees the run instead of it
+// silently disappearing alongside the buffer entry. Throwing a retryable
+// error from the callback causes the drainer to requeue rather than fail
+// (so the PG write itself gets another chance once PG recovers); throwing
+// anything else falls through to `buffer.fail()` to avoid an infinite loop
+// on a genuinely bad payload.
+export type MollifierDrainerTerminalFailureCause = "non-retryable" | "max-attempts-exhausted";
+export type MollifierDrainerTerminalFailureHandler<TPayload> = (input: {
+  runId: string;
+  envId: string;
+  orgId: string;
+  payload: TPayload;
+  attempts: number;
+  createdAt: Date;
+  error: { code: string; message: string };
+  cause: MollifierDrainerTerminalFailureCause;
+}) => Promise<void>;
+
 export type MollifierDrainerOptions<TPayload> = {
   buffer: MollifierBuffer;
   handler: MollifierDrainerHandler<TPayload>;
+  onTerminalFailure?: MollifierDrainerTerminalFailureHandler<TPayload>;
   concurrency: number;
   maxAttempts: number;
   isRetryable: (err: unknown) => boolean;
@@ -42,6 +63,7 @@ export type DrainResult = {
 export class MollifierDrainer<TPayload = unknown> {
   private readonly buffer: MollifierBuffer;
   private readonly handler: MollifierDrainerHandler<TPayload>;
+  private readonly onTerminalFailure?: MollifierDrainerTerminalFailureHandler<TPayload>;
   private readonly maxAttempts: number;
   private readonly isRetryable: (err: unknown) => boolean;
   private readonly pollIntervalMs: number;
@@ -60,6 +82,7 @@ export class MollifierDrainer<TPayload = unknown> {
   constructor(options: MollifierDrainerOptions<TPayload>) {
     this.buffer = options.buffer;
     this.handler = options.handler;
+    this.onTerminalFailure = options.onTerminalFailure;
     this.maxAttempts = options.maxAttempts;
     this.isRetryable = options.isRetryable;
     this.pollIntervalMs = options.pollIntervalMs ?? 100;
@@ -275,13 +298,56 @@ export class MollifierDrainer<TPayload = unknown> {
         });
         return "failed";
       }
+      const cause: MollifierDrainerTerminalFailureCause = this.isRetryable(err)
+        ? "max-attempts-exhausted"
+        : "non-retryable";
       const code = err instanceof Error ? err.name : "Unknown";
       const message = err instanceof Error ? err.message : String(err);
+      // Run the terminal-failure callback BEFORE buffer.fail() so a
+      // SYSTEM_FAILURE PG row can land while the entry is still around to
+      // read from (and so we don't lose the run if the callback's own
+      // write itself needs a retry). If the callback throws a retryable
+      // error, requeue the entry instead of fail()ing — PG is still
+      // unreachable, give it another tick. Any other callback failure
+      // falls through to buffer.fail() so a genuinely bad snapshot
+      // doesn't loop forever.
+      if (this.onTerminalFailure) {
+        try {
+          await this.onTerminalFailure({
+            runId: entry.runId,
+            envId: entry.envId,
+            orgId: entry.orgId,
+            payload: deserialiseSnapshot<TPayload>(entry.payload),
+            attempts: nextAttempts,
+            createdAt: entry.createdAt,
+            error: { code, message },
+            cause,
+          });
+        } catch (writeErr) {
+          if (this.isRetryable(writeErr)) {
+            await this.buffer.requeue(entry.runId);
+            this.logger.warn(
+              "MollifierDrainer: terminal-failure callback retryable; requeued",
+              {
+                runId: entry.runId,
+                attempts: nextAttempts,
+                writeErr,
+              },
+            );
+            return "failed";
+          }
+          this.logger.error("MollifierDrainer: terminal-failure callback failed", {
+            runId: entry.runId,
+            writeErr,
+          });
+        }
+      }
       await this.buffer.fail(entry.runId, { code, message });
       this.logger.error("MollifierDrainer: terminal failure", {
         runId: entry.runId,
         code,
         message,
+        cause,
       });
       return "failed";
     }
diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts
index 5e6fe202e3d..c7875a7d55f 100644
--- a/packages/redis-worker/src/mollifier/index.ts
+++ b/packages/redis-worker/src/mollifier/index.ts
@@ -1,8 +1,21 @@
-export { MollifierBuffer, type MollifierBufferOptions } from "./buffer.js";
+export {
+  MollifierBuffer,
+  type MollifierBufferOptions,
+  type SnapshotPatch,
+  type AcceptResult,
+  type MutateSnapshotResult,
+  type CasSetMetadataResult,
+  type IdempotencyClaimResult,
+  type IdempotencyLookupInput,
+  idempotencyLookupKeyFor,
+  makeIdempotencyClaimKey,
+} from "./buffer.js";
 export {
   MollifierDrainer,
   type MollifierDrainerOptions,
   type MollifierDrainerHandler,
+  type MollifierDrainerTerminalFailureHandler,
+  type MollifierDrainerTerminalFailureCause,
   type DrainResult,
 } from "./drainer.js";
 export {
diff --git a/packages/redis-worker/src/mollifier/schemas.ts b/packages/redis-worker/src/mollifier/schemas.ts
index f93b0f0a3c3..5acd0c7c15d 100644
--- a/packages/redis-worker/src/mollifier/schemas.ts
+++ b/packages/redis-worker/src/mollifier/schemas.ts
@@ -27,6 +27,10 @@ const stringToDate = z.string().transform((v, ctx) => {
   return d;
 });
 
+const stringToBool = z
+  .union([z.literal("true"), z.literal("false")])
+  .transform((v) => v === "true");
+
 const stringToError = z.string().transform((v, ctx) => {
   try {
     return BufferEntryError.parse(JSON.parse(v));
@@ -44,6 +48,27 @@ export const BufferEntrySchema = z.object({
   status: BufferEntryStatus,
   attempts: stringToInt,
   createdAt: stringToDate,
+  // Microsecond epoch of accept time, kept as a hash field for dwell
+  // metrics. Not a queue sort key (the queue is a FIFO LIST). Defaulted
+  // so an entry written by an accept Lua predating this field — or one
+  // surviving across the deploy that introduced it — still parses instead
+  // of being silently dropped on pop.
+  createdAtMicros: stringToInt.default("0"),
+  // Drainer-ack flag: `true` once the drainer has materialised this run
+  // into PG. The hash persists for a short grace TTL after ack so direct
+  // reads (retrieve, trace, etc.) still resolve while PG replica lag
+  // settles. Absent on pre-ack entries.
+  materialised: stringToBool.default("false"),
+  // Denormalised pointer to the Redis idempotency lookup key (set when
+  // the run was accepted with an idempotency key, empty otherwise). The
+  // ack Lua reads this to DEL the lookup atomically with marking the
+  // entry materialised.
+  idempotencyLookupKey: z.string().optional().default(""),
+  // Optimistic-lock counter for the snapshot's `metadata` field.
+  // Incremented atomically by the CAS metadata Lua. Matches the
+  // semantic of `TaskRun.metadataVersion` on the PG side (which the
+  // UpdateMetadataService uses for the same retry-on-conflict pattern).
+  metadataVersion: stringToInt.default("0"),
   lastError: stringToError.optional(),
 });
 
diff --git a/packages/trigger-sdk/src/v3/ai-shared.ts b/packages/trigger-sdk/src/v3/ai-shared.ts
index 35b61910563..a0ea3036cff 100644
--- a/packages/trigger-sdk/src/v3/ai-shared.ts
+++ b/packages/trigger-sdk/src/v3/ai-shared.ts
@@ -16,7 +16,7 @@
  */
 
 import type { Task, AnyTask } from "@trigger.dev/core/v3";
-import type { ModelMessage, UIMessage } from "ai";
+import type { InferUITools, ModelMessage, ToolSet, UIDataTypes, UIMessage } from "ai";
 
 /**
  * Message-part `type` value for the pending-message data part the agent
@@ -199,6 +199,26 @@ export type InferChatUIMessage<TTask extends AnyTask> = TTask extends Task<
   ? TUIM
   : UIMessage;
 
+/**
+ * Derive the chat `UIMessage` type for a given tool set. The tool-part types
+ * (`tool-${name}` with typed input/output) are inferred from the tools. Use
+ * this to declare the message type from your tools (e.g. to pass to
+ * `chat.withUIMessage<...>()` or to type the frontend) without hand-writing
+ * the `UIMessage<unknown, UIDataTypes, InferUITools<...>>` triple.
+ *
+ * @example
+ * ```ts
+ * import type { InferChatUIMessageFromTools } from "@trigger.dev/sdk/ai";
+ * const tools = { search, readFile };
+ * type ChatUiMessage = InferChatUIMessageFromTools<typeof tools>;
+ * ```
+ */
+export type InferChatUIMessageFromTools<TTools extends ToolSet> = UIMessage<
+  unknown,
+  UIDataTypes,
+  InferUITools<TTools>
+>;
+
 /**
  * Upsert an incoming wire message into the customer's DB-backed chain
  * inside a `hydrateMessages` hook. Returns `true` iff the chain was
diff --git a/packages/trigger-sdk/src/v3/ai.ts b/packages/trigger-sdk/src/v3/ai.ts
index d1a6a226023..62aad9d8c57 100644
--- a/packages/trigger-sdk/src/v3/ai.ts
+++ b/packages/trigger-sdk/src/v3/ai.ts
@@ -102,7 +102,16 @@ const METADATA_KEY = "tool.execute.options";
  * stopped/aborted conversations with partial tool parts.
  */
 function toModelMessages(messages: UIMessage[]): Promise<ModelMessage[]> {
-  return convertToModelMessages(messages, { ignoreIncompleteToolCalls: true });
+  // Pass the resolved per-turn `tools` (if any) so the AI SDK can look up each
+  // tool's `toModelOutput` and re-apply it to prior-turn tool results. Without
+  // `tools` it falls back to JSON-stringifying the raw output (TRI-10149). The
+  // conditional spread keeps the options object byte-identical to the no-tools
+  // path when nothing was declared.
+  const tools = locals.get(chatResolvedToolsKey);
+  return convertToModelMessages(messages, {
+    ignoreIncompleteToolCalls: true,
+    ...(tools ? { tools } : {}),
+  });
 }
 
 export type ToolCallExecutionOptions = {
@@ -1425,7 +1434,10 @@ export type ChatTaskSignals = {
  * The full payload passed to a `chatAgent` run function.
  * Extends `ChatTaskPayload` (the wire payload) with abort signals.
  */
-export type ChatTaskRunPayload<TClientData = unknown> = ChatTaskPayload<TClientData> &
+export type ChatTaskRunPayload<
+  TClientData = unknown,
+  TTools extends ToolSet = ToolSet,
+> = ChatTaskPayload<TClientData> &
   ChatTaskSignals & {
     /**
      * Task run context — same object as the `ctx` passed to a standard `task({ run })` handler’s second argument.
@@ -1436,6 +1448,21 @@ export type ChatTaskRunPayload<TClientData = unknown> = ChatTaskPayload<TClientD
     previousTurnUsage?: LanguageModelUsage;
     /** Cumulative token usage across all completed turns so far. */
     totalUsage: LanguageModelUsage;
+    /**
+     * The resolved tool set for this turn, the same `tools` you declared on
+     * `chat.agent({ tools })` (or the result of the per-turn `tools` function).
+     * Pass straight to `streamText({ tools })` so you don't redeclare them:
+     *
+     * ```ts
+     * run: ({ messages, tools, signal }) =>
+     *   streamText({ model, messages, tools, abortSignal: signal })
+     * ```
+     *
+     * Declaring `tools` on the config is also what lets the SDK re-run each
+     * tool's `toModelOutput` when it re-converts prior-turn history (see the
+     * `tools` option on `chat.agent`). Empty object when no `tools` were declared.
+     */
+    tools: TTools;
   };
 
 // Input streams for bidirectional chat communication
@@ -2366,6 +2393,20 @@ const chatPrepareMessagesKey =
   locals.create<(event: PrepareMessagesEvent<unknown>) => ModelMessage[] | Promise<ModelMessage[]>>(
     "chat.prepareMessages"
   );
+/**
+ * @internal The raw `tools` option from `chat.agent({ tools })`, either a
+ * static `ToolSet` or a per-turn function. Set once at boot.
+ */
+const chatToolsOptionKey = locals.create<
+  ToolSet | ((event: ResolveToolsEvent<unknown>) => ToolSet | Promise<ToolSet>)
+>("chat.toolsOption");
+/**
+ * @internal The concrete `ToolSet` resolved for the current turn. Read by
+ * `toModelMessages` so `convertToModelMessages` can re-run `toModelOutput` on
+ * prior-turn tool results. Unset when no `tools` were declared (preserves the
+ * exact pre-feature conversion behavior).
+ */
+const chatResolvedToolsKey = locals.create<ToolSet>("chat.resolvedTools");
 
 /** @internal Flag set by `chat.requestUpgrade()` to exit the loop after the current turn. */
 const chatUpgradeRequestedKey = locals.create<boolean>("chat.upgradeRequested");
@@ -2626,6 +2667,25 @@ export type PrepareMessagesEvent<TClientData = unknown> = {
   clientData?: TClientData;
 };
 
+/**
+ * Event passed to the per-turn `tools` function form on `chat.agent`.
+ *
+ * Use this when the active tool set depends on per-turn context (the user, a
+ * feature flag, etc.). Return the `ToolSet` to use for converting this turn's
+ * history. Only `inputSchema` and `toModelOutput` are read during conversion,
+ * so a lightweight map (no `execute`) is fine.
+ */
+export type ResolveToolsEvent<TClientData = unknown> = {
+  /** The chat session ID. */
+  chatId: string;
+  /** The current turn number (0-indexed). */
+  turn: number;
+  /** Whether this run is continuing an existing chat. */
+  continuation: boolean;
+  /** Custom data from the frontend. */
+  clientData?: TClientData;
+};
+
 /**
  * Data shape for `data-compaction` stream chunks emitted during compaction.
  * Use to type the `data` field when rendering compaction parts in the frontend.
@@ -2800,6 +2860,41 @@ async function applyPrepareMessages(
   );
 }
 
+/**
+ * Resolve the `tools` option into a concrete `ToolSet` and cache it in locals so
+ * `toModelMessages` can pass it to `convertToModelMessages`. For the function
+ * form, invokes the user function with the given context (or the current turn
+ * context when no override is passed). Pass an `override` for the boot-time
+ * history conversion, which runs before the per-turn context exists and uses
+ * the run/continuation payload's `clientData`.
+ *
+ * Fails closed: a throwing resolver propagates rather than carrying a prior
+ * turn's set forward. The function form can gate capabilities by user or flag,
+ * so reusing stale tools would leak capabilities. No-op when no `tools` were
+ * declared.
+ * @internal
+ */
+async function resolveTurnTools(
+  override?: { chatId: string; turn: number; continuation: boolean; clientData: unknown }
+): Promise<void> {
+  const option = locals.get(chatToolsOptionKey);
+  if (!option) return;
+
+  if (typeof option !== "function") {
+    locals.set(chatResolvedToolsKey, option);
+    return;
+  }
+
+  const ctx = override ?? locals.get(chatTurnContextKey);
+  const resolved = await option({
+    chatId: ctx?.chatId ?? "",
+    turn: ctx?.turn ?? 0,
+    continuation: ctx?.continuation ?? false,
+    clientData: ctx?.clientData,
+  });
+  locals.set(chatResolvedToolsKey, resolved);
+}
+
 /**
  * Read the current compaction state. Returns the summary and base message count
  * if compaction has occurred in this turn, or `undefined` if not.
@@ -4250,6 +4345,7 @@ export type ChatAgentOptions<
   TClientDataSchema extends TaskSchema | undefined = undefined,
   TUIMessage extends UIMessage = UIMessage,
   TActionSchema extends TaskSchema | undefined = undefined,
+  TTools extends ToolSet = ToolSet,
 > = Omit<
   TaskOptions<
     TIdentifier,
@@ -4360,6 +4456,41 @@ export type ChatAgentOptions<
     >
   ) => Promise<unknown> | unknown;
 
+  /**
+   * The tools available to this agent.
+   *
+   * `chat.agent` doesn't call the model for you. Your tools still go to
+   * `streamText({ tools })` inside `run()`. Declaring them here additionally
+   * lets the SDK re-run each tool's
+   * [`toModelOutput`](https://ai-sdk.dev/docs/ai-sdk-core/tools-and-tool-calling#tomodeloutput)
+   * when it re-converts persisted history on later turns. Without this, the
+   * AI SDK has no `tools` to look up `toModelOutput` against, so a tool's
+   * transformed result (e.g. raw image bytes → an image content part, or a
+   * sub-agent summary) silently degrades to its raw JSON output from turn 2
+   * onward.
+   *
+   * Only `inputSchema` and `toModelOutput` are read during conversion (never
+   * `execute`), so you may pass a lightweight map if you keep heavy execute
+   * deps out of this module.
+   *
+   * Pass either a static `ToolSet` or a function of per-turn context (for
+   * tools that depend on the user, a feature flag, etc.). The resolved set is
+   * available on the `run()` payload as `tools`.
+   *
+   * @example
+   * ```ts
+   * const tools = { read_file, search };
+   * chat.agent({
+   *   tools,
+   *   run: async ({ messages, tools, signal }) =>
+   *     streamText({ model, messages, tools, abortSignal: signal }),
+   * });
+   * ```
+   */
+  tools?:
+    | TTools
+    | ((event: ResolveToolsEvent<inferSchemaOut<TClientDataSchema>>) => TTools | Promise<TTools>);
+
   /**
    * The run function for the chat task.
    *
@@ -4370,7 +4501,9 @@ export type ChatAgentOptions<
    * **Auto-piping:** If this function returns a value with `.toUIMessageStream()`,
    * the stream is automatically piped to the frontend.
    */
-  run: (payload: ChatTaskRunPayload<inferSchemaOut<TClientDataSchema>>) => Promise<unknown>;
+  run: (
+    payload: ChatTaskRunPayload<inferSchemaOut<TClientDataSchema>, TTools>
+  ) => Promise<unknown>;
 
   /**
    * Called once at the start of every run boot — for the initial run, for
@@ -4951,8 +5084,9 @@ function chatAgent<
   TClientDataSchema extends TaskSchema | undefined = undefined,
   TUIMessage extends UIMessage = UIMessage,
   TActionSchema extends TaskSchema | undefined = undefined,
+  TTools extends ToolSet = ToolSet,
 >(
-  options: ChatAgentOptions<TIdentifier, TClientDataSchema, TUIMessage, TActionSchema>
+  options: ChatAgentOptions<TIdentifier, TClientDataSchema, TUIMessage, TActionSchema, TTools>
 ): Task<TIdentifier, ChatTaskWirePayload<TUIMessage, inferSchemaIn<TClientDataSchema>>, unknown> {
   const {
     run: userRun,
@@ -4971,6 +5105,7 @@ function chatAgent<
     compaction,
     pendingMessages: pendingMessagesConfig,
     prepareMessages,
+    tools: toolsOption,
     onTurnComplete,
     maxTurns = 100,
     turnTimeout = "1h",
@@ -5049,6 +5184,25 @@ function chatAgent<
         locals.set(chatPrepareMessagesKey, prepareMessages);
       }
 
+      if (toolsOption) {
+        // Cast: the option's function form is typed against the parsed
+        // `clientData` (`ResolveToolsEvent<inferSchemaOut<...>>`), but the
+        // locals key uses the erased `ResolveToolsEvent<unknown>`. The runtime
+        // value is identical; this mirrors how `prepareMessages` is stored.
+        locals.set(
+          chatToolsOptionKey,
+          toolsOption as
+            | ToolSet
+            | ((event: ResolveToolsEvent<unknown>) => ToolSet | Promise<ToolSet>)
+        );
+        // Static tools are usable immediately. The function form is resolved
+        // just before the boot history conversion (with the payload's
+        // clientData) and again per-turn (see resolveTurnTools).
+        if (typeof toolsOption !== "function") {
+          locals.set(chatResolvedToolsKey, toolsOption);
+        }
+      }
+
       if (compaction) {
         locals.set(
           chatAgentCompactionKey,
@@ -5438,6 +5592,29 @@ function chatAgent<
         }
 
         if (accumulatedUIMessages.length > 0) {
+          // Resolve a function-form `tools` with the run/continuation payload's
+          // clientData so this conversion of the restored history applies each
+          // tool's toModelOutput (static tools were already seeded above). This
+          // only re-renders saved history, so it fails open: a resolver hiccup
+          // logs and converts without tools rather than blocking the resume.
+          // Per-turn resolveTurnTools still fails closed for live turns.
+          if (typeof toolsOption === "function") {
+            try {
+              await resolveTurnTools({
+                chatId: payload.chatId,
+                turn: 0,
+                continuation: payload.continuation ?? false,
+                clientData: parseClientData
+                  ? await parseClientData(payload.metadata)
+                  : payload.metadata,
+              });
+            } catch (error) {
+              logger.warn(
+                "chat.agent: tools() resolver threw at boot; restored history converted without toModelOutput",
+                { error: error instanceof Error ? error.message : String(error) }
+              );
+            }
+          }
           try {
             accumulatedMessages = await toModelMessages(accumulatedUIMessages);
           } catch (error) {
@@ -5958,6 +6135,11 @@ function chatAgent<
                     clientData,
                   });
 
+                  // Resolve the per-turn `tools` set now that turn context
+                  // (incl. parsed clientData) exists, so every toModelMessages
+                  // call this turn can re-apply tool `toModelOutput`.
+                  await resolveTurnTools();
+
                   // Per-turn stop controller (reset each turn)
                   const stopController = new AbortController();
                   currentStopController = stopController;
@@ -6613,6 +6795,7 @@ function chatAgent<
                         previousTurnUsage,
                         totalUsage: cumulativeUsage,
                         ctx,
+                        tools: locals.get(chatResolvedToolsKey) ?? {},
                         signal: combinedSignal,
                         cancelSignal,
                         stopSignal,
@@ -7512,11 +7695,11 @@ export interface ChatBuilder<
    * (backwards compatible).
    */
   agent: [TClientDataSchema] extends [undefined]
-  ? <TId extends string, TInfer extends TaskSchema | undefined = undefined, TAction extends TaskSchema | undefined = undefined>(
-    options: ChatAgentOptions<TId, TInfer, TUIMessage, TAction>
+  ? <TId extends string, TInfer extends TaskSchema | undefined = undefined, TAction extends TaskSchema | undefined = undefined, TTools extends ToolSet = ToolSet>(
+    options: ChatAgentOptions<TId, TInfer, TUIMessage, TAction, TTools>
   ) => Task<TId, ChatTaskWirePayload<TUIMessage, inferSchemaIn<TInfer>>, unknown>
-  : <TId extends string, TAction extends TaskSchema | undefined = undefined>(
-    options: Omit<ChatAgentOptions<TId, TClientDataSchema, TUIMessage, TAction>, "clientDataSchema">
+  : <TId extends string, TAction extends TaskSchema | undefined = undefined, TTools extends ToolSet = ToolSet>(
+    options: Omit<ChatAgentOptions<TId, TClientDataSchema, TUIMessage, TAction, TTools>, "clientDataSchema">
   ) => Task<TId, ChatTaskWirePayload<TUIMessage, inferSchemaIn<TClientDataSchema>>, unknown>;
 
   /**
@@ -9145,7 +9328,11 @@ function chatLocal<T extends Record<string, unknown>>(options: { id: string }):
 // the browser graph. Re-exported here so `@trigger.dev/sdk/ai` consumers
 // still see them.
 import type { InferChatClientData, InferChatUIMessage } from "./ai-shared.js";
-export type { InferChatClientData, InferChatUIMessage } from "./ai-shared.js";
+export type {
+  InferChatClientData,
+  InferChatUIMessage,
+  InferChatUIMessageFromTools,
+} from "./ai-shared.js";
 
 /**
  * Options for {@link createChatStartSessionAction}.
diff --git a/packages/trigger-sdk/test/mockChatAgent.test.ts b/packages/trigger-sdk/test/mockChatAgent.test.ts
index 3832e64b848..7245622b6e4 100644
--- a/packages/trigger-sdk/test/mockChatAgent.test.ts
+++ b/packages/trigger-sdk/test/mockChatAgent.test.ts
@@ -2082,3 +2082,139 @@ describe("mockChatAgent", () => {
     });
   });
 });
+
+describe("mockChatAgent tools / toModelOutput (TRI-10149)", () => {
+  // A tool whose raw `execute`/output never contains the marker; the marker
+  // lives ONLY in `toModelOutput`. If the SDK re-converts prior-turn history
+  // without threading tools, `toModelOutput` is skipped and the marker is lost.
+  const makeVault = () =>
+    tool({
+      description: "Vault.",
+      inputSchema: z.object({}),
+      toModelOutput: () => ({ type: "text" as const, value: "MARKER-XYZ" }),
+    });
+
+  // Seed a prior assistant turn that already carries a resolved vault tool
+  // result whose raw output has NO marker.
+  const seedAssistantWithToolResult = {
+    id: "a-vault",
+    role: "assistant" as const,
+    parts: [
+      {
+        type: "tool-vault" as const,
+        toolCallId: "tc_vault",
+        state: "output-available" as const,
+        input: {},
+        output: { bytes: "raw-no-marker" },
+      },
+    ],
+  };
+
+  it("re-applies tool.toModelOutput when re-converting prior-turn history (static tools)", async () => {
+    const vault = makeVault();
+    const model = new MockLanguageModelV3({
+      doStream: async () => ({ stream: textStream("ok") }),
+    });
+
+    const seenToolResults: string[] = [];
+    const agent = chat.agent({
+      id: "mockChatAgent.toModelOutput-static",
+      tools: { vault },
+      run: async ({ messages, tools, signal }) => {
+        // REUSE A: `tools` is threaded onto the run payload (typed concretely,
+        // not the broad `ToolSet`). The static-form type inference is validated
+        // end-to-end by the references/ai-chat typecheck; here we exercise the
+        // runtime behavior. (test/ is not part of the package's tsc pass.)
+        for (const m of messages) {
+          if (m.role === "tool") seenToolResults.push(JSON.stringify(m.content));
+        }
+        return streamText({ model, messages, tools, abortSignal: signal });
+      },
+    });
+
+    const harness = mockChatAgent(agent, { chatId: "test-tmo-static" });
+    try {
+      // Turn 1 seeds the tool result; turn 2 forces a re-conversion of history.
+      await harness.sendMessage(seedAssistantWithToolResult as any);
+      await harness.sendMessage(userMessage("recall"));
+      await new Promise((r) => setTimeout(r, 20));
+
+      const all = seenToolResults.join("|");
+      // toModelOutput ran → transformed value present, raw output gone.
+      expect(all).toContain("MARKER-XYZ");
+      expect(all).not.toContain("raw-no-marker");
+    } finally {
+      await harness.close();
+    }
+  });
+
+  it("resolves the per-turn function form of tools and re-applies toModelOutput", async () => {
+    const vault = makeVault();
+    const model = new MockLanguageModelV3({
+      doStream: async () => ({ stream: textStream("ok") }),
+    });
+
+    const seenToolResults: string[] = [];
+    let resolverCalls = 0;
+    const agent = chat.agent({
+      id: "mockChatAgent.toModelOutput-fn",
+      tools: () => {
+        resolverCalls++;
+        return { vault };
+      },
+      run: async ({ messages, tools, signal }) => {
+        for (const m of messages) {
+          if (m.role === "tool") seenToolResults.push(JSON.stringify(m.content));
+        }
+        return streamText({ model, messages, tools, abortSignal: signal });
+      },
+    });
+
+    const harness = mockChatAgent(agent, { chatId: "test-tmo-fn" });
+    try {
+      await harness.sendMessage(seedAssistantWithToolResult as any);
+      await harness.sendMessage(userMessage("recall"));
+      await new Promise((r) => setTimeout(r, 20));
+
+      const all = seenToolResults.join("|");
+      expect(all).toContain("MARKER-XYZ");
+      expect(all).not.toContain("raw-no-marker");
+      // The resolver runs per turn (once each), not per conversion call.
+      expect(resolverCalls).toBeGreaterThanOrEqual(2);
+    } finally {
+      await harness.close();
+    }
+  });
+
+  it("leaves conversion unchanged when no tools are declared (raw output passes through)", async () => {
+    const model = new MockLanguageModelV3({
+      doStream: async () => ({ stream: textStream("ok") }),
+    });
+
+    const seenToolResults: string[] = [];
+    const agent = chat.agent({
+      id: "mockChatAgent.toModelOutput-none",
+      run: async ({ messages, signal }) => {
+        for (const m of messages) {
+          if (m.role === "tool") seenToolResults.push(JSON.stringify(m.content));
+        }
+        return streamText({ model, messages, abortSignal: signal });
+      },
+    });
+
+    const harness = mockChatAgent(agent, { chatId: "test-tmo-none" });
+    try {
+      await harness.sendMessage(seedAssistantWithToolResult as any);
+      await harness.sendMessage(userMessage("recall"));
+      await new Promise((r) => setTimeout(r, 20));
+
+      // No tools declared → no toModelOutput lookup → raw output stringified
+      // (the pre-feature behavior, preserved for backward compatibility).
+      const all = seenToolResults.join("|");
+      expect(all).toContain("raw-no-marker");
+      expect(all).not.toContain("MARKER-XYZ");
+    } finally {
+      await harness.close();
+    }
+  });
+});
diff --git a/references/ai-chat/src/components/chat-sidebar.tsx b/references/ai-chat/src/components/chat-sidebar.tsx
index 9707b61ac36..f49e20ec2d2 100644
--- a/references/ai-chat/src/components/chat-sidebar.tsx
+++ b/references/ai-chat/src/components/chat-sidebar.tsx
@@ -119,6 +119,7 @@ export function ChatSidebar({
             <option value="upgrade-test">upgrade-test (requestUpgrade after 3 turns)</option>
             <option value="stress-emit">stress-emit (UI stress test)</option>
             <option value="cf-trust-test">cf-trust-test (Cloudflare proxy trust)</option>
+            <option value="tool-model-output-test">tool-model-output-test (toModelOutput cross-turn)</option>
           </select>
         </div>
         <label
diff --git a/references/ai-chat/src/lib/chat-tools-schemas.ts b/references/ai-chat/src/lib/chat-tools-schemas.ts
index 2def5f68553..bfd1a86d4b2 100644
--- a/references/ai-chat/src/lib/chat-tools-schemas.ts
+++ b/references/ai-chat/src/lib/chat-tools-schemas.ts
@@ -29,6 +29,7 @@
  */
 import { tool } from "ai";
 import type { InferUITools, UIDataTypes, UIMessage } from "ai";
+import type { InferChatUIMessageFromTools } from "@trigger.dev/sdk/ai";
 import { z } from "zod";
 
 export const inspectEnvironment = tool({
@@ -186,4 +187,5 @@ export const headStartTools = {
 
 type ChatToolSet = typeof headStartTools;
 export type ChatUiTools = InferUITools<ChatToolSet>;
-export type ChatUiMessage = UIMessage<unknown, UIDataTypes, ChatUiTools>;
+// Equivalent to `UIMessage<unknown, UIDataTypes, ChatUiTools>`, via the SDK helper.
+export type ChatUiMessage = InferChatUIMessageFromTools<ChatToolSet>;
diff --git a/references/ai-chat/src/trigger/chat.ts b/references/ai-chat/src/trigger/chat.ts
index 3f2e5900480..00f27465086 100644
--- a/references/ai-chat/src/trigger/chat.ts
+++ b/references/ai-chat/src/trigger/chat.ts
@@ -10,7 +10,8 @@ import {
   createProviderRegistry,
   validateUIMessages,
 } from "ai";
-import type { LanguageModel, LanguageModelUsage, UIMessage } from "ai";
+import type { LanguageModel, LanguageModelUsage, ModelMessage, UIMessage } from "ai";
+import { tool } from "ai";
 import { openai } from "@ai-sdk/openai";
 import { anthropic } from "@ai-sdk/anthropic";
 import { z } from "zod";
@@ -184,6 +185,12 @@ export const aiChat = chat
     idleTimeoutInSeconds: 60,
     chatAccessTokenTTL: "1h",
 
+    // Declare tools on the config so the SDK threads them into its internal
+    // convertToModelMessages, so any tool `toModelOutput` is re-applied when
+    // prior-turn history is re-converted. The resolved set comes back, typed,
+    // on the run payload (used below instead of referencing `chatTools` again).
+    tools: chatTools,
+
     // #region Compaction — automatic context window management
     compaction: {
       shouldCompact: ({ totalTokens }) => (totalTokens ?? 0) > COMPACT_AFTER_TOKENS,
@@ -537,7 +544,7 @@ export const aiChat = chat
     // #endregion
 
     // #region run — just return streamText(), chat.agent handles everything else
-    run: async ({ messages, clientData, stopSignal }) => {
+    run: async ({ messages, clientData, stopSignal, tools }) => {
       userContext.messageCount++;
       if (clientData?.model) {
         userContext.preferredModel = clientData.model;
@@ -550,7 +557,8 @@ export const aiChat = chat
         ...chat.toStreamTextOptions({
           registry,
           telemetry: clientData?.userId ? { userId: clientData.userId } : undefined,
-          tools: chatTools,
+          // `tools` is the same `chatTools` set, handed back typed on the payload.
+          tools,
         }),
         model: languageModelForChatTurn(modelOverride),
         messages: messages,
@@ -1074,3 +1082,114 @@ export const cfTrustTestAgent = chat
       });
     },
   });
+
+// ============================================================================
+// tool-model-output-test: TRI-10149 regression
+//
+// A tool whose `toModelOutput` rewrites the result into a marker phrase that
+// the *raw* tool output never contains. The model only ever learns the
+// codeword through `toModelOutput`.
+//
+//   Turn 1: the model calls `vault`, sees the codeword via `toModelOutput`,
+//           but is told to reply with exactly "ACK", so the codeword never
+//           enters the assistant's text. The tool result is its only home in
+//           the persisted history.
+//   Turn 2: the model is asked to recall the codeword. The SDK re-converts the
+//           accumulated UIMessage history into the `messages: ModelMessage[]`
+//           handed to `run()`. If `tools` is threaded into that internal
+//           `convertToModelMessages` call, `toModelOutput` runs again and the
+//           model (and the `messages` we receive) see "GIRAFFE-7731". If not
+//           (the bug), the raw output is JSON-stringified and the codeword is
+//           gone.
+//
+// `run()` inspects its OWN incoming `messages` each turn and logs whether the
+// prior-turn tool result still carries the marker, a deterministic,
+// model-independent assertion point (this array is the literal output of the
+// `toModelMessages` wrapper under test). The model's turn-2 recall is a
+// secondary, user-facing signal.
+// ============================================================================
+
+const VAULT_CODEWORD = "GIRAFFE-7731";
+
+const vaultTool = tool({
+  description:
+    "Open the vault. You MUST call this tool to learn the codeword. The raw " +
+    "tool result is opaque bytes; only your model-side view reveals the codeword.",
+  inputSchema: z.object({}),
+  // Raw output deliberately omits the codeword. This is what streams to the
+  // frontend AND what gets JSON-stringified into the prompt when the history
+  // is re-converted WITHOUT tools (the bug this test guards against).
+  execute: async () => ({
+    kind: "vault-blob",
+    bytes: "9f3a8c1d7e2b40960aa5510fbe33cc77",
+    note: "raw vault bytes, not human readable",
+  }),
+  // Model-side view: the ONLY place the codeword appears. Skipped on turn 2+
+  // unless the SDK threads `tools` through its internal convertToModelMessages.
+  toModelOutput: () => ({
+    type: "text" as const,
+    value: `VAULT CONTENTS: the codeword is ${VAULT_CODEWORD}.`,
+  }),
+});
+
+/**
+ * Log whether each incoming tool-result message still carries the
+ * `toModelOutput` marker. `messages` is the exact output of the internal
+ * `toModelMessages` wrapper, so `containsCodeword` is the deterministic verdict
+ * for TRI-10149 on every turn after the tool has been called.
+ */
+function logVaultProbe(messages: ModelMessage[]) {
+  for (const m of messages) {
+    if (m.role !== "tool") continue;
+    const serialized = JSON.stringify(m.content);
+    logger.info("tool-model-output-test: incoming tool result", {
+      messageCount: messages.length,
+      containsCodeword: serialized.includes(VAULT_CODEWORD),
+      serialized: serialized.slice(0, 500),
+    });
+  }
+}
+
+const vaultSystemPrompt =
+  "You are a vault assistant. Follow the user's formatting instructions exactly. " +
+  "When the user asks for the codeword, answer with it directly.";
+
+export const toolModelOutputTest = chat.agent({
+  id: "tool-model-output-test",
+  idleTimeoutInSeconds: 60,
+  // Declaring tools on the config (TRI-10149) threads them into the SDK's
+  // internal convertToModelMessages so `toModelOutput` re-runs when prior-turn
+  // history is re-converted. `tools` is then handed back, typed, on the run payload.
+  tools: { vault: vaultTool },
+  run: async ({ messages, tools, signal }) => {
+    logVaultProbe(messages);
+    return streamText({
+      model: openai("gpt-4o-mini"),
+      system: vaultSystemPrompt,
+      messages,
+      tools,
+      stopWhen: stepCountIs(5),
+      abortSignal: signal,
+    });
+  },
+});
+
+// Same test, but with the per-turn function form of `tools`. Exercises the
+// resolver path: resolved per turn (and at boot, with the payload's clientData,
+// so a continuation's restored history still gets toModelOutput re-applied).
+export const toolModelOutputFnTest = chat.agent({
+  id: "tool-model-output-fn-test",
+  idleTimeoutInSeconds: 60,
+  tools: () => ({ vault: vaultTool }),
+  run: async ({ messages, tools, signal }) => {
+    logVaultProbe(messages);
+    return streamText({
+      model: openai("gpt-4o-mini"),
+      system: vaultSystemPrompt,
+      messages,
+      tools,
+      stopWhen: stepCountIs(5),
+      abortSignal: signal,
+    });
+  },
+});