diff --git a/.changeset/chat-agent-tools.md b/.changeset/chat-agent-tools.md new file mode 100644 index 00000000000..1d44ea2a659 --- /dev/null +++ b/.changeset/chat-agent-tools.md @@ -0,0 +1,15 @@ +--- +"@trigger.dev/sdk": patch +--- + +Add a `tools` option to `chat.agent`. Declaring your tools here threads them into the SDK's internal `convertToModelMessages`, so each tool's `toModelOutput` is re-applied when prior-turn history is re-converted. + +```ts +chat.agent({ + tools: { readFile, search }, + run: async ({ messages, tools, signal }) => + streamText({ model, messages, tools, abortSignal: signal }), +}); +``` + +Also exports `InferChatUIMessageFromTools` to derive the chat `UIMessage` type (typed tool parts) directly from a tool set. diff --git a/.changeset/coerce-concurrency-key-to-string.md b/.changeset/coerce-concurrency-key-to-string.md new file mode 100644 index 00000000000..faccf7a48bf --- /dev/null +++ b/.changeset/coerce-concurrency-key-to-string.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Coerce numeric `concurrencyKey` values to string at the API boundary across `tasks.trigger`, `tasks.batchTrigger`, and the Phase-2 streaming batch endpoint. diff --git a/.changeset/mollifier-buffer-extensions.md b/.changeset/mollifier-buffer-extensions.md new file mode 100644 index 00000000000..c2a3b1a0e8e --- /dev/null +++ b/.changeset/mollifier-buffer-extensions.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/redis-worker": minor +--- + +Mollifier buffer extensions: idempotency dedup, an atomic `mutateSnapshot` API, metadata CAS, claim primitives, and a `MollifierSnapshot` type. The buffer's Redis client now reconnects with jittered backoff so a fleet of clients doesn't stampede Redis in lockstep after a blip. diff --git a/.changeset/mollifier-buffer-pipeline-list-entries.md b/.changeset/mollifier-buffer-pipeline-list-entries.md new file mode 100644 index 00000000000..2c55d9b18a8 --- /dev/null +++ b/.changeset/mollifier-buffer-pipeline-list-entries.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/redis-worker": patch +--- + +Pipeline the per-entry `HGETALL` fetches in `MollifierBuffer.listEntriesForEnv`. The previous serial implementation issued one Redis round-trip per runId returned by `LRANGE`, which dominated stale-sweep wall-time at any meaningful backlog (at the sweep's default maxCount=1000, this is ~1000 RTTs per env per pass). Behaviour is unchanged — entries are still skipped when the entry hash has been torn down by a concurrent drainer ack/fail between the LRANGE and the HGETALL. diff --git a/.changeset/mollifier-drainer-terminal-failure-callback.md b/.changeset/mollifier-drainer-terminal-failure-callback.md new file mode 100644 index 00000000000..e0ac3400ff3 --- /dev/null +++ b/.changeset/mollifier-drainer-terminal-failure-callback.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/redis-worker": minor +--- + +Add `onTerminalFailure` callback to `MollifierDrainerOptions` so the customer's run lands a SYSTEM_FAILURE PG row even when the drainer exhausts `maxAttempts` on a retryable PG error. Previously, retryable-error exhaustion called `buffer.fail()` directly, which atomically marks FAILED + DELs the entry hash with no PG write — silent data loss when PG was unreachable across the full retry budget. The callback fires before `buffer.fail()` on any terminal path (`cause: "non-retryable"` or `"max-attempts-exhausted"`); throwing a retryable error from the callback causes the drainer to requeue rather than fail. diff --git a/.changeset/mollifier-tag-cap.md b/.changeset/mollifier-tag-cap.md new file mode 100644 index 00000000000..b9057664fa7 --- /dev/null +++ b/.changeset/mollifier-tag-cap.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/redis-worker": patch +--- + +Mollifier `mutateSnapshot` now enforces a tag cap: an `append_tags` patch carrying `maxTags` returns `"limit_exceeded"` (writing nothing) when the deduped tag count would exceed the limit, so a buffered run can't accumulate more tags via the tags API than the trigger validator allows at creation. diff --git a/.server-changes/mollifier-dashboard.md b/.server-changes/mollifier-dashboard.md new file mode 100644 index 00000000000..1aad107063e --- /dev/null +++ b/.server-changes/mollifier-dashboard.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Mollifier dashboard surface: run-detail page renders buffered runs via synthetic trace, header, and span shapes; admin-only "Buffered" indicator and drainer LOG event in the trace tree. diff --git a/.server-changes/mollifier-drainer-replay.md b/.server-changes/mollifier-drainer-replay.md new file mode 100644 index 00000000000..fb2c9dd37bc --- /dev/null +++ b/.server-changes/mollifier-drainer-replay.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Mollifier drainer replay: replay buffered entries into `engine.trigger`, stale-entry sweep, a drainer-health gauge, and run-engine cancelled/failed run APIs. Known limitation: stale-sweep runs per-webapp instance, so stale-entry counter metrics multiply by N webapps in HA until a distributed lease lands as follow-up. diff --git a/.server-changes/mollifier-mutations.md b/.server-changes/mollifier-mutations.md new file mode 100644 index 00000000000..d0d5a969cbc --- /dev/null +++ b/.server-changes/mollifier-mutations.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Mollifier API mutations on buffered runs: tag, metadata, replay, reschedule, cancel, and idempotency-key reset via a buffer-snapshot fallback. When a mutation races a mid-drain run, the wait-and-bounce loop watches the buffer entry in Redis (cheap) and reads the primary exactly once for the actual mutation, instead of polling the writer on a fixed cadence; polls use jittered exponential backoff. diff --git a/.server-changes/mollifier-reads.md b/.server-changes/mollifier-reads.md new file mode 100644 index 00000000000..320310be1ee --- /dev/null +++ b/.server-changes/mollifier-reads.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Mollifier API read-fallback: serve buffered runs from synthetic run/trace/span data on the retrieve, trace, spans, and events endpoints. diff --git a/.server-changes/mollifier-trigger.md b/.server-changes/mollifier-trigger.md new file mode 100644 index 00000000000..a289972ef87 --- /dev/null +++ b/.server-changes/mollifier-trigger.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Mollifier trigger-time decisions: gate `engine.trigger`, mollify bursts into the buffer, claim idempotency keys, and read-fallback for buffered runs. diff --git a/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx b/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx index facff746c5e..566bc787daa 100644 --- a/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx +++ b/apps/webapp/app/components/runs/v3/CancelRunDialog.tsx @@ -10,9 +10,18 @@ import { SpinnerWhite } from "~/components/primitives/Spinner"; type CancelRunDialogProps = { runFriendlyId: string; redirectPath: string; + // Fired on submit so the parent can close the Radix Dialog without + // wrapping the submit button in `DialogClose` — that wrapper races + // submit (close fires first, unmounts the form, and the cancel POST + // never lands). Optional so existing call sites still type-check. + onCancelSubmitted?: () => void; }; -export function CancelRunDialog({ runFriendlyId, redirectPath }: CancelRunDialogProps) { +export function CancelRunDialog({ + runFriendlyId, + redirectPath, + onCancelSubmitted, +}: CancelRunDialogProps) { const navigation = useNavigation(); const formAction = `/resources/taskruns/${runFriendlyId}/cancel`; @@ -27,7 +36,11 @@ export function CancelRunDialog({ runFriendlyId, redirectPath }: CancelRunDialog +
onCancelSubmitted?.()} + > - - - + )} @@ -587,6 +691,35 @@ function TraceView({ ); } +// Controlled wrapper around the cancel dialog. Owns the Radix open state +// so the dialog closes itself once the cancel action transitions through +// submission. We can't ``-wrap the submit button +// because Radix's onClick handler swallows the button's name=value pair +// that the form action depends on for `redirectUrl`. +function ControlledCancelRunDialog({ + runFriendlyId, + redirectPath, +}: { + runFriendlyId: string; + redirectPath: string; +}) { + const [open, setOpen] = useState(false); + return ( + + + + + setOpen(false)} + /> + + ); +} + function NoLogsView({ run, resizable }: Pick) { const plan = useCurrentPlan(); const organization = useOrganization(); @@ -616,6 +749,11 @@ function NoLogsView({ run, resizable }: Pick) { >
{daysSinceCompleted === undefined ? ( + // NoLogsView only renders when the loader returns no trace. + // Buffered runs always carry a synthetic trace (see + // buildSyntheticTraceForBufferedRun) so they never reach + // this branch — the message here is the pre-mollifier + // copy for runs with no completedAt and no logs. We tidy up older logs to keep things running smoothly. diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.events.ts b/apps/webapp/app/routes/api.v1.runs.$runId.events.ts index bfa3cab971b..42468f67604 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.events.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.events.ts @@ -38,6 +38,16 @@ export const loader = createLoaderApiRoute( }, }, async ({ resource: run, authentication }) => { + // Short-circuit for mollifier-buffered runs. The drainer hasn't + // materialised execution events yet (the gate intercepts before + // any trace event is written), so a ClickHouse round-trip is + // guaranteed to come back empty. `findRun` now sets `isBuffered` + // explicitly on its return value — gate on that rather than + // probing surrogate fields like `traceId === ""`. + if (run.isBuffered) { + return json({ events: [] }, { status: 200 }); + } + const eventRepository = await getEventRepositoryForStore( run.taskEventStore, authentication.environment.organization.id diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts index f27a9c13f98..f9c815f6ef1 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts @@ -1,15 +1,161 @@ +import type { LoaderFunctionArgs } from "@remix-run/server-runtime"; import { json } from "@remix-run/server-runtime"; import { tryCatch } from "@trigger.dev/core/utils"; +import type { RunMetadataChangeOperation } from "@trigger.dev/core/v3/schemas"; import { UpdateMetadataRequestBody } from "@trigger.dev/core/v3"; import { z } from "zod"; +import { $replica } from "~/db.server"; +// Aliased to avoid shadowing the local `env: AuthenticatedEnvironment` +// parameter the route handler and `routeOperationsToRun` use. +import { env as appEnv } from "~/env.server"; +import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; +import { authenticateApiRequest } from "~/services/apiAuth.server"; +import { logger } from "~/services/logger.server"; import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; import { ServiceValidationError } from "~/v3/services/common.server"; +import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; const ParamsSchema = z.object({ runId: z.string(), }); +// GET handler added to fix the pre-existing route bug where this URL +// returned a Remix "no loader" 400 — only PUT (update) was exported, so +// GET had no handler. Returns `{ metadata, metadataType }` from either +// the Postgres row or the mollifier buffer snapshot. +export async function loader({ request, params }: LoaderFunctionArgs) { + const authenticationResult = await authenticateApiRequest(request); + if (!authenticationResult) { + return json({ error: "Invalid or Missing API Key" }, { status: 401 }); + } + + const parsed = ParamsSchema.safeParse(params); + if (!parsed.success) { + return json({ error: "Invalid or missing run ID" }, { status: 400 }); + } + + const env = authenticationResult.environment; + + const pgRun = await $replica.taskRun.findFirst({ + where: { friendlyId: parsed.data.runId, runtimeEnvironmentId: env.id }, + select: { metadata: true, metadataType: true }, + }); + if (pgRun) { + return json({ metadata: pgRun.metadata, metadataType: pgRun.metadataType }, { status: 200 }); + } + + const buffered = await findRunByIdWithMollifierFallback({ + runId: parsed.data.runId, + environmentId: env.id, + organizationId: env.organizationId, + }); + if (buffered) { + return json( + { + metadata: buffered.metadata ?? null, + metadataType: buffered.metadataType ?? "application/json", + }, + { status: 200 } + ); + } + + return json({ error: "Run not found" }, { status: 404 }); +} + +// Route parent/root operations to the existing PG service by directly +// invoking it against the parent/root runId. The service ingests via +// its batching worker, which targets PG by id. If the parent/root is +// itself buffered we recurse through our buffered-mutation helper. +// `_ingestion_only` flag: a synthetic body that has the operations +// promoted to top-level `operations` so the service applies them to +// `targetRunId` directly. +// Exported so the silent-failure logging behaviour can be unit-tested. +// The route handler itself isn't an attractive test target (createActionApiRoute +// wraps it in auth + body parsing + error-handler middleware), but the +// fan-out helper carries the load-bearing logic — including the ops- +// visibility branch this change adds. +export async function routeOperationsToRun( + targetRunId: string | undefined, + operations: RunMetadataChangeOperation[] | undefined, + env: AuthenticatedEnvironment +): Promise { + if (!targetRunId || !operations || operations.length === 0) return; + + // Try PG first via the existing service (this is how parent/root + // operations have always landed; preserve that). Accepts the full + // AuthenticatedEnvironment so we don't have to recover the unsafe + // `as unknown` cast that the previous narrowed `{ id, organizationId }` + // signature forced on us. + // + // Two non-success outcomes from `call`: + // * throws — PG threw (e.g. "Cannot update metadata for a completed + // run", or a transient PG outage). + // * resolves with undefined — PG row didn't exist (the target may be + // buffered, not yet materialised). + // Either way we want to try the buffer fallback below; treating the + // undefined-return as success would make the fallback unreachable. + const [error, result] = await tryCatch( + updateMetadataService.call(targetRunId, { operations }, env) + ); + if (!error && result !== undefined) return; + + if (error) { + // PG threw — auxiliary op, stay best-effort and don't surface this + // to the caller (the caller's primary mutation already landed). But + // warn so a genuine PG outage on these ops isn't invisible. + logger.warn("metadata route: parent/root PG op failed", { + targetRunId, + error: error instanceof Error ? error.message : String(error), + }); + } + + // Buffer fallback only makes sense for friendlyId-keyed entries. The + // PG-side parent/root IDs are internal cuids; the buffer keys entries + // by friendlyId, so passing the internal id would silently no-op. + // Skip explicitly — a buffered child's parent is always materialised + // in PG already (a buffered run hasn't executed, so it can't have + // triggered the child), so the buffered-parent branch isn't actually + // reachable. Treating the no-op as intentional rather than incidental. + if (!targetRunId.startsWith("run_")) return; + + // Best-effort buffer fallback. Wrap so a transient Redis throw on + // this auxiliary op can't 500 the request after the primary mutation + // already succeeded. + const [bufferError, bufferOutcome] = await tryCatch( + applyMetadataMutationToBufferedRun({ + runId: targetRunId, + environmentId: env.id, + organizationId: env.organizationId, + maximumSize: appEnv.TASK_RUN_METADATA_MAXIMUM_SIZE, + body: { operations }, + }) + ); + if (bufferError) { + logger.warn("metadata route: buffer fallback for parent/root op failed", { + targetRunId, + error: bufferError instanceof Error ? bufferError.message : String(bufferError), + }); + return; + } + // `applyMetadataMutationToBufferedRun` reports non-throw failures via + // its returned outcome kind: `not_found`, `busy`, `version_exhausted`, + // `metadata_too_large`. Without inspecting `.kind`, the parent/root + // operation can silently disappear — no PG row landed it (handled + // above) and the buffer rejected it for one of these reasons but the + // helper returned cleanly. Surface a warn log per non-success branch + // so ops can trace why a parent/root op went missing. The customer's + // primary mutation has already succeeded by this point; this remains + // best-effort, so we still don't bubble these to the response. + if (bufferOutcome && bufferOutcome.kind !== "applied") { + logger.warn("metadata route: parent/root buffer op did not apply", { + targetRunId, + kind: bufferOutcome.kind, + }); + } +} + const { action } = createActionApiRoute( { params: ParamsSchema, @@ -18,23 +164,104 @@ const { action } = createActionApiRoute( method: "PUT", }, async ({ authentication, body, params }) => { - const [error, result] = await tryCatch( - updateMetadataService.call(params.runId, body, authentication.environment) - ); + const env = authentication.environment; + const runId = params.runId; - if (error) { - if (error instanceof ServiceValidationError) { - return json({ error: error.message }, { status: error.status ?? 422 }); + // PG-canonical path. If the run is in PG, the existing service + // owns the full request shape including parent/root operations, + // metadataVersion CAS, batching, validation — none of which the + // buffer side needs to reimplement. + const [pgError, pgResult] = await tryCatch( + updateMetadataService.call(runId, body, env) + ); + if (pgError) { + if (pgError instanceof ServiceValidationError) { + return json({ error: pgError.message }, { status: pgError.status ?? 422 }); } - return json({ error: "Internal Server Error" }, { status: 500 }); } + if (pgResult) { + return json(pgResult, { status: 200 }); + } - if (!result) { + // PG miss. Target run is either buffered or genuinely absent. + const bufferOutcome = await applyMetadataMutationToBufferedRun({ + runId, + environmentId: env.id, + organizationId: env.organizationId, + maximumSize: appEnv.TASK_RUN_METADATA_MAXIMUM_SIZE, + body: { metadata: body.metadata, operations: body.operations }, + }); + + if (bufferOutcome.kind === "not_found") { return json({ error: "Task Run not found" }, { status: 404 }); } + if (bufferOutcome.kind === "metadata_too_large") { + // Mirror PG's `MetadataTooLargeError` (413). + return json( + { + error: `Metadata exceeds maximum size of ${bufferOutcome.maximumSize} bytes`, + }, + { status: 413 } + ); + } + if (bufferOutcome.kind === "busy") { + // Entry is materialising. Best path is to retry the PG call — + // the row may be visible now. We don't waste a roundtrip in + // the happy path, but a 503 here would be customer-visible + // breakage for legitimately-burst workloads. Hand back 503 with + // a retry hint; SDK retry policy converges. + return json({ error: "Run materialising, retry shortly" }, { status: 503 }); + } + if (bufferOutcome.kind === "version_exhausted") { + // Pathological contention — many concurrent metadata writers on + // the same buffered runId. Surface as 503 rather than silently + // dropping the request. + return json({ error: "Metadata write contention; retry shortly" }, { status: 503 }); + } + + // Buffered metadata mutation succeeded. Fan parent/root operations + // out to their respective runs (parent/root are typically PG- + // materialised by the time the child is buffered, so the existing + // service handles them; if they're also buffered, the helper + // recurses through the buffered mutation path). + // + // Use the parent/root friendlyIds the buffered mutation captured + // during its internal read — NOT a second `findRunByIdWithMollifierFallback` + // call here. The drainer's terminal-failure path DELetes the entry + // hash atomically, so if it fires between the primary mutation + // landing and our route's second read, `bufferedEntry` would come + // back null and the route would silently drop `parentOperations` / + // `rootOperations` after the customer's primary mutation already + // landed on the snapshot. Capturing the ids in the helper's first + // CAS read closes that race. + // + // Self-fallback to `runId` matches PG semantics: the PG service + // routes to `taskRun.parentTaskRun?.id ?? taskRun.id` and + // `taskRun.rootTaskRun?.id ?? taskRun.id`, so a top-level run's + // parent/root ops land on itself rather than being silently + // dropped. + await Promise.all([ + routeOperationsToRun( + bufferOutcome.parentTaskRunFriendlyId ?? runId, + body.parentOperations, + env, + ), + routeOperationsToRun( + bufferOutcome.rootTaskRunFriendlyId ?? runId, + body.rootOperations, + env, + ), + ]); - return json(result, { status: 200 }); + // Wire-shape parity with the PG branch. `UpdateMetadataService.call` + // returns `{ metadata: }` (see `updateMetadata.server.ts:356-358`), + // sourced from `applyResults.newMetadata` / `parsePacket(metadataPacket)` + // — both parsed `Record`. `bufferOutcome.newMetadata` + // is typed identically (`applyMetadataMutation.server.ts:27`). SDK + // consumers see the same response shape regardless of which branch + // serves the request. + return json({ metadata: bufferOutcome.newMetadata }, { status: 200 }); } ); diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts index be0d12087b6..a5250e5b850 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts @@ -9,33 +9,69 @@ import { } from "~/services/routeBuilders/apiBuilder.server"; import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server"; import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { buildSyntheticSpanDetailBody } from "~/v3/mollifier/syntheticApiResponses.server"; const ParamsSchema = z.object({ runId: z.string(), spanId: z.string(), }); +// Resolve the run from either Postgres or the mollifier buffer. +// Buffered runs only have one valid spanId (the queued span recorded at +// gate time and reused as the run's root spanId when the drainer +// materialises). Any other spanId returns a deterministic 404; the queued +// span returns a minimal synthesised shape so the customer's SDK sees the +// same 200 contract they'd get for a freshly-triggered run. +type ResolvedRun = + | { source: "pg"; run: Awaited> & {} } + | { source: "buffer"; run: NonNullable>> }; + +async function findPgRun(runId: string, environmentId: string) { + return $replica.taskRun.findFirst({ + where: { friendlyId: runId, runtimeEnvironmentId: environmentId }, + }); +} + export const loader = createLoaderApiRoute( { params: ParamsSchema, allowJWT: true, corsStrategy: "all", - findResource: (params, auth) => { - return $replica.taskRun.findFirst({ - where: { - friendlyId: params.runId, - runtimeEnvironmentId: auth.environment.id, - }, + findResource: async (params, auth): Promise => { + const pgRun = await findPgRun(params.runId, auth.environment.id); + if (pgRun) return { source: "pg", run: pgRun }; + + const buffered = await findRunByIdWithMollifierFallback({ + runId: params.runId, + environmentId: auth.environment.id, + organizationId: auth.environment.organizationId, }); + if (buffered) return { source: "buffer", run: buffered }; + + return null; }, shouldRetryNotFound: true, authorization: { action: "read", - resource: (run) => { + resource: (resolved) => { + if (resolved.source === "pg") { + const run = resolved.run; + const resources = [ + { type: "runs", id: run.friendlyId }, + { type: "tasks", id: run.taskIdentifier }, + ...run.runTags.map((tag) => ({ type: "tags", id: tag })), + ]; + if (run.batchId) { + resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) }); + } + return anyResource(resources); + } + const run = resolved.run; const resources = [ { type: "runs", id: run.friendlyId }, - { type: "tasks", id: run.taskIdentifier }, - ...run.runTags.map((tag) => ({ type: "tags", id: tag })), + ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []), + ...run.tags.map((tag) => ({ type: "tags", id: tag })), ]; if (run.batchId) { resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) }); @@ -44,7 +80,20 @@ export const loader = createLoaderApiRoute( }, }, }, - async ({ params, resource: run, authentication }) => { + async ({ params, resource: resolved, authentication }) => { + if (resolved.source === "buffer") { + // Buffered runs have exactly one valid spanId — the queued span the + // mollifier gate recorded at trigger time, which becomes the run's + // root spanId once the drainer materialises. Any other spanId is a + // deterministic 404. The matching spanId returns a minimal shape + // representing "span exists, no execution data yet." + if (resolved.run.spanId !== params.spanId) { + return json({ error: "Span not found" }, { status: 404 }); + } + return json(buildSyntheticSpanDetailBody(resolved.run), { status: 200 }); + } + + const run = resolved.run; const eventRepository = await getEventRepositoryForStore( run.taskEventStore, authentication.environment.organization.id diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts index eae94375b9f..ef7f3180bf3 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts @@ -1,22 +1,39 @@ import { type ActionFunctionArgs, json } from "@remix-run/server-runtime"; import { AddTagsRequestBody } from "@trigger.dev/core/v3"; +import type { BufferEntry } from "@trigger.dev/redis-worker"; import { z } from "zod"; import { prisma } from "~/db.server"; import { MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; +import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; + +// Pull the existing tags out of a buffer entry's serialised payload so +// the buffer-path response can dedup against them, matching the +// PG-path's `newTags.length` count rather than the pre-dedup input +// count. Returns null on any parse failure / shape mismatch so the +// caller can fall back gracefully. +function parseSnapshotTags(entry: BufferEntry | null): string[] | null { + if (!entry) return null; + try { + const snapshot = JSON.parse(entry.payload) as { tags?: unknown }; + if (!Array.isArray(snapshot.tags)) return null; + return snapshot.tags.filter((t): t is string => typeof t === "string"); + } catch { + return null; + } +} const ParamsSchema = z.object({ runId: z.string(), }); export async function action({ request, params }: ActionFunctionArgs) { - // Ensure this is a POST request if (request.method.toUpperCase() !== "POST") { return { status: 405, body: "Method Not Allowed" }; } - // Authenticate the request const authenticationResult = await authenticateApiRequest(request); if (!authenticationResult) { return json({ error: "Invalid or Missing API Key" }, { status: 401 }); @@ -32,59 +49,89 @@ export async function action({ request, params }: ActionFunctionArgs) { try { const anyBody = await request.json(); - const body = AddTagsRequestBody.safeParse(anyBody); if (!body.success) { return json({ error: "Invalid request body", issues: body.error.issues }, { status: 400 }); } - - const run = await prisma.taskRun.findFirst({ - where: { - friendlyId: parsedParams.data.runId, - runtimeEnvironmentId: authenticationResult.environment.id, - }, - select: { - runTags: true, - }, - }); - - const existingTags = run?.runTags ?? []; - - //remove duplicate tags from the new tags const bodyTags = typeof body.data.tags === "string" ? [body.data.tags] : body.data.tags; - const newTags = bodyTags.filter((tag) => { - if (tag.trim().length === 0) return false; - return !existingTags.includes(tag); - }); - - if (existingTags.length + newTags.length > MAX_TAGS_PER_RUN) { - return json( - { - error: `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${ - existingTags.length + newTags.length - }. These tags have not been set: ${newTags.map((t) => `'${t}'`).join(", ")}.`, - }, - { status: 422 } - ); - } + const nonEmptyTags = bodyTags.filter((t) => t.trim().length > 0); - if (newTags.length === 0) { + if (nonEmptyTags.length === 0) { return json({ message: "No new tags to add" }, { status: 200 }); } - await prisma.taskRun.update({ - where: { - friendlyId: parsedParams.data.runId, - runtimeEnvironmentId: authenticationResult.environment.id, + const env = authenticationResult.environment; + const outcome = await mutateWithFallback({ + runId: parsedParams.data.runId, + environmentId: env.id, + organizationId: env.organizationId, + bufferPatch: { type: "append_tags", tags: nonEmptyTags, maxTags: MAX_TAGS_PER_RUN }, + pgMutation: async (taskRun) => { + const existing = taskRun.runTags ?? []; + const newTags = nonEmptyTags.filter((t) => !existing.includes(t)); + + if (existing.length + newTags.length > MAX_TAGS_PER_RUN) { + return json( + { + error: `Runs can only have ${MAX_TAGS_PER_RUN} tags, you're trying to set ${ + existing.length + newTags.length + }. These tags have not been set: ${newTags.map((t) => `'${t}'`).join(", ")}.`, + }, + { status: 422 } + ); + } + if (newTags.length === 0) { + return json({ message: "No new tags to add" }, { status: 200 }); + } + await prisma.taskRun.update({ + where: { + id: taskRun.id, + runtimeEnvironmentId: env.id, + }, + data: { runTags: { push: newTags } }, + }); + return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); }, - data: { - runTags: { - push: newTags, - }, + // Buffer-applied patch path. The mutateSnapshot Lua deduplicates + // against existing snapshot tags atomically and enforces + // MAX_TAGS_PER_RUN via the `maxTags` we pass in `bufferPatch` — + // matching the PG-path cap above so a buffered run can't exceed the + // limit the trigger validator applies at creation. + // + // Dedup the success-count off the pre-mutation entry (already + // fetched by mutateWithFallback's env-auth pre-check, so no extra + // Redis read) so the message reports the same `newTags.length` the + // PG path reports — not the pre-dedup request count, which would + // give an inconsistent number across the buffered/materialised + // boundary for the same input. + synthesisedResponse: ({ bufferEntry }) => { + const existing = parseSnapshotTags(bufferEntry); + const newTagsCount = existing + ? nonEmptyTags.filter((t) => !existing.includes(t)).length + : nonEmptyTags.length; + return json( + { message: `Successfully set ${newTagsCount} new tags.` }, + { status: 200 } + ); }, + // Buffer rejected the append because it would exceed the cap. We + // don't know the exact deduped overflow count here (the Lua does), + // so report the limit rather than a precise "trying to set N". + rejectedResponse: () => + json( + { error: `Runs can only have ${MAX_TAGS_PER_RUN} tags.` }, + { status: 422 } + ), + abortSignal: getRequestAbortSignal(), }); - return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); + if (outcome.kind === "not_found") { + return json({ error: "Run not found" }, { status: 404 }); + } + if (outcome.kind === "timed_out") { + return json({ error: "Run materialisation timed out" }, { status: 503 }); + } + return outcome.response; } catch (error) { logger.error("Failed to add run tags", { error }); return json({ error: "Something went wrong, please try again." }, { status: 500 }); diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts b/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts index 77e6a4df043..04ae398194f 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts @@ -8,32 +8,68 @@ import { } from "~/services/routeBuilders/apiBuilder.server"; import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server"; import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { buildSyntheticTraceBody } from "~/v3/mollifier/syntheticApiResponses.server"; const ParamsSchema = z.object({ runId: z.string(), // This is the run friendly ID }); +// Discriminator on the resolved resource — `pg` is the real Prisma TaskRun +// row, `buffer` is a synthesised shape from the mollifier buffer for runs +// whose drainer hasn't yet materialised them. The handler renders an empty +// trace for buffered runs so the customer sees the same 200 shape they'd +// get for a freshly-triggered PG run with no spans yet (matches the +// pass-through control case in scripts/mollifier-api-parity.sh). +type ResolvedRun = + | { source: "pg"; run: Awaited> & {} } + | { source: "buffer"; run: NonNullable>> }; + +async function findPgRun(runId: string, environmentId: string) { + return $replica.taskRun.findFirst({ + where: { friendlyId: runId, runtimeEnvironmentId: environmentId }, + }); +} + export const loader = createLoaderApiRoute( { params: ParamsSchema, allowJWT: true, corsStrategy: "all", - findResource: (params, auth) => { - return $replica.taskRun.findFirst({ - where: { - friendlyId: params.runId, - runtimeEnvironmentId: auth.environment.id, - }, + findResource: async (params, auth): Promise => { + const pgRun = await findPgRun(params.runId, auth.environment.id); + if (pgRun) return { source: "pg", run: pgRun }; + + const buffered = await findRunByIdWithMollifierFallback({ + runId: params.runId, + environmentId: auth.environment.id, + organizationId: auth.environment.organizationId, }); + if (buffered) return { source: "buffer", run: buffered }; + + return null; }, shouldRetryNotFound: true, authorization: { action: "read", - resource: (run) => { + resource: (resolved) => { + if (resolved.source === "pg") { + const run = resolved.run; + const resources = [ + { type: "runs", id: run.friendlyId }, + { type: "tasks", id: run.taskIdentifier }, + ...run.runTags.map((tag) => ({ type: "tags", id: tag })), + ]; + if (run.batchId) { + resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) }); + } + return anyResource(resources); + } + const run = resolved.run; const resources = [ { type: "runs", id: run.friendlyId }, - { type: "tasks", id: run.taskIdentifier }, - ...run.runTags.map((tag) => ({ type: "tags", id: tag })), + ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []), + ...run.tags.map((tag) => ({ type: "tags", id: tag })), ]; if (run.batchId) { resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) }); @@ -42,7 +78,17 @@ export const loader = createLoaderApiRoute( }, }, }, - async ({ resource: run, authentication }) => { + async ({ resource: resolved, authentication }) => { + if (resolved.source === "buffer") { + // Buffered runs have no events ingested yet — the drainer hasn't + // materialised the PG row and the worker hasn't started executing. + // The helper synthesises a single root span that satisfies the SDK's + // RetrieveRunTraceResponseBody schema (rootSpan is non-nullable) and + // reflects the buffered terminal state. + return json(buildSyntheticTraceBody(resolved.run), { status: 200 }); + } + + const run = resolved.run; const eventRepository = await getEventRepositoryForStore( run.taskEventStore, authentication.environment.organization.id diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts index 72ad202467d..4bb5922997f 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts @@ -1,10 +1,12 @@ import type { ActionFunctionArgs } from "@remix-run/server-runtime"; import { json } from "@remix-run/server-runtime"; +import type { TaskRun } from "@trigger.dev/database"; import { z } from "zod"; import { prisma } from "~/db.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { ReplayTaskRunService } from "~/v3/services/replayTaskRun.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; import { sanitizeTriggerSource } from "~/utils/triggerSource"; const ParamsSchema = z.object({ @@ -12,6 +14,39 @@ const ParamsSchema = z.object({ runParam: z.string(), }); +// Subset of TaskRun fields that ReplayTaskRunService.call actually +// reads from `existingTaskRun`. Validate the buffered fallback against +// this before casting to TaskRun so a buffer-format drift surfaces as a +// 404/422 here rather than as a silent NaN/undefined deep inside +// replay. The full TaskRun type has many more fields the service never +// touches; we only assert the ones it reads. +const BufferedReplayInputSchema = z.object({ + id: z.string(), + friendlyId: z.string(), + runtimeEnvironmentId: z.string(), + taskIdentifier: z.string(), + payload: z.string(), + payloadType: z.string(), + queue: z.string(), + isTest: z.boolean(), + traceId: z.string(), + spanId: z.string(), + engine: z.string(), + runTags: z.array(z.string()), + // Nullable / optional fields the service tolerates via `??` fallbacks. + concurrencyKey: z.string().nullable().optional(), + workerQueue: z.string().nullable().optional(), + machinePreset: z.string().nullable().optional(), + realtimeStreamsVersion: z.string().nullable().optional(), + // ReplayTaskRunService.getExistingMetadata reads these to preserve + // the original run's metadata on replay. Without them in the schema + // they'd be stripped by Zod's default key-passthrough behaviour, and + // a buffered-source replay would silently lose metadata that a + // PG-source replay carries over. + seedMetadata: z.string().nullable().optional(), + seedMetadataType: z.string().nullable().optional(), +}); + export async function action({ request, params }: ActionFunctionArgs) { // Ensure this is a POST request if (request.method.toUpperCase() !== "POST") { @@ -32,12 +67,57 @@ export async function action({ request, params }: ActionFunctionArgs) { const { runParam } = parsed.data; try { - const taskRun = await prisma.taskRun.findUnique({ + const env = authenticationResult.environment; + // PG-first. Replay works on any status per audit — no + // filter beyond friendlyId is the existing semantic; findFirst with + // env scoping tightens it minimally without changing behaviour for + // a correctly-authed caller. + let taskRun: TaskRun | null = await prisma.taskRun.findFirst({ where: { friendlyId: runParam, + runtimeEnvironmentId: env.id, }, }); + if (!taskRun) { + // Buffered fallback. SyntheticRun carries every field + // ReplayTaskRunService reads from a TaskRun. Validate the subset of + // fields the service consumes (BufferedReplayInputSchema above) + // before casting; a schema mismatch surfaces as a 404 here rather + // than as a silent undefined deep inside the service. + const buffered = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: env.id, + organizationId: env.organizationId, + }); + if (buffered) { + const parsed = BufferedReplayInputSchema.safeParse(buffered); + if (parsed.success) { + // Manual sync point: `BufferedReplayInputSchema` covers only + // the subset of `TaskRun` fields `ReplayTaskRunService.call` + // currently reads from `existingTaskRun`. The cast is `as + // unknown as TaskRun` because the full `TaskRun` type carries + // ~40 fields the service never touches; mirroring all of them + // on a synthetic snapshot would be misleading. If a future + // change to `ReplayTaskRunService` reads an additional + // `existingTaskRun` field, **add it to the schema above** — + // otherwise the buffered path will silently feed the service + // `undefined` for that field while the PG-source replay + // works. The `safeParse` + warn-log + 404 below is the + // run-time fail-safe; this comment is the design fail-safe. + taskRun = parsed.data as unknown as TaskRun; + } else { + logger.warn("replay: buffered fallback failed schema validation", { + runParam, + issues: parsed.error.issues.map((issue) => ({ + path: issue.path.join("."), + code: issue.code, + })), + }); + } + } + } + if (!taskRun) { return json({ error: "Run not found" }, { status: 404 }); } diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts index 0ac8aec8351..cbdd9807d8b 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runParam.reschedule.ts @@ -3,90 +3,162 @@ import { json } from "@remix-run/server-runtime"; import { RescheduleRunRequestBody } from "@trigger.dev/core/v3/schemas"; import { z } from "zod"; import { getApiVersion } from "~/api/versions"; -import { prisma } from "~/db.server"; import { ApiRetrieveRunPresenter } from "~/presenters/v3/ApiRetrieveRunPresenter.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; import { ServiceValidationError } from "~/v3/services/baseService.server"; import { RescheduleTaskRunService } from "~/v3/services/rescheduleTaskRun.server"; +import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { parseDelay } from "~/utils/delays"; const ParamsSchema = z.object({ runParam: z.string(), }); export async function action({ request, params }: ActionFunctionArgs) { - // Ensure this is a POST request if (request.method.toUpperCase() !== "POST") { return { status: 405, body: "Method Not Allowed" }; } - // Authenticate the request const authenticationResult = await authenticateApiRequest(request); - if (!authenticationResult) { return json({ error: "Invalid or missing API Key" }, { status: 401 }); } const parsed = ParamsSchema.safeParse(params); - if (!parsed.success) { return json({ error: "Invalid or missing run ID" }, { status: 400 }); } - const { runParam } = parsed.data; - - const taskRun = await prisma.taskRun.findUnique({ - where: { - friendlyId: runParam, - runtimeEnvironmentId: authenticationResult.environment.id, - }, - }); - - if (!taskRun) { - return json({ error: "Run not found" }, { status: 404 }); - } - const anyBody = await request.json(); - const body = RescheduleRunRequestBody.safeParse(anyBody); - if (!body.success) { return json({ error: "Invalid request body" }, { status: 400 }); } - const service = new RescheduleTaskRunService(); + const env = authenticationResult.environment; + // Pre-resolve the absolute Date the buffer snapshot should encode. + // RescheduleTaskRunService expects this to be present on the body for + // its PG-side flow; for the buffer-side patch we encode the same + // wall-clock value so the drainer's engine.trigger sees the intended + // delayUntil after materialisation. + // + // Wire-compat: pre-PR the validation lived inside + // `RescheduleTaskRunService.call` (rescheduleTaskRun.server.ts:14-18), + // which throws `ServiceValidationError("Invalid delay: …")`. The + // route's catch block below converts that to status **400** (not + // 422 — `ServiceValidationError` defaults to 422 but this route's + // catch block has always returned 400). Mirror that 400 + message + // shape here so SDK consumers keying retry/classification logic on + // 400 see no behavioural drift now that the parse is hoisted to the + // route layer. + const delayUntil = await parseDelay(body.data.delay); + if (!delayUntil) { + return json({ error: `Invalid delay: ${body.data.delay}` }, { status: 400 }); + } try { - const updatedRun = await service.call(taskRun, body.data); - - if (!updatedRun) { - return json({ error: "An unknown error occurred" }, { status: 500 }); + // PG-side `RescheduleTaskRunService.call` enforces + // `taskRun.status !== "DELAYED"` and 422s otherwise — without an + // equivalent guard the buffer path would happily inject a + // `delayUntil` into the snapshot of a non-delayed buffered run, and + // the drainer would materialise it with an unintended delay. The + // SyntheticRun type doesn't carry a "DELAYED" enum value because + // it's not a terminal status the trace API needs to express; the + // buffered analogue is `delayUntil` set in the snapshot. Gate on + // that. + // + // Only apply the guard when the buffer entry is NOT yet + // materialised. Post-materialise the entry sticks around for a + // 30s grace TTL with `materialised: true`, but the PG row is now + // canonical — its DELAYED state may differ from what the snapshot + // recorded at trigger time (e.g. a prior reschedule via the PG + // path, or a delay set by the engine through another mechanism). + // Reading from the stale snapshot would 422 a legitimately-DELAYED + // PG row. When `materialised` we let `mutateWithFallback` route to + // PG, which runs its own canonical DELAYED check. + const buffer = getMollifierBuffer(); + const entry = buffer ? await buffer.getEntry(parsed.data.runParam) : null; + const isLiveBuffered = + entry !== null && + entry.materialised !== true && + entry.envId === env.id && + entry.orgId === env.organizationId; + if (isLiveBuffered) { + const snapshot = JSON.parse(entry.payload) as Record; + const snapshotDelayUntil = + typeof snapshot.delayUntil === "string" ? snapshot.delayUntil : undefined; + if (!snapshotDelayUntil) { + return json( + { error: "Cannot reschedule a run that is not delayed" }, + { status: 422 }, + ); + } } - const run = await ApiRetrieveRunPresenter.findRun( - updatedRun.friendlyId, - authenticationResult.environment - ); - - if (!run) { + const outcome = await mutateWithFallback({ + runId: parsed.data.runParam, + environmentId: env.id, + organizationId: env.organizationId, + bufferPatch: { + type: "set_delay", + delayUntil: delayUntil.toISOString(), + }, + pgMutation: async (taskRun) => { + const service = new RescheduleTaskRunService(); + const updatedRun = await service.call(taskRun, body.data); + if (!updatedRun) { + return json({ error: "An unknown error occurred" }, { status: 500 }); + } + + const run = await ApiRetrieveRunPresenter.findRun(updatedRun.friendlyId, env); + if (!run) { + return json({ error: "Run not found" }, { status: 404 }); + } + const apiVersion = getApiVersion(request); + const presenter = new ApiRetrieveRunPresenter(apiVersion); + const result = await presenter.call(run, env); + if (!result) { + return json({ error: "Run not found" }, { status: 404 }); + } + return json(result); + }, + // Buffered snapshot has been patched. Run it through the same + // ApiRetrieveRunPresenter the PG branch uses (it falls back to + // the buffer for the SyntheticRun lookup) so the response shape + // matches `RetrieveRunResponse` — that's what the SDK's + // `rescheduleRun` zod-validates against. Returning a stripped + // `{ id, delayUntil }` object fails the SDK schema on every + // existing SDK version. + synthesisedResponse: async () => { + const run = await ApiRetrieveRunPresenter.findRun(parsed.data.runParam, env); + if (!run) { + return json({ error: "Run not found" }, { status: 404 }); + } + const apiVersion = getApiVersion(request); + const presenter = new ApiRetrieveRunPresenter(apiVersion); + const result = await presenter.call(run, env); + if (!result) { + return json({ error: "Run not found" }, { status: 404 }); + } + return json(result); + }, + abortSignal: getRequestAbortSignal(), + }); + + if (outcome.kind === "not_found") { return json({ error: "Run not found" }, { status: 404 }); } - - const apiVersion = getApiVersion(request); - - const presenter = new ApiRetrieveRunPresenter(apiVersion); - const result = await presenter.call(run, authenticationResult.environment); - - if (!result) { - return json({ error: "Run not found" }, { status: 404 }); + if (outcome.kind === "timed_out") { + return json({ error: "Run materialisation timed out" }, { status: 503 }); } - - return json(result); + return outcome.response; } catch (error) { if (error instanceof ServiceValidationError) { return json({ error: error.message }, { status: 400 }); } - logger.error("Failed to reschedule run", { error }); return json({ error: "Something went wrong, please try again." }, { status: 500 }); } diff --git a/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts b/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts index 8206a90f320..1f8a42af08c 100644 --- a/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts +++ b/apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts @@ -134,7 +134,20 @@ const { action, loader } = createActionApiRoute( return json({ error: "Task not found" }, { status: 404 }); } - await saveRequestIdempotency(requestIdempotencyKey, "trigger", result.run.id); + // Skip request-idempotency caching when the gate diverted to the + // mollifier buffer. `result.run.id` is a synthesised cuid with no + // corresponding PG row, so a lost-response SDK retry that reaches + // `handleRequestIdempotency` would lookup that id, miss in PG, and + // fall through to a fresh trigger — producing a duplicate buffer + // entry for triggers without a task-level idempotency key (the + // task-level path still dedupes via the buffer's SETNX in + // `findBufferedRunWithIdempotency`). Accepting the retry-as-fresh- + // trigger semantics here is bounded by the drainer's eventual + // materialisation: once the run lands in PG, normal request- + // idempotency from that point forward works as usual. + if (!result.isMollified) { + await saveRequestIdempotency(requestIdempotencyKey, "trigger", result.run.id); + } const $responseHeaders = await responseHeaders(result.run, authentication); diff --git a/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts b/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts index a636ca0cc1d..f02b058b272 100644 --- a/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts +++ b/apps/webapp/app/routes/api.v2.runs.$runParam.cancel.ts @@ -1,8 +1,13 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; -import { $replica } from "~/db.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server"; +import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; +import { + resolveRunForMutation, + type ResolvedRunForMutation, +} from "~/v3/mollifier/resolveRunForMutation.server"; const ParamsSchema = z.object({ runParam: z.string(), @@ -17,29 +22,55 @@ const { action } = createActionApiRoute( action: "write", resource: (params) => ({ type: "runs", id: params.runParam }), }, - findResource: async (params, auth) => { - return $replica.taskRun.findFirst({ - where: { - friendlyId: params.runParam, - runtimeEnvironmentId: auth.environment.id, - }, - }); - }, + // PG-or-buffer resolver. Returning null here would 404 BEFORE the + // action runs (`apiBuilder.server.ts:321`), so buffered cancels need + // a buffer check at this layer too. Logic lives in a helper so the + // three paths (PG hit, buffer hit, both miss) are unit-tested + // independently of the route builder. The action's mutateWithFallback + // call repeats the lookup atomically — slightly redundant but keeps + // wait-and-bounce semantics intact. + findResource: async (params, auth): Promise => + resolveRunForMutation({ + runParam: params.runParam, + environmentId: auth.environment.id, + organizationId: auth.environment.organizationId, + }), }, - async ({ resource }) => { - if (!resource) { - return json({ error: "Run not found" }, { status: 404 }); - } + async ({ params, authentication }) => { + const runId = params.runParam; + const env = authentication.environment; + const cancelledAt = new Date(); + const cancelReason = "Canceled by user"; - const service = new CancelTaskRunService(); + const outcome = await mutateWithFallback({ + runId, + environmentId: env.id, + organizationId: env.organizationId, + bufferPatch: { + type: "mark_cancelled", + cancelledAt: cancelledAt.toISOString(), + cancelReason, + }, + pgMutation: async (taskRun) => { + const service = new CancelTaskRunService(); + try { + await service.call(taskRun); + } catch { + return json({ error: "Internal Server Error" }, { status: 500 }); + } + return json({ id: taskRun.friendlyId }, { status: 200 }); + }, + synthesisedResponse: () => json({ id: runId }, { status: 200 }), + abortSignal: getRequestAbortSignal(), + }); - try { - await service.call(resource); - } catch (error) { - return json({ error: "Internal Server Error" }, { status: 500 }); + if (outcome.kind === "not_found") { + return json({ error: "Run not found" }, { status: 404 }); } - - return json({ id: resource.friendlyId }, { status: 200 }); + if (outcome.kind === "timed_out") { + return json({ error: "Run materialisation timed out" }, { status: 503 }); + } + return outcome.response; } ); diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx index 09f3f33fcb3..7e825fe303d 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx @@ -120,6 +120,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { try { const result = await presenter.call({ projectSlug: projectParam, + envSlug: envParam, spanId: spanParam, runFriendlyId: runParam, userId, @@ -1021,6 +1022,10 @@ function RunBody({ Admin only + + Buffered + {run.isBuffered ? "Yes" : "No"} + Worker queue {run.workerQueue} @@ -1096,7 +1101,7 @@ function RunBody({ {run.isCached ? "Jump to original run" : "Focus on run"} )} - + {!run.isBuffered && }
{run.logsDeletedAt === null ? ( diff --git a/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts b/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts index 5c7725c510b..c2a6fa9590c 100644 --- a/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts +++ b/apps/webapp/app/routes/resources.runs.$runParam.logs.download.ts @@ -9,6 +9,8 @@ import { formatDurationMilliseconds } from "@trigger.dev/core/v3/utils/durations import { getTaskEventStoreTableForRun } from "~/v3/taskEventStore.server"; import { TaskEventKind } from "@trigger.dev/database"; import { getEventRepositoryForStore } from "~/v3/eventRepository/index.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { deserialiseMollifierSnapshot } from "~/v3/mollifier/mollifierSnapshot.server"; export async function loader({ params, request }: LoaderFunctionArgs) { const user = await requireUser(request); @@ -30,6 +32,67 @@ export async function loader({ params, request }: LoaderFunctionArgs) { }); if (!run || !run.organizationId) { + // Buffered run? It hasn't executed, so there are no events to + // stream — but a 404 is wrong: the run does exist, the customer's + // "Download logs" button on the run-detail page generates this + // exact URL, and a 404 reads as "your run vanished" rather than + // "no logs yet". Verify the entry exists in the buffer (with the + // user as a member of the entry's org), and if so stream a single + // informational line in the same ` + // ` shape `formatRunEvent` uses below — so a downstream + // log viewer / grep over the downloaded file produces a + // meaningful explanation, not a 0-byte mystery. + const buffer = getMollifierBuffer(); + if (buffer) { + const entry = await buffer.getEntry(parsedParams.runParam); + if (entry) { + const member = await prisma.orgMember.findFirst({ + where: { userId: user.id, organizationId: entry.orgId }, + select: { id: true }, + }); + if (member) { + let taskIdentifier: string | undefined; + try { + // Use the shared webapp wrapper rather than raw JSON.parse so + // every read-side module shares a single deserialisation path + // (see contract comment in `mollifierSnapshot.server.ts` and + // `syntheticRedirectInfo.server.ts`). Keeps behaviour + // consistent if the snapshot encoding ever changes. + const snapshot = deserialiseMollifierSnapshot(entry.payload) as { + taskIdentifier?: unknown; + }; + if (typeof snapshot.taskIdentifier === "string") { + taskIdentifier = snapshot.taskIdentifier; + } + } catch { + // Fall through — taskIdentifier stays undefined. + } + const placeholderParts = [ + entry.createdAt.toISOString(), + ...(taskIdentifier ? [taskIdentifier] : []), + "INFO", + "Run is queued, has not started executing yet — no logs to download.", + ]; + const placeholder = placeholderParts.join(" ") + "\n"; + const placeholderReadable = new Readable({ + read() { + this.push(placeholder); + this.push(null); + }, + }); + const gzipStream = createGzip(); + const compressed = placeholderReadable.pipe(gzipStream); + return new Response(compressed as any, { + status: 200, + headers: { + "Content-Type": "application/octet-stream", + "Content-Disposition": `attachment; filename="${parsedParams.runParam}.log"`, + "Content-Encoding": "gzip", + }, + }); + } + } + } return new Response("Not found", { status: 404 }); } diff --git a/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts b/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts index 240d7d3d8ed..fa6ee29f3db 100644 --- a/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts +++ b/apps/webapp/app/routes/resources.taskruns.$runParam.cancel.ts @@ -6,6 +6,7 @@ import { redirectWithErrorMessage, redirectWithSuccessMessage } from "~/models/m import { logger } from "~/services/logger.server"; import { requireUserId } from "~/services/session.server"; import { CancelTaskRunService } from "~/v3/services/cancelTaskRun.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; export const cancelSchema = z.object({ redirectUrl: z.string(), @@ -42,15 +43,56 @@ export const action: ActionFunction = async ({ request, params }) => { }, }); - if (!taskRun) { + if (taskRun) { + const cancelRunService = new CancelTaskRunService(); + await cancelRunService.call(taskRun); + return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`); + } + + // PG miss — try the mollifier buffer. The customer can hit cancel + // on a buffered run from the dashboard during the burst window. + // Snapshot a `mark_cancelled` patch; the drainer's + // bifurcation routes the run to `engine.createCancelledRun` on + // next pop. + const buffer = getMollifierBuffer(); + const entry = buffer ? await buffer.getEntry(runParam) : null; + if (!entry) { submission.error = { runParam: ["Run not found"] }; return json(submission); } - const cancelRunService = new CancelTaskRunService(); - await cancelRunService.call(taskRun); + // Dashboard auth: verify the requesting user is a member of the + // buffered run's org. The API path scopes by env id from the + // authenticated request; the dashboard route uses org-membership + // because the URL doesn't carry an envId. + const member = await prisma.orgMember.findFirst({ + where: { userId, organizationId: entry.orgId }, + select: { id: true }, + }); + if (!member) { + submission.error = { runParam: ["Run not found"] }; + return json(submission); + } - return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`); + const result = await buffer!.mutateSnapshot(runParam, { + type: "mark_cancelled", + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }); + if (result === "applied_to_snapshot") { + return redirectWithSuccessMessage(submission.value.redirectUrl, request, `Canceled run`); + } + // "not_found" or "busy" — both indicate the drainer raced us between + // the getEntry check above and mutateSnapshot. On "not_found" the + // entry was just popped and the PG row is in flight; on "busy" the + // drainer is mid-materialisation. Either way the customer should + // retry — by then the PG row exists and the regular cancel path at + // the top of this action takes over. + return redirectWithErrorMessage( + submission.value.redirectUrl, + request, + "Run is materialising — retry in a moment" + ); } catch (error) { if (error instanceof Error) { logger.error("Failed to cancel run", { diff --git a/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts b/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts index 8a22822d06b..507d3cc706f 100644 --- a/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts +++ b/apps/webapp/app/routes/resources.taskruns.$runParam.replay.ts @@ -11,6 +11,12 @@ import { requireUser } from "~/services/session.server"; import { sortEnvironments } from "~/utils/environmentSort"; import { v3RunSpanPath } from "~/utils/pathBuilder"; import { ReplayTaskRunService } from "~/v3/services/replayTaskRun.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { + buildSyntheticReplayTaskRun, + type SyntheticReplayTaskRun, +} from "~/v3/mollifier/syntheticReplayTaskRun.server"; import parseDuration from "parse-duration"; import { findCurrentWorkerDeployment } from "~/v3/models/workerDeployment.server"; import { queueTypeFromType } from "~/presenters/v3/QueueRetrievePresenter.server"; @@ -33,7 +39,7 @@ export async function loader({ request, params }: LoaderFunctionArgs) { Object.fromEntries(new URL(request.url).searchParams) ); - const run = await $replica.taskRun.findFirst({ + let run = await $replica.taskRun.findFirst({ select: { payload: true, payloadType: true, @@ -88,6 +94,83 @@ export async function loader({ request, params }: LoaderFunctionArgs) { where: { friendlyId: runParam, project: { organization: { members: { some: { userId } } } } }, }); + let synthetic: + | (Awaited> & { __synth: true }) + | undefined; + if (!run) { + // Buffered fallback: read the snapshot and look up the env list via + // the snapshot's organizationId. Without this the Replay dialog + // 404s for runs queued in the mollifier buffer, which dumps the + // user back to the task list. + const buffer = getMollifierBuffer(); + const entry = buffer ? await buffer.getEntry(runParam) : null; + if (!entry) throw new Response("Not Found", { status: 404 }); + const member = await prisma.orgMember.findFirst({ + where: { userId, organizationId: entry.orgId }, + select: { id: true }, + }); + if (!member) throw new Response("Not Found", { status: 404 }); + const buffered = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: entry.envId, + organizationId: entry.orgId, + }); + if (!buffered) throw new Response("Not Found", { status: 404 }); + synthetic = Object.assign(buffered, { __synth: true as const }); + // Scope the project lookup to the buffer entry's org as well as the + // env id. The prior `orgMember.findFirst` above confirms the user + // belongs to `entry.orgId`; pinning `organizationId` here means a + // malformed entry whose envId resolves to a different org can't leak + // that project's data through this loader. Mirrors the PG path's + // `project.organization.members.some.userId` scoping (lines 42-95) + // — the env filter and select shape are kept identical so the Replay + // dialog renders the same dropdown either way. + const orgProject = await $replica.project.findFirst({ + where: { + organizationId: entry.orgId, + environments: { some: { id: entry.envId } }, + }, + select: { + slug: true, + environments: { + select: { + id: true, + type: true, + slug: true, + branchName: true, + orgMember: { select: { user: true } }, + }, + where: { + archivedAt: null, + OR: [ + { type: { in: ["PREVIEW", "STAGING", "PRODUCTION"] } }, + { type: "DEVELOPMENT", orgMember: { userId } }, + ], + }, + }, + }, + }); + if (!orgProject) throw new Response("Not Found", { status: 404 }); + run = { + payload: buffered.payload, + payloadType: buffered.payloadType ?? "application/json", + seedMetadata: buffered.seedMetadata ?? null, + seedMetadataType: buffered.seedMetadataType ?? null, + runtimeEnvironmentId: entry.envId, + concurrencyKey: buffered.concurrencyKey ?? null, + maxAttempts: buffered.maxAttempts ?? null, + maxDurationInSeconds: buffered.maxDurationInSeconds ?? null, + machinePreset: buffered.machinePreset ?? null, + workerQueue: buffered.workerQueue ?? null, + ttl: buffered.ttl ?? null, + idempotencyKey: buffered.idempotencyKey ?? null, + runTags: buffered.runTags, + queue: buffered.queue ?? "task/", + taskIdentifier: buffered.taskIdentifier ?? "", + project: orgProject, + } as unknown as typeof run; + } + if (!run) { throw new Response("Not Found", { status: 404 }); } @@ -164,6 +247,15 @@ export async function loader({ request, params }: LoaderFunctionArgs) { } export const action: ActionFunction = async ({ request, params }) => { + // Dashboard auth: identical pattern to resources.taskruns.$runParam.cancel.ts. + // The loader above this action already gates with `requireUser`, but + // Remix's action runs independently — without this call any request + // with a valid runParam could submit a replay. The PG findFirst below + // also adds the org-membership filter so a PAT can't replay another + // org's run, and the buffered fallback verifies org membership via + // orgMember.findFirst against the snapshot's orgId. + const user = await requireUser(request); + const userId = user.id; const { runParam } = ParamSchema.parse(params); const formData = await request.formData(); @@ -174,9 +266,18 @@ export const action: ActionFunction = async ({ request, params }) => { } try { - const taskRun = await prisma.taskRun.findFirst({ + const pgRun = await prisma.taskRun.findFirst({ where: { friendlyId: runParam, + project: { + organization: { + members: { + some: { + userId, + }, + }, + }, + }, }, include: { runtimeEnvironment: { @@ -192,6 +293,50 @@ export const action: ActionFunction = async ({ request, params }) => { }, }); + // Mollifier read-fallback: if the original isn't in PG yet, + // synthesise a TaskRun from the buffered snapshot. The B4-extended + // SyntheticRun carries every field ReplayTaskRunService reads. We + // also need projectSlug + orgSlug + envSlug for the redirect path, + // so look those up via the snapshot's runtimeEnvironmentId. + let taskRun: SyntheticReplayTaskRun | null = pgRun ?? null; + if (!taskRun) { + const buffer = getMollifierBuffer(); + const entry = buffer ? await buffer.getEntry(runParam) : null; + if (entry) { + // Same org-membership gate as the PG path above. Without this + // any authenticated user who knows a runId could replay the + // buffered run across orgs. + const member = await prisma.orgMember.findFirst({ + where: { userId, organizationId: entry.orgId }, + select: { id: true }, + }); + if (!member) { + return redirectWithErrorMessage( + submission.value.failedRedirect, + request, + "Run not found" + ); + } + const synthetic = await findRunByIdWithMollifierFallback({ + runId: runParam, + environmentId: entry.envId, + organizationId: entry.orgId, + }); + if (synthetic) { + const envRow = await prisma.runtimeEnvironment.findFirst({ + where: { id: entry.envId }, + select: { + slug: true, + project: { select: { slug: true, organization: { select: { slug: true } } } }, + }, + }); + if (envRow) { + taskRun = buildSyntheticReplayTaskRun({ synthetic, envRow }); + } + } + } + } + if (!taskRun) { return redirectWithErrorMessage(submission.value.failedRedirect, request, "Run not found"); } diff --git a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts index a6fe5babe2c..55cb3311441 100644 --- a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts +++ b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts @@ -2,13 +2,50 @@ import { RunId } from "@trigger.dev/core/v3/isomorphic"; import type { PrismaClientOrTransaction, TaskRun } from "@trigger.dev/database"; import { logger } from "~/services/logger.server"; import { resolveIdempotencyKeyTTL } from "~/utils/idempotencyKeys.server"; +import { ServiceValidationError } from "~/v3/services/common.server"; import type { RunEngine } from "~/v3/runEngine.server"; import { shouldIdempotencyKeyBeCleared } from "~/v3/taskStatus"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import { claimOrAwait } from "~/v3/mollifier/idempotencyClaim.server"; +import { makeResolveMollifierFlag } from "~/v3/mollifier/mollifierGate.server"; import type { TraceEventConcern, TriggerTaskRequest } from "../types"; +// In-memory per-org mollifier-enabled check, shared with `evaluateGate` +// (same `Organization.featureFlags` JSON, no DB read). Used to gate the +// pre-gate claim's Redis round-trip so non-mollifier orgs don't pay it +// during staged rollout — see the comment above the claim block in +// handleTriggerRequest. +const resolveOrgMollifierFlag = makeResolveMollifierFlag(); + +// Claim ownership context returned to the caller when the +// IdempotencyKeyConcern won a pre-gate claim. Caller MUST publish the +// winning runId on pipeline success (`publishClaim`) or release the +// claim on failure (`releaseClaim`). +export type ClaimedIdempotency = { + envId: string; + taskIdentifier: string; + idempotencyKey: string; + // Ownership token from `claimOrAwait`. The caller's trigger pipeline + // MUST thread this into publishClaim/releaseClaim so the buffer's + // compare-and-act protects the slot against a stale predecessor. + token: string; +}; + export type IdempotencyKeyConcernResult = | { isCached: true; run: TaskRun } - | { isCached: false; idempotencyKey?: string; idempotencyKeyExpiresAt?: Date }; + | { + isCached: false; + idempotencyKey?: string; + idempotencyKeyExpiresAt?: Date; + // Set when this trigger holds a pre-gate claim. The caller's + // trigger pipeline MUST resolve the claim by either publishing + // the runId on success or releasing on failure. Undefined when + // the request has no idempotency key, when the buffer is + // unavailable, or when the request is a triggerAndWait (claim + // path skipped per plan doc). + claim?: ClaimedIdempotency; + }; export class IdempotencyKeyConcern { constructor( @@ -17,6 +54,86 @@ export class IdempotencyKeyConcern { private readonly traceEventConcern: TraceEventConcern ) {} + // Buffer-side idempotency dedup. Resolves an idempotency key against the + // mollifier buffer when PG missed. Returns a SyntheticRun cast to + // TaskRun so the route handler (which only reads run.id / run.friendlyId) + // can echo the buffered run's friendlyId as a cached hit. Returns null + // for any failure or miss — buffer outages must not 500 the trigger + // hot path; we fail open to "no cache hit" and let the request through. + private async findBufferedRunWithIdempotency( + environmentId: string, + organizationId: string, + taskIdentifier: string, + idempotencyKey: string, + ): Promise { + const buffer = getMollifierBuffer(); + if (!buffer) return null; + + let bufferedRunId: string | null; + try { + bufferedRunId = await buffer.lookupIdempotency({ + envId: environmentId, + taskIdentifier, + idempotencyKey, + }); + } catch (err) { + logger.error("IdempotencyKeyConcern: buffer lookupIdempotency failed", { + environmentId, + taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } + if (!bufferedRunId) return null; + + const synthetic = await findRunByIdWithMollifierFallback({ + runId: bufferedRunId, + environmentId, + organizationId, + }); + if (!synthetic) return null; + // PG-resident path enforces idempotency-key expiry below + // (`existingRun.idempotencyKeyExpiresAt < new Date()` clears the key + // and lets a new run go through). The buffer path needs the same + // check — without it a customer who passes `idempotencyKeyTTL: "2s"` + // gets the cached buffered runId returned indefinitely, because the + // buffer entry persists for its own (hours-long) TTL independent of + // the customer's key TTL. + // + // Returning null isn't enough on its own: the trigger pipeline then + // proceeds to `mollifyTrigger`, whose `buffer.accept` Lua dedupes by + // `(envId, taskIdentifier, idempotencyKey)` via SETNX on the same + // `mollifier:idempotency:*` key and would echo the stale runId as + // `duplicate_idempotency`. Clear the buffer-side idempotency + // binding (both the lookup and any in-flight claim) so the next + // accept goes through as a fresh trigger. Mirrors what + // `ResetIdempotencyKeyService` does for the explicit + // reset-via-API path. + if ( + synthetic.idempotencyKeyExpiresAt && + synthetic.idempotencyKeyExpiresAt < new Date() + ) { + const buffer = getMollifierBuffer(); + if (buffer) { + try { + await buffer.resetIdempotency({ + envId: environmentId, + taskIdentifier, + idempotencyKey, + }); + } catch (err) { + logger.warn("IdempotencyKeyConcern: failed to reset expired buffer idempotency", { + envId: environmentId, + taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + } + } + return null; + } + return synthetic as unknown as TaskRun; + } + async handleTriggerRequest( request: TriggerTaskRequest, parentStore: string | undefined @@ -44,6 +161,25 @@ export class IdempotencyKeyConcern { }) : undefined; + // Buffer fallback per the mollifier-idempotency design. PG missed — + // the same key may belong to a buffered run that hasn't materialised + // yet. Skipped when `resumeParentOnCompletion` is set: blocking a + // parent on a buffered child via waitpoint requires a PG row that + // doesn't exist yet. The follow-up accept's SETNX in mollifyTrigger + // still dedupes the trigger itself; the waitpoint just doesn't fire + // for this rare race window. + if (!existingRun && idempotencyKey && !request.body.options?.resumeParentOnCompletion) { + const buffered = await this.findBufferedRunWithIdempotency( + request.environment.id, + request.environment.organizationId, + request.taskId, + idempotencyKey, + ); + if (buffered) { + return { isCached: true, run: buffered }; + } + } + if (existingRun) { // The idempotency key has expired if (existingRun.idempotencyKeyExpiresAt && existingRun.idempotencyKeyExpiresAt < new Date()) { @@ -133,6 +269,133 @@ export class IdempotencyKeyConcern { return { isCached: true, run: existingRun }; } + // Pre-gate claim — closes the PG+buffer race during gate transition. + // All same-key triggers serialise here before evaluateGate decides + // PG-pass-through vs mollify. Skipped for triggerAndWait + // (resumeParentOnCompletion) — that path bypasses the gate entirely + // and its existing PG-side dedup is sufficient. + // + // Also gated on the same per-org mollifier flag the gate uses: when + // `TRIGGER_MOLLIFIER_ENABLED=1` globally for staged rollout, the buffer + // singleton is constructed and `claimOrAwait` would otherwise issue a + // Redis SETNX for EVERY idempotency-keyed trigger — including orgs + // that haven't opted in. Those orgs never enter the mollify branch + // (the gate always returns pass_through for them), so there's no + // buffer activity to serialise against; PG's unique constraint + // already deduplicates concurrent same-key races. Resolving the org + // flag is a pure in-memory read of `Organization.featureFlags` — no + // DB query, same predicate the gate uses — keeping the claim's Redis + // RTT off the hot path for non-opted-in orgs during incremental + // rollout. + // Match the gate's bypass list (`mollifierGate.server.ts:158-175`). + // debounce + oneTimeUseToken triggers always return pass_through from + // the gate, so claiming a Redis SETNX here is wasted RTT on the + // trigger hot path. Excluding them keeps the claim aligned with the + // gate — if the gate would never mollify the request, there's no + // buffer to serialise against. + const claimEligible = + !request.body.options?.resumeParentOnCompletion && + !request.body.options?.debounce && + !request.options?.oneTimeUseToken && + (await resolveOrgMollifierFlag({ + envId: request.environment.id, + orgId: request.environment.organizationId, + taskId: request.taskId, + orgFeatureFlags: + ((request.environment.organization?.featureFlags as + | Record + | null + | undefined) ?? null), + })); + if (claimEligible) { + const ttlSeconds = Math.max( + 1, + Math.min( + 30, + Math.ceil((idempotencyKeyExpiresAt.getTime() - Date.now()) / 1000), + ), + ); + const outcome = await claimOrAwait({ + envId: request.environment.id, + taskIdentifier: request.taskId, + idempotencyKey, + ttlSeconds, + }); + if (outcome.kind === "resolved") { + // Another concurrent trigger committed first. Re-resolve via the + // existing checks: writer-side PG findFirst first (defeats + // replica lag), then buffer fallback for the buffered case. + const writerRun = await this.prisma.taskRun.findFirst({ + where: { + runtimeEnvironmentId: request.environment.id, + idempotencyKey, + taskIdentifier: request.taskId, + }, + include: { associatedWaitpoint: true }, + }); + if (writerRun) { + return { isCached: true, run: writerRun }; + } + const buffered = await this.findBufferedRunWithIdempotency( + request.environment.id, + request.environment.organizationId, + request.taskId, + idempotencyKey, + ); + if (buffered) { + return { isCached: true, run: buffered }; + } + // Claim resolved to a runId nothing can find — the run was + // genuinely lost (claimant errored after publish, drain failed, + // or both the PG row and buffer entry TTL'd out). This is + // terminal, not transient: `lookupIdempotency` self-heals a + // dangling pointer, and `ack` keeps the entry hash as a + // read-fallback past the PG write, so re-polling cannot conjure + // a run that is gone. Falling through to a fresh trigger is the + // correct recovery. + // + // Why falling through claimless is safe (no duplicate runs): + // concurrent triggers that also fall through here converge on a + // single run via the same dedup backstops the claim layer relies + // on — the PG unique constraint on the idempotency key + // (RunDuplicateIdempotencyKeyError → retry resolves to the + // winner) for the pass-through path, and `accept`'s idempotency + // SETNX (`duplicate_idempotency`) for the mollify path. Once the + // first fall-through commits a run, later callers find it via the + // writer-PG / buffer lookups above despite the stale `resolved:` + // slot, which the slot's TTL clears within ~30s. The residual + // cost is a few redundant (deduped) trigger attempts in that + // window, not duplicate runs. + logger.warn("idempotency claim resolved but runId not findable", { + envId: request.environment.id, + taskIdentifier: request.taskId, + claimedRunId: outcome.runId, + }); + } + if (outcome.kind === "timed_out") { + throw new ServiceValidationError( + "Idempotency claim resolution timed out", + 503, + ); + } + if (outcome.kind === "claimed") { + // Caller MUST publish/release. Signalled via the result's + // `claim` field, including the ownership token so the buffer + // can compare-and-act on the slot we now own. + return { + isCached: false, + idempotencyKey, + idempotencyKeyExpiresAt, + claim: { + envId: request.environment.id, + taskIdentifier: request.taskId, + idempotencyKey, + token: outcome.token, + }, + }; + } + } + return { isCached: false, idempotencyKey, idempotencyKeyExpiresAt }; } } diff --git a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts index 5f985b684c1..a8a7cbf0f3b 100644 --- a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts @@ -6,6 +6,7 @@ import type { PrismaClientOrTransaction } from "@trigger.dev/database"; import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { getEventRepository } from "~/v3/eventRepository/index.server"; +import { PerformTaskRunAlertsService } from "~/v3/services/alerts/performTaskRunAlerts.server"; import { DefaultQueueManager } from "../concerns/queues.server"; import type { TriggerTaskRequest } from "../types"; @@ -176,6 +177,14 @@ export class TriggerFailedTaskService { event.setAttribute("runId", failedRunFriendlyId); event.failWithError(taskRunError); + // `emitRunFailedEvent: false` because this call site owns the + // trace-event lifecycle via the outer `traceEvent({ + // incomplete: false, isError: true })`. Letting the engine + // emit `runFailed` here would race the + // `completeFailedRunEvent` listener against the outer trace + // event's own completion write for the same (traceId, spanId). + // We re-trigger the alerts side directly after the trace + // event closes, below. return await this.engine.createFailedTaskRun({ friendlyId: failedRunFriendlyId, environment: { @@ -200,12 +209,30 @@ export class TriggerFailedTaskService { spanId: event.spanId, traceContext: traceContext as Record, taskEventStore: store, + emitRunFailedEvent: false, ...(queueName !== undefined && { queue: queueName }), ...(lockedQueueId !== undefined && { lockedQueueId }), }); } ); + // Alerts side of `runFailed` — the engine emit was suppressed + // above so the trace-event completion isn't double-written; we + // still need the alert pipeline to fire so customers' ERROR + // channels see the failure. Best-effort: a failed enqueue logs + // but doesn't block returning the friendlyId, mirroring the + // engine handler's behaviour at runEngineHandlers.server.ts:81. + try { + await PerformTaskRunAlertsService.enqueue(failedRun.id); + } catch (alertsError) { + logger.warn("TriggerFailedTaskService: alert enqueue failed", { + taskId: request.taskId, + friendlyId: failedRun.friendlyId, + error: + alertsError instanceof Error ? alertsError.message : String(alertsError), + }); + } + return failedRun.friendlyId; } catch (createError) { const createErrorMsg = @@ -264,7 +291,7 @@ export class TriggerFailedTaskService { } } - await this.engine.createFailedTaskRun({ + const failedRun = await this.engine.createFailedTaskRun({ friendlyId: failedRunFriendlyId, environment: { id: opts.environmentId, @@ -286,8 +313,32 @@ export class TriggerFailedTaskService { depth, resumeParentOnCompletion: opts.resumeParentOnCompletion, batch: opts.batch, + // Suppress the engine's `runFailed` bus emit — the listener + // (`runEngineHandlers.server.ts` `runFailed`) calls + // `completeFailedRunEvent`, which writes a ClickHouse trace event + // row keyed on (traceId, spanId). This caller has no trace + // context (the method name is literally `callWithoutTraceEvents`) + // so the emit would write a row with empty traceId/spanId — + // orphan event in the store. We still want alert coverage, + // though, so enqueue directly below. + emitRunFailedEvent: false, }); + // Alerts side of `runFailed` — the engine emit was suppressed + // above so we don't create an orphan trace event; enqueue the + // alert directly so customers' ERROR channels still see the + // failure. Best-effort, mirroring the `call()` path. + try { + await PerformTaskRunAlertsService.enqueue(failedRun.id); + } catch (alertsError) { + logger.warn("TriggerFailedTaskService.callWithoutTraceEvents: alert enqueue failed", { + taskId: opts.taskId, + friendlyId: failedRun.friendlyId, + error: + alertsError instanceof Error ? alertsError.message : String(alertsError), + }); + } + return failedRunFriendlyId; } catch (createError) { logger.error("TriggerFailedTaskService: failed to create pre-failed TaskRun (no trace)", { diff --git a/apps/webapp/app/runEngine/services/triggerTask.server.ts b/apps/webapp/app/runEngine/services/triggerTask.server.ts index 2d9eeec0943..0049968e06a 100644 --- a/apps/webapp/app/runEngine/services/triggerTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerTask.server.ts @@ -30,7 +30,14 @@ import type { TriggerTaskServiceResult, } from "../../v3/services/triggerTask.server"; import { clampMaxDuration } from "../../v3/utils/maxDuration"; -import { IdempotencyKeyConcern } from "../concerns/idempotencyKeys.server"; +import { + IdempotencyKeyConcern, + type ClaimedIdempotency, +} from "../concerns/idempotencyKeys.server"; +import { + publishClaim as publishMollifierClaim, + releaseClaim as releaseMollifierClaim, +} from "~/v3/mollifier/idempotencyClaim.server"; import type { PayloadProcessor, QueueManager, @@ -50,8 +57,8 @@ import { getMollifierBuffer as defaultGetMollifierBuffer, type MollifierGetBuffer, } from "~/v3/mollifier/mollifierBuffer.server"; -import { buildBufferedTriggerPayload } from "~/v3/mollifier/bufferedTriggerPayload.server"; -import { serialiseSnapshot } from "@trigger.dev/redis-worker"; +import { mollifyTrigger } from "~/v3/mollifier/mollifierMollify.server"; +import { type MollifierBuffer } from "@trigger.dev/redis-worker"; import { QueueSizeLimitExceededError, ServiceValidationError } from "~/v3/services/common.server"; class NoopTriggerRacepointSystem implements TriggerRacepointSystem { @@ -124,474 +131,657 @@ export class RunEngineTriggerTaskService { options?: TriggerTaskServiceOptions; attempt?: number; }): Promise { - return await startSpan(this.tracer, "RunEngineTriggerTaskService.call()", async (span) => { - span.setAttribute("taskId", taskId); - span.setAttribute("attempt", attempt); - - const runFriendlyId = options?.runFriendlyId ?? RunId.generate().friendlyId; - const triggerRequest = { - taskId, - friendlyId: runFriendlyId, - environment, - body, - options, - } satisfies TriggerTaskRequest; - - // Validate max attempts - const maxAttemptsValidation = this.validator.validateMaxAttempts({ - taskId, - attempt, - }); - - if (!maxAttemptsValidation.ok) { - throw maxAttemptsValidation.error; - } + // Pre-gate idempotency-claim ownership. Set inside the span when + // `IdempotencyKeyConcern.handleTriggerRequest` returns `claim: + // {...}`. The try/catch below resolves it once the span finishes. + let idempotencyClaim: ClaimedIdempotency | undefined; + try { + const result = await startSpan( + this.tracer, + "RunEngineTriggerTaskService.call()", + async (span) => { + span.setAttribute("taskId", taskId); + span.setAttribute("attempt", attempt); + + const runFriendlyId = options?.runFriendlyId ?? RunId.generate().friendlyId; + const triggerRequest = { + taskId, + friendlyId: runFriendlyId, + environment, + body, + options, + } satisfies TriggerTaskRequest; - // Validate tags - const tagValidation = this.validator.validateTags({ - tags: body.options?.tags, - }); + // Validate max attempts + const maxAttemptsValidation = this.validator.validateMaxAttempts({ + taskId, + attempt, + }); - if (!tagValidation.ok) { - throw tagValidation.error; - } + if (!maxAttemptsValidation.ok) { + throw maxAttemptsValidation.error; + } - // Validate entitlement (unless skipChecks is enabled) - let planType: string | undefined; + // Validate tags + const tagValidation = this.validator.validateTags({ + tags: body.options?.tags, + }); - if (!options.skipChecks) { - const entitlementValidation = await this.validator.validateEntitlement({ - environment, - }); + if (!tagValidation.ok) { + throw tagValidation.error; + } - if (!entitlementValidation.ok) { - throw entitlementValidation.error; - } + // Validate entitlement (unless skipChecks is enabled) + let planType: string | undefined; - // Extract plan type from entitlement response - planType = entitlementValidation.plan?.type; - } else { - // When skipChecks is enabled, planType should be passed via options - planType = options.planType; + if (!options.skipChecks) { + const entitlementValidation = await this.validator.validateEntitlement({ + environment, + }); - if (!planType) { - logger.warn("Plan type not set but skipChecks is enabled", { + if (!entitlementValidation.ok) { + throw entitlementValidation.error; + } + + // Extract plan type from entitlement response + planType = entitlementValidation.plan?.type; + } else { + // When skipChecks is enabled, planType should be passed via options + planType = options.planType; + + if (!planType) { + logger.warn("Plan type not set but skipChecks is enabled", { + taskId, + environment: { + id: environment.id, + type: environment.type, + projectId: environment.projectId, + organizationId: environment.organizationId, + }, + }); + } + } + + // Parse delay from either explicit delay option or debounce.delay + const delaySource = body.options?.delay ?? body.options?.debounce?.delay; + const [parseDelayError, delayUntil] = await tryCatch(parseDelay(delaySource)); + + if (parseDelayError) { + throw new ServiceValidationError(`Invalid delay ${delaySource}`); + } + + // Validate debounce options + if (body.options?.debounce) { + if (!delayUntil) { + throw new ServiceValidationError( + `Debounce requires a valid delay duration. Provided: ${body.options.debounce.delay}` + ); + } + + // Always validate debounce.delay separately since it's used for rescheduling + // This catches the case where options.delay is valid but debounce.delay is invalid + const [debounceDelayError, debounceDelayUntil] = await tryCatch( + parseDelay(body.options.debounce.delay) + ); + + if (debounceDelayError || !debounceDelayUntil) { + throw new ServiceValidationError( + `Invalid debounce delay: ${body.options.debounce.delay}. ` + + `Supported formats: {number}s, {number}m, {number}h, {number}d, {number}w` + ); + } + } + + // Get parent run if specified + const parentRun = body.options?.parentRunId + ? await this.prisma.taskRun.findFirst({ + where: { + id: RunId.fromFriendlyId(body.options.parentRunId), + runtimeEnvironmentId: environment.id, + }, + }) + : undefined; + + // Validate parent run + const parentRunValidation = this.validator.validateParentRun({ taskId, - environment: { - id: environment.id, - type: environment.type, - projectId: environment.projectId, - organizationId: environment.organizationId, - }, + parentRun: parentRun ?? undefined, + resumeParentOnCompletion: body.options?.resumeParentOnCompletion, }); - } - } - - // Parse delay from either explicit delay option or debounce.delay - const delaySource = body.options?.delay ?? body.options?.debounce?.delay; - const [parseDelayError, delayUntil] = await tryCatch(parseDelay(delaySource)); - if (parseDelayError) { - throw new ServiceValidationError(`Invalid delay ${delaySource}`); - } + if (!parentRunValidation.ok) { + throw parentRunValidation.error; + } - // Validate debounce options - if (body.options?.debounce) { - if (!delayUntil) { - throw new ServiceValidationError( - `Debounce requires a valid delay duration. Provided: ${body.options.debounce.delay}` - ); - } - - // Always validate debounce.delay separately since it's used for rescheduling - // This catches the case where options.delay is valid but debounce.delay is invalid - const [debounceDelayError, debounceDelayUntil] = await tryCatch( - parseDelay(body.options.debounce.delay) - ); - - if (debounceDelayError || !debounceDelayUntil) { - throw new ServiceValidationError( - `Invalid debounce delay: ${body.options.debounce.delay}. ` + - `Supported formats: {number}s, {number}m, {number}h, {number}d, {number}w` + const idempotencyKeyConcernResult = await this.idempotencyKeyConcern.handleTriggerRequest( + triggerRequest, + parentRun?.taskEventStore ); - } - } - // Get parent run if specified - const parentRun = body.options?.parentRunId - ? await this.prisma.taskRun.findFirst({ - where: { - id: RunId.fromFriendlyId(body.options.parentRunId), - runtimeEnvironmentId: environment.id, - }, - }) - : undefined; - - // Validate parent run - const parentRunValidation = this.validator.validateParentRun({ - taskId, - parentRun: parentRun ?? undefined, - resumeParentOnCompletion: body.options?.resumeParentOnCompletion, - }); - - if (!parentRunValidation.ok) { - throw parentRunValidation.error; - } + if (idempotencyKeyConcernResult.isCached) { + return idempotencyKeyConcernResult; + } - const idempotencyKeyConcernResult = await this.idempotencyKeyConcern.handleTriggerRequest( - triggerRequest, - parentRun?.taskEventStore - ); + const { idempotencyKey, idempotencyKeyExpiresAt, claim: claimResult } = + idempotencyKeyConcernResult; + + // If we own an idempotency claim, the trigger pipeline below MUST + // resolve it — publish on success so waiters see our runId, + // release on error so the next claimant can retry. Stored in an + // outer scope so the try/catch at the bottom of `callV2` can act + // on whichever return path or throw the pipeline takes. + idempotencyClaim = claimResult; + + if (idempotencyKey) { + await this.triggerRacepointSystem.waitForRacepoint({ + racepoint: "idempotencyKey", + id: idempotencyKey, + }); + } - if (idempotencyKeyConcernResult.isCached) { - return idempotencyKeyConcernResult; - } + const lockedToBackgroundWorker = body.options?.lockToVersion + ? await this.prisma.backgroundWorker.findFirst({ + where: { + projectId: environment.projectId, + runtimeEnvironmentId: environment.id, + version: body.options?.lockToVersion, + }, + select: { + id: true, + version: true, + sdkVersion: true, + cliVersion: true, + }, + }) + : undefined; - const { idempotencyKey, idempotencyKeyExpiresAt } = idempotencyKeyConcernResult; + const { queueName, lockedQueueId, taskTtl, taskKind } = + await this.queueConcern.resolveQueueProperties( + triggerRequest, + lockedToBackgroundWorker ?? undefined + ); - if (idempotencyKey) { - await this.triggerRacepointSystem.waitForRacepoint({ - racepoint: "idempotencyKey", - id: idempotencyKey, - }); - } + // Resolve TTL with precedence: per-trigger > task-level > dev default + let ttl: string | undefined; - const lockedToBackgroundWorker = body.options?.lockToVersion - ? await this.prisma.backgroundWorker.findFirst({ - where: { - projectId: environment.projectId, - runtimeEnvironmentId: environment.id, - version: body.options?.lockToVersion, - }, - select: { - id: true, - version: true, - sdkVersion: true, - cliVersion: true, - }, - }) - : undefined; - - const { queueName, lockedQueueId, taskTtl, taskKind } = - await this.queueConcern.resolveQueueProperties( - triggerRequest, - lockedToBackgroundWorker ?? undefined - ); - - // Resolve TTL with precedence: per-trigger > task-level > dev default - let ttl: string | undefined; - - if (body.options?.ttl !== undefined) { - ttl = - typeof body.options.ttl === "number" - ? stringifyDuration(body.options.ttl) - : body.options.ttl; - } else { - ttl = taskTtl ?? (environment.type === "DEVELOPMENT" ? "10m" : undefined); - } + if (body.options?.ttl !== undefined) { + ttl = + typeof body.options.ttl === "number" + ? stringifyDuration(body.options.ttl) + : body.options.ttl; + } else { + ttl = taskTtl ?? (environment.type === "DEVELOPMENT" ? "10m" : undefined); + } - if (!options.skipChecks) { - const queueSizeGuard = await this.queueConcern.validateQueueLimits( - environment, - queueName - ); - - if (!queueSizeGuard.ok) { - throw new QueueSizeLimitExceededError( - `Cannot trigger ${taskId} as the queue size limit for this environment has been reached. The maximum size is ${queueSizeGuard.maximumSize}`, - queueSizeGuard.maximumSize ?? 0, - undefined, - "warn" + if (!options.skipChecks) { + const queueSizeGuard = await this.queueConcern.validateQueueLimits( + environment, + queueName + ); + + if (!queueSizeGuard.ok) { + throw new QueueSizeLimitExceededError( + `Cannot trigger ${taskId} as the queue size limit for this environment has been reached. The maximum size is ${queueSizeGuard.maximumSize}`, + queueSizeGuard.maximumSize ?? 0, + undefined, + "warn" + ); + } + } + + const metadataPacket = body.options?.metadata + ? handleMetadataPacket( + body.options?.metadata, + body.options?.metadataType ?? "application/json", + this.metadataMaximumSize + ) + : undefined; + + const tags = ( + body.options?.tags + ? typeof body.options.tags === "string" + ? [body.options.tags] + : body.options.tags + : [] + ).filter((tag) => tag.trim().length > 0); + + const depth = parentRun ? parentRun.depth + 1 : 0; + + const workerQueueResult = await this.queueConcern.getWorkerQueue( + environment, + body.options?.region ); - } - } + const workerQueue = workerQueueResult?.masterQueue; + const enableFastPath = workerQueueResult?.enableFastPath ?? false; + + // Build annotations for this run + const triggerSource = options.triggerSource ?? "api"; + const triggerAction = options.triggerAction ?? "trigger"; + const parentAnnotations = RunAnnotations.safeParse(parentRun?.annotations).data; + const annotations = { + triggerSource, + triggerAction, + rootTriggerSource: parentAnnotations?.rootTriggerSource ?? triggerSource, + rootScheduleId: parentAnnotations?.rootScheduleId || options.scheduleId || undefined, + taskKind: taskKind ?? "STANDARD", + }; + + try { + return await this.traceEventConcern.traceRun( + triggerRequest, + parentRun?.taskEventStore, + async (event, store) => { + event.setAttribute("queueName", queueName); + span.setAttribute("queueName", queueName); + event.setAttribute("runId", runFriendlyId); + span.setAttribute("runId", runFriendlyId); + + // Short-circuit when mollifier is globally off (the default + // for every deployment that hasn't opted in). Avoids the + // GateInputs allocation, the deps spread inside `evaluateGate`, + // and the `mollifier.decisions{outcome=pass_through}` OTel + // increment on every trigger — `triggerTask` is the + // highest-throughput code path in the system. The check goes + // through a DI'd predicate so unit tests that inject a custom + // `evaluateGate` can also override the gate-on check (the + // default reads `env.TRIGGER_MOLLIFIER_ENABLED`, which is "0" + // in CI where no .env file is present). + // + // Batch items bypass the mollifier gate entirely. + // + // The mollify path returns a stripped run-shape `{ id, + // friendlyId, spanId }` with no PG row written. Batch + // tracking relies on `BatchTaskRunItem`, a join row whose + // `taskRunId` column has a NOT NULL FK to `TaskRun.id` — + // creating that join at trigger-time (in + // `batchTriggerV3.server.ts:871`) fails with FK violation + // for any mollified item, and skipping it at trigger-time + // would silently drop the batch↔run link forever because + // the drainer's materialise path doesn't (yet) create + // `BatchTaskRunItem`. Either side alone is wrong: + // - skip at trigger-time only → batch progress + // under-reports forever, `batchTriggerAndWait` parent + // stays parked + // - mollify at trigger-time only → FK violation, 500 + // + // The proper end state is a drainer-side + // `BatchTaskRunItem` create-on-materialise (the snapshot + // already carries `batch: { id, index }` so the drainer + // has the info). That belongs in the drainer / replay PR, + // not here. Until that lands, batch triggers pass-through + // — they lose the burst-protection benefit, but the path + // works end-to-end. + const skipMollifierForBatch = !!options.batchId; + const mollifierOutcome: GateOutcome | null = + this.isMollifierGloballyEnabled() && !skipMollifierForBatch + ? await this.evaluateGate({ + envId: environment.id, + orgId: environment.organizationId, + taskId, + orgFeatureFlags: + (environment.organization.featureFlags as Record | null) ?? + null, + options: { + debounce: body.options?.debounce, + oneTimeUseToken: options.oneTimeUseToken, + parentTaskRunId: body.options?.parentRunId, + resumeParentOnCompletion: body.options?.resumeParentOnCompletion, + }, + }) + : null; + + // When the gate says mollify, write the engine.trigger input + // snapshot into the Redis buffer and return a synthesised + // TriggerTaskServiceResult. The customer never waits on + // Postgres; the drainer materialises the run later by replaying + // engine.trigger against the snapshot. The run span has already + // been opened by traceRun above (PARTIAL event in ClickHouse), + // so its traceId/spanId live in the snapshot and the drainer's + // `mollifier.drained` span parents on the same trace — buffered + // runs become visible in the dashboard's trace view immediately, + // not only after the drainer fires. + if (mollifierOutcome?.action === "mollify") { + const mollifierBuffer = this.getMollifierBuffer(); + if (mollifierBuffer && !body.options?.debounce) { + event.setAttribute("mollifier.reason", mollifierOutcome.decision.reason); + event.setAttribute("mollifier.count", String(mollifierOutcome.decision.count)); + event.setAttribute( + "mollifier.threshold", + String(mollifierOutcome.decision.threshold) + ); + event.setAttribute("taskRunId", runFriendlyId); + + const payloadPacket = await this.payloadProcessor.process(triggerRequest); + + const engineTriggerInput = this.#buildEngineTriggerInput({ + runFriendlyId, + environment, + idempotencyKey, + idempotencyKeyExpiresAt, + body, + options, + queueName, + lockedQueueId, + workerQueue, + enableFastPath, + lockedToBackgroundWorker: lockedToBackgroundWorker ?? undefined, + delayUntil, + ttl, + metadataPacket, + tags, + depth, + parentRun: parentRun ?? undefined, + annotations, + planType, + taskId, + payloadPacket, + traceContext: this.#propagateExternalTraceContext( + event.traceContext, + parentRun?.traceContext, + event.traceparent?.spanId + ), + traceId: event.traceId, + spanId: event.spanId, + parentSpanId: + options.parentAsLinkType === "replay" + ? undefined + : event.traceparent?.spanId, + taskEventStore: store, + }); + + const result = await mollifyTrigger({ + runFriendlyId, + environmentId: environment.id, + organizationId: environment.organizationId, + engineTriggerInput, + decision: mollifierOutcome.decision, + buffer: mollifierBuffer, + // Idempotency-key triple wires the buffer's SETNX into + // the trigger-time dedup symmetric with PG. + idempotencyKey, + taskIdentifier: taskId, + }); + + logger.debug("mollifier.buffered", { + runId: runFriendlyId, + envId: environment.id, + orgId: environment.organizationId, + taskId, + reason: mollifierOutcome.decision.reason, + }); + + // Synthetic result is structurally narrower than the full + // TaskRun; the route handler only reads + // `result.run.friendlyId`. traceRun flushes the PARTIAL + // run-span event to ClickHouse on callback return. + // `isMollified` flags the route to skip the request- + // idempotency cache write — see the field's contract on + // `TriggerTaskServiceResult`. + return { + ...(result as unknown as TriggerTaskServiceResult), + isMollified: true, + }; + } + if (!mollifierBuffer) { + logger.warn( + "mollifier gate said mollify but buffer is null — falling through to pass-through" + ); + } + } - const metadataPacket = body.options?.metadata - ? handleMetadataPacket( - body.options?.metadata, - body.options?.metadataType ?? "application/json", - this.metadataMaximumSize - ) - : undefined; - - const tags = ( - body.options?.tags - ? typeof body.options.tags === "string" - ? [body.options.tags] - : body.options.tags - : [] - ).filter((tag) => tag.trim().length > 0); - - const depth = parentRun ? parentRun.depth + 1 : 0; - - const workerQueueResult = await this.queueConcern.getWorkerQueue( - environment, - body.options?.region - ); - const workerQueue = workerQueueResult?.masterQueue; - const enableFastPath = workerQueueResult?.enableFastPath ?? false; - - // Build annotations for this run - const triggerSource = options.triggerSource ?? "api"; - const triggerAction = options.triggerAction ?? "trigger"; - const parentAnnotations = RunAnnotations.safeParse(parentRun?.annotations).data; - const annotations = { - triggerSource, - triggerAction, - rootTriggerSource: parentAnnotations?.rootTriggerSource ?? triggerSource, - rootScheduleId: parentAnnotations?.rootScheduleId || options.scheduleId || undefined, - taskKind: taskKind ?? "STANDARD", - }; - - // Short-circuit before the gate when mollifier is globally off (the - // default for every deployment that hasn't opted in). Avoids the - // GateInputs allocation, the deps spread inside `evaluateGate`, and - // the `mollifier.decisions{outcome=pass_through}` OTel increment on - // every trigger — `triggerTask` is the highest-throughput code path - // in the system. The check goes through a DI'd predicate so unit - // tests that inject a custom `evaluateGate` can also override the - // gate-on check (the default reads `env.TRIGGER_MOLLIFIER_ENABLED`, - // which is "0" in CI where no .env file is present). - const mollifierOutcome: GateOutcome | null = this.isMollifierGloballyEnabled() - ? await this.evaluateGate({ - envId: environment.id, - orgId: environment.organizationId, - taskId, - orgFeatureFlags: - (environment.organization.featureFlags as Record | null) ?? null, - }) - : null; - - try { - return await this.traceEventConcern.traceRun( - triggerRequest, - parentRun?.taskEventStore, - async (event, store) => { - event.setAttribute("queueName", queueName); - span.setAttribute("queueName", queueName); - event.setAttribute("runId", runFriendlyId); - span.setAttribute("runId", runFriendlyId); - - const payloadPacket = await this.payloadProcessor.process(triggerRequest); - - // Phase 1 dual-write: if the org has the mollifier feature flag - // enabled and the per-env trip evaluator says divert, write the - // canonical replay payload to the buffer AND continue through - // engine.trigger as normal. The buffer entry is an audit/preview - // copy; the drainer's no-op handler consumes it to prove the - // dequeue mechanism works. Phase 2 will replace engine.trigger - // (below) with a synthesised 200 response and rely on the - // drainer to perform the Postgres write via replay. - if (mollifierOutcome?.action === "mollify") { - const buffer = this.getMollifierBuffer(); - if (buffer) { - const canonicalPayload = buildBufferedTriggerPayload({ + const payloadPacket = await this.payloadProcessor.process(triggerRequest); + + const baseEngineInput = this.#buildEngineTriggerInput({ runFriendlyId, - taskId, - envId: environment.id, - envType: environment.type, - envSlug: environment.slug, - orgId: environment.organizationId, - orgSlug: environment.organization.slug, - projectId: environment.projectId, - projectRef: environment.project.externalRef, + environment, + idempotencyKey, + idempotencyKeyExpiresAt, body, - idempotencyKey: idempotencyKey ?? null, - idempotencyKeyExpiresAt: idempotencyKey - ? idempotencyKeyExpiresAt ?? null - : null, + options, + queueName, + lockedQueueId, + workerQueue, + enableFastPath, + lockedToBackgroundWorker: lockedToBackgroundWorker ?? undefined, + delayUntil, + ttl, + metadataPacket, tags, - parentRunFriendlyId: parentRun?.friendlyId ?? null, - traceContext: event.traceContext, - triggerSource, - triggerAction, - serviceOptions: options, - createdAt: new Date(), + depth, + parentRun: parentRun ?? undefined, + annotations, + planType, + taskId, + payloadPacket, + traceContext: this.#propagateExternalTraceContext( + event.traceContext, + parentRun?.traceContext, + event.traceparent?.spanId + ), + traceId: event.traceId, + spanId: event.spanId, + parentSpanId: + options.parentAsLinkType === "replay" ? undefined : event.traceparent?.spanId, + taskEventStore: store, }); - try { - const serialisedPayload = serialiseSnapshot(canonicalPayload); - await buffer.accept({ - runId: runFriendlyId, - envId: environment.id, - orgId: environment.organizationId, - payload: serialisedPayload, - }); - // Light log on the hot path — keep this synchronous work - // O(1) per trigger. The drainer computes the payload hash - // off-path; operators correlate `mollifier.buffered` → - // `mollifier.drained` by runId. - logger.debug("mollifier.buffered", { - runId: runFriendlyId, - envId: environment.id, - orgId: environment.organizationId, - taskId, - payloadBytes: serialisedPayload.length, - }); - } catch (err) { - // Fail-open: buffer write must never block the customer's - // trigger. engine.trigger below is the primary write path - // in Phase 1 — the customer still gets a valid run. - logger.error("mollifier.buffer_accept_failed", { - runId: runFriendlyId, - envId: environment.id, - taskId, - err: err instanceof Error ? err.message : String(err), - }); + const taskRun = await this.engine.trigger( + { + ...baseEngineInput, + // onDebounced is a closure over webapp state (triggerRequest + + // traceEventConcern) and can't be serialised into the mollifier + // snapshot. The pass-through path attaches it here; the drainer + // path replays without it. The debounce and triggerAndWait gate + // bypasses ensure neither reaches the mollify branch. + onDebounced: + body.options?.debounce && body.options?.resumeParentOnCompletion + ? async ({ existingRun, waitpoint, debounceKey }) => { + return await this.traceEventConcern.traceDebouncedRun( + triggerRequest, + parentRun?.taskEventStore, + { + existingRun, + debounceKey, + incomplete: waitpoint.status === "PENDING", + isError: waitpoint.outputIsError, + }, + async (spanEvent) => { + const spanId = + options?.parentAsLinkType === "replay" + ? spanEvent.spanId + : spanEvent.traceparent?.spanId + ? `${spanEvent.traceparent.spanId}:${spanEvent.spanId}` + : spanEvent.spanId; + return spanId; + } + ); + } + : undefined, + }, + this.prisma + ); + + // If the returned run has a different friendlyId, it was debounced. + // For triggerAndWait: stop the outer span since a replacement debounced span was created via onDebounced. + // For regular trigger: let the span complete normally - no replacement span needed since the + // original run already has its span from when it was first created. + if ( + taskRun.friendlyId !== runFriendlyId && + body.options?.debounce && + body.options?.resumeParentOnCompletion + ) { + event.stop(); } - } - } - const taskRun = await this.engine.trigger( - { - friendlyId: runFriendlyId, - environment: environment, - idempotencyKey, - idempotencyKeyExpiresAt: idempotencyKey ? idempotencyKeyExpiresAt : undefined, - idempotencyKeyOptions: body.options?.idempotencyKeyOptions, - taskIdentifier: taskId, - payload: payloadPacket.data ?? "", - payloadType: payloadPacket.dataType, - context: body.context, - traceContext: this.#propagateExternalTraceContext( - event.traceContext, - parentRun?.traceContext, - event.traceparent?.spanId - ), - traceId: event.traceId, - spanId: event.spanId, - parentSpanId: - options.parentAsLinkType === "replay" ? undefined : event.traceparent?.spanId, - replayedFromTaskRunFriendlyId: options.replayedFromTaskRunFriendlyId, - lockedToVersionId: lockedToBackgroundWorker?.id, - taskVersion: lockedToBackgroundWorker?.version, - sdkVersion: lockedToBackgroundWorker?.sdkVersion, - cliVersion: lockedToBackgroundWorker?.cliVersion, - concurrencyKey: body.options?.concurrencyKey, - queue: queueName, - lockedQueueId, - workerQueue, - enableFastPath, - isTest: body.options?.test ?? false, - delayUntil, - queuedAt: delayUntil ? undefined : new Date(), - maxAttempts: body.options?.maxAttempts, - taskEventStore: store, - ttl, - tags, - oneTimeUseToken: options.oneTimeUseToken, - parentTaskRunId: parentRun?.id, - rootTaskRunId: parentRun?.rootTaskRunId ?? parentRun?.id, - batch: options?.batchId - ? { - id: options.batchId, - index: options.batchIndex ?? 0, - } - : undefined, - resumeParentOnCompletion: body.options?.resumeParentOnCompletion, - depth, - metadata: metadataPacket?.data, - metadataType: metadataPacket?.dataType, - seedMetadata: metadataPacket?.data, - seedMetadataType: metadataPacket?.dataType, - maxDurationInSeconds: body.options?.maxDuration - ? clampMaxDuration(body.options.maxDuration) - : undefined, - machine: body.options?.machine, - priorityMs: body.options?.priority ? body.options.priority * 1_000 : undefined, - queueTimestamp: - options.queueTimestamp ?? - (parentRun && body.options?.resumeParentOnCompletion - ? parentRun.queueTimestamp ?? undefined - : undefined), - scheduleId: options.scheduleId, - scheduleInstanceId: options.scheduleInstanceId, - createdAt: options.overrideCreatedAt, - bulkActionId: body.options?.bulkActionId, - planType, - realtimeStreamsVersion: options.realtimeStreamsVersion, - streamBasinName: environment.organization.streamBasinName, - debounce: body.options?.debounce, - annotations, - // When debouncing with triggerAndWait, create a span for the debounced trigger - onDebounced: - body.options?.debounce && body.options?.resumeParentOnCompletion - ? async ({ existingRun, waitpoint, debounceKey }) => { - return await this.traceEventConcern.traceDebouncedRun( - triggerRequest, - parentRun?.taskEventStore, - { - existingRun, - debounceKey, - incomplete: waitpoint.status === "PENDING", - isError: waitpoint.outputIsError, - }, - async (spanEvent) => { - const spanId = - options?.parentAsLinkType === "replay" - ? spanEvent.spanId - : spanEvent.traceparent?.spanId - ? `${spanEvent.traceparent.spanId}:${spanEvent.spanId}` - : spanEvent.spanId; - return spanId; - } - ); - } - : undefined, - }, - this.prisma - ); + const error = taskRun.error ? TaskRunError.parse(taskRun.error) : undefined; - // If the returned run has a different friendlyId, it was debounced. - // For triggerAndWait: stop the outer span since a replacement debounced span was created via onDebounced. - // For regular trigger: let the span complete normally - no replacement span needed since the - // original run already has its span from when it was first created. - if ( - taskRun.friendlyId !== runFriendlyId && - body.options?.debounce && - body.options?.resumeParentOnCompletion - ) { - event.stop(); - } + if (error) { + event.failWithError(error); + } - const error = taskRun.error ? TaskRunError.parse(taskRun.error) : undefined; + const result = { run: taskRun, error, isCached: false }; - if (error) { - event.failWithError(error); - } + if (result?.error) { + throw new ServiceValidationError( + taskRunErrorToString(taskRunErrorEnhancer(result.error)) + ); + } - const result = { run: taskRun, error, isCached: false }; + return result; + } + ); + } catch (error) { + if (error instanceof RunDuplicateIdempotencyKeyError) { + //retry calling this function, because this time it will return the idempotent run + return await this.call({ + taskId, + environment, + body, + options: { ...options, runFriendlyId }, + attempt: attempt + 1, + }); + } - if (result?.error) { + if (error instanceof RunOneTimeUseTokenError) { throw new ServiceValidationError( - taskRunErrorToString(taskRunErrorEnhancer(result.error)) + `Cannot trigger ${taskId} with a one-time use token as it has already been used.` ); } - return result; + throw error; } - ); - } catch (error) { - if (error instanceof RunDuplicateIdempotencyKeyError) { - //retry calling this function, because this time it will return the idempotent run - return await this.call({ - taskId, - environment, - body, - options: { ...options, runFriendlyId }, - attempt: attempt + 1, - }); - } - - if (error instanceof RunOneTimeUseTokenError) { - throw new ServiceValidationError( - `Cannot trigger ${taskId} with a one-time use token as it has already been used.` - ); - } - - throw error; + }, + ); + // Pipeline returned successfully — publish the claim if we held + // one. Waiters polling for our key resolve to this runId. + if (idempotencyClaim && result?.run?.friendlyId) { + await publishMollifierClaim({ + envId: idempotencyClaim.envId, + taskIdentifier: idempotencyClaim.taskIdentifier, + idempotencyKey: idempotencyClaim.idempotencyKey, + token: idempotencyClaim.token, + runId: result.run.friendlyId, + }); + } + return result; + } catch (err) { + // Pipeline threw — release the claim so the next claimant can + // retry. Re-throw so the caller sees the original error. + if (idempotencyClaim) { + await releaseMollifierClaim(idempotencyClaim); } - }); + throw err; + } + } + + // Build the engine.trigger() input object from the values gathered during + // this.call(). Extracted so the mollify path can construct the + // same input shape without re-entering the trace-run span. The pass-through + // path spreads this result and attaches `onDebounced` inline; the mollify + // path serialises it into the buffer for drainer replay. + #buildEngineTriggerInput(args: { + runFriendlyId: string; + environment: AuthenticatedEnvironment; + idempotencyKey?: string; + idempotencyKeyExpiresAt?: Date; + body: TriggerTaskRequest["body"]; + options: TriggerTaskServiceOptions; + queueName: string; + lockedQueueId?: string; + workerQueue?: string; + enableFastPath: boolean; + lockedToBackgroundWorker?: { id: string; version: string; sdkVersion: string; cliVersion: string }; + delayUntil?: Date; + ttl?: string; + metadataPacket?: { data?: string; dataType: string }; + tags: string[]; + depth: number; + parentRun?: { id: string; rootTaskRunId?: string | null; queueTimestamp?: Date | null; taskEventStore?: string }; + annotations: { + triggerSource: string; + triggerAction: string; + rootTriggerSource: string; + rootScheduleId?: string | undefined; + }; + planType?: string; + taskId: string; + payloadPacket: { data?: string; dataType: string }; + traceContext: TriggerTraceContext; + traceId: string; + spanId: string; + parentSpanId: string | undefined; + taskEventStore: string; + }) { + return { + friendlyId: args.runFriendlyId, + environment: args.environment, + idempotencyKey: args.idempotencyKey, + idempotencyKeyExpiresAt: args.idempotencyKey ? args.idempotencyKeyExpiresAt : undefined, + idempotencyKeyOptions: args.body.options?.idempotencyKeyOptions, + taskIdentifier: args.taskId, + payload: args.payloadPacket.data ?? "", + payloadType: args.payloadPacket.dataType, + context: args.body.context, + traceContext: args.traceContext, + traceId: args.traceId, + spanId: args.spanId, + parentSpanId: args.parentSpanId, + replayedFromTaskRunFriendlyId: args.options.replayedFromTaskRunFriendlyId, + lockedToVersionId: args.lockedToBackgroundWorker?.id, + taskVersion: args.lockedToBackgroundWorker?.version, + sdkVersion: args.lockedToBackgroundWorker?.sdkVersion, + cliVersion: args.lockedToBackgroundWorker?.cliVersion, + // Schema-level coercion now lands `body.options.concurrencyKey` as + // `string` on the API path, but the BatchQueue worker rebuilds + // body.options from Redis-stored items (Record), + // which can still carry the pre-fix shape from in-flight batches. + concurrencyKey: + typeof args.body.options?.concurrencyKey === "number" + ? String(args.body.options.concurrencyKey) + : args.body.options?.concurrencyKey, + queue: args.queueName, + lockedQueueId: args.lockedQueueId, + workerQueue: args.workerQueue, + enableFastPath: args.enableFastPath, + isTest: args.body.options?.test ?? false, + delayUntil: args.delayUntil, + queuedAt: args.delayUntil ? undefined : new Date(), + maxAttempts: args.body.options?.maxAttempts, + taskEventStore: args.taskEventStore, + ttl: args.ttl, + tags: args.tags, + oneTimeUseToken: args.options.oneTimeUseToken, + parentTaskRunId: args.parentRun?.id, + rootTaskRunId: args.parentRun?.rootTaskRunId ?? args.parentRun?.id, + batch: args.options?.batchId + ? { id: args.options.batchId, index: args.options.batchIndex ?? 0 } + : undefined, + resumeParentOnCompletion: args.body.options?.resumeParentOnCompletion, + depth: args.depth, + metadata: args.metadataPacket?.data, + metadataType: args.metadataPacket?.dataType, + seedMetadata: args.metadataPacket?.data, + seedMetadataType: args.metadataPacket?.dataType, + maxDurationInSeconds: args.body.options?.maxDuration + ? clampMaxDuration(args.body.options.maxDuration) + : undefined, + machine: args.body.options?.machine, + priorityMs: args.body.options?.priority ? args.body.options.priority * 1_000 : undefined, + queueTimestamp: + args.options.queueTimestamp ?? + (args.parentRun && args.body.options?.resumeParentOnCompletion + ? args.parentRun.queueTimestamp ?? undefined + : undefined), + scheduleId: args.options.scheduleId, + scheduleInstanceId: args.options.scheduleInstanceId, + createdAt: args.options.overrideCreatedAt, + bulkActionId: args.body.options?.bulkActionId, + planType: args.planType, + realtimeStreamsVersion: args.options.realtimeStreamsVersion, + streamBasinName: args.environment.organization.streamBasinName, + debounce: args.body.options?.debounce, + annotations: args.annotations, + }; } #propagateExternalTraceContext( diff --git a/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts b/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts new file mode 100644 index 00000000000..ee6419cc381 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/applyMetadataMutation.server.ts @@ -0,0 +1,204 @@ +import { applyMetadataOperations } from "@trigger.dev/core/v3"; +import type { FlushedRunMetadata } from "@trigger.dev/core/v3/schemas"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; + +// On `applied` we surface the parent/root friendlyIds captured during +// the snapshot read. Callers that fan parent/root metadata operations +// out to their respective runs can use these without a second +// `findRunByIdWithMollifierFallback` round trip — and, more importantly, +// without racing the drainer's terminal-failure path (which atomically +// DELetes the entry hash). Without these on the outcome the second +// read can come back null mid-route, silently dropping the caller's +// parentOperations / rootOperations after the primary mutation already +// landed on the snapshot. +// +// FriendlyIds (not internal cuids) because the consuming +// `routeOperationsToRun` helper gates on the `run_…` prefix to decide +// whether to attempt the buffer fallback; cuids would skip that path. +// The snapshot's `parentTaskRunId` / `rootTaskRunId` are engine-side +// cuids, so we convert via `RunId.toFriendlyId` here — identical to +// what `readFallback.server.ts` does when assembling its SyntheticRun. +export type ApplyMetadataMutationOutcome = + | { + kind: "applied"; + newMetadata: Record; + parentTaskRunFriendlyId: string | undefined; + rootTaskRunFriendlyId: string | undefined; + } + | { kind: "not_found" } + | { kind: "busy" } + | { kind: "version_exhausted" } + // Mirrors the PG-side `MetadataTooLargeError` (status 413). Carries + // the limit + observed size so the route can produce a useful body. + | { kind: "metadata_too_large"; maximumSize: number; observedSize: number }; + +// Apply a metadata PUT (body.metadata replace AND/OR body.operations +// deltas) to a buffered run's snapshot. Mirrors the PG-side +// `UpdateMetadataService.#updateRunMetadataWithOperations` retry loop: +// read snapshot → apply operations in JS → CAS-write back with the +// observed `metadataVersion`. Retries on conflict; bounded by +// `maxRetries`. The Lua CAS is the atomicity primitive — concurrent +// callers never lose an increment / append / set. +export async function applyMetadataMutationToBufferedRun(input: { + runId: string; + // Env+org scoping closes a cross-environment write gap on the buffer + // path: the route's PG path is already env-scoped via Prisma filters, + // and this helper now enforces the same isolation before any buffer + // write so a caller authed in env A can't mutate a buffered run that + // belongs to env B. + environmentId: string; + organizationId: string; + // Byte-size cap on the resulting metadata payload, mirroring the + // PG-side `UpdateMetadataService.maximumSize` (sourced from + // `env.TASK_RUN_METADATA_MAXIMUM_SIZE`). Required so the buffer path + // doesn't silently allow writes the PG path would have rejected. + maximumSize: number; + body: Pick; + buffer?: MollifierBuffer | null; + maxRetries?: number; +}): Promise { + const buffer = input.buffer ?? getMollifierBuffer(); + if (!buffer) return { kind: "not_found" }; + + // Default retry budget tuned for buffered-window concurrency. The + // PG-side `UpdateMetadataService` uses 3, which is fine when the only + // writer is the executing task itself. For a buffered run the writers + // are external API callers, and N parallel writers exhaust 3 retries + // quickly under contention. Bumping to 12 covers ~50-way concurrency + // with sub-percent failure probability; the cost is bounded (each + // retry is one Redis Lua call ~1ms). + const maxRetries = input.maxRetries ?? 12; + for (let attempt = 0; attempt <= maxRetries; attempt++) { + const entry = await buffer.getEntry(input.runId); + if (!entry) return { kind: "not_found" }; + // Env+org check: an entry from a different env is treated as a + // miss (not 403) so existence in other envs doesn't leak. + if ( + entry.envId !== input.environmentId || + entry.orgId !== input.organizationId + ) { + return { kind: "not_found" }; + } + if (entry.status !== "QUEUED" || entry.materialised) { + return { kind: "busy" }; + } + + const snapshot = JSON.parse(entry.payload) as Record; + const currentMetadataType = + typeof snapshot.metadataType === "string" ? snapshot.metadataType : "application/json"; + + // Capture parent/root ids during this read so the caller can fan + // parent/root operations out without a second buffer.getEntry. If + // the drainer's terminal-failure path runs between our CAS-write + // below and the route's follow-up, the entry hash would be DELd + // and a second read would return null — silently dropping the + // caller's `body.parentOperations` / `body.rootOperations`. The ids + // themselves are immutable for a run, so capturing them on any + // loop iteration is fine. + const snapshotParentTaskRunInternalId = + typeof snapshot.parentTaskRunId === "string" ? snapshot.parentTaskRunId : undefined; + const snapshotParentTaskRunFriendlyId = snapshotParentTaskRunInternalId + ? RunId.toFriendlyId(snapshotParentTaskRunInternalId) + : undefined; + const snapshotRootTaskRunInternalId = + typeof snapshot.rootTaskRunId === "string" ? snapshot.rootTaskRunId : undefined; + const snapshotRootTaskRunFriendlyId = snapshotRootTaskRunInternalId + ? RunId.toFriendlyId(snapshotRootTaskRunInternalId) + : undefined; + + // Match PG semantics: `body.operations` and `body.metadata` are + // mutually exclusive on a single request. The PG service + // (`UpdateMetadataService.#updateRunMetadata`) branches on + // `Array.isArray(body.operations)` — if operations are present it + // applies them on top of the EXISTING metadata and ignores + // `body.metadata` entirely; otherwise `body.metadata` is the new + // full value. Doing both here would make a request like + // `{ metadata: {b:2}, operations: [set c=3] }` produce + // `{b:2,c:3}` on the buffer vs `{a:1,c:3}` on PG, which silently + // changes semantics across the buffered/materialised boundary. + const parseSnapshotMetadata = (): Record => { + if (typeof snapshot.metadata !== "string") return {}; + try { + return JSON.parse(snapshot.metadata) as Record; + } catch { + return {}; + } + }; + + let metadataObject: Record; + // Use `Array.isArray` (the PG service's predicate) instead of a + // truthy length check. For `{ metadata, operations: [] }` PG sees + // Array.isArray([])=true and no-ops on existing metadata; a + // `.length` check would treat the empty array as falsy and fall + // through to the `body.metadata` branch, replacing metadata — + // exactly the cross-boundary drift the comment above warns + // against. + if (Array.isArray(input.body.operations)) { + // Operations take precedence: apply on top of existing snapshot + // metadata; ignore `body.metadata` to match PG behaviour. + metadataObject = applyMetadataOperations( + parseSnapshotMetadata(), + input.body.operations, + ).newMetadata; + } else if (input.body.metadata !== undefined) { + // No operations — full replace. + metadataObject = input.body.metadata as Record; + } else { + // Neither — write back existing snapshot metadata (no-op shape). + metadataObject = parseSnapshotMetadata(); + } + + const newMetadataStr = JSON.stringify(metadataObject); + + // Size cap — match PG (`handleMetadataPacket` throws + // `MetadataTooLargeError` (413) when the JSON-encoded packet + // exceeds the configured cap). Reject in-loop, before CAS, so a + // single oversize write doesn't churn the retry budget. + const observedSize = Buffer.byteLength(newMetadataStr, "utf8"); + if (observedSize > input.maximumSize) { + return { + kind: "metadata_too_large", + maximumSize: input.maximumSize, + observedSize, + }; + } + + const cas = await buffer.casSetMetadata({ + runId: input.runId, + expectedVersion: entry.metadataVersion, + newMetadata: newMetadataStr, + newMetadataType: currentMetadataType, + }); + + if (cas.kind === "applied") { + return { + kind: "applied", + newMetadata: metadataObject, + parentTaskRunFriendlyId: snapshotParentTaskRunFriendlyId, + rootTaskRunFriendlyId: snapshotRootTaskRunFriendlyId, + }; + } + if (cas.kind === "not_found") return { kind: "not_found" }; + if (cas.kind === "busy") return { kind: "busy" }; + // version_conflict — another caller wrote between our read + CAS. + // Small jittered backoff so a thundering herd of N retriers doesn't + // all re-read + re-CAS at exactly the same moment. + logger.debug("applyMetadataMutationToBufferedRun: version_conflict, retrying", { + runId: input.runId, + attempt, + observedVersion: entry.metadataVersion, + currentVersion: cas.currentVersion, + }); + const backoffMs = Math.floor(Math.random() * (5 + attempt * 5)); + await new Promise((resolve) => setTimeout(resolve, backoffMs)); + } + + logger.warn("applyMetadataMutationToBufferedRun: retries exhausted", { + runId: input.runId, + maxRetries, + }); + return { kind: "version_exhausted" }; +} diff --git a/apps/webapp/app/v3/mollifier/bufferedTriggerPayload.server.ts b/apps/webapp/app/v3/mollifier/bufferedTriggerPayload.server.ts index d251e9f98e8..287b5bf9bcb 100644 --- a/apps/webapp/app/v3/mollifier/bufferedTriggerPayload.server.ts +++ b/apps/webapp/app/v3/mollifier/bufferedTriggerPayload.server.ts @@ -2,17 +2,17 @@ import type { TriggerTaskRequestBody } from "@trigger.dev/core/v3"; import type { TriggerTaskServiceOptions } from "~/v3/services/triggerTask.server"; // Canonical payload shape written to the mollifier buffer when the gate -// decides to mollify a trigger. Phase 1 ALSO calls engine.trigger directly -// (dual-write) so this is currently an audit/preview record. Phase 2 will -// make the buffer the primary write path: the drainer's handler will read -// this payload and replay it through engine.trigger to create the run in -// Postgres, and read-fallback endpoints will synthesise a Run view from it -// while it is still QUEUED. +// decides to mollify a trigger. At this stage the call site ALSO calls +// engine.trigger directly (dual-write), so this is currently an +// audit/preview record. A later change makes the buffer the primary write +// path: the drainer's handler reads this payload and replays it through +// engine.trigger to create the run in Postgres, and read-fallback +// endpoints synthesise a Run view from it while it is still QUEUED. // -// CONTRACT: this shape must contain everything needed for Phase 2's -// drainer-replay to reconstruct an equivalent engine.trigger call. Phase 1 -// emits it to logs; Phase 2 will serialise it into Redis and rebuild it on -// the drain side. Keep it serialisable — no functions, no class instances. +// CONTRACT: this shape must contain everything the drainer-replay needs to +// reconstruct an equivalent engine.trigger call. Today it is emitted to +// logs; later it is serialised into Redis and rebuilt on the drain side. +// Keep it serialisable — no functions, no class instances. export type BufferedTriggerPayload = { runFriendlyId: string; diff --git a/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts b/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts new file mode 100644 index 00000000000..47c9733c927 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/idempotencyClaim.server.ts @@ -0,0 +1,218 @@ +import { randomUUID } from "node:crypto"; +import type { + IdempotencyClaimResult, + IdempotencyLookupInput, + MollifierBuffer, +} from "@trigger.dev/redis-worker"; +import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; + +// Tunables. The TTL on the claim key is bounded by typical trigger-pipeline +// dwell; long enough that a slow PG insert doesn't expire mid-flight, +// short enough that a crashed claimant unblocks waiters quickly. +export const DEFAULT_CLAIM_TTL_SECONDS = 30; +// safetyNetMs caps how long a waiter blocks before returning timed_out. +// Matches the mutateWithFallback safety net so SDK retry policies don't +// have to special-case this path. +export const DEFAULT_CLAIM_WAIT_MS = 5_000; +export const DEFAULT_CLAIM_POLL_MS = 25; + +export type ClaimOrAwaitOutcome = + // We own the claim. `token` MUST be passed to publishClaim/releaseClaim + // so the buffer can compare-and-act against our ownership marker — a + // late release from a previous claimant whose TTL expired cannot + // erase our slot. + | { kind: "claimed"; token: string } + | { kind: "resolved"; runId: string } // someone else's runId; caller returns isCached:true + | { kind: "timed_out" }; + +export type ClaimOrAwaitInput = IdempotencyLookupInput & { + ttlSeconds?: number; + safetyNetMs?: number; + pollStepMs?: number; + abortSignal?: AbortSignal; + // Test injection. + buffer?: MollifierBuffer | null; + now?: () => number; + sleep?: (ms: number) => Promise; + // Test override for the ownership-token generator. Defaults to + // `crypto.randomUUID()`. Tests pass a deterministic value so they + // can assert publish/release pass-through. + generateToken?: () => string; +}; + +// Pre-gate Redis claim. All same-key triggers serialise through here +// before the trigger pipeline runs. Returning `resolved` short-circuits +// the trigger entirely — the caller responds with the cached runId. +// Returning `claimed` means we own the claim and MUST publish the +// winning runId on success (`publishClaim`) or release the claim on +// failure (`releaseClaim`). +// +// Failure modes: +// - Redis down at claim time: returns `claimed` (fail open, no +// coordination). Customer is no worse than today's race; the +// PG unique constraint is the eventual arbiter. +// - Claimant crashes mid-pipeline: claim TTL expires, waiters +// eventually time out, SDK retries. +// - PG/buffer publish failure: waiters time out and SDK retries; next +// attempt sees the eventual PG/buffer state via existing +// IdempotencyKeyConcern PG-first lookup. +export async function claimOrAwait(input: ClaimOrAwaitInput): Promise { + const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + if (!buffer) { + // Mollifier disabled / buffer construction failed. Fall open — + // caller proceeds with the trigger pipeline (PG unique constraint + // backstop). The token is never read in this case (publish/release + // are buffer-null no-ops downstream), so we skip the default + // `randomUUID()` to keep the mollifier-OFF hot path allocation-free + // for idempotency-keyed triggers — `triggerTask` is the + // highest-throughput code path in the system. A test-injected + // generator is still honoured for deterministic assertions. + return { kind: "claimed", token: input.generateToken ? input.generateToken() : "" }; + } + const generateToken = input.generateToken ?? randomUUID; + // Generate the ownership token up front so the retry loop reuses it + // — we're the same logical claimant across attempts; only the slot + // owner changes between releases. + const token = generateToken(); + const ttlSeconds = input.ttlSeconds ?? DEFAULT_CLAIM_TTL_SECONDS; + const safetyNetMs = input.safetyNetMs ?? DEFAULT_CLAIM_WAIT_MS; + const pollStepMs = input.pollStepMs ?? DEFAULT_CLAIM_POLL_MS; + const now = input.now ?? Date.now; + const sleep = input.sleep ?? defaultSleep; + + const lookupInput: IdempotencyLookupInput = { + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + }; + + // Initial claim attempt. Most production-path calls resolve here on + // the first call (either we win, or the key is already resolved from + // a prior burst). + let result: IdempotencyClaimResult; + try { + result = await buffer.claimIdempotency({ ...lookupInput, token, ttlSeconds }); + } catch (err) { + logger.warn("idempotency claim failed (fail-open)", { + envId: input.envId, + taskIdentifier: input.taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + return { kind: "claimed", token }; + } + + if (result.kind === "claimed") return { kind: "claimed", token }; + if (result.kind === "resolved") return result; + + // result.kind === "pending" — wait/poll loop. May see the value flip + // to "resolved" (winner published), the key vanish (winner released + // on error → retry claim), or stay "pending" until the safety net. + const deadline = now() + safetyNetMs; + while (now() < deadline) { + if (input.abortSignal?.aborted) return { kind: "timed_out" }; + await sleep(pollStepMs); + + let current: IdempotencyClaimResult | null; + try { + current = await buffer.readClaim(lookupInput); + } catch (err) { + // Transient read failure — keep polling until deadline. + logger.warn("idempotency claim read failed mid-poll", { + err: err instanceof Error ? err.message : String(err), + }); + continue; + } + + if (current === null) { + // Claimant released on error. Re-attempt the claim — one of the + // waiters will win, the rest see "pending" again. Reuse our token: + // we're still the same logical claimant, just contending for a + // freshly empty slot. + try { + const retry = await buffer.claimIdempotency({ ...lookupInput, token, ttlSeconds }); + if (retry.kind === "claimed") return { kind: "claimed", token }; + if (retry.kind === "resolved") return retry; + // "pending" again → keep polling. + } catch (err) { + logger.warn("idempotency claim retry failed", { + err: err instanceof Error ? err.message : String(err), + }); + return { kind: "claimed", token }; + } + continue; + } + if (current.kind === "resolved") return current; + // current.kind === "pending" → keep polling. + } + return { kind: "timed_out" }; +} + +// Publish the winning runId so waiters resolve. Best-effort: failure +// here means waiters will time out and the SDK will retry, which will +// then find the row via the existing IdempotencyKeyConcern PG-first +// check. +export async function publishClaim(input: { + envId: string; + taskIdentifier: string; + idempotencyKey: string; + // Ownership token from the `claimed` outcome. Buffer compare-and-sets + // on this so a publish from a stale claimant (TTL expired, another + // claimant moved in) is a no-op rather than overwriting their claim. + token: string; + runId: string; + ttlSeconds?: number; + buffer?: MollifierBuffer | null; +}): Promise { + const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + if (!buffer) return; + const ttlSeconds = input.ttlSeconds ?? DEFAULT_CLAIM_TTL_SECONDS; + try { + await buffer.publishClaim({ + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + token: input.token, + runId: input.runId, + ttlSeconds, + }); + } catch (err) { + logger.warn("idempotency claim publish failed", { + envId: input.envId, + taskIdentifier: input.taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + } +} + +// Release on pipeline failure. Best-effort. If the DEL fails, the claim +// TTL is the safety net — waiters time out, SDK retries. +export async function releaseClaim(input: { + envId: string; + taskIdentifier: string; + idempotencyKey: string; + // Ownership token from the `claimed` outcome. Buffer compare-and- + // deletes on this so a release from a stale claimant whose TTL + // expired can't wipe a new owner's claim. + token: string; + buffer?: MollifierBuffer | null; +}): Promise { + const buffer = input.buffer === undefined ? getMollifierBuffer() : input.buffer; + if (!buffer) return; + try { + await buffer.releaseClaim({ + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + token: input.token, + }); + } catch (err) { + logger.warn("idempotency claim release failed", { + err: err instanceof Error ? err.message : String(err), + }); + } +} + +function defaultSleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts index 9c8917623e4..09b52aa9da3 100644 --- a/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierBuffer.server.ts @@ -22,7 +22,6 @@ function initializeMollifierBuffer(): MollifierBuffer { enableAutoPipelining: true, ...(env.TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), }, - entryTtlSeconds: env.TRIGGER_MOLLIFIER_ENTRY_TTL_S, }); } diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts index 139aeaf9a6e..26ac60f180f 100644 --- a/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts @@ -1,10 +1,16 @@ -import { createHash } from "node:crypto"; -import { MollifierDrainer, serialiseSnapshot } from "@trigger.dev/redis-worker"; +import { MollifierDrainer } from "@trigger.dev/redis-worker"; +import { prisma } from "~/db.server"; import { env } from "~/env.server"; +import { engine as runEngine } from "~/v3/runEngine.server"; import { logger } from "~/services/logger.server"; import { singleton } from "~/utils/singleton"; import { getMollifierBuffer } from "./mollifierBuffer.server"; -import type { BufferedTriggerPayload } from "./bufferedTriggerPayload.server"; +import { + createDrainerHandler, + createDrainerTerminalFailureHandler, + isRetryablePgError, +} from "./mollifierDrainerHandler.server"; +import type { MollifierSnapshot } from "./mollifierSnapshot.server"; // Distinct error class for the deterministic "fail loud at boot" throws // below. The bootstrap in `mollifierDrainerWorker.server.ts` catches @@ -25,7 +31,7 @@ export class MollifierConfigurationError extends Error { } } -function initializeMollifierDrainer(): MollifierDrainer { +function initializeMollifierDrainer(): MollifierDrainer { const buffer = getMollifierBuffer(); if (!buffer) { // Unreachable in normal config: getMollifierDrainer() gates on the @@ -68,40 +74,14 @@ function initializeMollifierDrainer(): MollifierDrainer maxAttempts: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS, }); - // Phase 1 handler: no-op ack. The trigger has ALREADY been written to - // Postgres via engine.trigger (dual-write at the call site). Popping + - // acking here proves the dequeue mechanism works end-to-end without - // duplicating the work. Phase 2 will replace this with an engine.trigger - // replay that performs the actual Postgres write. - const drainer = new MollifierDrainer({ + const drainer = new MollifierDrainer({ buffer, - handler: async (input) => { - // Hash the (re-serialised, canonical) payload on the drain side rather - // than on the trigger hot path. Burst-time CPU stays with engine.trigger; - // the drainer is the natural place for the audit-equivalence checksum. - // Re-serialisation is identity for the BufferedTriggerPayload shape - // (only strings/numbers/plain objects), so this hash matches what the - // call site wrote into Redis. - const reserialised = serialiseSnapshot(input.payload); - const payloadHash = createHash("sha256").update(reserialised).digest("hex"); - logger.info("mollifier.drained", { - runId: input.runId, - envId: input.envId, - orgId: input.orgId, - taskId: input.payload.taskId, - attempts: input.attempts, - ageMs: Date.now() - input.createdAt.getTime(), - payloadBytes: reserialised.length, - payloadHash, - }); - }, + handler: createDrainerHandler({ engine: runEngine, prisma }), + onTerminalFailure: createDrainerTerminalFailureHandler({ engine: runEngine, prisma }), concurrency: env.TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY, maxAttempts: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS, maxOrgsPerTick: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK, - // A no-op handler shouldn't throw, but if something does (e.g. an - // unexpected deserialise failure), don't loop — let it FAIL terminally - // so the entry is observable in metrics. - isRetryable: () => false, + isRetryable: isRetryablePgError, }); return drainer; @@ -114,7 +94,7 @@ function initializeMollifierDrainer(): MollifierDrainer // handler registration, leaving a narrow window where a SIGTERM landing // between `start()` and `process.once("SIGTERM", ...)` would skip the // graceful stop. The split is intentional. -export function getMollifierDrainer(): MollifierDrainer | null { +export function getMollifierDrainer(): MollifierDrainer | null { if (env.TRIGGER_MOLLIFIER_ENABLED !== "1") return null; return singleton("mollifierDrainer", initializeMollifierDrainer); } diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts new file mode 100644 index 00000000000..6e829baa575 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts @@ -0,0 +1,403 @@ +import { context, trace, TraceFlags } from "@opentelemetry/api"; +import type { RunEngine } from "@internal/run-engine"; +import type { PrismaClientOrTransaction } from "@trigger.dev/database"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; +import type { + MollifierDrainerHandler, + MollifierDrainerTerminalFailureHandler, +} from "@trigger.dev/redis-worker"; +import { logger } from "~/services/logger.server"; +import { recordRunDebugLog } from "~/v3/eventRepository/index.server"; +import { PerformTaskRunAlertsService } from "~/v3/services/alerts/performTaskRunAlerts.server"; +import { startSpan } from "~/v3/tracing.server"; +import type { MollifierSnapshot } from "./mollifierSnapshot.server"; + +const tracer = trace.getTracer("mollifier-drainer"); + +export function isRetryablePgError(err: unknown): boolean { + if (!(err instanceof Error)) return false; + const msg = err.message ?? ""; + // Prisma surfaces P1001 ("Can't reach database server") via two + // different error classes — `PrismaClientKnownRequestError` exposes + // it as `err.code`, `PrismaClientInitializationError` exposes it as + // `err.errorCode`. Check both so reconnection-time errors retry + // regardless of which class fires. + const code = (err as { code?: string }).code; + const errorCode = (err as { errorCode?: string }).errorCode; + if (code === "P2024") return true; + if (code === "P1001" || errorCode === "P1001") return true; + if (msg.includes("Can't reach database server")) return true; + if (msg.includes("Connection lost")) return true; + if (msg.includes("ECONNRESET")) return true; + return false; +} + +export function createDrainerHandler(deps: { + engine: RunEngine; + prisma: PrismaClientOrTransaction; +}): MollifierDrainerHandler { + return async (input) => { + const dwellMs = Date.now() - input.createdAt.getTime(); + + // Re-attach to the trace started by the caller's mollifier.queued span + // (its traceId + spanId were captured into the snapshot at buffer time). + // Without this the drainer would emit mollifier.drained in a brand-new + // trace and the engine.trigger instrumentation would inherit an empty + // active context — leaving the run-detail page with only the root span. + const snapshotTraceId = + typeof input.payload.traceId === "string" ? input.payload.traceId : undefined; + const snapshotSpanId = + typeof input.payload.spanId === "string" ? input.payload.spanId : undefined; + + const parentContext = + snapshotTraceId && snapshotSpanId + ? trace.setSpanContext(context.active(), { + traceId: snapshotTraceId, + spanId: snapshotSpanId, + traceFlags: TraceFlags.SAMPLED, + isRemote: true, + }) + : context.active(); + + // Cancel-wins-over-trigger. If a cancel API call landed on this + // entry while it was QUEUED, the snapshot carries `cancelledAt` + + // `cancelReason`. Skip the normal materialise path and write a + // CANCELED PG row directly. The `runCancelled` bus emit is + // suppressed here because a buffered-only run never had a primary + // trace event written for it — the runCancelled handler's + // `cancelRunEvent` lookup would fail and log noise per cancel. + const cancelledAtStr = + typeof input.payload.cancelledAt === "string" ? input.payload.cancelledAt : undefined; + if (cancelledAtStr) { + const cancelReason = + typeof input.payload.cancelReason === "string" + ? input.payload.cancelReason + : "Canceled by user"; + await context.with(parentContext, async () => { + await startSpan(tracer, "mollifier.drained.cancelled", async (span) => { + span.setAttribute("mollifier.drained", true); + span.setAttribute("mollifier.dwell_ms", dwellMs); + span.setAttribute("mollifier.attempts", input.attempts); + span.setAttribute("mollifier.run_friendly_id", input.runId); + span.setAttribute("mollifier.cancel_bifurcation", true); + span.setAttribute("taskRunId", input.runId); + try { + await deps.engine.createCancelledRun( + { + snapshot: input.payload as any, + cancelledAt: new Date(cancelledAtStr), + cancelReason, + emitRunCancelledEvent: false, + }, + deps.prisma, + ); + } catch (err) { + // createCancelledRun throws a conflict when the normal trigger + // replay path won the race and already materialised a live + // (non-CANCELED) row for this friendlyId. Its contract leaves + // the resolution to us: honour the cancel by actually + // cancelling the now-live run. Letting the conflict propagate + // would instead reach the drainer's terminal-failure path + // (isRetryablePgError() is false for it), buffer.fail() the + // entry, and silently lose the cancellation while the run + // keeps executing. + const isConflict = + err instanceof Error && err.message.startsWith("createCancelledRun conflict"); + if (!isConflict) { + // Mirror the SYSTEM_FAILURE fallback the non-cancelled + // trigger path uses below. Without this branch, a + // non-retryable createCancelledRun failure rethrows, the + // drainer's onTerminalFailure handler skips because it + // gates on `cause === "max-attempts-exhausted"` (and the + // outer drainer classifies non-retryable failures with + // `cause: "non-retryable"`), and buffer.fail() deletes + // the entry — leaving NO PG row. The cancellation + // disappears silently from the customer's dashboard. + // Writing a SYSTEM_FAILURE row gives the run a terminal, + // visible state. + if (isRetryablePgError(err)) { + throw err; + } + span.setAttribute("mollifier.cancel_terminal_failure_reason", + err instanceof Error ? err.message : String(err)); + try { + const wrote = await writeMollifierTerminalFailureRow(deps, { + friendlyId: input.runId, + snapshot: input.payload as Record, + reason: err instanceof Error ? err.message : String(err), + }); + if (wrote) return; + } catch (writeErr) { + if (isRetryablePgError(writeErr)) { + span.setAttribute("mollifier.cancel_terminal_write_retryable", true); + throw writeErr; + } + span.setAttribute( + "mollifier.cancel_terminal_write_error", + writeErr instanceof Error ? writeErr.message : String(writeErr) + ); + } + throw err; + } + span.setAttribute("mollifier.cancel_conflict", true); + const friendlyId = + typeof input.payload.friendlyId === "string" + ? input.payload.friendlyId + : input.runId; + await deps.engine.cancelRun({ + runId: RunId.fromFriendlyId(friendlyId), + completedAt: new Date(cancelledAtStr), + reason: cancelReason, + }); + } + }); + }); + return; + } + + await context.with(parentContext, async () => { + await startSpan(tracer, "mollifier.drained", async (span) => { + span.setAttribute("mollifier.drained", true); + span.setAttribute("mollifier.dwell_ms", dwellMs); + span.setAttribute("mollifier.attempts", input.attempts); + span.setAttribute("mollifier.run_friendly_id", input.runId); + span.setAttribute("taskRunId", input.runId); + + let triggerSucceeded = false; + try { + await deps.engine.trigger(input.payload as any, deps.prisma); + triggerSucceeded = true; + } catch (err) { + // The retryable-PG class re-throws so the drainer's outer + // worker loop can `buffer.requeue` (handled in + // `MollifierDrainer.drainOne`). For non-retryable failures we + // write a terminal SYSTEM_FAILURE row to PG via the engine's + // existing `createFailedTaskRun` (used by batch-trigger for + // the same purpose) so the customer sees the run in their + // dashboard / SDK instead of silently losing it when the + // buffer entry TTLs out. If THAT insert also fails (PG truly + // unreachable), rethrow so the drainer's outer catch falls + // through to its existing `buffer.fail` terminal-marker path. + if (isRetryablePgError(err)) { + throw err; + } + const reason = err instanceof Error ? err.message : String(err); + span.setAttribute("mollifier.terminal_failure_reason", reason); + try { + const wrote = await writeMollifierTerminalFailureRow(deps, { + friendlyId: input.runId, + snapshot: input.payload as Record, + reason, + }); + if (!wrote) { + // Snapshot too malformed to even construct a TaskRun row. + // Drainer's outer catch will buffer.fail this entry. + throw err; + } + } catch (writeErr) { + // The terminal SYSTEM_FAILURE write itself failed. If it + // failed because PG is transiently unreachable, rethrow the + // *write* error so the drainer requeues — buffer.fail()ing on + // the original non-retryable error would lose the run with no + // PG row ever landing. Once PG recovers the requeued entry + // writes its failure row and the customer sees it. + if (isRetryablePgError(writeErr)) { + span.setAttribute("mollifier.terminal_write_retryable", true); + throw writeErr; + } + // PG reachable but the write was rejected for another reason + // (genuinely bad snapshot). Rethrow the original trigger error + // so the drainer falls back to buffer.fail. + span.setAttribute( + "mollifier.terminal_write_error", + writeErr instanceof Error ? writeErr.message : String(writeErr) + ); + throw err; + } + } + + // Admin-only audit trail emitted once engine.trigger has + // landed a PG row. `recordRunDebugLog` flips this to the + // admin-gated debug kind (TaskEventKind.LOG in the PG store / + // DEBUG_EVENT in the ClickHouse store) which the trace view + + // logs download already strip for non-admins + // (`eventRepository.server.ts:108`, + // `resources.runs.$runParam.logs.download.ts:118`). + // + // Placement: emit as a zero-duration marker AT materialisation + // time, not as a back-dated bar spanning the buffered window. + // `engine.trigger` rewrites the run's root span at + // materialisation (it adopts the synth root via traceId/spanId + // carryover but updates start_time to "now"), so the trace + // renderer treats materialisation time as t=0. A back-dated + // event with startTime = bufferedAt would land before that t=0 + // and get clipped from the tree. Same pattern as the + // `[engine] QUEUED` markers. The window itself is preserved + // in metadata so admins can read it off the span detail pane. + // + // Best-effort: `recordRunDebugLog` has its own try/catch and + // returns a result, so it never throws into the materialisation + // path. Failures are logged but not surfaced because the + // customer-visible run has already landed. + if (triggerSucceeded) { + const debugResult = await recordRunDebugLog( + RunId.fromFriendlyId(input.runId), + `Mollifier buffered ${dwellMs}ms before materialising`, + { + attributes: { + runId: input.runId, + metadata: { + "mollifier.bufferedAt": input.createdAt.toISOString(), + "mollifier.materialisedAt": new Date().toISOString(), + "mollifier.dwellMs": dwellMs, + "mollifier.attempts": input.attempts, + }, + }, + parentId: snapshotSpanId, + } + ); + if (!debugResult.success && debugResult.code !== "RUN_NOT_FOUND") { + logger.warn("mollifier drainer: failed to record admin debug log", { + runId: input.runId, + code: debugResult.code, + }); + } + } + }); + }); + }; +} + +// Shared SYSTEM_FAILURE construction used by both terminal paths: +// - non-retryable failure inside the handler (above) +// - retryable failure after maxAttempts inside the drainer's +// `processEntry` (via `createDrainerTerminalFailureHandler`) +// +// Suppresses `runFailed` and enqueues the alert manually — the engine's +// `runFailed` handler calls `completeFailedRunEvent`, which looks up +// the run's primary span. Buffered-only runs never had a primary trace +// event written (the mollifier gate intercepts BEFORE +// `repository.traceEvent` runs), so the lookup always fails and the +// handler logs a systematic `[runFailed] Failed to complete failed +// run event` error per terminal failure. `TriggerFailedTaskService` +// handles the identical situation the same way (see triggerFailedTask +// .server.ts:212 and 324) — pass `emitRunFailedEvent: false` to the +// engine and call `PerformTaskRunAlertsService.enqueue(...)` directly +// so customers' ERROR channels still fire. Alert enqueue is +// best-effort; an alert-side failure is logged but does not bubble up +// (the SYSTEM_FAILURE row landing is the load-bearing customer-visible +// outcome). +// +// Returns the new `TaskRun` on success or `null` when the snapshot was +// so malformed it couldn't even produce an environment — caller decides +// whether to escalate that to `buffer.fail` directly. Throws on any +// other failure so the drainer's retryable/non-retryable disposition +// logic can own the decision. +async function writeMollifierTerminalFailureRow( + deps: { engine: RunEngine; prisma: PrismaClientOrTransaction }, + args: { friendlyId: string; snapshot: Record; reason: string }, +) { + const { snapshot } = args; + const env = snapshot.environment as + | { + id: string; + type: any; + project: { id: string }; + organization: { id: string }; + } + | undefined; + if (!env) return null; + // Extract batch association from the snapshot if present. Without this + // a SYSTEM_FAILURE row for a buffered batch child won't be linked to + // its batch, and the batch parent's completion tracking can hang + // indefinitely waiting on a child that landed but isn't visible to + // the batch. + const rawBatch = snapshot.batch; + const batch = + rawBatch && + typeof rawBatch === "object" && + "id" in rawBatch && + typeof (rawBatch as { id: unknown }).id === "string" && + "index" in rawBatch && + typeof (rawBatch as { index: unknown }).index === "number" + ? (rawBatch as { id: string; index: number }) + : undefined; + const failedRun = await deps.engine.createFailedTaskRun({ + friendlyId: args.friendlyId, + environment: env, + taskIdentifier: String(snapshot.taskIdentifier ?? ""), + payload: typeof snapshot.payload === "string" ? snapshot.payload : undefined, + payloadType: typeof snapshot.payloadType === "string" ? snapshot.payloadType : undefined, + error: { + type: "STRING_ERROR", + raw: `Mollifier drainer terminal failure: ${args.reason}`, + }, + parentTaskRunId: + typeof snapshot.parentTaskRunId === "string" ? snapshot.parentTaskRunId : undefined, + rootTaskRunId: + typeof snapshot.rootTaskRunId === "string" ? snapshot.rootTaskRunId : undefined, + depth: typeof snapshot.depth === "number" ? snapshot.depth : 0, + resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true, + batch, + traceId: typeof snapshot.traceId === "string" ? snapshot.traceId : undefined, + spanId: typeof snapshot.spanId === "string" ? snapshot.spanId : undefined, + taskEventStore: + typeof snapshot.taskEventStore === "string" ? snapshot.taskEventStore : undefined, + queue: typeof snapshot.queue === "string" ? snapshot.queue : undefined, + lockedQueueId: + typeof snapshot.lockedQueueId === "string" ? snapshot.lockedQueueId : undefined, + emitRunFailedEvent: false, + }); + // Alerts side of `runFailed` — the engine emit was suppressed above + // so we don't create an orphan trace event; enqueue the alert + // directly so customers' ERROR channels still see the failure. + // Best-effort, mirroring TriggerFailedTaskService. + try { + await PerformTaskRunAlertsService.enqueue(failedRun.id); + } catch (alertsError) { + logger.warn("writeMollifierTerminalFailureRow: alert enqueue failed", { + friendlyId: args.friendlyId, + error: alertsError instanceof Error ? alertsError.message : String(alertsError), + }); + } + return failedRun; +} + +// Drainer-side terminal-failure callback. Fires from +// `MollifierDrainer.processEntry` BEFORE `buffer.fail()` on any path +// where the in-handler write didn't already land — currently the +// `cause: "max-attempts-exhausted"` case for retryable PG errors. Writes +// the same SYSTEM_FAILURE row the non-retryable handler path writes +// inline (via the shared `writeMollifierTerminalFailureRow` helper) so +// the customer-visible behaviour is identical regardless of how the +// failure was classified. +// +// Re-throws retryable PG errors so the drainer requeues — buffer.fail()ing +// here would still lose the run if PG is genuinely unreachable. Throwing +// anything else falls through to buffer.fail to avoid an infinite loop on +// a genuinely bad snapshot (the drainer logs it). +export function createDrainerTerminalFailureHandler(deps: { + engine: RunEngine; + prisma: PrismaClientOrTransaction; +}): MollifierDrainerTerminalFailureHandler { + return async (input) => { + // The handler's own non-retryable terminal path has already written + // the SYSTEM_FAILURE row before it throws non-retryable. Only the + // retryable-exhausted path reaches us with no row written yet — gate + // on `cause` to avoid double-writing for non-retryable failures. + if (input.cause !== "max-attempts-exhausted") return; + await startSpan(tracer, "mollifier.drained.terminal_failure", async (span) => { + span.setAttribute("mollifier.drained", false); + span.setAttribute("mollifier.attempts", input.attempts); + span.setAttribute("mollifier.run_friendly_id", input.runId); + span.setAttribute("mollifier.terminal_failure_cause", input.cause); + span.setAttribute("mollifier.terminal_failure_reason", input.error.message); + span.setAttribute("taskRunId", input.runId); + await writeMollifierTerminalFailureRow(deps, { + friendlyId: input.runId, + snapshot: input.payload as Record, + reason: input.error.message, + }); + }); + }; +} diff --git a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts index 28b0a7f88cf..63146b4c323 100644 --- a/apps/webapp/app/v3/mollifier/mollifierGate.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierGate.server.ts @@ -46,6 +46,17 @@ export type GateInputs = { // the pattern used by `canAccessAi`, `canAccessPrivateConnections`, and the // compute-template beta gate. orgFeatureFlags: Record | null; + // Trigger options that drive the debounce / OTU / triggerAndWait + // bypasses. The mollify path can't + // serialise stateful callbacks (debounce), can't safely break OTU's + // synchronous-rejection contract, and shouldn't intercept single + // triggerAndWait (batchTriggerAndWait still funnels through per item). + options?: { + debounce?: unknown; + oneTimeUseToken?: string; + parentTaskRunId?: string; + resumeParentOnCompletion?: boolean; + }; }; export type TripEvaluator = (inputs: GateInputs) => Promise; @@ -73,7 +84,7 @@ export type GateDependencies = { }; // `options` is a thunk so env reads happen per-evaluation, not at module load. -// Don't "simplify" to a plain object — Phase 2 dynamic config relies on the +// Don't "simplify" to a plain object — dynamic config relies on the // gate observing whichever env values are live at trigger time. const defaultEvaluator = createRealTripEvaluator({ getBuffer: () => getMollifierBuffer(), @@ -141,6 +152,28 @@ export async function evaluateGate( ): Promise { const d = { ...defaultGateDependencies, ...deps }; + // Debounce bypass. onDebounced is a closure over webapp state and + // can't be snapshotted into the buffer for drainer replay. Skip before the + // trip evaluator so debounce traffic is never counted against the rate. + if (inputs.options?.debounce) { + d.recordDecision("pass_through"); + return { action: "pass_through" }; + } + // OneTimeUseToken bypass. OTU is a security feature on the PUBLIC_JWT + // auth path; its synchronous-rejection contract is materially worse to + // break than the idempotency-key contract. + if (inputs.options?.oneTimeUseToken) { + d.recordDecision("pass_through"); + return { action: "pass_through" }; + } + // Single triggerAndWait bypass. batchTriggerAndWait still funnels + // through TriggerTaskService.call per item so the dominant burst pattern + // remains covered. + if (inputs.options?.parentTaskRunId && inputs.options?.resumeParentOnCompletion) { + d.recordDecision("pass_through"); + return { action: "pass_through" }; + } + if (!d.isMollifierEnabled()) { d.recordDecision("pass_through"); return { action: "pass_through" }; diff --git a/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts b/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts new file mode 100644 index 00000000000..a8b0b151115 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierMollify.server.ts @@ -0,0 +1,98 @@ +import { RunId } from "@trigger.dev/core/v3/isomorphic"; +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { serialiseMollifierSnapshot, type MollifierSnapshot } from "./mollifierSnapshot.server"; +import type { TripDecision } from "./mollifierGate.server"; + +export type MollifyNotice = { + code: "mollifier.queued"; + message: string; + docs: string; +}; + +export type MollifySyntheticResult = { + // `id` is the canonical TaskRun primary key derived from `friendlyId` + // via `RunId.fromFriendlyId`. Downstream consumers in the trigger + // route — notably `saveRequestIdempotency` — index the request- + // idempotency cache by this id; without it the cache stores + // `undefined` and Prisma's `findFirst({ where: { id: undefined } })` + // on retry strips the predicate and returns an arbitrary TaskRun + // (potential cross-tenant leak). Always populated. + // + // `spanId` is the root-span id allocated at gate-accept time and + // stored in the snapshot. Callers like the dashboard's Test action + // use it to build a `v3RunSpanPath` URL that auto-opens the right + // details panel — without it, the buffered run lands on the + // run-detail page with no span selected (parity gap with PG runs). + run: { id: string; friendlyId: string; spanId: string }; + error: undefined; + // The race-loser path: if accept's SETNX hit an existing + // buffered run with the same (env, task, idempotencyKey), the + // response echoes the winner's runId with isCached=true. The + // mollifier-queued notice is only attached for the happy accept. + isCached: boolean; + notice?: MollifyNotice; +}; + +const NOTICE: MollifyNotice = { + code: "mollifier.queued", + message: + "Trigger accepted into burst buffer. Consider batchTrigger for fan-outs of 100+.", + docs: "https://trigger.dev/docs/management/tasks/batch-trigger", +}; + +export async function mollifyTrigger(args: { + runFriendlyId: string; + environmentId: string; + organizationId: string; + engineTriggerInput: MollifierSnapshot; + decision: Extract; + buffer: MollifierBuffer; + // Optional idempotency context. When both are passed, accept SETNXes + // the lookup so the buffered window participates in trigger-time + // dedup symmetrically with PG. + idempotencyKey?: string; + taskIdentifier?: string; +}): Promise { + const result = await args.buffer.accept({ + runId: args.runFriendlyId, + envId: args.environmentId, + orgId: args.organizationId, + payload: serialiseMollifierSnapshot(args.engineTriggerInput), + idempotencyKey: args.idempotencyKey, + taskIdentifier: args.taskIdentifier, + }); + + if (result.kind === "duplicate_idempotency") { + // Race loser. Echo the winner's runId so the SDK's response shape + // matches PG-side idempotency cache hits. The winner's spanId isn't + // readily available without a second buffer fetch; an empty string + // causes `v3RunSpanPath` to omit the `?span=` param, which matches + // current behaviour for cached PG responses. + return { + run: { + id: RunId.fromFriendlyId(result.existingRunId), + friendlyId: result.existingRunId, + spanId: "", + }, + error: undefined, + isCached: true, + }; + } + + // Both "accepted" and "duplicate_run_id" produce the same customer- + // visible response: a buffered-trigger acknowledgement. The duplicate + // runId case is unreachable in practice (runIds are server-generated + // and unique) but is silently idempotent at the buffer layer either way. + const rawSpanId = args.engineTriggerInput.spanId; + const spanId = typeof rawSpanId === "string" ? rawSpanId : ""; + return { + run: { + id: RunId.fromFriendlyId(args.runFriendlyId), + friendlyId: args.runFriendlyId, + spanId, + }, + error: undefined, + isCached: false, + notice: NOTICE, + }; +} diff --git a/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts new file mode 100644 index 00000000000..a0732a3542e --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierSnapshot.server.ts @@ -0,0 +1,16 @@ +import { serialiseSnapshot, deserialiseSnapshot } from "@trigger.dev/redis-worker"; + +// MollifierSnapshot is the JSON-serialisable shape of the input that would be +// passed to engine.trigger(). The drainer deserialises and replays it. +// Kept as Record at this layer — the engine.trigger call site +// casts it to the engine's typed input. This keeps the mollifier subdirectory +// from depending on @internal/run-engine internals. +export type MollifierSnapshot = Record; + +export function serialiseMollifierSnapshot(input: MollifierSnapshot): string { + return serialiseSnapshot(input); +} + +export function deserialiseMollifierSnapshot(serialised: string): MollifierSnapshot { + return deserialiseSnapshot(serialised); +} diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts new file mode 100644 index 00000000000..d135824032c --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts @@ -0,0 +1,256 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { logger as defaultLogger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; +import { MollifierStaleSweepState, type StaleSweepStateStore } from "./mollifierStaleSweepState.server"; +import { + recordStaleEntry as defaultRecordStaleEntry, + reportStaleEntrySnapshot as defaultReportStaleEntrySnapshot, +} from "./mollifierTelemetry.server"; + +// One pass of the sweep scans a bounded slice of orgs from the buffer's +// queue LIST, identified by a durable cursor in Redis. Per-env entry +// scan is also bounded so a single pathological env can't extend the +// pass. +const DEFAULT_MAX_ENTRIES_PER_ENV = 1000; +// Max orgs visited per tick. Together with `maxEntriesPerEnv` this +// caps Redis traffic per pass. One "cycle" (visiting every org once) +// takes `ceil(N_orgs / cap)` ticks, after which the cursor wraps and a +// fresh org list is taken. +const DEFAULT_MAX_ORGS_PER_PASS = 100; + +export type StaleSweepConfig = { + // Entries whose dwell exceeds this threshold are flagged stale. Set + // it well below `entryTtlSeconds * 1000` so ops have lead time before + // TTL-induced silent loss; the default (half of entryTtlSeconds) + // matches the cadence in the plan doc. + staleThresholdMs: number; + maxEntriesPerEnv?: number; + // Hard cap on orgs visited per tick. Bounds the per-pass Redis traffic + // and wall-time. Default 100 — at typical fleet sizes one or two + // ticks cover everyone; under incident-scale fan-out a full cycle + // takes a handful of ticks (~minutes) which is still well below the + // staleness signal latency that ops cares about. + maxOrgsPerPass?: number; +}; + +export type StaleSweepDeps = { + getBuffer?: () => MollifierBuffer | null; + // Durable cursor + per-env counts hash. Required: the sweep is + // useless without persistent state across ticks. The webapp wires up + // a real `MollifierStaleSweepState`; tests pass one constructed + // against the test container. + state: StaleSweepStateStore; + // No `envId` arg — `envId` is a high-cardinality metric attribute and + // is intentionally not emitted as a metric label. The structured warn + // log below carries envId for forensic drill-down. + recordStaleEntry?: () => void; + reportStaleEntrySnapshot?: (snapshot: Map) => void; + logger?: { warn: (message: string, fields: Record) => void }; + now?: () => number; +}; + +export type StaleSweepResult = { + orgsScanned: number; + envsScanned: number; + entriesScanned: number; + staleCount: number; +}; + +// Walks a bounded slice of `orgs → envs → entries`, emitting an OTel +// counter tick and a structured warning log for each buffer entry whose +// dwell exceeds the stale threshold. Read-only on the buffer's own +// state; writes only to the sweep's three dedicated keys +// (`mollifier:stale_sweep:*`). The sweep does NOT remove or salvage +// buffer entries; that decision is deferred to a separate retention- +// policy change. The signal here exists so ops sees the drainer falling +// behind well before TTL-induced loss kicks in. +// +// Sharding contract: +// - Cursor starts at 0. On cursor=0 the org list is refreshed by +// snapshotting `buffer.listOrgs()` into the durable LIST — that is +// the cycle's frozen view of orgs to visit. +// - Each tick consumes up to `maxOrgsPerPass` orgs from the LIST, +// advances the cursor, and persists. +// - When the cursor reaches the end of the LIST it wraps to 0; the next +// tick rebuilds the org list, capturing any orgs that joined the +// buffer mid-cycle. +// - The per-env counts HASH carries over across ticks: an env visited +// on tick N and not revisited until tick N+M keeps its last-known +// stale count in the gauge for that window. This is the price of +// sharding — accepted because the alternative (re-scan everything +// every tick) does not bound work. +export async function runStaleSweepOnce( + config: StaleSweepConfig, + deps: StaleSweepDeps, +): Promise { + const getBuffer = deps.getBuffer ?? getMollifierBuffer; + const recordStale = deps.recordStaleEntry ?? defaultRecordStaleEntry; + const reportSnapshot = + deps.reportStaleEntrySnapshot ?? defaultReportStaleEntrySnapshot; + const log = deps.logger ?? defaultLogger; + const now = (deps.now ?? Date.now)(); + const maxEntries = config.maxEntriesPerEnv ?? DEFAULT_MAX_ENTRIES_PER_ENV; + const maxOrgsPerPass = config.maxOrgsPerPass ?? DEFAULT_MAX_ORGS_PER_PASS; + + const buffer = getBuffer(); + if (!buffer) { + // Replace any previous snapshot with empty so a previously-paging + // env doesn't stay latched if mollifier is turned off mid-flight. + // Also clear the durable state so a re-enable starts from a clean + // slate instead of resuming on a stale cursor. + await deps.state.clearAll(); + reportSnapshot(new Map()); + return { orgsScanned: 0, envsScanned: 0, entriesScanned: 0, staleCount: 0 }; + } + + let cursor = await deps.state.readCursor(); + if (cursor === 0) { + // Fresh cycle — capture the current set of orgs into the frozen + // LIST. Any orgs that join after this snapshot wait until the next + // cycle to be visited. Acceptable for an observational sweep; the + // staleness signal would only fire on entries that have been + // dwelling for `staleThresholdMs` anyway, so they're not new. + const orgs = await buffer.listOrgs(); + await deps.state.rebuildOrgList(orgs); + } + + const { orgs: slice, total } = await deps.state.readOrgListSlice( + cursor, + maxOrgsPerPass, + ); + + let envsScanned = 0; + let entriesScanned = 0; + let staleCount = 0; + + for (const orgId of slice) { + const envs = await buffer.listEnvsForOrg(orgId); + for (const envId of envs) { + envsScanned += 1; + let envStale = 0; + const entries = await buffer.listEntriesForEnv(envId, maxEntries); + for (const entry of entries) { + entriesScanned += 1; + const dwellMs = now - entry.createdAt.getTime(); + if (dwellMs > config.staleThresholdMs) { + recordStale(); + log.warn("mollifier.stale_entry", { + runId: entry.runId, + envId, + orgId, + dwellMs, + staleThresholdMs: config.staleThresholdMs, + }); + envStale += 1; + } + } + // Persist the per-env count to the durable hash. HSET when stale + // > 0, HDEL when it dropped back to zero — the hash is the source + // of truth for the gauge snapshot below. + await deps.state.setEnvStaleCount(envId, envStale); + // Track that this env was visited during the current cycle. The + // reconcile step at cycle wrap uses this to HDEL counts hash + // entries for envs that fully drained mid-cycle (they disappear + // from listEnvsForOrg, so the inner loop above never reaches them + // and never HDELs their hash field — without reconcile the gauge + // would stay elevated forever). + await deps.state.markEnvVisited(envId); + staleCount += envStale; + } + } + + // Advance the cursor. If the slice consumed the end of the LIST, wrap + // to 0 so the next tick rebuilds the org list and starts a new cycle. + const advanced = cursor + slice.length; + const wrapped = advanced >= total; + const newCursor = wrapped ? 0 : advanced; + await deps.state.writeCursor(newCursor); + + if (wrapped) { + // Cycle ended. HDEL any env still in the counts hash that didn't + // appear in any tick of the just-completed cycle — these are envs + // that fully drained from the buffer mid-cycle and would otherwise + // hold their stale gauge value forever. Also DELs the visited set + // so the next cycle starts clean. + await deps.state.reconcileVisited(); + } + + // Emit the snapshot from the durable hash, which carries values for + // envs visited in earlier ticks too. This is what makes the gauge + // stable across ticks (and across webapp restarts). + const snapshot = await deps.state.readAllEnvStaleCounts(); + reportSnapshot(snapshot); + + return { orgsScanned: slice.length, envsScanned, entriesScanned, staleCount }; +} + +export type StaleSweepIntervalHandle = { + stop: () => Promise; +}; + +// Production wrapper: schedule `runStaleSweepOnce` on a fixed interval. +// One pass at a time — if a sweep is still running when the timer fires +// the next tick is skipped (a backed-up Redis would otherwise queue +// overlapping sweeps that all log the same stale entries). +export function startStaleSweepInterval( + config: StaleSweepConfig & { intervalMs: number }, + deps: StaleSweepDeps, +): StaleSweepIntervalHandle { + let stopped = false; + let inFlight = false; + // Tracks the current tick so `stop()` can await it before closing the + // state's Redis client. Without this, a tick that's already past the + // `stopped` guard at entry would continue making `state.*` calls + // against an ioredis client that `stop()` has already `quit()`ed, + // raising errors that the tick's own try/catch then logs as + // `mollifier.stale_sweep.failed` warnings — spurious noise on every + // graceful shutdown. + let currentTick: Promise | null = null; + + const tick = async () => { + if (stopped || inFlight) return; + inFlight = true; + const run = (async () => { + try { + await runStaleSweepOnce(config, deps); + } catch (err) { + const log = deps.logger ?? defaultLogger; + log.warn("mollifier.stale_sweep.failed", { + err: err instanceof Error ? err.message : String(err), + }); + } finally { + inFlight = false; + currentTick = null; + } + })(); + currentTick = run; + await run; + }; + + const timer = setInterval(() => { + void tick(); + }, config.intervalMs); + + return { + stop: async () => { + stopped = true; + clearInterval(timer); + // Drain any tick that started before `stopped` flipped. Its + // `state.*` calls must land before we close the Redis client. + if (currentTick) { + try { + await currentTick; + } catch { + // tick has its own catch — this await is just to ensure + // ordering, not to surface errors that have already been + // logged inside the tick. + } + } + // Close the state's underlying resource. The `close()` method is + // part of the `StaleSweepStateStore` contract — production's + // `MollifierStaleSweepState` shuts down its ioredis client; fake + // test states implement a no-op. + await deps.state.close(); + }, + }; +} diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts new file mode 100644 index 00000000000..4fce5ad9ee4 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts @@ -0,0 +1,188 @@ +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import { Logger } from "@trigger.dev/core/logger"; + +// Durable per-tick state for the sharded stale sweep. Four Redis keys, +// all in the `mollifier:` namespace alongside the buffer's own state: +// +// mollifier:stale_sweep:cursor STRING next position in org_list (0 = fresh cycle) +// mollifier:stale_sweep:org_list LIST org IDs frozen at the start of the cycle +// mollifier:stale_sweep:counts HASH envId -> last-known stale count +// mollifier:stale_sweep:visited SET envIds visited during the current cycle +// +// The state survives webapp restarts: a restarted process picks up the +// cursor where the previous one left off and re-emits the last-known +// gauge values immediately, rather than blinking to zero until the next +// cycle visits each env. +// +// The `visited` set exists to GC the `counts` hash at cycle wrap: an env +// that drains completely between sweep ticks disappears from +// `buffer.listEnvsForOrg`, so the sweep's inner loop never revisits it +// and never HDELs its counts entry. Without the visited-set GC the +// counts hash retains the env's last-known stale count forever and the +// gauge stays permanently elevated. At cursor wrap we diff the hash +// against the cycle's visited set and HDEL the difference. +// +// Storage is owned by this class rather than added to MollifierBuffer +// because the keys are sweep-internal — the buffer abstracts the +// drainer/queue state, this abstracts sweep state. They share a +// namespace prefix but no API surface. + +export interface StaleSweepStateStore { + readCursor(): Promise; + writeCursor(value: number): Promise; + /** Replaces the cycle's frozen org_list. Called at cursor=0. */ + rebuildOrgList(orgs: string[]): Promise; + /** Returns up to `count` org IDs starting at `start`, plus the LIST's total length. */ + readOrgListSlice(start: number, count: number): Promise<{ orgs: string[]; total: number }>; + /** HSET when count > 0, HDEL when count === 0 (so the snapshot reflects current truth). */ + setEnvStaleCount(envId: string, count: number): Promise; + readAllEnvStaleCounts(): Promise>; + /** SADD `envId` to the current cycle's visited set. Called once per env scanned per tick. */ + markEnvVisited(envId: string): Promise; + /** + * HDEL every env in the counts hash that is NOT in the visited set, then + * DEL the visited set. Called when the cursor wraps (cycle ends) so + * envs that fully drained mid-cycle get cleaned out of the gauge. + */ + reconcileVisited(): Promise; + clearAll(): Promise; + close(): Promise; +} + +const CURSOR_KEY = "mollifier:stale_sweep:cursor"; +const ORG_LIST_KEY = "mollifier:stale_sweep:org_list"; +const COUNTS_KEY = "mollifier:stale_sweep:counts"; +const VISITED_KEY = "mollifier:stale_sweep:visited"; + +export class MollifierStaleSweepState implements StaleSweepStateStore { + private readonly redis: Redis; + private readonly logger: Logger; + + constructor(options: { redisOptions: RedisOptions; logger?: Logger }) { + this.logger = options.logger ?? new Logger("MollifierStaleSweepState", "debug"); + this.redis = createRedisClient( + { ...options.redisOptions, maxRetriesPerRequest: 20 }, + { + onError: (error) => { + this.logger.error("MollifierStaleSweepState redis client error:", { error }); + }, + }, + ); + } + + async readCursor(): Promise { + const raw = await this.redis.get(CURSOR_KEY); + if (raw === null) return 0; + const n = Number.parseInt(raw, 10); + return Number.isFinite(n) && n >= 0 ? n : 0; + } + + async writeCursor(value: number): Promise { + await this.redis.set(CURSOR_KEY, String(value)); + } + + async rebuildOrgList(orgs: string[]): Promise { + // DEL + RPUSH in a pipeline — close enough to atomic for an + // observational sweep (the inFlight guard at startStaleSweepInterval + // serialises sweep passes; nothing else writes these keys). + const pipeline = this.redis.pipeline(); + pipeline.del(ORG_LIST_KEY); + if (orgs.length > 0) { + pipeline.rpush(ORG_LIST_KEY, ...orgs); + } + await pipeline.exec(); + } + + async readOrgListSlice( + start: number, + count: number, + ): Promise<{ orgs: string[]; total: number }> { + const pipeline = this.redis.pipeline(); + pipeline.lrange(ORG_LIST_KEY, start, start + count - 1); + pipeline.llen(ORG_LIST_KEY); + const results = await pipeline.exec(); + // `pipeline.exec()` returning null is the abort-on-broken-pipe path. + // Surface it as a thrown error — the previous `return { orgs: [], total: 0 }` + // looked indistinguishable from a genuinely empty org list to the + // caller (`runStaleSweepOnce`), which then wrote cursor=0, reconciled + // visited envs against the empty result, and cleared the stale-entry + // gauge. That hid real Redis problems and silenced the alerts the + // sweep exists to raise. + if (!results) { + throw new Error("MollifierStaleSweepState.readOrgListSlice: pipeline.exec returned null"); + } + const [lrangeErr, lrangeRes] = results[0] as [Error | null, string[] | null]; + const [llenErr, llenRes] = results[1] as [Error | null, number | null]; + if (lrangeErr || llenErr) { + this.logger.error("MollifierStaleSweepState.readOrgListSlice failed", { + lrangeErr: lrangeErr?.message, + llenErr: llenErr?.message, + }); + // Same reasoning as the null-result path above — propagate the + // failure so the sweep's interval wrapper records a failed cycle + // and the durable cursor / counts hash stay untouched. + throw lrangeErr ?? llenErr ?? new Error("MollifierStaleSweepState.readOrgListSlice failed"); + } + return { orgs: lrangeRes ?? [], total: llenRes ?? 0 }; + } + + async setEnvStaleCount(envId: string, count: number): Promise { + if (count > 0) { + await this.redis.hset(COUNTS_KEY, envId, String(count)); + } else { + await this.redis.hdel(COUNTS_KEY, envId); + } + } + + async readAllEnvStaleCounts(): Promise> { + const raw = await this.redis.hgetall(COUNTS_KEY); + const out = new Map(); + for (const [envId, value] of Object.entries(raw)) { + const n = Number.parseInt(value, 10); + if (Number.isFinite(n)) out.set(envId, n); + } + return out; + } + + async markEnvVisited(envId: string): Promise { + await this.redis.sadd(VISITED_KEY, envId); + } + + async reconcileVisited(): Promise { + // HKEYS + SMEMBERS in a pipeline, then HDEL the difference locally. + // For typical fleet sizes (counts and visited both bounded by the + // count of buffered envs) this is well within a single RTT plus one + // small HDEL. + const pipeline = this.redis.pipeline(); + pipeline.hkeys(COUNTS_KEY); + pipeline.smembers(VISITED_KEY); + const results = await pipeline.exec(); + if (!results) return; + const [hkeysErr, hkeysRes] = results[0] as [Error | null, string[] | null]; + const [smembersErr, smembersRes] = results[1] as [Error | null, string[] | null]; + if (hkeysErr || smembersErr) { + this.logger.error("MollifierStaleSweepState.reconcileVisited failed", { + hkeysErr: hkeysErr?.message, + smembersErr: smembersErr?.message, + }); + return; + } + const hashEnvs = hkeysRes ?? []; + const visited = new Set(smembersRes ?? []); + const orphans = hashEnvs.filter((envId) => !visited.has(envId)); + const cleanup = this.redis.pipeline(); + if (orphans.length > 0) { + cleanup.hdel(COUNTS_KEY, ...orphans); + } + cleanup.del(VISITED_KEY); + await cleanup.exec(); + } + + async clearAll(): Promise { + await this.redis.del(CURSOR_KEY, ORG_LIST_KEY, COUNTS_KEY, VISITED_KEY); + } + + async close(): Promise { + await this.redis.quit(); + } +} diff --git a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts index 0fe302584ce..f9c7ca72f1f 100644 --- a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts @@ -15,3 +15,87 @@ export function recordDecision(outcome: DecisionOutcome, reason?: DecisionReason ...(reason ? { reason } : {}), }); } + +// Counts subscriptions hitting `/realtime/v1/runs/` for a run that +// lives only in the mollifier buffer (no PG row yet). The route opens +// the Electric stream anyway so the eventual drainer-INSERT propagates +// to the client; this counter is the signal of how often customers +// subscribe inside the buffered window. +export const realtimeBufferedSubscriptionsCounter = meter.createCounter( + "mollifier.realtime_subscriptions.buffered", + { + description: + "Realtime subscriptions opened against a runId that exists only in the mollifier buffer", + }, +); + +// No `envId` attribute — `envId` is a banned high-cardinality metric +// label per the repo's OTel rules. The structured warn log emitted +// alongside the counter tick (in `mollifierStaleSweep.server.ts`) +// carries the envId / orgId / runId for forensic drill-down; the +// metric stays an aggregate. +export function recordRealtimeBufferedSubscription(): void { + realtimeBufferedSubscriptionsCounter.add(1); +} + +// Counts buffer entries that have been waiting in the queue ZSET longer +// than the configured stale threshold. Useful for historical "stale +// events over time" views, but not directly alertable on its own — a +// single stuck entry observed by N sweep ticks adds N to the counter, +// so `rate()` over an alerting window reflects (entries × ticks), not +// "entries that are stale right now". +export const staleEntriesCounter = meter.createCounter( + "mollifier.stale_entries", + { + description: + "Mollifier buffer entries whose dwell exceeds the stale threshold (per sweep pass)", + }, +); + +// No `envId` attribute — see comment above. +export function recordStaleEntry(): void { + staleEntriesCounter.add(1); +} + +// Alertable signal: the total count of stale entries observed by the +// latest sweep. The sweep snapshots the full picture on each pass so +// the gauge drops back to 0 when the drainer catches up instead of +// staying latched. Recommended alert: +// mollifier_stale_entries_current > 0 for 5m +export const staleEntriesGauge = meter.createObservableGauge( + "mollifier.stale_entries.current", + { + description: + "Buffer entries whose dwell exceeds the stale threshold, as observed by the latest sweep pass", + }, +); + +let latestStaleTotal = 0; + +export function reportStaleEntrySnapshot(snapshot: Map): void { + // Sum across envs. Per-env breakdown is intentionally NOT emitted as + // a metric label (high-cardinality); the structured warn log lines + // from the sweep carry per-env detail for ops to drill down. + let total = 0; + for (const count of snapshot.values()) { + total += count; + } + latestStaleTotal = total; +} + +meter.addBatchObservableCallback( + (result) => { + result.observe(staleEntriesGauge, latestStaleTotal); + }, + [staleEntriesGauge], +); + +// Electric SQL's shape-stream protocol adds a `handle=` query param on +// every reconnect after the initial GET. Gating the realtime-buffered +// log/counter on its absence keeps the signal at one tick per +// subscription instead of one tick per ~20s live-poll iteration — +// without it the counter would over-count by the long-poll factor. +export function isInitialBufferedSubscriptionRequest(url: string | URL): boolean { + const u = typeof url === "string" ? new URL(url) : url; + return !u.searchParams.has("handle"); +} diff --git a/apps/webapp/app/v3/mollifier/mollifierTripEvaluator.server.ts b/apps/webapp/app/v3/mollifier/mollifierTripEvaluator.server.ts index 4bd9a34d412..9032467d200 100644 --- a/apps/webapp/app/v3/mollifier/mollifierTripEvaluator.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierTripEvaluator.server.ts @@ -35,8 +35,8 @@ export function createRealTripEvaluator(deps: CreateRealTripEvaluatorDeps): Trip } catch (err) { // Deliberate: no error counter here. Shadow mode means a silent miss is // harmless — fail-open is the safe direction. The error log + Sentry - // capture is sufficient operability for Phase 1. Revisit in Phase 2 - // when buffer writes are the primary path and a missed evaluation has cost. + // capture is sufficient operability while this runs in shadow mode. Revisit + // once buffer writes are the primary path and a missed evaluation has cost. logger.error("mollifier trip evaluator: fail-open on error", { envId: inputs.envId, err: err instanceof Error ? err.message : String(err), diff --git a/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts b/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts new file mode 100644 index 00000000000..9de8f64b3e9 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts @@ -0,0 +1,246 @@ +import type { + BufferEntry, + MollifierBuffer, + MutateSnapshotResult, + SnapshotPatch, +} from "@trigger.dev/redis-worker"; +import type { TaskRun } from "@trigger.dev/database"; +import { prisma, $replica } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; + +// Wait/retry knobs. Exported for tests. +export const DEFAULT_SAFETY_NET_MS = 2_000; +// Initial gap between buffer polls; grows by BACKOFF_FACTOR up to +// DEFAULT_MAX_POLL_STEP_MS so a slow drain doesn't poll at a tight fixed +// cadence for the whole safety-net budget. +export const DEFAULT_POLL_STEP_MS = 20; +export const DEFAULT_MAX_POLL_STEP_MS = 250; +const BACKOFF_FACTOR = 1.7; + +export type MutateWithFallbackInput = { + runId: string; + environmentId: string; + organizationId: string; + bufferPatch: SnapshotPatch; + // Called when a PG row exists (either replica-hit or post-wait writer-hit). + // Receives the full TaskRun shape and returns the customer-visible body. + pgMutation: (pgRow: TaskRun) => Promise; + // Called when the patch landed cleanly on the buffer snapshot. The + // drainer will see the patched payload on its next pop. Receives the + // pre-mutation snapshot entry (the one fetched for the env auth + // check above) so the caller can compute response details that + // depend on the prior state — e.g. the tags route needs to dedup + // against the existing tags to report an accurate `newTags` count + // matching the PG path, without an extra Redis round-trip. + // `bufferEntry` is `null` in the rare race where the entry didn't + // exist at pre-check time but appeared before `mutateSnapshot`. + synthesisedResponse: (ctx: { + bufferEntry: BufferEntry | null; + }) => TResponse | Promise; + // Called when the buffer rejected the patch as invalid (e.g. an + // `append_tags` patch carrying `maxTags` would exceed the cap). Required + // only by callers that send a rejectable patch; the helper throws if the + // buffer reports a rejection and no builder was supplied. Receives the + // same `bufferEntry` context as `synthesisedResponse` so a rejection + // message can reference the prior state if useful. + rejectedResponse?: (ctx: { + bufferEntry: BufferEntry | null; + }) => TResponse | Promise; + abortSignal?: AbortSignal; + // Override defaults for tests. + safetyNetMs?: number; + pollStepMs?: number; + maxPollStepMs?: number; + // Test injection. + getBuffer?: () => MollifierBuffer | null; + prismaWriter?: TaskRunReader; + prismaReplica?: TaskRunReader; + sleep?: (ms: number) => Promise; + now?: () => number; + // Jitter source; defaults to Math.random. Inject `() => 0` for + // deterministic poll timing in tests. + random?: () => number; +}; + +export type MutateWithFallbackOutcome = + | { kind: "pg"; response: TResponse } + | { kind: "snapshot"; response: TResponse } + | { kind: "rejected"; response: TResponse } + | { kind: "not_found" } + | { kind: "timed_out" }; + +// PG-first → buffer mutateSnapshot → wait-and-bounce. The +// caller decides how to translate the outcome into an HTTP response — +// this helper never throws Response objects so it remains route-agnostic +// and unit-testable in isolation. +export async function mutateWithFallback( + input: MutateWithFallbackInput, +): Promise> { + const replica = input.prismaReplica ?? $replica; + const writer = input.prismaWriter ?? prisma; + const buffer = (input.getBuffer ?? getMollifierBuffer)(); + const sleep = input.sleep ?? defaultSleep; + const now = input.now ?? Date.now; + + // Path 1 — PG is already canonical. + const replicaRow = await findRunInPg(replica, input.runId, input.environmentId); + if (replicaRow) { + const response = await input.pgMutation(replicaRow); + return { kind: "pg", response }; + } + + if (!buffer) { + // No buffer configured (mollifier disabled or boot-time error). The + // pre-PR mutation routes read from the writer directly, so a freshly- + // created PG row was always visible regardless of replication lag. + // Now that the read moved to the replica (line 87) for the offload, + // a `!buffer` short-circuit would regress: a real PG row + replica + // lag would return 404. Mirror the writer-disambiguation block below + // (line 148, the buffer-says-not-found path) so degraded mode + // (mollifier disabled) still matches pre-PR mutation behaviour. + const writerRow = await findRunInPg(writer, input.runId, input.environmentId); + if (writerRow) { + const response = await input.pgMutation(writerRow); + return { kind: "pg", response }; + } + return { kind: "not_found" }; + } + + // Env-scoped authorization for the buffer path. The replica/writer + // lookups above are already env-scoped via findRunInPg; this closes + // the same gap on the buffer side so a caller authed in env A can't + // mutate a buffered run that belongs to env B (or a different org) + // by guessing its friendlyId. Non-atomic w.r.t. the mutateSnapshot + // call below, but the TOCTOU is benign: runIds are globally unique, + // so a cross-env entry can't suddenly appear after a same-env check. + // A genuinely-missing entry (entry === null) falls through and is + // handled by the existing not_found / writer-recovery path below. + const entryForAuth = await buffer.getEntry(input.runId); + if ( + entryForAuth && + (entryForAuth.envId !== input.environmentId || + entryForAuth.orgId !== input.organizationId) + ) { + // Hide existence on env mismatch: return not_found, same shape as + // a true miss, rather than 403 which would leak that the runId + // exists in some other env. + return { kind: "not_found" }; + } + + // Path 2 — buffer snapshot mutation. + const result: MutateSnapshotResult = await buffer.mutateSnapshot( + input.runId, + input.bufferPatch, + ); + + if (result === "applied_to_snapshot") { + return { + kind: "snapshot", + response: await input.synthesisedResponse({ bufferEntry: entryForAuth }), + }; + } + + if (result === "limit_exceeded") { + // The buffer refused the patch (e.g. tag cap). Nothing was written. + // Surface the caller's rejection body; a missing builder means the + // caller sent a rejectable patch without handling the rejection. + if (!input.rejectedResponse) { + throw new Error( + "mutateWithFallback: buffer returned 'limit_exceeded' but no rejectedResponse was provided", + ); + } + return { + kind: "rejected", + response: await input.rejectedResponse({ bufferEntry: entryForAuth }), + }; + } + + if (result === "not_found") { + // Disambiguate a genuine 404 from a replica-lag miss: ask the writer + // directly. If the row just appeared post-drain we route through the + // PG mutation path. + const writerRow = await findRunInPg(writer, input.runId, input.environmentId); + if (writerRow) { + const response = await input.pgMutation(writerRow); + return { kind: "pg", response }; + } + return { kind: "not_found" }; + } + + // result === "busy" — the entry is mid-handoff (DRAINING) or already + // materialised. We do NOT poll the primary for the row to appear: that + // piles read load onto the writer at exactly the moment mollifier exists + // to shed it. Instead we watch the buffer entry itself (cheap Redis + // reads). The drainer writes the PG row BEFORE it acks (sets + // `materialised`) or fails (deletes the entry), so the entry's own state + // is an authoritative, already-in-Redis signal for "is the row in PG + // yet?". Only once it resolves do we touch the primary — exactly once, + // for the real mutation. + const safetyNetMs = input.safetyNetMs ?? DEFAULT_SAFETY_NET_MS; + const maxPollStepMs = input.maxPollStepMs ?? DEFAULT_MAX_POLL_STEP_MS; + const random = input.random ?? Math.random; + const deadline = now() + safetyNetMs; + let step = input.pollStepMs ?? DEFAULT_POLL_STEP_MS; + + while (now() < deadline) { + if (input.abortSignal?.aborted) { + return { kind: "timed_out" }; + } + + const entry = await buffer.getEntry(input.runId); + // Resolved when the entry is gone (`fail` deleted it after writing a + // terminal SYSTEM_FAILURE row) or materialised (`ack` after a + // successful trigger / cancel write). In both cases the PG row is now + // committed on the primary, so read it once and route through the + // canonical PG mutation path. + if (entry === null || entry.materialised === true) { + const row = await findRunInPg(writer, input.runId, input.environmentId); + if (row) { + const response = await input.pgMutation(row); + return { kind: "pg", response }; + } + // Entry gone with no PG row: the drainer's terminal write itself + // failed (PG unreachable). Nothing to mutate. + return { kind: "not_found" }; + } + // Still QUEUED (requeued after a retryable drain error) or DRAINING — + // the run hasn't reached PG. Back off with jitter so concurrent + // waiters on the same draining run don't requery in lockstep. + if (now() >= deadline) break; + const jittered = step + Math.floor(random() * step); + await sleep(jittered); + step = Math.min(Math.ceil(step * BACKOFF_FACTOR), maxPollStepMs); + } + + logger.warn("mollifier mutate-with-fallback: drainer resolution timed out", { + runId: input.runId, + safetyNetMs, + }); + return { kind: "timed_out" }; +} + +// Structural reader interface — accepts both the writer (`prisma`) and the +// replica (`$replica`), which differ slightly in their generated Prisma +// types but share the findFirst surface used here. +type TaskRunReader = { + taskRun: { + findFirst(args: { + where: { friendlyId: string; runtimeEnvironmentId: string }; + }): Promise; + }; +}; + +async function findRunInPg( + client: TaskRunReader, + friendlyId: string, + environmentId: string, +): Promise { + return client.taskRun.findFirst({ + where: { friendlyId, runtimeEnvironmentId: environmentId }, + }); +} + +function defaultSleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/apps/webapp/app/v3/mollifier/readFallback.server.ts b/apps/webapp/app/v3/mollifier/readFallback.server.ts index 34a8b48f970..21dd6c23957 100644 --- a/apps/webapp/app/v3/mollifier/readFallback.server.ts +++ b/apps/webapp/app/v3/mollifier/readFallback.server.ts @@ -1,4 +1,10 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; +import { IdempotencyKeyOptionsSchema } from "@trigger.dev/core/v3/schemas"; +import type { z } from "zod"; import { logger } from "~/services/logger.server"; +import { deserialiseMollifierSnapshot } from "./mollifierSnapshot.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; export type ReadFallbackInput = { runId: string; @@ -6,11 +12,255 @@ export type ReadFallbackInput = { organizationId: string; }; +export type SyntheticRun = { + // Snapshot-derived TaskRun primary key. Used by ReplayTaskRunService + // for logging and by callers passing this object where a TaskRun is + // expected (cast). Derived deterministically from `friendlyId`. + id: string; + friendlyId: string; + status: "QUEUED" | "FAILED" | "CANCELED"; + // Set when the customer cancelled the run via the dashboard or API + // while it was buffered. The drainer's cancel bifurcation reads this + // on next pop and writes a CANCELED PG row directly (skipping + // materialisation). Reflected back into the UI by the synthesised + // SpanRun so the run-detail page shows the cancelled state even before + // the drainer materialises it. + cancelledAt: Date | undefined; + cancelReason: string | undefined; + // Reschedule patch (`set_delay`) writes `delayUntil` into the snapshot. + // Surfacing it on SyntheticRun lets the retrieve-run shape reflect the + // pending delay before the drainer materialises the PG row. + delayUntil: Date | undefined; + taskIdentifier: string | undefined; + createdAt: Date; + + payload: unknown; + payloadType: string | undefined; + metadata: unknown; + metadataType: string | undefined; + // Seed-metadata mirrors what `triggerTask.server.ts` writes into the + // snapshot: the original metadataPacket data preserved separately from + // any later customer mutations. ReplayTaskRunService uses these to + // rebuild the replay's metadata. + seedMetadata: string | undefined; + seedMetadataType: string | undefined; + + idempotencyKey: string | undefined; + // Surfaced for the cached-hit expiration check in IdempotencyKeyConcern. + // The PG-resident path enforces this (clears key, allows new run when + // expired). For buffered runs the snapshot carries the same field — we + // expose it here so the cached-hit branch can apply the same check + // rather than indefinitely returning the buffered run's id. + idempotencyKeyExpiresAt: Date | undefined; + // `{ key, scope }` object form, matching how the SDK serialises and PG + // stores it. Previously typed as `string[]` (legacy/incorrect — Prisma + // is `Json?` carrying the schema-shaped object). `getUserProvidedIdempotencyKey` + // and `extractIdempotencyKeyScope` both parse via the same Zod schema; + // they returned `undefined` for the array-shape, which silently + // demoted the response to surface the hash instead of the user- + // provided key for buffered runs — a contract divergence from + // PG-resident runs. See the regression test in `mollifierReadFallback.test.ts`. + idempotencyKeyOptions: z.infer | undefined; + isTest: boolean; + depth: number; + ttl: string | undefined; + tags: string[]; + // Mirror of `tags` under the PG field name. ReplayTaskRunService reads + // `existingTaskRun.runTags`; both names are kept here so a synthetic + // run can be passed wherever the PG-shape `runTags` is expected. + runTags: string[]; + lockedToVersion: string | undefined; + resumeParentOnCompletion: boolean; + parentTaskRunId: string | undefined; + + // Allocated at gate-accept time and embedded in the snapshot so the run's + // trace is continuous from QUEUED-in-buffer through executing post-drain. + traceId: string | undefined; + spanId: string | undefined; + parentSpanId: string | undefined; + + // Replay-relevant fields populated from the engine-trigger snapshot. + // ReplayTaskRunService reads each of these from the existing TaskRun; + // when the original lives in the buffer we synthesise them here. + runtimeEnvironmentId: string | undefined; + engine: "V2"; + workerQueue: string | undefined; + queue: string | undefined; + concurrencyKey: string | undefined; + machinePreset: string | undefined; + realtimeStreamsVersion: string | undefined; + + // Additional snapshot-sourced fields used when synthesising a SpanRun + // for the dashboard's right-side details panel. All optional because + // older snapshots may not carry them. + maxAttempts: number | undefined; + maxDurationInSeconds: number | undefined; + replayedFromTaskRunFriendlyId: string | undefined; + annotations: unknown; + traceContext: unknown; + scheduleId: string | undefined; + batchId: string | undefined; + parentTaskRunFriendlyId: string | undefined; + rootTaskRunFriendlyId: string | undefined; + + error?: { code: string; message: string }; +}; + +export type ReadFallbackDeps = { + getBuffer?: () => MollifierBuffer | null; +}; + +function asString(value: unknown): string | undefined { + return typeof value === "string" ? value : undefined; +} + +function asStringArray(value: unknown): string[] { + return Array.isArray(value) && value.every((v) => typeof v === "string") ? (value as string[]) : []; +} + +function asDate(value: unknown): Date | undefined { + const raw = asString(value); + if (!raw) return undefined; + const parsed = new Date(raw); + return Number.isNaN(parsed.getTime()) ? undefined : parsed; +} + +// Snapshot ids are written by engine.trigger as INTERNAL ids (cuids); the +// SyntheticRun contract exposes friendlyIds. `RunId.toFriendlyId` is +// already used for the synthetic run's own id (line 155); reuse it for +// parent/root so consumers see the same shape as the PG path. +function internalRunIdToFriendlyId(internalId: string | undefined): string | undefined { + if (!internalId) return undefined; + return RunId.toFriendlyId(internalId); +} + export async function findRunByIdWithMollifierFallback( input: ReadFallbackInput, -): Promise { - logger.debug("mollifier read-fallback called (phase 1 stub)", { - runId: input.runId, - }); - return null; + deps: ReadFallbackDeps = {}, +): Promise { + const buffer = (deps.getBuffer ?? getMollifierBuffer)(); + if (!buffer) return null; + + try { + const entry = await buffer.getEntry(input.runId); + if (!entry) return null; + + if (entry.envId !== input.environmentId || entry.orgId !== input.organizationId) { + logger.warn("mollifier read-fallback auth mismatch", { + runId: input.runId, + callerEnvId: input.environmentId, + callerOrgId: input.organizationId, + }); + return null; + } + + const snapshot = deserialiseMollifierSnapshot(entry.payload); + // Parse via the canonical schema (`{ key: string, scope: "run" | + // "attempt" | "global" }`) rather than the legacy Array.isArray + // check. The SDK and Prisma both store this as an object; the array + // form never matches, so a buffered run's response previously fell + // back to the server-side hash in `getUserProvidedIdempotencyKey` + // instead of the customer-supplied key — diverging from how + // materialised runs render the same field. + const idempotencyKeyOptionsParsed = IdempotencyKeyOptionsSchema.safeParse( + snapshot.idempotencyKeyOptions, + ); + const idempotencyKeyOptions = idempotencyKeyOptionsParsed.success + ? idempotencyKeyOptionsParsed.data + : undefined; + + const tags = asStringArray(snapshot.tags); + const environment = + snapshot.environment && typeof snapshot.environment === "object" + ? (snapshot.environment as Record) + : undefined; + + const cancelledAt = asDate(snapshot.cancelledAt); + const cancelReason = asString(snapshot.cancelReason); + let status: SyntheticRun["status"] = "QUEUED"; + if (cancelledAt) { + status = "CANCELED"; + } else if (entry.status === "FAILED") { + status = "FAILED"; + } + const delayUntil = asDate(snapshot.delayUntil); + + return { + id: RunId.fromFriendlyId(entry.runId), + friendlyId: entry.runId, + status, + cancelledAt, + cancelReason, + delayUntil, + taskIdentifier: asString(snapshot.taskIdentifier), + createdAt: entry.createdAt, + + payload: snapshot.payload, + payloadType: asString(snapshot.payloadType), + metadata: snapshot.metadata, + metadataType: asString(snapshot.metadataType), + seedMetadata: asString(snapshot.seedMetadata), + seedMetadataType: asString(snapshot.seedMetadataType), + + idempotencyKey: asString(snapshot.idempotencyKey), + idempotencyKeyExpiresAt: asDate(snapshot.idempotencyKeyExpiresAt), + idempotencyKeyOptions, + isTest: snapshot.isTest === true, + depth: typeof snapshot.depth === "number" ? snapshot.depth : 0, + ttl: asString(snapshot.ttl), + tags, + runTags: tags, + lockedToVersion: asString(snapshot.taskVersion), + resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true, + parentTaskRunId: asString(snapshot.parentTaskRunId), + + traceId: asString(snapshot.traceId), + spanId: asString(snapshot.spanId), + parentSpanId: asString(snapshot.parentSpanId), + + runtimeEnvironmentId: + asString(environment?.id) ?? entry.envId, + engine: "V2", + workerQueue: asString(snapshot.workerQueue), + queue: asString(snapshot.queue), + concurrencyKey: asString(snapshot.concurrencyKey), + machinePreset: asString(snapshot.machine), + realtimeStreamsVersion: asString(snapshot.realtimeStreamsVersion), + + maxAttempts: typeof snapshot.maxAttempts === "number" ? snapshot.maxAttempts : undefined, + maxDurationInSeconds: + typeof snapshot.maxDurationInSeconds === "number" + ? snapshot.maxDurationInSeconds + : undefined, + replayedFromTaskRunFriendlyId: asString(snapshot.replayedFromTaskRunFriendlyId), + annotations: snapshot.annotations, + traceContext: snapshot.traceContext, + scheduleId: asString(snapshot.scheduleId), + // The engine.trigger input embeds the batch as `{ id, index }` (see + // triggerTask.server.ts #buildEngineTriggerInput), not as a flat + // `batchId`. The nested `id` is the batch's internal cuid — the same + // value PG stores in `TaskRun.batchId` — so callers reconstruct the + // friendly id via `BatchId.toFriendlyId` exactly as the PG path does. + batchId: asString((snapshot.batch as { id?: unknown } | undefined)?.id), + // The snapshot only carries the INTERNAL parent/root ids + // (`parentTaskRunId` / `rootTaskRunId` — what engine.trigger consumes), + // not the friendlyIds the SyntheticRun contract expects. Convert + // internal → friendly here so consumers don't have to special-case + // the buffered path. + parentTaskRunFriendlyId: internalRunIdToFriendlyId( + asString(snapshot.parentTaskRunId) + ), + rootTaskRunFriendlyId: internalRunIdToFriendlyId( + asString(snapshot.rootTaskRunId) + ), + + error: entry.lastError, + }; + } catch (err) { + logger.error("mollifier read-fallback errored — fail-open to null", { + runId: input.runId, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } } diff --git a/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts b/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts new file mode 100644 index 00000000000..b3db81368b9 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/resolveRunForMutation.server.ts @@ -0,0 +1,82 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { $replica as defaultReplica, prisma as defaultWriter } from "~/db.server"; +import { getMollifierBuffer as defaultGetBuffer } from "./mollifierBuffer.server"; + +// Discriminated-union resolver used by mutation routes' `findResource`. +// The route builder treats a null return from `findResource` as a 404 +// BEFORE the action handler runs (`apiBuilder.server.ts:321`), so we +// must check BOTH the PG canonical store and the mollifier buffer here +// — otherwise a buffered run can't be cancelled / mutated even though +// the underlying mutateWithFallback flow would handle it correctly. +// +// (Regression: before extracting this helper the cancel route had +// `findResource: async () => null`, which made every cancel 404 before +// the action ran. The helper makes the lookup unit-testable.) +export type ResolvedRunForMutation = + | { source: "pg"; friendlyId: string } + | { source: "buffer"; friendlyId: string }; + +type PrismaTaskRunFindFirst = { + taskRun: { + findFirst(args: { + where: { friendlyId: string; runtimeEnvironmentId: string }; + select: { friendlyId: true }; + }): Promise<{ friendlyId: string } | null>; + }; +}; + +export type ResolveRunForMutationDeps = { + prismaReplica?: PrismaTaskRunFindFirst; + prismaWriter?: PrismaTaskRunFindFirst; + getBuffer?: () => MollifierBuffer | null; +}; + +export async function resolveRunForMutation(input: { + runParam: string; + environmentId: string; + organizationId: string; + deps?: ResolveRunForMutationDeps; +}): Promise { + const replica = input.deps?.prismaReplica ?? defaultReplica; + const writer = input.deps?.prismaWriter ?? defaultWriter; + const getBuffer = input.deps?.getBuffer ?? defaultGetBuffer; + + const pgRun = await replica.taskRun.findFirst({ + where: { friendlyId: input.runParam, runtimeEnvironmentId: input.environmentId }, + select: { friendlyId: true }, + }); + if (pgRun) return { source: "pg", friendlyId: pgRun.friendlyId }; + + const buffer = getBuffer(); + + if (buffer) { + const entry = await buffer.getEntry(input.runParam); + if ( + entry && + entry.envId === input.environmentId && + entry.orgId === input.organizationId + ) { + return { source: "buffer", friendlyId: input.runParam }; + } + } + + // Replica + buffer both missed. Before declaring "not found" (which the + // route builder converts to a hard 404 *before* the action handler runs, + // so the downstream `mutateWithFallback` writer-recovery never gets a + // chance to fire), do one final probe against the writer. This catches + // two cases: + // 1. Replica lag on a freshly-created PG row. + // 2. A buffered run that materialised in the window between the + // replica read and our buffer check (the entry was ack'd and the + // hash is mid-grace-TTL but our getEntry returned null due to + // lookup-by-friendlyId timing). + // Without this, the resolver returns null in degraded states that the + // downstream mutateWithFallback flow would otherwise handle correctly. + const writerRun = await writer.taskRun.findFirst({ + where: { friendlyId: input.runParam, runtimeEnvironmentId: input.environmentId }, + select: { friendlyId: true }, + }); + if (writerRun) return { source: "pg", friendlyId: writerRun.friendlyId }; + + return null; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticApiResponses.server.ts b/apps/webapp/app/v3/mollifier/syntheticApiResponses.server.ts new file mode 100644 index 00000000000..02c63fe91f1 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticApiResponses.server.ts @@ -0,0 +1,73 @@ +import type { SyntheticRun } from "./readFallback.server"; + +// Buffered runs have no execution data — the drainer hasn't materialised +// the PG row and the worker hasn't started. The SDK-facing read routes +// still need to return a span/trace shape that satisfies their response +// schemas; these helpers build that minimal shape from the buffered +// SyntheticRun. +// +// CANCELED and FAILED are terminal states: a FAILED buffered run is +// errored (drainer exhausted retries or the gate rejected it) and must +// not signal "still in progress." The flags below mirror +// syntheticTrace.server.ts so the SDK contract stays consistent across +// the three read paths (spans, trace, dashboard trace presenter). + +function deriveTerminalFlags(status: SyntheticRun["status"]): { + isError: boolean; + isPartial: boolean; + isCancelled: boolean; +} { + const isCancelled = status === "CANCELED"; + const isFailed = status === "FAILED"; + return { + isError: isFailed, + isPartial: !isCancelled && !isFailed, + isCancelled, + }; +} + +// Body for GET /api/v1/runs/:runId/spans/:spanId when the run is buffered +// and `:spanId` has already been verified against `buffered.spanId` by the +// route. Pure function so the route layer just authenticates, resolves +// the run, validates the spanId, and forwards the buffered run here. +export function buildSyntheticSpanDetailBody(buffered: SyntheticRun) { + const flags = deriveTerminalFlags(buffered.status); + return { + spanId: buffered.spanId, + parentId: buffered.parentSpanId ?? null, + runId: buffered.friendlyId, + message: buffered.taskIdentifier ?? "", + ...flags, + level: "TRACE" as const, + startTime: buffered.createdAt, + durationMs: 0, + }; +} + +// Body for GET /api/v1/runs/:runId/trace when the run is buffered. +// Returns the `{ trace: { traceId, rootSpan } }` envelope expected by the +// SDK's RetrieveRunTraceResponseBody schema. +export function buildSyntheticTraceBody(buffered: SyntheticRun) { + const flags = deriveTerminalFlags(buffered.status); + return { + trace: { + traceId: buffered.traceId ?? "", + rootSpan: { + id: buffered.spanId ?? "", + runId: buffered.friendlyId, + data: { + message: buffered.taskIdentifier ?? "", + taskSlug: buffered.taskIdentifier ?? undefined, + events: [] as unknown[], + startTime: buffered.createdAt, + duration: 0, + ...flags, + level: "TRACE" as const, + queueName: buffered.queue ?? undefined, + machinePreset: buffered.machinePreset ?? undefined, + }, + children: [] as unknown[], + }, + }, + }; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts b/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts new file mode 100644 index 00000000000..e316846d708 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticRedirectInfo.server.ts @@ -0,0 +1,119 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import type { PrismaClientOrTransaction } from "@trigger.dev/database"; +import { z } from "zod"; +import { prisma } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; +// Use the webapp-side wrapper (not `deserialiseSnapshot` from +// @trigger.dev/redis-worker directly) so this file shares a single +// deserialisation path with readFallback.server.ts. The two are +// behaviourally identical today (both wrap `JSON.parse`), but pinning +// the shared helper keeps the two read-side modules from drifting if +// snapshot encoding ever changes. +import { deserialiseMollifierSnapshot } from "./mollifierSnapshot.server"; + +// Validated subset of a mollifier snapshot — just the fields needed to +// rebuild a canonical run-detail URL for a buffered run. Anything else +// in the payload is ignored. `safeParse` against this schema replaces +// the ad-hoc `as Record` + `typeof === "string"` checks +// that the redirect path used to do by hand; missing or wrong-typed +// fields collapse into a single `parsed.success === false` branch. +const BufferedSnapshotSchema = z.object({ + spanId: z.string().optional(), + environment: z.object({ + slug: z.string(), + project: z.object({ slug: z.string() }), + organization: z.object({ slug: z.string() }), + }), +}); + +export type BufferedRunRedirectInfo = { + organizationSlug: string; + projectSlug: string; + environmentSlug: string; + spanId: string | undefined; +}; + +export type FindBufferedRunRedirectInfoDeps = { + getBuffer?: () => MollifierBuffer | null; + prismaClient?: PrismaClientOrTransaction; +}; + +// Resolve the org/project/env slugs needed to build the canonical run-detail +// URL for a buffered run. Used by the short-URL redirect routes +// (`runs.$runParam`, `@.runs.$runParam`, `projects.v3.$projectRef.runs.$runParam`) +// so a customer clicking the trigger-API-returned run link doesn't 404 +// during the buffered window. +// +// Authorisation: PG query confirms the requesting user belongs to the +// organisation the buffer entry says owns the run. Without this check a +// known runId would leak slugs. +export async function findBufferedRunRedirectInfo( + args: { + runFriendlyId: string; + userId: string; + // Admin impersonation paths bypass org-membership; mirrors the existing + // PG-side admin route behaviour (`@.runs.$runParam` doesn't filter by + // org membership in the PG query either). + skipOrgMembershipCheck?: boolean; + }, + deps: FindBufferedRunRedirectInfoDeps = {}, +): Promise { + const buffer = (deps.getBuffer ?? getMollifierBuffer)(); + const prismaClient = deps.prismaClient ?? prisma; + if (!buffer) return null; + + let entry; + try { + entry = await buffer.getEntry(args.runFriendlyId); + } catch (err) { + logger.warn("buffered redirect: buffer.getEntry failed", { + runFriendlyId: args.runFriendlyId, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } + if (!entry) return null; + + if (!args.skipOrgMembershipCheck) { + const member = await prismaClient.orgMember.findFirst({ + where: { userId: args.userId, organizationId: entry.orgId }, + select: { id: true }, + }); + if (!member) return null; + } + + let raw: unknown; + try { + raw = deserialiseMollifierSnapshot(entry.payload); + } catch (err) { + logger.warn("buffered redirect: snapshot deserialise failed", { + runFriendlyId: args.runFriendlyId, + err: err instanceof Error ? err.message : String(err), + }); + return null; + } + + const parsed = BufferedSnapshotSchema.safeParse(raw); + if (!parsed.success) { + // Either the snapshot is from a different writer that doesn't carry + // environment slugs (in which case we genuinely can't build a URL) + // or a buffer-format drift snuck through. Log at debug; the caller + // 404s and the user sees the standard not-found page, not a 500. + logger.debug("buffered redirect: snapshot shape mismatch", { + runFriendlyId: args.runFriendlyId, + issues: parsed.error.issues.map((issue) => ({ + path: issue.path.join("."), + code: issue.code, + })), + }); + return null; + } + + return { + organizationSlug: parsed.data.environment.organization.slug, + projectSlug: parsed.data.environment.project.slug, + environmentSlug: parsed.data.environment.slug, + spanId: parsed.data.spanId, + }; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticReplayTaskRun.server.ts b/apps/webapp/app/v3/mollifier/syntheticReplayTaskRun.server.ts new file mode 100644 index 00000000000..01962cf7890 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticReplayTaskRun.server.ts @@ -0,0 +1,51 @@ +import type { TaskRun } from "@trigger.dev/database"; +import type { SyntheticRun } from "./readFallback.server"; + +export type SyntheticReplayTaskRun = TaskRun & { + project: { slug: string; organization: { slug: string } }; + runtimeEnvironment: { slug: string }; +}; + +// Adapt a buffered-run snapshot into the TaskRun-shaped input that +// `ReplayTaskRunService.call` expects. ReplayTaskRunService builds the +// new run's traceparent as `00-${existingTaskRun.traceId}-${existingTaskRun.spanId}-01` +// without guarding for undefined, so a synthetic with missing traceId +// or spanId (older snapshots — both fields are documented optional on +// `SyntheticRun`) would produce `00-undefined-undefined-01`, an invalid +// W3C traceparent that OTel silently drops, severing the replay's trace +// link to the original run. +// +// Returns null when those fields are missing — the caller surfaces this +// as "Run not found" so the customer retries once the drainer has +// materialised the PG row, where traceId/spanId are guaranteed present. +export function buildSyntheticReplayTaskRun(args: { + synthetic: SyntheticRun; + envRow: { + slug: string; + project: { slug: string; organization: { slug: string } }; + }; +}): SyntheticReplayTaskRun | null { + const { synthetic, envRow } = args; + if (!synthetic.traceId || !synthetic.spanId) return null; + return { + // The double `as unknown as TaskRun` cast is load-bearing — a direct + // `synthetic as TaskRun` won't compile. `SyntheticRun` carries the + // subset of fields that `ReplayTaskRunService.call` actually reads + // (the contract is enumerated on the SyntheticRun type comment in + // readFallback.server.ts), but its shape is not structurally + // assignable to the full Prisma `TaskRun` row: optional vs required + // fields diverge, several PG columns (number, batchId variants, + // status enum widening) are deliberately absent or narrower on the + // synthetic. Routing it through `unknown` is the explicit "we know + // this is a subset, we've audited which fields are read" signal, + // and the traceId/spanId guard above prevents the only field + // ReplayTaskRunService consumes that would corrupt downstream + // behaviour (the OTel traceparent) when undefined. + ...(synthetic as unknown as TaskRun), + project: { + slug: envRow.project.slug, + organization: { slug: envRow.project.organization.slug }, + }, + runtimeEnvironment: { slug: envRow.slug }, + }; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticRunHeader.server.ts b/apps/webapp/app/v3/mollifier/syntheticRunHeader.server.ts new file mode 100644 index 00000000000..9b137f87fb3 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticRunHeader.server.ts @@ -0,0 +1,75 @@ +import type { SyntheticRun } from "./readFallback.server"; + +// Synthesise the run-detail page's `run` header shape (the NavBar + +// status badge + Cancel-button gate) from a buffered run snapshot. The +// shape matches `RunPresenter.getRun`'s `runData` — keep this in sync +// when fields are added there. +// +// CANCELED and FAILED state is reflected back from +// `SyntheticRun.cancelledAt` / `status` so terminal buffered runs show +// the correct status in the NavBar + isFinished:true (which collapses +// the Cancel button on the page header) before the drainer materialises +// the PG row. This mirrors what `buildSyntheticSpanRun` does for the +// right-side details panel — the SyntheticRun.cancelledAt contract +// comment in readFallback.server.ts names this exact UI surface. +// +// FAILED status maps to `SYSTEM_FAILURE` to match the drainer's +// non-retryable terminal path, which is what `buildSyntheticSpanRun` +// uses too. Symmetric across the header + span-detail panel so an +// admin doesn't see "Pending" + "FAILED" simultaneously on the same +// run. +export function buildSyntheticRunHeader(args: { + run: SyntheticRun; + environment: { + id: string; + organizationId: string; + type: "PRODUCTION" | "DEVELOPMENT" | "STAGING" | "PREVIEW"; + slug: string; + }; +}) { + const { run, environment } = args; + const isCancelled = run.status === "CANCELED"; + const isFailed = run.status === "FAILED"; + + return { + // `id` mirrors RunPresenter.getRun's runData (the PG path), which + // is the internal cuid — not the friendlyId. SyntheticRun.id is + // already the cuid (RunId.fromFriendlyId(entry.runId) in + // readFallback.server.ts) so the admin debug tooltip on the run + // detail page shows the same format for buffered + materialised + // runs. + id: run.id, + number: 1, + friendlyId: run.friendlyId, + traceId: run.traceId ?? "", + spanId: run.spanId ?? "", + status: isCancelled + ? ("CANCELED" as const) + : isFailed + ? ("SYSTEM_FAILURE" as const) + : ("PENDING" as const), + isFinished: isCancelled || isFailed, + startedAt: null, + // Symmetric with `buildSyntheticSpanRun` and the + // `ApiRetrieveRunPresenter` synth path. The run-detail route + // derives `isCompleted` from `completedAt !== null` and gates SSE + // live-reloading on it (`route.tsx:459`, `:551`); leaving + // `completedAt` null for FAILED would keep a terminal buffered run + // live-reloading forever. PG-resident SYSTEM_FAILURE rows always + // have completedAt set, so fall back to createdAt (the buffer + // entry has no separate failedAt — closest proxy for when the + // terminal state landed). + completedAt: run.cancelledAt ?? (isFailed ? run.createdAt : null), + logsDeletedAt: null, + rootTaskRun: null, + parentTaskRun: null, + environment: { + id: environment.id, + organizationId: environment.organizationId, + type: environment.type, + slug: environment.slug, + userId: undefined, + userName: undefined, + }, + }; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts b/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts new file mode 100644 index 00000000000..ae274aac3d5 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticSpanRun.server.ts @@ -0,0 +1,197 @@ +import { prettyPrintPacket, RunAnnotations } from "@trigger.dev/core/v3"; +import { getMaxDuration } from "@trigger.dev/core/v3/isomorphic"; +import { + extractIdempotencyKeyScope, + getUserProvidedIdempotencyKey, +} from "@trigger.dev/core/v3/serverOnly"; +import { MachinePresetName } from "@trigger.dev/core/v3/schemas"; +import type { SpanRun } from "~/presenters/v3/SpanPresenter.server"; +import type { SyntheticRun } from "./readFallback.server"; + +// `SyntheticRun.machinePreset` is sourced from the snapshot payload as +// a plain string, but `SpanRun.machinePreset` is the narrowed enum. +// Validate against the canonical enum so an unknown / stale preset +// string collapses to undefined rather than fighting the type checker. +function narrowMachinePreset(value: string | undefined): SpanRun["machinePreset"] { + if (value === undefined) return undefined; + const parsed = MachinePresetName.safeParse(value); + return parsed.success ? parsed.data : undefined; +} + +// Synthesise a SpanRun-shaped object from a buffered run so the run-detail +// page's right-side details panel renders identically to a PG-resident +// run. The shape matches `SpanPresenter.getRun`'s return value; +// buffered-irrelevant fields (output, attempts, schedule, session, +// region, batch) are filled with sensible defaults, while terminal state +// (CANCELED / FAILED) is reflected into `status`, `isFinished`, `isError` +// and `error` so a finished buffered run does not render as PENDING. +// +// Pretty-printing for payload and metadata mirrors SpanPresenter so the +// UI receives data in the same shape. Buffered runs cannot use the +// `application/store` packet path (no R2 object yet) so we treat raw +// snapshot fields as inline packets. +export async function buildSyntheticSpanRun(args: { + run: SyntheticRun; + environment: { id: string; slug: string; type: "PRODUCTION" | "DEVELOPMENT" | "STAGING" | "PREVIEW" }; +}): Promise { + const { run, environment } = args; + + const payload = + typeof run.payload !== "undefined" && run.payload !== null + ? await prettyPrintPacket(run.payload, run.payloadType ?? undefined) + : undefined; + + // Nullish check, not truthy — matches the payload branch above so an + // intentionally-empty packet (e.g. metadata: "") still gets handed to + // `prettyPrintPacket` and renders consistently. A truthy check would + // drop the empty-string case and the two paths would diverge. + const metadata = + typeof run.metadata !== "undefined" && run.metadata !== null + ? await prettyPrintPacket(run.metadata, run.metadataType, { + filteredKeys: ["$$streams", "$$streamsVersion", "$$streamsBaseUrl"], + }) + : undefined; + + const idempotencyShape = { + idempotencyKey: run.idempotencyKey ?? null, + idempotencyKeyExpiresAt: null, + idempotencyKeyOptions: run.idempotencyKeyOptions ?? null, + }; + + const idempotencyKey = getUserProvidedIdempotencyKey(idempotencyShape); + const idempotencyKeyScope = extractIdempotencyKeyScope(idempotencyShape); + const idempotencyKeyStatus: SpanRun["idempotencyKeyStatus"] = idempotencyKey + ? "active" + : idempotencyKeyScope + ? "inactive" + : undefined; + + const taskKind = RunAnnotations.safeParse(run.annotations).data?.taskKind; + const isAgentRun = taskKind === "AGENT"; + + const queueName = run.queue ?? "task/"; + const isCancelled = run.status === "CANCELED"; + const isFailed = run.status === "FAILED"; + + // The run-detail panel derives terminal/error state from `status`, + // `isFinished` and `isError` (SpanPresenter.getRun -> isFinalRunStatus / + // isFailedRunStatus). Buffered FAILED runs surface as SYSTEM_FAILURE to + // match ApiRetrieveRunPresenter.bufferedStatusToTaskRunStatus; both + // CANCELED and SYSTEM_FAILURE are final run statuses, and SYSTEM_FAILURE + // is also a failed status. + const status: SpanRun["status"] = isCancelled + ? "CANCELED" + : isFailed + ? "SYSTEM_FAILURE" + : "PENDING"; + + // Mirror ApiRetrieveRunPresenter's STRING_ERROR synthesis so the panel + // shows why a buffered run failed instead of an empty error block. + const error: SpanRun["error"] = + isFailed && run.error + ? { type: "STRING_ERROR", raw: `${run.error.code}: ${run.error.message}` } + : undefined; + + return { + id: run.id, + friendlyId: run.friendlyId, + status, + statusReason: isCancelled + ? run.cancelReason ?? undefined + : isFailed + ? run.error?.message ?? undefined + : undefined, + createdAt: run.createdAt, + startedAt: null, + executedAt: null, + updatedAt: run.cancelledAt ?? run.createdAt, + delayUntil: run.delayUntil ?? null, + expiredAt: null, + // Symmetric with `ApiRetrieveRunPresenter` — FAILED buffered runs + // must surface a non-null `completedAt` so the run-detail panel + // (and any caller checking `isFinished && completedAt`) doesn't + // render a finished run with no completion timestamp. PG-resident + // SYSTEM_FAILURE rows always have completedAt set; the buffer + // entry has no separate failedAt, so we fall back to createdAt + // as the best proxy for when the terminal state landed. + completedAt: run.cancelledAt ?? (isFailed ? run.createdAt : null), + logsDeletedAt: null, + ttl: run.ttl ?? null, + taskIdentifier: run.taskIdentifier ?? "", + version: undefined, + sdkVersion: undefined, + runtime: undefined, + runtimeVersion: undefined, + isTest: run.isTest, + replayedFromTaskRunFriendlyId: run.replayedFromTaskRunFriendlyId ?? null, + environmentId: environment.id, + idempotencyKey, + idempotencyKeyExpiresAt: null, + idempotencyKeyScope, + idempotencyKeyStatus, + debounce: null, + schedule: undefined, + queue: { + name: queueName, + isCustomQueue: !queueName.startsWith("task/"), + concurrencyKey: run.concurrencyKey ?? null, + }, + tags: run.runTags, + baseCostInCents: 0, + costInCents: 0, + totalCostInCents: 0, + usageDurationMs: 0, + isFinished: isCancelled || isFailed, + isRunning: false, + isError: isFailed, + isAgentRun, + payload, + payloadType: run.payloadType ?? "application/json", + output: undefined, + outputType: "application/json", + error, + // The snapshot only carries the root/parent friendly IDs, not the + // spanId or taskIdentifier that SpanPresenter sources from the joined + // PG rows. Emitting them with empty-string stubs renders a blank task + // name and a misleading `?span=` jump target, so we omit the + // relationships until the drainer materialises the row (a transient + // window). Top-level buffered runs have no relationships regardless. + relationships: { + root: undefined, + parent: undefined, + }, + context: JSON.stringify( + { + task: { + id: run.taskIdentifier ?? "", + }, + run: { + id: run.friendlyId, + createdAt: run.createdAt, + isTest: run.isTest, + }, + environment: { + id: environment.id, + slug: environment.slug, + type: environment.type, + }, + }, + null, + 2, + ), + metadata, + maxDurationInSeconds: getMaxDuration(run.maxDurationInSeconds), + batch: undefined, + session: undefined, + engine: "V2", + region: null, + workerQueue: run.workerQueue ?? "", + traceId: run.traceId ?? "", + spanId: run.spanId ?? "", + isCached: false, + isBuffered: true, + machinePreset: narrowMachinePreset(run.machinePreset), + taskEventStore: "taskEvent", + externalTraceId: undefined, + }; +} diff --git a/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts b/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts new file mode 100644 index 00000000000..ee0d518e2e7 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/syntheticTrace.server.ts @@ -0,0 +1,76 @@ +import { millisecondsToNanoseconds } from "@trigger.dev/core/v3"; +import { createTreeFromFlatItems, flattenTree } from "~/components/primitives/TreeView/TreeView"; +import { createTimelineSpanEventsFromSpanEvents } from "~/utils/timelineSpanEvents"; +import type { SpanSummary } from "~/v3/eventRepository/eventRepository.types"; +import type { SyntheticRun } from "./readFallback.server"; + +// Build a single-span trace for a buffered run so the run-detail page +// renders a meaningful timeline before the drainer materialises the +// row. Mirrors the shape produced by `RunPresenter` when its trace +// store lookup returns no spans, so the dashboard consumer treats the +// buffered run identically to a freshly enqueued PG run that hasn't +// emitted any events yet. +export function buildSyntheticTraceForBufferedRun(run: SyntheticRun) { + const spanId = run.spanId ?? ""; + const isCancelled = run.status === "CANCELED"; + const isFailed = run.status === "FAILED"; + const span: SpanSummary = { + id: spanId, + parentId: run.parentSpanId, + runId: run.friendlyId, + data: { + message: run.taskIdentifier ?? "Task", + style: { icon: "task", variant: "primary" }, + events: [], + startTime: run.createdAt, + duration: 0, + isError: isFailed, + // CANCELED and FAILED are terminal; only a still-queued buffered run + // is partial. A partial failed span would otherwise render as + // "executing" forever in the timeline. + isPartial: !isCancelled && !isFailed, + isCancelled, + isDebug: false, + level: "TRACE", + }, + }; + + const tree = createTreeFromFlatItems([span], spanId); + const treeRootStartTimeMs = tree?.data.startTime.getTime() ?? 0; + const totalDuration = Math.max(tree?.data.duration ?? 0, millisecondsToNanoseconds(1)); + + const events = tree + ? flattenTree(tree).map((n) => { + const offset = millisecondsToNanoseconds( + n.data.startTime.getTime() - treeRootStartTimeMs + ); + return { + ...n, + data: { + ...n.data, + timelineEvents: createTimelineSpanEventsFromSpanEvents(n.data.events, false, treeRootStartTimeMs), + duration: n.data.isPartial ? null : n.data.duration, + offset, + isRoot: n.id === spanId, + }, + }; + }) + : []; + + return { + // Matches RunPresenter's derivation: failed root span -> "failed", + // otherwise a terminal (non-partial) span -> "completed", else + // "executing". CANCELED is terminal-but-not-error, so "completed". + rootSpanStatus: (isFailed ? "failed" : isCancelled ? "completed" : "executing") as + | "executing" + | "completed" + | "failed", + events, + duration: totalDuration, + rootStartedAt: tree?.data.startTime, + startedAt: null, + queuedDuration: undefined, + overridesBySpanId: undefined, + linkedRunIdBySpanId: {} as Record, + }; +} diff --git a/apps/webapp/app/v3/mollifierDrainerWorker.server.ts b/apps/webapp/app/v3/mollifierDrainerWorker.server.ts index 313e9af6719..e571344141d 100644 --- a/apps/webapp/app/v3/mollifierDrainerWorker.server.ts +++ b/apps/webapp/app/v3/mollifierDrainerWorker.server.ts @@ -97,8 +97,8 @@ export function initMollifierDrainerWorker( // Deterministic misconfig (shutdown-timeout vs GRACEFUL_SHUTDOWN_TIMEOUT, // missing buffer client) is a deploy-time mistake the operator must // see immediately — rethrow so the process crashes, health checks - // fail, and the orchestrator rolls the deploy back. Phase 1 is - // monitoring-only and the silent-fallback was tempting, but Phase 2/3 + // fail, and the orchestrator rolls the deploy back. The drainer is currently + // monitoring-only and the silent-fallback was tempting, but later phases // make the drainer the source of truth for diverted triggers, where a // silently-disabled drainer means data loss. Better to fail loud now // than retrofit later. diff --git a/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts new file mode 100644 index 00000000000..de05ab24671 --- /dev/null +++ b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts @@ -0,0 +1,73 @@ +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { signalsEmitter } from "~/services/signals.server"; +import { + startStaleSweepInterval, + type StaleSweepIntervalHandle, +} from "./mollifier/mollifierStaleSweep.server"; +import { MollifierStaleSweepState } from "./mollifier/mollifierStaleSweepState.server"; + +declare global { + // eslint-disable-next-line no-var + var __mollifierStaleSweepRegistered__: boolean | undefined; + // eslint-disable-next-line no-var + var __mollifierStaleSweepHandle__: StaleSweepIntervalHandle | undefined; +} + +/** + * Bootstraps the mollifier stale-entry sweep. + * + * Independent of the drainer — its purpose is to alert when entries are + * piling up despite the drainer being supposedly healthy, so it runs + * any time the mollifier itself is enabled (gated separately from + * `TRIGGER_MOLLIFIER_DRAINER_ENABLED`). The sweep is read-only: it + * counts and logs stale entries but does not remove or salvage them. + * + * The Remix dev server re-evaluates `entry.server.tsx` on every change, + * so the registration guard + handle cache make the bootstrap + * idempotent across hot reloads. + */ +export function initMollifierStaleSweepWorker(): void { + if (env.TRIGGER_MOLLIFIER_STALE_SWEEP_ENABLED !== "1") return; + if (global.__mollifierStaleSweepRegistered__) return; + + logger.debug("Initializing mollifier stale-entry sweep", { + intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS, + staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, + }); + + // Construct the sweep's durable-state Redis client using the same + // mollifier-Redis credentials as the buffer. Keeping this client + // separate from the buffer's own client keeps state ownership clean: + // the buffer abstracts queue/entry state, this abstracts sweep state. + const state = new MollifierStaleSweepState({ + redisOptions: { + keyPrefix: "", + host: env.TRIGGER_MOLLIFIER_REDIS_HOST, + port: env.TRIGGER_MOLLIFIER_REDIS_PORT, + username: env.TRIGGER_MOLLIFIER_REDIS_USERNAME, + password: env.TRIGGER_MOLLIFIER_REDIS_PASSWORD, + enableAutoPipelining: true, + ...(env.TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }, + }); + + const handle = startStaleSweepInterval( + { + intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS, + staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, + }, + { state }, + ); + + // `handle.stop` is now async (it closes the Redis client). The signals + // emitter swallows promise rejections from listeners, so wrap it in a + // void-returning shim to be explicit about discarding the promise. + const onShutdown = (): void => { + void handle.stop(); + }; + signalsEmitter.on("SIGTERM", onShutdown); + signalsEmitter.on("SIGINT", onShutdown); + global.__mollifierStaleSweepRegistered__ = true; + global.__mollifierStaleSweepHandle__ = handle; +} diff --git a/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts b/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts index 95684999303..8273d8c9d97 100644 --- a/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts +++ b/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts @@ -1,6 +1,7 @@ import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { BaseService, ServiceValidationError } from "./baseService.server"; import { logger } from "~/services/logger.server"; +import { getMollifierBuffer } from "~/v3/mollifier/mollifierBuffer.server"; export class ResetIdempotencyKeyService extends BaseService { public async call( @@ -8,7 +9,7 @@ export class ResetIdempotencyKeyService extends BaseService { taskIdentifier: string, authenticatedEnv: AuthenticatedEnvironment ): Promise<{ id: string }> { - const { count } = await this._prisma.taskRun.updateMany({ + const { count: pgCount } = await this._prisma.taskRun.updateMany({ where: { idempotencyKey, taskIdentifier, @@ -20,7 +21,77 @@ export class ResetIdempotencyKeyService extends BaseService { }, }); - if (count === 0) { + // Buffer-side reset: the key may belong to a buffered run that + // hasn't materialised yet. The PG updateMany above can't see it. + // resetIdempotency clears both the snapshot fields and the Redis + // lookup atomically. Returns null when nothing was bound there. + const buffer = getMollifierBuffer(); + let bufferResetFailed = false; + const bufferResult = buffer + ? await buffer + .resetIdempotency({ + envId: authenticatedEnv.id, + taskIdentifier, + idempotencyKey, + }) + .catch((err) => { + // Don't drop a buffer outage on the floor. We log + flag so + // the 404 branch below can distinguish "no record anywhere" + // (legitimate not-found) from "PG cleared nothing AND we + // couldn't see the buffer" (partial outage — caller should + // retry, not be told "doesn't exist"). + bufferResetFailed = true; + logger.error("ResetIdempotencyKeyService: buffer reset failed", { + idempotencyKey, + taskIdentifier, + err: err instanceof Error ? err.message : String(err), + }); + return { clearedRunId: null }; + }) + : { clearedRunId: null }; + + const totalCount = pgCount + (bufferResult.clearedRunId ? 1 : 0); + + if (pgCount === 0 && bufferResetFailed) { + // PG saw nothing AND the buffer is unreachable. We can't truthfully + // say "not found" — there may be a buffered run we can't observe. + // Surface as 503 so the caller retries instead of being misled. + throw new ServiceValidationError( + "Unable to verify buffered idempotency state right now; please retry", + 503 + ); + } + + if (totalCount === 0) { + // PG↔buffer handoff re-check. Between the initial `pg.updateMany` + // and the buffer reset above, a buffered run can materialise into + // PG: the drainer's `engine.trigger` writes the row with the + // original idempotencyKey, then `buffer.ack` clears the Redis + // idempotency lookup (per ack's contract on + // `packages/redis-worker/src/mollifier/buffer.ts`). Both surfaces + // now report "nothing", but the key still lives on the freshly- + // materialised PG row. One more conditional updateMany catches + // that row before we 404 the customer. Cost: a single indexed + // lookup against the writer when there's nothing to find; + // otherwise the exact write the customer asked for (i.e., not + // duplicative — without it the reset is silently lost). + const { count: handoffPgCount } = await this._prisma.taskRun.updateMany({ + where: { + idempotencyKey, + taskIdentifier, + runtimeEnvironmentId: authenticatedEnv.id, + }, + data: { + idempotencyKey: null, + idempotencyKeyExpiresAt: null, + }, + }); + if (handoffPgCount > 0) { + logger.info( + `Reset idempotency key via handoff re-check: ${idempotencyKey} for task: ${taskIdentifier} in env: ${authenticatedEnv.id}, affected ${handoffPgCount} run(s)` + ); + return { id: idempotencyKey }; + } throw new ServiceValidationError( `No runs found with idempotency key: ${idempotencyKey} and task: ${taskIdentifier}`, 404 @@ -28,7 +99,7 @@ export class ResetIdempotencyKeyService extends BaseService { } logger.info( - `Reset idempotency key: ${idempotencyKey} for task: ${taskIdentifier} in env: ${authenticatedEnv.id}, affected ${count} run(s)` + `Reset idempotency key: ${idempotencyKey} for task: ${taskIdentifier} in env: ${authenticatedEnv.id}, affected ${totalCount} run(s) (pg=${pgCount}, buffered=${bufferResult.clearedRunId ? 1 : 0})` ); return { id: idempotencyKey }; diff --git a/apps/webapp/app/v3/services/triggerTask.server.ts b/apps/webapp/app/v3/services/triggerTask.server.ts index 96712c36cc4..7bbaa0dd99b 100644 --- a/apps/webapp/app/v3/services/triggerTask.server.ts +++ b/apps/webapp/app/v3/services/triggerTask.server.ts @@ -46,6 +46,14 @@ export class OutOfEntitlementError extends Error { export type TriggerTaskServiceResult = { run: TaskRun; isCached: boolean; + // True when the mollifier gate diverted the trigger to the Redis + // buffer and `run` is a synthesised record (no PG row exists yet). + // The trigger route reads this to skip `saveRequestIdempotency` — + // caching the synth runId would mean a lost-response SDK retry hits + // a PG-miss in `handleRequestIdempotency` and falls through to a + // fresh trigger, producing a duplicate buffer entry for trigger + // calls that don't carry a task-level idempotency key. + isMollified?: boolean; }; export const MAX_ATTEMPTS = 2; diff --git a/apps/webapp/test/engine/triggerTask.test.ts b/apps/webapp/test/engine/triggerTask.test.ts index d07909d2907..8613ba429c3 100644 --- a/apps/webapp/test/engine/triggerTask.test.ts +++ b/apps/webapp/test/engine/triggerTask.test.ts @@ -68,17 +68,31 @@ class MockTriggerTaskValidator implements TriggerTaskValidator { } } +// Mirror the production ClickhouseEventRepository.traceEvent shape so +// callers that read `event.traceContext.traceparent` (e.g. the +// mollifier branch seeding the snapshot) get the same W3C-formatted +// value they'd get against a real event repository. +const MOCK_TRACE_ID = "0123456789abcdef0123456789abcdef"; +const MOCK_SPAN_ID = "fedcba9876543210"; +const MOCK_TRACEPARENT = `00-${MOCK_TRACE_ID}-${MOCK_SPAN_ID}-01`; + class MockTraceEventConcern implements TraceEventConcern { + // Records the start time of the most recent traceRun callback entry. + // Used by ordering assertions that verify traceRun fires before + // downstream side effects (e.g. mollifier buffer writes). + public traceRunEnteredAt: number | undefined; + async traceRun( request: TriggerTaskRequest, parentStore: string | undefined, callback: (span: TracedEventSpan, store: string) => Promise ): Promise { + this.traceRunEnteredAt = Date.now(); return await callback( { - traceId: "test", - spanId: "test", - traceContext: {}, + traceId: MOCK_TRACE_ID, + spanId: MOCK_SPAN_ID, + traceContext: { traceparent: MOCK_TRACEPARENT }, traceparent: undefined, setAttribute: () => { }, failWithError: () => { }, @@ -253,6 +267,76 @@ describe("RunEngineTriggerTaskService", () => { await engine.quit(); }); + // The BatchQueue worker rebuilds body.options from Redis-stored items + // (Record), so the Phase-2 schema coercion doesn't apply + // to in-flight items enqueued before the schema fix. The defensive + // `typeof === "number"` coercion at the engine.trigger call site is what + // prevents these from failing at prisma.taskRun.create with + // "Argument concurrencyKey: Expected String or Null, provided Int". + containerTest( + "coerces a numeric concurrencyKey to a string at the engine.trigger boundary", + async ({ prisma, redisOptions }) => { + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { redis: redisOptions }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + const triggerTaskService = new RunEngineTriggerTaskService({ + engine, + prisma, + payloadProcessor: new MockPayloadProcessor(), + queueConcern: new DefaultQueueManager(prisma, engine), + idempotencyKeyConcern: new IdempotencyKeyConcern( + prisma, + engine, + new MockTraceEventConcern() + ), + validator: new MockTriggerTaskValidator(), + traceEventConcern: new MockTraceEventConcern(), + tracer: trace.getTracer("test", "0.0.0"), + metadataMaximumSize: 1024 * 1024 * 1, + }); + + const result = await triggerTaskService.call({ + taskId: taskIdentifier, + environment: authenticatedEnvironment, + // Cast through `any` to simulate the in-flight Redis batch-item shape + // (Record) that bypasses the BatchItemNDJSON schema. + body: { payload: { userId: 51262 }, options: { concurrencyKey: 51262 as any } }, + }); + + expect(result).toBeDefined(); + const run = await prisma.taskRun.findUnique({ where: { id: result!.run.id } }); + expect(run?.concurrencyKey).toBe("51262"); + + await engine.quit(); + } + ); + containerTest("should handle idempotency keys correctly", async ({ prisma, redisOptions }) => { const engine = new RunEngine({ prisma, @@ -1269,8 +1353,17 @@ describe("RunEngineTriggerTaskService", () => { ); containerTest( - "mollifier · mollify action triggers dual-write (buffer.accept + engine.trigger)", + "mollifier · mollify action writes to buffer and returns synthetic result (no Postgres row)", async ({ prisma, redisOptions }) => { + // When the gate decides mollify, the call site + // invokes `mollifyTrigger` which writes the engine.trigger snapshot + // to the buffer and returns a synthesised `MollifySyntheticResult` + // (run.friendlyId + notice + isCached:false). `engine.trigger` is + // NEVER invoked on this path — the run materialises in Postgres + // later, when the drainer replays the snapshot. The replay is + // covered by `mollifierDrainerHandler.test.ts`; this test pins the + // call-site integration: synthetic result + buffer write + no + // Postgres side effect. const engine = new RunEngine({ prisma, worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, @@ -1288,7 +1381,24 @@ describe("RunEngineTriggerTaskService", () => { const taskIdentifier = "test-task"; await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); - const buffer = new CapturingMollifierBuffer(); + // Buffer override records the time of the accept call so we can + // assert that traceRun fired strictly before the buffer was + // touched. If a future change re-introduces the "skip traceRun on + // mollify" shortcut, traceConcern.traceRunEnteredAt stays + // undefined and the ordering assertion fails. + class TimestampedBuffer extends CapturingMollifierBuffer { + public acceptedAt: number | undefined; + override async accept(input: { + runId: string; + envId: string; + orgId: string; + payload: string; + }) { + this.acceptedAt = Date.now(); + return await super.accept(input); + } + } + const buffer = new TimestampedBuffer(); const trippedDecision = { divert: true as const, reason: "per_env_rate" as const, @@ -1297,6 +1407,7 @@ describe("RunEngineTriggerTaskService", () => { windowMs: 200, holdMs: 500, }; + const traceConcern = new MockTraceEventConcern(); const triggerTaskService = new RunEngineTriggerTaskService({ engine, @@ -1305,7 +1416,7 @@ describe("RunEngineTriggerTaskService", () => { queueConcern: new DefaultQueueManager(prisma, engine), idempotencyKeyConcern: new IdempotencyKeyConcern(prisma, engine, new MockTraceEventConcern()), validator: new MockTriggerTaskValidator(), - traceEventConcern: new MockTraceEventConcern(), + traceEventConcern: traceConcern, tracer: trace.getTracer("test", "0.0.0"), metadataMaximumSize: 1024 * 1024, evaluateGate: async () => ({ action: "mollify", decision: trippedDecision }), @@ -1319,25 +1430,93 @@ describe("RunEngineTriggerTaskService", () => { body: { payload: { hello: "world" } }, }); - // engine.trigger ran — Postgres has the run + // Pre-modifier span creation: traceRun must run BEFORE the buffer + // is touched. Customer-visible effect — the run span lands in + // ClickHouse from the moment the trigger returns, even when the + // drainer is offline, so buffered runs are visible in the trace + // view immediately rather than only after drain. + expect(traceConcern.traceRunEnteredAt).toBeDefined(); + expect(buffer.acceptedAt).toBeDefined(); + expect(traceConcern.traceRunEnteredAt!).toBeLessThanOrEqual(buffer.acceptedAt!); + + // Synthetic result is returned with the `mollifier.queued` notice + // (the call-site casts the synthetic shape to `TriggerTaskServiceResult`; + // at runtime the `notice` and `isCached: false` fields are present + // and read by the api.v1.tasks.$taskId.trigger.ts route handler). expect(result).toBeDefined(); expect(result?.run.friendlyId).toBeDefined(); - const pgRun = await prisma.taskRun.findFirst({ where: { id: result!.run.id } }); - expect(pgRun).not.toBeNull(); - expect(pgRun!.friendlyId).toBe(result!.run.friendlyId); - - // buffer.accept ran — Redis has the audit copy under the same friendlyId + const synthetic = result as unknown as { + run: { friendlyId: string }; + isCached: false; + notice: { code: string; message: string; docs: string }; + }; + expect(synthetic.isCached).toBe(false); + expect(synthetic.notice.code).toBe("mollifier.queued"); + expect(synthetic.notice.message).toBeTypeOf("string"); + expect(synthetic.notice.docs).toBeTypeOf("string"); + + // The mollify branch must flag `isMollified: true` on the result so + // the trigger route can skip `saveRequestIdempotency`. Caching the + // synthetic runId in the request-idempotency table would mean a + // lost-response SDK retry (same `x-trigger-request-idempotency-key` + // header) hits a PG miss in `handleRequestIdempotency` and falls + // through to a fresh trigger — producing a duplicate buffer entry + // for trigger calls without a task-level idempotency key. The + // bounded behaviour (accept retry-as-fresh-trigger during the + // buffer window) is the deliberate choice; a stale-cache lookup + // returning null is not. + expect(result?.isMollified).toBe(true); + + // buffer.accept ran — Redis has the canonical engine.trigger snapshot + // under the synthesised friendlyId. The drainer will read this and + // replay it through engine.trigger to materialise the run. expect(buffer.accepted).toHaveLength(1); expect(buffer.accepted[0]!.runId).toBe(result!.run.friendlyId); expect(buffer.accepted[0]!.envId).toBe(authenticatedEnvironment.id); expect(buffer.accepted[0]!.orgId).toBe(authenticatedEnvironment.organizationId); + // Payload is a JSON-serialised MollifierSnapshot (the engine.trigger + // input). Schema is internal to the engine, so we only assert that + // it parses and references the friendlyId — anything more specific + // would couple the mollifier-layer test to engine-layer fields. + const snapshot = JSON.parse(buffer.accepted[0]!.payload) as { + traceId?: string; + spanId?: string; + traceContext?: { traceparent?: string }; + }; - // payload is the canonical replay shape - const payload = JSON.parse(buffer.accepted[0]!.payload); - expect(payload.runFriendlyId).toBe(result!.run.friendlyId); - expect(payload.taskId).toBe(taskIdentifier); - expect(payload.envId).toBe(authenticatedEnvironment.id); - expect(payload.body).toEqual({ payload: { hello: "world" } }); + // Regression guard for the dashboard trace-tree bug: the mollifier + // snapshot MUST carry a W3C `traceparent` in `traceContext`, + // seeded from the same span traceRun opened. Without it, the + // drainer replays through engine.trigger with empty traceContext + // and every downstream `recordRunDebugLog` + // (QUEUED/EXECUTING/FINISHED/run:notify…) gets a fresh traceId + + // null parentId — the run-detail page can only show the root + // span. Both the mollify and pass-through paths now flow through + // `traceEventConcern.traceRun`; this assertion pins the + // seeding-from-the-run-span contract. + expect(snapshot.traceContext?.traceparent).toMatch( + /^00-[0-9a-f]{32}-[0-9a-f]{16}-[0-9a-f]{2}$/ + ); + expect(snapshot.traceContext!.traceparent).toContain(snapshot.traceId); + expect(snapshot.traceContext!.traceparent).toContain(snapshot.spanId); + // The snapshot inherits the *run span's* traceId/spanId (from the + // event handed in by traceRun), not a separately-generated OTel + // span. This is what lets the drainer's `mollifier.drained` span + // and downstream engine.trigger materialisation parent on the + // same ClickHouse trace the customer sees from the moment trigger + // returns. + expect(snapshot.traceId).toBe(MOCK_TRACE_ID); + expect(snapshot.spanId).toBe(MOCK_SPAN_ID); + + // Postgres has NOT been written: engine.trigger was never called on + // the mollify path. The run materialises only when the drainer + // replays the snapshot. Regression intent: if a future change makes + // the mollify branch fall through to engine.trigger (re-introducing + // phase-1 dual-write), this assertion fails loudly. + const pgRun = await prisma.taskRun.findFirst({ + where: { friendlyId: result!.run.friendlyId }, + }); + expect(pgRun).toBeNull(); await engine.quit(); }, @@ -1393,108 +1572,12 @@ describe("RunEngineTriggerTaskService", () => { // getMollifierBuffer must not be called either — the call site short-circuits // before touching the singleton when the gate says pass_through. expect(getBufferSpy).not.toHaveBeenCalled(); - - await engine.quit(); - }, - ); - - containerTest( - "mollifier · engine.trigger throwing AFTER buffer.accept leaves an orphan entry (documented behaviour)", - async ({ prisma, redisOptions }) => { - // SCENARIO: dual-write where buffer.accept succeeds but engine.trigger - // throws. The throw propagates to the caller (correct: customer sees - // the same 4xx as today), and the buffer entry remains as an "orphan" - // — Phase 1's no-op drainer will pop+ack it on its next poll, so the - // orphan is bounded (~drainer pollIntervalMs) but observable in the - // audit trail (mollifier.buffered with no matching TaskRun). - // - // Why engine.trigger can throw post-buffer: - // - RunDuplicateIdempotencyKeyError (Prisma P2002 on idempotencyKey): - // a concurrent non-mollified trigger with the same idempotencyKey - // wins the DB UNIQUE constraint between IdempotencyKeyConcern's - // pre-check and engine.trigger's INSERT. - // - RunOneTimeUseTokenError (Prisma P2002 on oneTimeUseToken). - // - Transient Prisma errors (FK constraint, connection drop, etc.). - // - // Why we don't "fix" this race in Phase 1: - // The customer correctly gets the error. State eventually converges - // (drainer pops the orphan). The audit-trail explicitly surfaces - // "buffered without TaskRun" entries to operators. A real fix is - // Phase 2's responsibility once the buffer becomes the primary write - // — at that point we add the mollifier-specific idempotency index. - // - // This test pins the current ordering: buffer.accept fires synchronously - // BEFORE engine.trigger, and engine.trigger failure does NOT roll back - // the buffer write. Any future change that reverses the order or adds - // a silent rollback will fail this assertion and force a design - // decision rather than a silent behaviour change. - - const engine = new RunEngine({ - prisma, - worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, - queue: { redis: redisOptions }, - runLock: { redis: redisOptions }, - machines: { - defaultMachine: "small-1x", - machines: { "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 } }, - baseCostInCents: 0.0005, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); - - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const taskIdentifier = "test-task"; - await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); - - const buffer = new CapturingMollifierBuffer(); - - // Force engine.trigger to throw on this single call. We spy AFTER - // setupBackgroundWorker so the worker setup still uses the real - // engine.trigger (which has its own engine.trigger-ish calls for - // worker bootstrap — though in practice setupBackgroundWorker doesn't - // call trigger). - const simulatedFailure = new Error("simulated engine.trigger failure post-buffer"); - vi.spyOn(engine, "trigger").mockRejectedValueOnce(simulatedFailure); - - const triggerTaskService = new RunEngineTriggerTaskService({ - engine, - prisma, - payloadProcessor: new MockPayloadProcessor(), - queueConcern: new DefaultQueueManager(prisma, engine), - idempotencyKeyConcern: new IdempotencyKeyConcern(prisma, engine, new MockTraceEventConcern()), - validator: new MockTriggerTaskValidator(), - traceEventConcern: new MockTraceEventConcern(), - tracer: trace.getTracer("test", "0.0.0"), - metadataMaximumSize: 1024 * 1024, - evaluateGate: async () => ({ - action: "mollify", - decision: { - divert: true, - reason: "per_env_rate", - count: 150, - threshold: 100, - windowMs: 200, - holdMs: 500, - }, - }), - getMollifierBuffer: () => buffer as never, - isMollifierGloballyEnabled: () => true, - }); - - await expect( - triggerTaskService.call({ - taskId: taskIdentifier, - environment: authenticatedEnvironment, - body: { payload: { test: "x" } }, - }), - ).rejects.toThrow(/simulated engine.trigger failure post-buffer/); - - // The buffer write happened BEFORE engine.trigger threw. The orphan - // remains; the audit-trail will surface it (mollifier.buffered with - // no matching TaskRun row). Phase 1's no-op drainer cleans it up. - expect(buffer.accepted).toHaveLength(1); - const orphanPayload = JSON.parse(buffer.accepted[0]!.payload); - expect(orphanPayload.taskId).toBe(taskIdentifier); + // Pass-through must NOT set `isMollified` — `result.run` is a real + // PG row, and the trigger route's `saveRequestIdempotency` is + // safe to call. Setting the flag here would silently skip the + // request-idempotency cache for every non-mollified trigger on a + // mollifier-enabled org, breaking lost-response retry dedup. + expect(result?.isMollified).toBeFalsy(); await engine.quit(); }, @@ -1607,143 +1690,6 @@ describe("RunEngineTriggerTaskService", () => { }, ); - containerTest( - "mollifier · debounce match produces an orphan buffer entry (documented behaviour)", - async ({ prisma, redisOptions }) => { - // SCENARIO: a trigger with a debounce key arrives while a matching - // debounced run already exists. `debounceSystem.handleDebounce` runs - // INSIDE `engine.trigger` (line ~514 of run-engine/src/engine/index.ts), - // AFTER buffer.accept has already written the new friendlyId. The - // service correctly returns the existing run id to the customer, but - // the buffer is left with an orphan entry for the new friendlyId. - // - // Why this is acceptable in Phase 1: - // - Customer-facing behaviour is unchanged from today: they receive - // the existing run id, same as the non-mollified path. - // - The orphan is bounded — the drainer's no-op-ack handler pops - // and acks it on its next poll. - // - The audit-trail surfaces it: a `mollifier.buffered` log line - // with `runId` that has no matching TaskRun in Postgres. - // - // Why Phase 2 cares: - // - When the buffer becomes the primary write path, debounce can - // no longer be allowed to run AFTER buffer.accept. The drainer's - // engine.trigger replay would observe "existing" and skip the - // persist — the customer's synthesised 200 (with the new - // friendlyId) would never get a TaskRun, and the audit-trail - // divergence becomes a real data-loss bug. - // - Phase 2 must lift `handleDebounce` into the call site BEFORE - // buffer.accept: - // 1. handleDebounce → if existing, return existing run; do NOT - // touch the buffer. - // 2. Otherwise, accept with `claimId` threaded into the - // canonical payload so the drainer's replay can - // `registerDebouncedRun` after persisting. - // - // This test pins the current ordering. A future change that "fixes" - // it by lifting handleDebounce upfront will fail the orphan - // assertion below and force an explicit choice (update the test, - // remove this scenario, or stage the lift behind a flag). - - const engine = new RunEngine({ - prisma, - worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, - queue: { redis: redisOptions }, - runLock: { redis: redisOptions }, - machines: { - defaultMachine: "small-1x", - machines: { "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 } }, - baseCostInCents: 0.0005, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); - - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const taskIdentifier = "test-task"; - await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); - - const idempotencyKeyConcern = new IdempotencyKeyConcern( - prisma, - engine, - new MockTraceEventConcern(), - ); - - // Setup: trigger with debounce — creates the existing run + Redis claim. - const baseline = new RunEngineTriggerTaskService({ - engine, - prisma, - payloadProcessor: new MockPayloadProcessor(), - queueConcern: new DefaultQueueManager(prisma, engine), - idempotencyKeyConcern, - validator: new MockTriggerTaskValidator(), - traceEventConcern: new MockTraceEventConcern(), - tracer: trace.getTracer("test", "0.0.0"), - metadataMaximumSize: 1024 * 1024, - }); - const first = await baseline.call({ - taskId: taskIdentifier, - environment: authenticatedEnvironment, - body: { - payload: { test: "x" }, - options: { debounce: { key: "regression-debounce-6", delay: "30s" } }, - }, - }); - expect(first?.run.friendlyId).toBeDefined(); - - // Action: same debounce key, mollify-stub gate. - const buffer = new CapturingMollifierBuffer(); - const mollifierService = new RunEngineTriggerTaskService({ - engine, - prisma, - payloadProcessor: new MockPayloadProcessor(), - queueConcern: new DefaultQueueManager(prisma, engine), - idempotencyKeyConcern, - validator: new MockTriggerTaskValidator(), - traceEventConcern: new MockTraceEventConcern(), - tracer: trace.getTracer("test", "0.0.0"), - metadataMaximumSize: 1024 * 1024, - evaluateGate: async () => ({ - action: "mollify", - decision: { - divert: true, - reason: "per_env_rate", - count: 150, - threshold: 100, - windowMs: 200, - holdMs: 500, - }, - }), - getMollifierBuffer: () => buffer as never, - isMollifierGloballyEnabled: () => true, - }); - - const debounced = await mollifierService.call({ - taskId: taskIdentifier, - environment: authenticatedEnvironment, - body: { - payload: { test: "x" }, - options: { debounce: { key: "regression-debounce-6", delay: "30s" } }, - }, - }); - - // Customer-facing behaviour: the existing run is returned (correct). - expect(debounced).toBeDefined(); - expect(debounced?.run.friendlyId).toBe(first?.run.friendlyId); - - // Orphan: buffer.accept fired with the new friendlyId we generated - // upfront, and that friendlyId has no matching TaskRun in Postgres - // because engine.trigger returned the existing run via debounce. - expect(buffer.accepted).toHaveLength(1); - expect(buffer.accepted[0]!.runId).not.toBe(first?.run.friendlyId); - const orphanFriendlyId = buffer.accepted[0]!.runId; - const orphanRow = await prisma.taskRun.findFirst({ - where: { friendlyId: orphanFriendlyId }, - }); - expect(orphanRow).toBeNull(); - - await engine.quit(); - }, - ); }); describe("DefaultQueueManager task metadata cache", () => { diff --git a/apps/webapp/test/metadataRouteOperationsLogging.test.ts b/apps/webapp/test/metadataRouteOperationsLogging.test.ts new file mode 100644 index 00000000000..ab96c9b9b23 --- /dev/null +++ b/apps/webapp/test/metadataRouteOperationsLogging.test.ts @@ -0,0 +1,132 @@ +import { describe, expect, it, vi } from "vitest"; + +// `vi.mock` factories are hoisted above regular top-level `const`s, so +// any cross-references between the spy/mock fns and the factories have +// to live inside `vi.hoisted`. See `mollifierDrainerHandler.test.ts` +// for the same pattern. +const { warnSpy, applyMetadataMutationToBufferedRunMock } = vi.hoisted(() => ({ + warnSpy: vi.fn(), + applyMetadataMutationToBufferedRunMock: vi.fn(), +})); + +// The route module's import graph (createActionApiRoute, the env, the +// services singleton) is heavier than the helper actually needs. Stub +// the leaf modules so only the helper under test executes; the route's +// top-level `createActionApiRoute(...)` call runs against the stubbed +// builder and never touches platform.v3.server / prisma. +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); +vi.mock("~/env.server", () => ({ + env: { TASK_RUN_METADATA_MAXIMUM_SIZE: 256 * 1024 }, +})); +vi.mock("~/services/routeBuilders/apiBuilder.server", () => ({ + createActionApiRoute: () => ({ action: vi.fn() }), +})); +vi.mock("~/services/apiAuth.server", () => ({ + authenticateApiRequest: vi.fn(), +})); +vi.mock("~/v3/services/common.server", () => ({ + ServiceValidationError: class extends Error { + constructor(public override message: string, public status?: number) { + super(message); + } + }, +})); +vi.mock("~/services/metadata/updateMetadataInstance.server", () => ({ + updateMetadataService: { call: vi.fn(async () => undefined) }, +})); +vi.mock("~/v3/mollifier/applyMetadataMutation.server", () => ({ + applyMetadataMutationToBufferedRun: applyMetadataMutationToBufferedRunMock, +})); +vi.mock("~/v3/mollifier/readFallback.server", () => ({ + findRunByIdWithMollifierFallback: vi.fn(), +})); +vi.mock("~/services/logger.server", () => ({ + logger: { + warn: warnSpy, + info: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + }, +})); + +import { routeOperationsToRun } from "~/routes/api.v1.runs.$runId.metadata"; +import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; + +const env = { + id: "env_a", + organizationId: "org_1", +} as unknown as AuthenticatedEnvironment; + +const opsFixture = [{ type: "set", key: "k", value: "v" }] as Parameters< + typeof routeOperationsToRun +>[1]; + +describe("routeOperationsToRun — non-throw buffer outcome logging", () => { + // Each non-success outcome `applyMetadataMutationToBufferedRun` can + // return (`not_found`, `busy`, `version_exhausted`, `metadata_too_large`) + // must produce a warn log so ops can trace silent drops. Without this + // branch the parent/root operation would disappear with no record — + // `tryCatch` only catches throws, and the outcome object was + // previously ignored. + for (const kind of ["not_found", "busy", "version_exhausted", "metadata_too_large"] as const) { + it(`warn-logs when buffer outcome is { kind: "${kind}" }`, async () => { + warnSpy.mockClear(); + applyMetadataMutationToBufferedRunMock.mockResolvedValueOnce({ kind }); + + await routeOperationsToRun("run_buffered_1", opsFixture, env); + + expect(warnSpy).toHaveBeenCalledWith( + "metadata route: parent/root buffer op did not apply", + expect.objectContaining({ targetRunId: "run_buffered_1", kind }), + ); + }); + } + + it("does NOT warn on the happy path (kind: 'applied')", async () => { + warnSpy.mockClear(); + applyMetadataMutationToBufferedRunMock.mockResolvedValueOnce({ + kind: "applied", + newMetadata: { k: "v" }, + parentTaskRunFriendlyId: undefined, + rootTaskRunFriendlyId: undefined, + }); + + await routeOperationsToRun("run_buffered_1", opsFixture, env); + + expect(warnSpy).not.toHaveBeenCalledWith( + "metadata route: parent/root buffer op did not apply", + expect.anything(), + ); + }); + + it("warn-logs once when the helper throws (the pre-existing throw branch keeps working)", async () => { + warnSpy.mockClear(); + applyMetadataMutationToBufferedRunMock.mockRejectedValueOnce(new Error("ECONNRESET")); + + await routeOperationsToRun("run_buffered_1", opsFixture, env); + + // Pre-existing branch — the catch logs `buffer fallback for parent/root + // op failed`. The new non-throw branch must NOT also fire (we return + // early on bufferError). + expect(warnSpy).toHaveBeenCalledWith( + "metadata route: buffer fallback for parent/root op failed", + expect.objectContaining({ targetRunId: "run_buffered_1" }), + ); + expect(warnSpy).not.toHaveBeenCalledWith( + "metadata route: parent/root buffer op did not apply", + expect.anything(), + ); + }); + + it("skips both PG and buffer when targetRunId is missing or operations is empty", async () => { + warnSpy.mockClear(); + applyMetadataMutationToBufferedRunMock.mockClear(); + + await routeOperationsToRun(undefined, opsFixture, env); + await routeOperationsToRun("run_x", undefined, env); + await routeOperationsToRun("run_x", [], env); + + expect(applyMetadataMutationToBufferedRunMock).not.toHaveBeenCalled(); + expect(warnSpy).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/mollifierApplyMetadataMutation.test.ts b/apps/webapp/test/mollifierApplyMetadataMutation.test.ts new file mode 100644 index 00000000000..5995f6969f3 --- /dev/null +++ b/apps/webapp/test/mollifierApplyMetadataMutation.test.ts @@ -0,0 +1,352 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server"; +import type { BufferEntry, MollifierBuffer, CasSetMetadataResult } from "@trigger.dev/redis-worker"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; + +// Regression for a CAS retry-exhaustion bug: the default `maxRetries` +// was 3, matching the PG-side service, but that exhausts fast when N +// external API writers race the same buffered run's metadata. Bumped +// to 12 + jittered backoff. These tests simulate version_conflict +// races and assert (a) every delta lands and (b) the retry budget is +// sized for realistic concurrency. + +const NOW = new Date("2026-05-21T10:00:00Z"); + +type BufferStub = { + buffer: MollifierBuffer; + state: { + version: number; + metadata: Record; + pendingConflictsForNextN: number; + }; +}; + +// Build a stub MollifierBuffer that simulates Lua-CAS semantics +// in-memory. The first `pendingConflictsForNextN` casSetMetadata calls +// from any worker will return version_conflict (then the version +// bumps); subsequent calls succeed. +function makeBufferStub(initialPayload: Record = {}): BufferStub { + const state = { + version: 0, + metadata: initialPayload.metadata + ? (JSON.parse(initialPayload.metadata as string) as Record) + : {}, + pendingConflictsForNextN: 0, + }; + const entryTemplate: Omit = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + status: "QUEUED", + attempts: 0, + createdAt: NOW, + createdAtMicros: 1747044000000000, + materialised: false, + idempotencyLookupKey: "", + metadataVersion: 0, + }; + + const buffer: MollifierBuffer = { + getEntry: vi.fn(async (): Promise => ({ + ...entryTemplate, + metadataVersion: state.version, + payload: JSON.stringify({ ...initialPayload, metadata: JSON.stringify(state.metadata) }), + })), + casSetMetadata: vi.fn( + async (input: { + runId: string; + expectedVersion: number; + newMetadata: string; + newMetadataType: string; + }): Promise => { + // Inject a controlled number of conflicts to simulate races. + if (state.pendingConflictsForNextN > 0) { + state.pendingConflictsForNextN -= 1; + // Bump version as if some other writer just landed. + state.version += 1; + return { kind: "version_conflict", currentVersion: state.version }; + } + if (input.expectedVersion !== state.version) { + return { kind: "version_conflict", currentVersion: state.version }; + } + state.metadata = JSON.parse(input.newMetadata) as Record; + state.version += 1; + return { kind: "applied", newVersion: state.version }; + }, + ), + } as unknown as MollifierBuffer; + + return { buffer, state }; +} + +describe("applyMetadataMutationToBufferedRun — retry behaviour", () => { + it("succeeds when CAS lands on the first try (no contention)", async () => { + const { buffer, state } = makeBufferStub(); + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { metadata: { counter: 1 } }, + buffer, + }); + expect(result.kind).toBe("applied"); + expect(state.metadata).toEqual({ counter: 1 }); + expect(state.version).toBe(1); + }); + + it("succeeds after 5 version conflicts (default budget = 12)", async () => { + const { buffer, state } = makeBufferStub(); + state.pendingConflictsForNextN = 5; + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer, + }); + expect(result.kind).toBe("applied"); + if (result.kind === "applied") { + expect(result.newMetadata.counter).toBe(1); + } + }); + + it("succeeds after 11 version conflicts (one under the default budget)", async () => { + const { buffer } = makeBufferStub(); + const setStateConflicts = (n: number) => { + // Re-read state from the closure + const state = (buffer as unknown as { __state__?: never; getEntry: () => Promise }); + void state; + }; + void setStateConflicts; + // Set conflicts directly via the shared state object + const { state } = makeBufferStub(); + state.pendingConflictsForNextN = 11; + // Build a fresh stub since we want one shared state instance + const stub = makeBufferStub(); + stub.state.pendingConflictsForNextN = 11; + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer: stub.buffer, + }); + expect(result.kind).toBe("applied"); + }); + + it("returns version_exhausted after retries are spent", async () => { + const stub = makeBufferStub(); + // 99 conflicts ≫ default budget of 12. With maxRetries 3 (the + // pre-fix value), this would have exhausted after 4 attempts. + stub.state.pendingConflictsForNextN = 99; + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer: stub.buffer, + maxRetries: 12, + }); + expect(result.kind).toBe("version_exhausted"); + }); + + it("regression: 3 retries are NOT enough under 50-way concurrency simulation", async () => { + // The pre-fix default would have lost most deltas under this + // contention. Asserting that the OLD budget (3) exhausts confirms + // the regression actually existed and the new budget addresses it. + const stub = makeBufferStub(); + stub.state.pendingConflictsForNextN = 8; + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer: stub.buffer, + maxRetries: 3, + }); + expect(result.kind).toBe("version_exhausted"); + }); + + it("matches PG semantics when body has both metadata + operations: ops on top of EXISTING, body.metadata ignored", async () => { + // PG service (UpdateMetadataService.#updateRunMetadata) branches on + // Array.isArray(body.operations) — when present it applies ops on + // top of existing PG metadata and IGNORES body.metadata. The buffer + // helper used to merge both (replace then apply), producing different + // results across the buffered/materialised boundary. This regression + // pins the PG-matching behaviour. + const stub = makeBufferStub({ metadata: JSON.stringify({ a: 1 }) }); + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { + // Should be ignored because `operations` is also present. + metadata: { b: 2 }, + operations: [{ type: "set", key: "c", value: 3 }], + }, + buffer: stub.buffer, + }); + expect(result.kind).toBe("applied"); + if (result.kind === "applied") { + // PG would produce {a:1, c:3}; previously the buffer produced {b:2, c:3}. + expect(result.newMetadata).toEqual({ a: 1, c: 3 }); + expect(result.newMetadata).not.toHaveProperty("b"); + } + }); + + it("returns metadata_too_large when the resulting payload exceeds maximumSize (mirrors PG 413)", async () => { + // PG-side `UpdateMetadataService` uses `handleMetadataPacket` to + // enforce TASK_RUN_METADATA_MAXIMUM_SIZE (default 256KB), throwing + // `MetadataTooLargeError` (413) on overflow. The buffer helper now + // matches that cap so a buffered run can't accept a payload PG + // would have rejected. Reject must fire BEFORE casSetMetadata. + const stub = makeBufferStub(); + const big = "x".repeat(2048); // 2 KB string value + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024, // 1 KB cap — strictly less than the payload + body: { metadata: { big } }, + buffer: stub.buffer, + }); + expect(result.kind).toBe("metadata_too_large"); + if (result.kind === "metadata_too_large") { + expect(result.maximumSize).toBe(1024); + expect(result.observedSize).toBeGreaterThan(1024); + } + // No CAS write should have been attempted. + expect(stub.buffer.casSetMetadata).not.toHaveBeenCalled(); + expect(stub.state.version).toBe(0); + }); + + it("returns not_found when the buffered entry belongs to a different env (cross-env auth gate)", async () => { + // Same shape as a normal apply call, but the caller's environmentId + // doesn't match the entry's envId. The helper must refuse the + // mutation and return not_found (without leaking existence) and + // must NOT call casSetMetadata. + const stub = makeBufferStub(); + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_OTHER", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { metadata: { counter: 1 } }, + buffer: stub.buffer, + }); + expect(result.kind).toBe("not_found"); + expect(stub.buffer.casSetMetadata).not.toHaveBeenCalled(); + expect(stub.state.version).toBe(0); + }); + + it("returns not_found when the buffered entry belongs to a different org (cross-org auth gate)", async () => { + const stub = makeBufferStub(); + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_OTHER", + maximumSize: 1024 * 1024, + body: { metadata: { counter: 1 } }, + buffer: stub.buffer, + }); + expect(result.kind).toBe("not_found"); + expect(stub.buffer.casSetMetadata).not.toHaveBeenCalled(); + }); + + it("surfaces parent/root friendlyIds on `applied` so the route can fan parent/root ops without a second buffer read", async () => { + // Regression: the metadata route used to do a SECOND + // `findRunByIdWithMollifierFallback` after the primary CAS to + // obtain parent/root friendlyIds for `routeOperationsToRun`. + // If the drainer's terminal-failure path ran between the CAS and + // the second read, the entry hash was DELd and the second read + // came back null — the route silently skipped the entire + // parent/root fan-out, dropping `body.parentOperations` / + // `body.rootOperations` after the primary mutation already + // landed. The helper now captures the ids inside its own read + // loop and surfaces them on the `applied` outcome so the route + // never needs a second round trip. + // + // Engine-side snapshot stores internal cuids; we expect the + // helper to convert via `RunId.toFriendlyId` so the outcome + // matches what `readFallback.server.ts` would have produced. + const parentFriendly = RunId.generate().friendlyId; + const rootFriendly = RunId.generate().friendlyId; + const parentInternal = RunId.fromFriendlyId(parentFriendly); + const rootInternal = RunId.fromFriendlyId(rootFriendly); + const stub = makeBufferStub({ + parentTaskRunId: parentInternal, + rootTaskRunId: rootInternal, + }); + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { metadata: { counter: 1 } }, + buffer: stub.buffer, + }); + expect(result.kind).toBe("applied"); + if (result.kind === "applied") { + expect(result.parentTaskRunFriendlyId).toBe(parentFriendly); + expect(result.rootTaskRunFriendlyId).toBe(rootFriendly); + } + }); + + it("`applied` parent/root ids are undefined when the snapshot carries neither (top-level run)", async () => { + // Top-level runs (parentTaskRunId/rootTaskRunId both undefined in + // the engine-trigger snapshot) must surface as undefined on the + // outcome so the route's `?? runId` self-fallback fires — + // matching the PG service's `taskRun.parentTaskRun?.id ?? + // taskRun.id` semantics. + const stub = makeBufferStub({}); + const result = await applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { metadata: { counter: 1 } }, + buffer: stub.buffer, + }); + expect(result.kind).toBe("applied"); + if (result.kind === "applied") { + expect(result.parentTaskRunFriendlyId).toBeUndefined(); + expect(result.rootTaskRunFriendlyId).toBeUndefined(); + } + }); + + it("N-way concurrent applies all converge under default budget", async () => { + // Simulate N parallel writers against a shared state. Each writer + // reads, applies a delta, CAS-writes. The Lua CAS forces them to + // retry until they see the latest version. + const N = 30; + const sharedStub = makeBufferStub(); + // Override the stub to model real per-attempt serialisation: each + // call reads the latest version, and CAS conflicts are organic + // (not pre-injected) when expectedVersion != current. + sharedStub.state.pendingConflictsForNextN = 0; + + const calls = Array.from({ length: N }, () => + applyMetadataMutationToBufferedRun({ + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + maximumSize: 1024 * 1024, + body: { operations: [{ type: "increment", key: "counter", value: 1 }] }, + buffer: sharedStub.buffer, + }), + ); + const results = await Promise.all(calls); + const applied = results.filter((r) => r.kind === "applied").length; + expect(applied).toBe(N); + expect(sharedStub.state.metadata.counter).toBe(N); + }); +}); diff --git a/apps/webapp/test/mollifierClaimResolution.test.ts b/apps/webapp/test/mollifierClaimResolution.test.ts new file mode 100644 index 00000000000..f61cda0d04e --- /dev/null +++ b/apps/webapp/test/mollifierClaimResolution.test.ts @@ -0,0 +1,143 @@ +import { describe, expect, it, vi } from "vitest"; + +// Stub `~/db.server` before importing the concern — the real module +// eagerly calls `prisma.$connect()` at singleton construction, which +// would fail without a database. The concern under test receives its +// prisma via the constructor, so the stub is never used by the code path. +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +// The IdempotencyKeyConcern resolves the pre-gate claim through the +// global mollifier buffer (`getMollifierBuffer`), shared by both +// `claimOrAwait` and `findBufferedRunWithIdempotency`. Control it via a +// hoisted handle so each test can script the claim/lookup responses. +const h = vi.hoisted(() => ({ buffer: null as unknown, orgFlag: true })); +vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({ + getMollifierBuffer: () => h.buffer, +})); +// Stub `mollifierGate.server` so loading the concern doesn't drag in +// `env.server` (which fails to parse without a populated environment in +// CI). The concern only uses `makeResolveMollifierFlag` to gate the +// claim; tests flip `h.orgFlag` to cover both opted-in and opted-out +// orgs without touching real env or feature-flag wiring. +vi.mock("~/v3/mollifier/mollifierGate.server", () => ({ + makeResolveMollifierFlag: () => async () => h.orgFlag, +})); + +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { IdempotencyKeyConcern } from "~/runEngine/concerns/idempotencyKeys.server"; +import type { TriggerTaskRequest } from "~/runEngine/types"; + +function makeConcern(prisma: { findFirst: () => Promise }) { + return new IdempotencyKeyConcern( + { taskRun: { findFirst: prisma.findFirst } } as never, + {} as never, // engine — unused on this path + {} as never, // traceEventConcern — unused on this path + ); +} + +function makeRequest(): TriggerTaskRequest { + return { + taskId: "my-task", + environment: { + id: "env_a", + organizationId: "org_1", + // The pre-gate claim is gated by the per-org mollifier flag + // (mirroring evaluateGate's gating) so non-opted-in orgs don't pay + // the Redis SETNX. Tests covering the claim path must opt this + // fake org in, otherwise the concern skips claimOrAwait entirely + // and the resolution branches under test never run. + organization: { featureFlags: { mollifierEnabled: true } }, + }, + options: {}, + body: { options: { idempotencyKey: "k-1" } }, + } as unknown as TriggerTaskRequest; +} + +describe("IdempotencyKeyConcern · claim resolution", () => { + it("resolved-but-unfindable falls through to a fresh trigger (no cached run, no claim held)", async () => { + // The claim slot holds a runId that is gone from both stores: the PG + // findFirst misses and the buffer lookup misses. Regression guard for + // the resolved-but-unfindable terminal case — the concern must fall + // through to a fresh trigger rather than throw, hand back a bogus + // cached run, or claim ownership it doesn't hold. + const lookupIdempotency = vi.fn(async () => null); + h.buffer = { + claimIdempotency: vi.fn(async () => ({ kind: "resolved", runId: "run_gone" })), + lookupIdempotency, + } as unknown as MollifierBuffer; + + const findFirst = vi.fn(async () => null); // PG misses on every call + const concern = makeConcern({ findFirst }); + + const result = await concern.handleTriggerRequest(makeRequest(), undefined); + + expect(result.isCached).toBe(false); + if (result.isCached === false) { + // No claim held — we resolved someone else's (stale) claim, we did + // not win one. The caller must NOT publish/release on our behalf. + expect(result.claim).toBeUndefined(); + expect(result.idempotencyKey).toBe("k-1"); + } + // We attempted the buffer fallback before giving up. + expect(lookupIdempotency).toHaveBeenCalled(); + }); + + it("resolved-and-findable returns the existing run as a cached hit", async () => { + // Guard the happy resolved path: when the claimed runId IS findable + // (writer-side PG), the fall-through change must not swallow it. + h.buffer = { + claimIdempotency: vi.fn(async () => ({ kind: "resolved", runId: "run_winner" })), + lookupIdempotency: vi.fn(async () => null), + } as unknown as MollifierBuffer; + + const winner = { id: "run_winner", friendlyId: "run_winner" }; + // First findFirst (initial existingRun check) misses so we enter the + // claim path; the second (writer-side re-resolve) finds the winner. + let calls = 0; + const findFirst = vi.fn(async () => { + calls += 1; + return calls >= 2 ? winner : null; + }); + const concern = makeConcern({ findFirst }); + + const result = await concern.handleTriggerRequest(makeRequest(), undefined); + + expect(result.isCached).toBe(true); + if (result.isCached === true) { + expect(result.run).toBe(winner); + } + }); + + it("non-opted-in org skips claimOrAwait entirely (no buffer round-trip, no claim held)", async () => { + // Regression guard for the per-org gating that keeps the claim's + // Redis SETNX off the hot path for orgs that haven't opted into the + // mollifier — even when `TRIGGER_MOLLIFIER_ENABLED=1` globally and + // the buffer singleton exists. The concern should NOT touch + // `claimIdempotency` for these orgs; PG's unique constraint already + // deduplicates same-key races on the pass-through path. + h.orgFlag = false; + const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const })); + const lookupIdempotency = vi.fn(async () => null); + h.buffer = { + claimIdempotency, + lookupIdempotency, + } as unknown as MollifierBuffer; + + const findFirst = vi.fn(async () => null); + const concern = makeConcern({ findFirst }); + + try { + const result = await concern.handleTriggerRequest(makeRequest(), undefined); + expect(result.isCached).toBe(false); + if (result.isCached === false) { + // No claim returned — the caller must NOT publish/release. + expect(result.claim).toBeUndefined(); + expect(result.idempotencyKey).toBe("k-1"); + } + // The headline guarantee: zero Redis claim activity for this org. + expect(claimIdempotency).not.toHaveBeenCalled(); + } finally { + h.orgFlag = true; // restore for any later tests in this file + } + }); +}); diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts new file mode 100644 index 00000000000..085fab6418b --- /dev/null +++ b/apps/webapp/test/mollifierDrainerHandler.test.ts @@ -0,0 +1,574 @@ +import { describe, expect, it, vi } from "vitest"; +import { trace } from "@opentelemetry/api"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: {}, +})); + +// `writeMollifierTerminalFailureRow` enqueues a PerformTaskRunAlertsService +// after writing the SYSTEM_FAILURE row (mirrors TriggerFailedTaskService). +// In production that enqueues into the alerts redis-worker; the test +// environment has no redis-worker, so the real call hangs the tick out +// to its 5s vitest timeout. Stub `enqueue` to a resolved no-op so the +// handler's best-effort try/catch sees a clean success path. +vi.mock("~/v3/services/alerts/performTaskRunAlerts.server", () => ({ + PerformTaskRunAlertsService: { + enqueue: vi.fn(async () => undefined), + }, +})); + +// The drainer calls `recordRunDebugLog` after a successful engine.trigger +// to emit an admin-only LOG-kind event encoding the buffered window. +// The real implementation imports the configured event repository (prisma +// + clickhouse + env), which has heavy side-effects on first import. +// Stub it to a vi.fn so the unit tests can assert call shape without +// dragging the whole eventRepository graph into webapp test setup. +// `vi.hoisted` is required because `vi.mock` factories are hoisted above +// regular `const`s — referencing a top-level variable from inside the +// factory otherwise fires `Cannot access 'X' before initialization`. +const { recordRunDebugLogMock } = vi.hoisted(() => ({ + recordRunDebugLogMock: vi.fn(async () => ({ success: true as const })), +})); +vi.mock("~/v3/eventRepository/index.server", () => ({ + recordRunDebugLog: recordRunDebugLogMock, +})); + +import { + createDrainerHandler, + isRetryablePgError, +} from "~/v3/mollifier/mollifierDrainerHandler.server"; + +describe("isRetryablePgError", () => { + it("returns true for P2024 (connection pool timeout)", () => { + const err = Object.assign(new Error("Timed out fetching a new connection"), { + code: "P2024", + }); + expect(isRetryablePgError(err)).toBe(true); + }); + + it("returns true for generic connection-lost messages", () => { + expect(isRetryablePgError(new Error("Connection lost"))).toBe(true); + expect(isRetryablePgError(new Error("Can't reach database server"))).toBe(true); + }); + + it("returns false for validation errors", () => { + expect(isRetryablePgError(new Error("Invalid payload"))).toBe(false); + }); + + it("returns false for non-Error inputs", () => { + expect(isRetryablePgError("string error")).toBe(false); + expect(isRetryablePgError({ message: "object" })).toBe(false); + }); +}); + +describe("createDrainerHandler", () => { + it("invokes engine.trigger with the deserialised snapshot", async () => { + const trigger = vi.fn(async () => ({ friendlyId: "run_x" })); + const handler = createDrainerHandler({ + engine: { trigger } as any, + prisma: {} as any, + }); + + await handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", payload: "{}" }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(trigger).toHaveBeenCalledOnce(); + const callArg = trigger.mock.calls[0][0] as { taskIdentifier: string }; + expect(callArg.taskIdentifier).toBe("t"); + }); + + it("re-attaches the snapshot's traceId so engine.trigger inherits the original trace", async () => { + // Captures the active traceId at the moment engine.trigger is invoked. + // Without context propagation it would be a fresh traceId, leaving the + // run-detail page with only the root span. + let observedTraceId: string | undefined; + const trigger = vi.fn(async () => { + observedTraceId = trace.getActiveSpan()?.spanContext().traceId; + return { friendlyId: "run_x" }; + }); + + const handler = createDrainerHandler({ + engine: { trigger } as any, + prisma: {} as any, + }); + + const snapshotTraceId = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + const snapshotSpanId = "bbbbbbbbbbbbbbbb"; + + await handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { + taskIdentifier: "t", + traceId: snapshotTraceId, + spanId: snapshotSpanId, + }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(observedTraceId).toBe(snapshotTraceId); + }); + + it("rethrows retryable PG errors so MollifierDrainer requeues the entry", async () => { + const err = new Error("Can't reach database server"); + const trigger = vi.fn(async () => { + throw err; + }); + const createFailedTaskRun = vi.fn(); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t" }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("Can't reach database server"); + // Retryable: we do NOT write a SYSTEM_FAILURE row, the entry should + // be requeued for another shot. + expect(createFailedTaskRun).not.toHaveBeenCalled(); + }); + + const envFixture = { + id: "env_a", + type: "DEVELOPMENT", + project: { id: "proj_1" }, + organization: { id: "org_1" }, + }; + + it("writes a SYSTEM_FAILURE PG row when engine.trigger fails non-retryably", async () => { + const trigger = vi.fn(async () => { + throw new Error("validation failed: payload too large"); + }); + const createFailedTaskRun = vi.fn(async () => ({ + id: "internal", + friendlyId: "run_x", + })); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any), + ).resolves.toBeUndefined(); + + expect(trigger).toHaveBeenCalledOnce(); + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + const arg = createFailedTaskRun.mock.calls[0][0] as { error: { raw: string } }; + expect(arg.error.raw).toContain("validation failed"); + }); + + it("propagates the batch association into createFailedTaskRun (so batch parents don't hang on missing children)", async () => { + // Devin's ANALYSIS report on PR #3754: the terminal-failure path + // extracts most snapshot fields (parentTaskRunId, rootTaskRunId, + // depth, etc.) but dropped `batch`. If the original trigger was + // part of a batch, the SYSTEM_FAILURE row isn't associated with + // the batch, so the batch parent's completion-tracking can hang + // indefinitely waiting on a child that landed but isn't linked. + const trigger = vi.fn(async () => { + throw new Error("validation failed: payload too large"); + }); + const createFailedTaskRun = vi.fn(async () => ({ + id: "internal", + friendlyId: "run_x", + })); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_batched", + envId: "env_a", + orgId: "org_1", + payload: { + taskIdentifier: "t", + environment: envFixture, + batch: { id: "batch_xyz", index: 7 }, + }, + attempts: 0, + createdAt: new Date(), + } as any), + ).resolves.toBeUndefined(); + + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + const arg = createFailedTaskRun.mock.calls[0][0] as { + batch?: { id: string; index: number }; + }; + expect(arg.batch).toEqual({ id: "batch_xyz", index: 7 }); + }); + + it("rethrows the original error when createFailedTaskRun also fails (PG genuinely unreachable)", async () => { + const triggerErr = new Error("engine rejected the snapshot"); + const trigger = vi.fn(async () => { + throw triggerErr; + }); + const createFailedTaskRun = vi.fn(async () => { + throw new Error("connection refused"); + }); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("engine rejected the snapshot"); + // Drainer's outer drainOne loop now decides retry vs buffer.fail. + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + }); + + it("calls createCancelledRun with emitRunCancelledEvent: false (suppresses orphan trace-event log noise)", async () => { + // Buffered-only runs never had a primary trace event written for + // them — the mollifier gate skipped `repository.traceEvent` since + // the run hadn't materialised in PG yet. The `runCancelled` handler + // would log `[runCancelled] Failed to cancel run event` for every + // cancelled buffered run if we let the emit fire. Suppress it. + const friendlyId = RunId.generate().friendlyId; + const createCancelledRun = vi.fn(async () => ({ + id: "internal", + friendlyId, + status: "CANCELED", + })); + const handler = createDrainerHandler({ + engine: { createCancelledRun } as any, + prisma: {} as any, + }); + + await handler({ + runId: friendlyId, + envId: "env_a", + orgId: "org_1", + payload: { + friendlyId, + taskIdentifier: "t", + environment: envFixture, + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(createCancelledRun).toHaveBeenCalledOnce(); + const arg = createCancelledRun.mock.calls[0][0] as { + emitRunCancelledEvent?: boolean; + }; + expect(arg.emitRunCancelledEvent).toBe(false); + }); + + it("honours the cancel when a buffered cancel races a materialised non-CANCELED row", async () => { + // Cancel-wins-over-trigger. If the normal trigger + // replay path materialised a live PENDING row before the cancel + // bifurcation drained, engine.createCancelledRun throws a conflict — + // its documented contract is that "the caller must decide between + // engine.cancelRun() and skipping". The drainer handler must honour + // the cancel intent by actually cancelling the now-live run; otherwise + // the conflict propagates, isRetryablePgError() returns false, and the + // drainer buffer.fail()s the entry — silently losing the cancellation + // while the run keeps executing. + const friendlyId = RunId.generate().friendlyId; + const createCancelledRun = vi.fn(async () => { + throw new Error( + `createCancelledRun conflict: existing run ${friendlyId} has status PENDING` + ); + }); + const cancelRun = vi.fn(async () => ({ alreadyFinished: false })); + const handler = createDrainerHandler({ + engine: { createCancelledRun, cancelRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: friendlyId, + envId: "env_a", + orgId: "org_1", + payload: { + friendlyId, + taskIdentifier: "t", + environment: envFixture, + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }, + attempts: 0, + createdAt: new Date(), + } as any) + ).resolves.toBeUndefined(); + + // The live run is actually cancelled, by its internal id. + expect(cancelRun).toHaveBeenCalledOnce(); + expect(cancelRun.mock.calls[0][0].runId).toBe(RunId.fromFriendlyId(friendlyId)); + }); + + it("requeues on a transient PG outage during the SYSTEM_FAILURE fallback write", async () => { + // engine.trigger failed non-retryably, so we try to write a terminal + // SYSTEM_FAILURE row. If THAT write fails because PG is transiently + // unreachable, rethrowing the *original* non-retryable error makes the + // drainer buffer.fail() the entry — losing the run with no PG row ever + // landing. Rethrow the retryable write error instead so the drainer + // requeues; once PG recovers the failure row lands and the customer + // sees it. + const trigger = vi.fn(async () => { + throw new Error("validation failed: payload too large"); + }); + const createFailedTaskRun = vi.fn(async () => { + throw new Error("Can't reach database server"); + }); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any) + ).rejects.toThrow("Can't reach database server"); + }); + + it("writes a SYSTEM_FAILURE row when createCancelledRun fails non-retryably (cancel bifurcation)", async () => { + // Without this guard a non-conflict, non-retryable failure from + // createCancelledRun rethrows out of the handler. The drainer's + // onTerminalFailure gates on cause==="max-attempts-exhausted" and + // skips "non-retryable", so buffer.fail() deletes the entry with + // no PG row written — the cancellation disappears silently. + // Mirror the non-cancel path's SYSTEM_FAILURE fallback so the + // customer always sees a terminal row. + const friendlyId = RunId.generate().friendlyId; + const cancelErr = new Error("validation failed: bad cancel snapshot"); + const createCancelledRun = vi.fn(async () => { + throw cancelErr; + }); + const createFailedTaskRun = vi.fn(async () => ({ id: "internal_x" })); + const handler = createDrainerHandler({ + engine: { createCancelledRun, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await handler({ + runId: friendlyId, + envId: "env_a", + orgId: "org_1", + payload: { + friendlyId, + taskIdentifier: "t", + environment: envFixture, + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }, + attempts: 0, + createdAt: new Date(), + } as any); + + // SYSTEM_FAILURE row was written via the shared helper. Handler + // returns cleanly so the drainer ACKs the entry instead of + // buffer.fail()ing it. + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + expect(createFailedTaskRun.mock.calls[0][0].friendlyId).toBe(friendlyId); + expect(createFailedTaskRun.mock.calls[0][0].error.raw).toContain( + "validation failed: bad cancel snapshot" + ); + }); + + it("requeues when createCancelledRun fails with a retryable PG error (cancel bifurcation)", async () => { + // Retryable PG failures must rethrow so the drainer requeues the + // entry — writing a SYSTEM_FAILURE row when PG is transiently + // unreachable would still fail. The drainer's existing retry loop + // handles the requeue. + const friendlyId = RunId.generate().friendlyId; + const cancelErr = new Error("Can't reach database server"); + const createCancelledRun = vi.fn(async () => { + throw cancelErr; + }); + const createFailedTaskRun = vi.fn(); + const handler = createDrainerHandler({ + engine: { createCancelledRun, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: friendlyId, + envId: "env_a", + orgId: "org_1", + payload: { + friendlyId, + taskIdentifier: "t", + environment: envFixture, + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }, + attempts: 0, + createdAt: new Date(), + } as any) + ).rejects.toThrow("Can't reach database server"); + expect(createFailedTaskRun).not.toHaveBeenCalled(); + }); + + it("rethrows the original error when the snapshot lacks an environment block", async () => { + const triggerErr = new Error("engine rejected the snapshot"); + const trigger = vi.fn(async () => { + throw triggerErr; + }); + const createFailedTaskRun = vi.fn(); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t" /* no environment */ }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("engine rejected the snapshot"); + expect(createFailedTaskRun).not.toHaveBeenCalled(); + }); + + it("emits an admin-only LOG-kind event with the buffered window after engine.trigger succeeds", async () => { + // The drainer's audit trail rides the existing TaskEventKind.LOG + // filter pattern (`eventRepository.server.ts:108` + `logs.download.ts:118`) + // — admins see the buffered window in the trace; non-admins don't. + recordRunDebugLogMock.mockClear(); + const trigger = vi.fn(async () => ({ friendlyId: "run_z" })); + const handler = createDrainerHandler({ + engine: { trigger } as any, + prisma: {} as any, + }); + + const bufferedAt = new Date(Date.now() - 4_000); + await handler({ + runId: "run_z", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", spanId: "snapspan", traceId: "snaptrace" }, + attempts: 2, + createdAt: bufferedAt, + } as any); + + expect(recordRunDebugLogMock).toHaveBeenCalledOnce(); + const [callRunId, message, options] = recordRunDebugLogMock.mock.calls[0] as [ + string, + string, + any, + ]; + // Internal cuid derived from the friendlyId, mirroring what + // `findRunForEventCreation` queries on. + expect(callRunId).toBe("z"); + expect(message).toMatch(/Mollifier buffered \d+ms before materialising/); + // Emitted as a marker at materialisation time (no `startTime` / + // `duration` overrides) — engine.trigger has just rewritten the + // root span's start_time to "now", so back-dating the event would + // clip it off-screen in the trace renderer. The historical window + // is preserved in metadata so admins can still read it. + expect(options.startTime).toBeUndefined(); + expect(options.duration).toBeUndefined(); + expect(options.parentId).toBe("snapspan"); + expect(options.attributes.metadata["mollifier.bufferedAt"]).toBe(bufferedAt.toISOString()); + expect(options.attributes.metadata["mollifier.attempts"]).toBe(2); + expect(options.attributes.metadata["mollifier.dwellMs"]).toBeGreaterThan(0); + }); + + it("does NOT emit the admin LOG event when engine.trigger fails non-retryably", async () => { + // The audit trail is for runs that actually materialised. On a + // terminal SYSTEM_FAILURE path the customer-visible outcome is the + // failure row; emitting a "buffered for Xms" event next to it would + // imply the buffered window completed normally. + recordRunDebugLogMock.mockClear(); + const trigger = vi.fn(async () => { + throw new Error("engine rejected the snapshot"); + }); + const createFailedTaskRun = vi.fn(async () => ({ id: "internal" })); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await handler({ + runId: "run_z", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(recordRunDebugLogMock).not.toHaveBeenCalled(); + }); + + it("does NOT emit the admin LOG event on the cancel-bifurcation path", async () => { + // Cancel-bifurcation writes a CANCELED row directly without calling + // engine.trigger. There's no buffered-then-materialised window to + // describe — the run never ran. + recordRunDebugLogMock.mockClear(); + const friendlyId = RunId.generate().friendlyId; + const createCancelledRun = vi.fn(async () => ({ + id: "internal", + friendlyId, + status: "CANCELED", + })); + const handler = createDrainerHandler({ + engine: { createCancelledRun } as any, + prisma: {} as any, + }); + + await handler({ + runId: friendlyId, + envId: "env_a", + orgId: "org_1", + payload: { + friendlyId, + taskIdentifier: "t", + environment: envFixture, + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(recordRunDebugLogMock).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/mollifierDrainerWorker.test.ts b/apps/webapp/test/mollifierDrainerWorker.test.ts index e5f38229d8f..0d4e931fd83 100644 --- a/apps/webapp/test/mollifierDrainerWorker.test.ts +++ b/apps/webapp/test/mollifierDrainerWorker.test.ts @@ -1,4 +1,17 @@ -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; + +// Importing `~/v3/mollifier/mollifierDrainer.server` (below) transitively +// loads `~/v3/runEngine.server`, whose top-level `singleton(...)` call +// eagerly constructs a RunEngine. That spins up Prisma + Redis workers +// that try to connect to localhost — which in CI (no PG, no Redis) +// produces an unhandled `PrismaClientInitializationError` that fails +// the test run even though the assertions all pass. Mocking the +// runEngine module short-circuits the singleton so no worker starts. +vi.mock("~/v3/runEngine.server", () => ({ engine: {} })); +// Same problem: prisma.server.ts's top-level singleton tries to open a +// PG client. The test never makes a query; an empty stub is enough. +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + import { MollifierConfigurationError } from "~/v3/mollifier/mollifierDrainer.server"; import { initMollifierDrainerWorker } from "~/v3/mollifierDrainerWorker.server"; diff --git a/apps/webapp/test/mollifierGate.test.ts b/apps/webapp/test/mollifierGate.test.ts index b81df7f0c5b..e40a29b2481 100644 --- a/apps/webapp/test/mollifierGate.test.ts +++ b/apps/webapp/test/mollifierGate.test.ts @@ -432,3 +432,82 @@ describe("evaluateGate — per-org isolation via Organization.featureFlags", () expect(unrelatedDeps.spies.evaluatorCalls).toBe(0); }); }); + +// Bypasses: the three categories of trigger that the mollifier never +// intercepts, regardless of the per-org flag or the trip-evaluator decision. +describe("evaluateGate — debounce / OTU / triggerAndWait bypasses", () => { + it("debounce triggers pass through without invoking the evaluator", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + const outcome = await evaluateGate( + { ...inputs, options: { debounce: { key: "k" } } }, + deps, + ); + expect(outcome).toEqual({ action: "pass_through" }); + expect(spies.evaluatorCalls).toBe(0); + }); + + it("oneTimeUseToken triggers pass through without invoking the evaluator", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + const outcome = await evaluateGate( + { ...inputs, options: { oneTimeUseToken: "jwt-otu" } }, + deps, + ); + expect(outcome).toEqual({ action: "pass_through" }); + expect(spies.evaluatorCalls).toBe(0); + }); + + it("single triggerAndWait (parentTaskRunId + resumeParentOnCompletion) passes through", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + const outcome = await evaluateGate( + { + ...inputs, + options: { parentTaskRunId: "run_parent", resumeParentOnCompletion: true }, + }, + deps, + ); + expect(outcome).toEqual({ action: "pass_through" }); + expect(spies.evaluatorCalls).toBe(0); + }); + + it("parentTaskRunId alone (no resumeParentOnCompletion) does NOT bypass — must be both", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + const outcome = await evaluateGate( + { ...inputs, options: { parentTaskRunId: "run_parent" } }, + deps, + ); + expect(outcome.action).toBe("mollify"); + expect(spies.evaluatorCalls).toBe(1); + }); + + it("bypass records pass_through decision (so observability counters stay accurate)", async () => { + const { deps, spies } = makeDeps({ + enabled: true, + shadow: false, + flag: true, + decision: trippedDecision, + }); + await evaluateGate({ ...inputs, options: { debounce: { key: "k" } } }, deps); + expect(spies.recordDecisionCalls).toHaveLength(1); + expect(spies.recordDecisionCalls[0].outcome).toBe("pass_through"); + }); +}); diff --git a/apps/webapp/test/mollifierIdempotencyClaim.test.ts b/apps/webapp/test/mollifierIdempotencyClaim.test.ts new file mode 100644 index 00000000000..87c009cb1f7 --- /dev/null +++ b/apps/webapp/test/mollifierIdempotencyClaim.test.ts @@ -0,0 +1,268 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { + claimOrAwait, + publishClaim, + releaseClaim, +} from "~/v3/mollifier/idempotencyClaim.server"; +import type { + IdempotencyClaimResult, + MollifierBuffer, +} from "@trigger.dev/redis-worker"; + +type ClaimState = { + value: string | null; + // Scripted return sequence for claimIdempotency calls. When set, + // overrides the default behaviour of returning based on `value`. + scriptedClaims?: IdempotencyClaimResult[]; +}; + +function makeBuffer(initial: ClaimState = { value: null }): { + buffer: MollifierBuffer; + state: ClaimState; +} { + const state = { ...initial }; + const buffer = { + claimIdempotency: vi.fn(async (): Promise => { + if (state.scriptedClaims && state.scriptedClaims.length > 0) { + return state.scriptedClaims.shift()!; + } + if (state.value === null) { + state.value = "pending"; + return { kind: "claimed" }; + } + if (state.value === "pending") return { kind: "pending" }; + return { kind: "resolved", runId: state.value }; + }), + readClaim: vi.fn(async (): Promise => { + if (state.value === null) return null; + if (state.value === "pending") return { kind: "pending" }; + return { kind: "resolved", runId: state.value }; + }), + publishClaim: vi.fn(async ({ runId }: { runId: string }) => { + state.value = runId; + }), + releaseClaim: vi.fn(async () => { + state.value = null; + }), + } as unknown as MollifierBuffer; + return { buffer, state }; +} + +const baseInput = { + envId: "env_a", + taskIdentifier: "my-task", + idempotencyKey: "k-1", +}; + +describe("claimOrAwait", () => { + it("returns 'claimed' for the first caller — empty key wins SETNX", async () => { + const { buffer } = makeBuffer({ value: null }); + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + generateToken: () => "token-1", + }); + expect(outcome).toEqual({ kind: "claimed", token: "token-1" }); + }); + + it("returns 'resolved' immediately when the key already holds a runId", async () => { + const { buffer } = makeBuffer({ value: "run_X" }); + const outcome = await claimOrAwait({ ...baseInput, buffer }); + expect(outcome).toEqual({ kind: "resolved", runId: "run_X" }); + }); + + it("polls a pending key, then resolves when the runId is published", async () => { + const { buffer, state } = makeBuffer({ value: "pending" }); + let nowValue = 0; + let pollCount = 0; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + now: () => nowValue, + sleep: async (ms) => { + nowValue += ms; + pollCount += 1; + if (pollCount === 3) state.value = "run_X"; + }, + safetyNetMs: 1000, + pollStepMs: 25, + }); + expect(outcome).toEqual({ kind: "resolved", runId: "run_X" }); + }); + + it("returns 'timed_out' when the key stays pending past safetyNetMs", async () => { + const { buffer } = makeBuffer({ value: "pending" }); + let nowValue = 0; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + now: () => nowValue, + sleep: async (ms) => { + nowValue += ms; + }, + safetyNetMs: 50, + pollStepMs: 25, + }); + expect(outcome).toEqual({ kind: "timed_out" }); + }); + + it("retries the claim when a polled key vanishes (claimant released)", async () => { + const { buffer, state } = makeBuffer({ value: "pending" }); + let nowValue = 0; + let pollCount = 0; + // Scripted retry: on the second `claimIdempotency` call we win. + state.scriptedClaims = [ + { kind: "pending" }, // first call (initial) + { kind: "claimed" }, // second call (retry after release) + ]; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + generateToken: () => "token-retry", + now: () => nowValue, + sleep: async (ms) => { + nowValue += ms; + pollCount += 1; + // First poll cycle: key vanishes (release). + if (pollCount === 1) state.value = null; + }, + safetyNetMs: 1000, + pollStepMs: 25, + }); + expect(outcome).toEqual({ kind: "claimed", token: "token-retry" }); + }); + + it("fails open with 'claimed' when buffer is null (mollifier disabled)", async () => { + const outcome = await claimOrAwait({ + ...baseInput, + buffer: null, + generateToken: () => "token-fallopen-null", + }); + expect(outcome).toEqual({ kind: "claimed", token: "token-fallopen-null" }); + }); + + it("fails open with 'claimed' if buffer.claimIdempotency throws (Redis down)", async () => { + const buffer = { + claimIdempotency: vi.fn(async () => { + throw new Error("ECONNREFUSED"); + }), + } as unknown as MollifierBuffer; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + generateToken: () => "token-fallopen-throw", + }); + expect(outcome).toEqual({ kind: "claimed", token: "token-fallopen-throw" }); + }); + + it("respects an aborted signal during the wait loop", async () => { + const { buffer } = makeBuffer({ value: "pending" }); + const controller = new AbortController(); + let nowValue = 0; + let pollCount = 0; + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + now: () => nowValue, + sleep: async (ms) => { + nowValue += ms; + pollCount += 1; + if (pollCount === 1) controller.abort(); + }, + abortSignal: controller.signal, + safetyNetMs: 5000, + pollStepMs: 25, + }); + expect(outcome).toEqual({ kind: "timed_out" }); + }); +}); + +describe("publishClaim", () => { + it("writes the runId to the claim key", async () => { + const { buffer, state } = makeBuffer({ value: "pending" }); + await publishClaim({ ...baseInput, token: "owner-token", runId: "run_X", buffer }); + expect(state.value).toBe("run_X"); + expect(buffer.publishClaim).toHaveBeenCalledOnce(); + }); + + it("no-op when buffer is null", async () => { + await expect( + publishClaim({ ...baseInput, token: "owner-token", runId: "run_X", buffer: null }), + ).resolves.toBeUndefined(); + }); + + it("swallows errors so trigger pipeline isn't broken by Redis hiccups", async () => { + const buffer = { + publishClaim: vi.fn(async () => { + throw new Error("ECONNREFUSED"); + }), + } as unknown as MollifierBuffer; + await expect( + publishClaim({ ...baseInput, token: "owner-token", runId: "run_X", buffer }), + ).resolves.toBeUndefined(); + }); +}); + +describe("releaseClaim", () => { + it("DELs the claim so waiters can re-acquire", async () => { + const { buffer, state } = makeBuffer({ value: "pending" }); + await releaseClaim({ ...baseInput, token: "owner-token", buffer }); + expect(state.value).toBeNull(); + }); + + it("no-op when buffer is null", async () => { + await expect(releaseClaim({ ...baseInput, token: "owner-token", buffer: null })).resolves.toBeUndefined(); + }); +}); + +// End-to-end: the token from `claimOrAwait`'s `claimed` outcome must +// reach `buffer.claimIdempotency` and round-trip through publishClaim / +// releaseClaim. Without this the compare-and-act ownership protection +// in the buffer is bypassed and the stale-claimant hazard returns. +describe("claim ownership token wiring", () => { + it("threads the token from claimOrAwait into buffer.claimIdempotency", async () => { + const { buffer } = makeBuffer({ value: null }); + const outcome = await claimOrAwait({ + ...baseInput, + buffer, + generateToken: () => "owner-token-xyz", + }); + expect(outcome).toEqual({ kind: "claimed", token: "owner-token-xyz" }); + expect(buffer.claimIdempotency).toHaveBeenCalledWith({ + ...baseInput, + token: "owner-token-xyz", + ttlSeconds: 30, + }); + }); + + it("threads the token from publishClaim into buffer.publishClaim", async () => { + const { buffer } = makeBuffer({ value: "pending" }); + await publishClaim({ + ...baseInput, + token: "owner-token-xyz", + runId: "run_X", + buffer, + }); + expect(buffer.publishClaim).toHaveBeenCalledWith( + expect.objectContaining({ + token: "owner-token-xyz", + runId: "run_X", + }), + ); + }); + + it("threads the token from releaseClaim into buffer.releaseClaim", async () => { + const { buffer } = makeBuffer({ value: "pending" }); + await releaseClaim({ + ...baseInput, + token: "owner-token-xyz", + buffer, + }); + expect(buffer.releaseClaim).toHaveBeenCalledWith( + expect.objectContaining({ token: "owner-token-xyz" }), + ); + }); +}); diff --git a/apps/webapp/test/mollifierMollify.test.ts b/apps/webapp/test/mollifierMollify.test.ts new file mode 100644 index 00000000000..ec7a30b49c2 --- /dev/null +++ b/apps/webapp/test/mollifierMollify.test.ts @@ -0,0 +1,133 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: {}, +})); + +import { mollifyTrigger } from "~/v3/mollifier/mollifierMollify.server"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; + +function fakeBuffer( + acceptResult: Awaited> = { kind: "accepted" }, +): { buffer: MollifierBuffer; accept: ReturnType } { + const accept = vi.fn(async () => acceptResult); + return { + buffer: { accept } as unknown as MollifierBuffer, + accept, + }; +} + +describe("mollifyTrigger", () => { + it("writes the snapshot to buffer and returns synthesised result", async () => { + const { buffer, accept } = fakeBuffer(); + const result = await mollifyTrigger({ + runFriendlyId: "run_abc123def456", + environmentId: "env_a", + organizationId: "org_1", + engineTriggerInput: { taskIdentifier: "my-task", payload: '{"x":1}' }, + decision: { + divert: true, + reason: "per_env_rate", + count: 150, + threshold: 100, + }, + buffer, + }); + + expect(accept).toHaveBeenCalledOnce(); + expect(accept).toHaveBeenCalledWith({ + runId: "run_abc123def456", + envId: "env_a", + orgId: "org_1", + payload: expect.any(String), + idempotencyKey: undefined, + taskIdentifier: undefined, + }); + expect(result.run.friendlyId).toBe("run_abc123def456"); + expect(result.error).toBeUndefined(); + expect(result.isCached).toBe(false); + expect(result.notice).toEqual({ + code: "mollifier.queued", + message: expect.stringContaining("burst buffer"), + docs: expect.stringContaining("trigger.dev/docs"), + }); + }); + + it("echoes the winner's runId with isCached=true on duplicate_idempotency", async () => { + const { buffer } = fakeBuffer({ + kind: "duplicate_idempotency", + existingRunId: "run_winner12345", + }); + const result = await mollifyTrigger({ + runFriendlyId: "run_loser56789a", + environmentId: "env_a", + organizationId: "org_1", + engineTriggerInput: { taskIdentifier: "t", payload: "{}" }, + decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 }, + buffer, + idempotencyKey: "key", + taskIdentifier: "t", + }); + expect(result.run.friendlyId).toBe("run_winner12345"); + expect(result.isCached).toBe(true); + expect(result.notice).toBeUndefined(); + }); + + // Regression: the synthetic result MUST carry a populated `run.id` + // derived from the friendlyId. Without it, the route handler's + // `saveRequestIdempotency(…, result.run.id)` stores `undefined` as + // the cached entity id, and on SDK retry Prisma's + // `findFirst({ where: { id: undefined } })` silently drops the + // predicate and returns an arbitrary TaskRun — a cross-tenant leak + // path. (See Devin review on PR #3753.) + it("populates run.id from friendlyId on the happy-accept path", async () => { + const { buffer } = fakeBuffer(); + const result = await mollifyTrigger({ + runFriendlyId: "run_pri456789ab", + environmentId: "env_a", + organizationId: "org_1", + engineTriggerInput: { taskIdentifier: "t", payload: "{}" }, + decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 }, + buffer, + }); + expect(result.run.id).toBe(RunId.fromFriendlyId("run_pri456789ab")); + expect(result.run.id).toMatch(/^[a-z0-9]+$/); // non-undefined, non-empty + }); + + it("populates run.id from the WINNER's friendlyId on duplicate_idempotency", async () => { + const { buffer } = fakeBuffer({ + kind: "duplicate_idempotency", + existingRunId: "run_winnerdup12", + }); + const result = await mollifyTrigger({ + runFriendlyId: "run_loser56789a", + environmentId: "env_a", + organizationId: "org_1", + engineTriggerInput: { taskIdentifier: "t", payload: "{}" }, + decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 }, + buffer, + idempotencyKey: "key", + taskIdentifier: "t", + }); + expect(result.run.id).toBe(RunId.fromFriendlyId("run_winnerdup12")); + expect(result.run.id).not.toBe(RunId.fromFriendlyId("run_loser56789a")); + }); + + it("snapshot is round-trippable: payload field is parseable JSON of engineTriggerInput", async () => { + const { buffer, accept } = fakeBuffer(); + const engineInput = { taskIdentifier: "t", payload: "{}", tags: ["a", "b"] }; + await mollifyTrigger({ + runFriendlyId: "run_xabcde12345", + environmentId: "env_a", + organizationId: "org_1", + engineTriggerInput: engineInput, + decision: { divert: true, reason: "per_env_rate", count: 1, threshold: 1 }, + buffer, + }); + + const callArg = accept.mock.calls[0][0] as { payload: string }; + expect(JSON.parse(callArg.payload)).toEqual(engineInput); + }); +}); diff --git a/apps/webapp/test/mollifierMutateWithFallback.test.ts b/apps/webapp/test/mollifierMutateWithFallback.test.ts new file mode 100644 index 00000000000..1102229f568 --- /dev/null +++ b/apps/webapp/test/mollifierMutateWithFallback.test.ts @@ -0,0 +1,481 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + prisma: { taskRun: { findFirst: vi.fn(async () => null) } }, + $replica: { taskRun: { findFirst: vi.fn(async () => null) } }, +})); + +import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; +import type { + BufferEntry, + MollifierBuffer, + MutateSnapshotResult, +} from "@trigger.dev/redis-worker"; +import type { TaskRun } from "@trigger.dev/database"; + +type FindFirst = ReturnType; +type PrismaStub = { taskRun: { findFirst: FindFirst } }; + +function fakePrisma(rows: Array): PrismaStub { + const fn = vi.fn(); + for (const r of rows) fn.mockResolvedValueOnce(r); + fn.mockResolvedValue(null); + return { taskRun: { findFirst: fn } }; +} + +// Env-matching entry returned by the env-pre-check getEntry call that +// mutateWithFallback now does before any buffer write (cross-env auth +// gate). Same envId/orgId as `baseInput` so the check passes and the +// flow under test proceeds to mutateSnapshot. +const preCheckEntry = (): BufferEntry => + ({ + envId: "env_a", + orgId: "org_1", + status: "QUEUED", + materialised: false, + }) as unknown as BufferEntry; + +function bufferReturning(result: MutateSnapshotResult): MollifierBuffer { + const getEntry = vi.fn(async () => preCheckEntry()); + return { + mutateSnapshot: vi.fn(async () => result), + getEntry, + } as unknown as MollifierBuffer; +} + +// Buffer whose mutateSnapshot returns "busy" and whose getEntry walks a +// scripted sequence of entry states. The pre-check getEntry call (one +// extra read before the busy-wait loop, used for env authorization) +// consumes the first scripted result, then the busy-wait loop pops the +// remainder; the last element repeats once the sequence is exhausted. +function bufferBusy(entries: Array): MollifierBuffer { + const getEntry = vi.fn(); + // Pre-check consumes one entry. Use a QUEUED env-matching entry so + // the env-check passes and the flow reaches mutateSnapshot (which + // returns "busy") and enters the wait-loop. + getEntry.mockResolvedValueOnce(preCheckEntry()); + for (const e of entries) getEntry.mockResolvedValueOnce(e); + getEntry.mockResolvedValue(entries.length ? entries[entries.length - 1] : null); + return { + mutateSnapshot: vi.fn(async () => "busy" as const), + getEntry, + } as unknown as MollifierBuffer; +} + +const entryDraining = (): BufferEntry => + ({ + envId: "env_a", + orgId: "org_1", + status: "DRAINING", + materialised: false, + }) as unknown as BufferEntry; +const entryQueued = (): BufferEntry => + ({ + envId: "env_a", + orgId: "org_1", + status: "QUEUED", + materialised: false, + }) as unknown as BufferEntry; +const entryMaterialised = (): BufferEntry => + ({ + envId: "env_a", + orgId: "org_1", + status: "DRAINING", + materialised: true, + }) as unknown as BufferEntry; + +const fakeRun = (overrides: Partial = {}): TaskRun => + ({ + id: "pg_id", + friendlyId: "run_1", + runtimeEnvironmentId: "env_a", + ...overrides, + }) as TaskRun; + +const baseInput = { + runId: "run_1", + environmentId: "env_a", + organizationId: "org_1", + bufferPatch: { type: "append_tags" as const, tags: ["x"] }, +}; + +describe("mutateWithFallback", () => { + it("hits replica → calls pgMutation, returns pg outcome", async () => { + const row = fakeRun(); + const pgMutation = vi.fn(async () => "pg-response"); + const synthesisedResponse = vi.fn(() => "snapshot-response"); + + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse, + prismaReplica: fakePrisma([row]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("applied_to_snapshot"), + }); + + expect(result).toEqual({ kind: "pg", response: "pg-response" }); + expect(pgMutation).toHaveBeenCalledWith(row); + expect(synthesisedResponse).not.toHaveBeenCalled(); + }); + + it("replica miss + buffer applied_to_snapshot → synthesisedResponse", async () => { + const pgMutation = vi.fn(async () => "pg"); + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("applied_to_snapshot"), + }); + expect(result).toEqual({ kind: "snapshot", response: "snap" }); + expect(pgMutation).not.toHaveBeenCalled(); + }); + + it("applied_to_snapshot forwards the pre-mutation entry to synthesisedResponse (lets callers dedup)", async () => { + // The tags route uses this to compute the same post-dedup count + // the PG path reports, without an extra Redis round-trip. + const synthesised = vi.fn(({ bufferEntry }: { bufferEntry: BufferEntry | null }) => { + // Caller can inspect bufferEntry.payload (or other fields) to + // produce a response that depends on the prior snapshot state. + return bufferEntry ? "snap-with-entry" : "snap-without-entry"; + }); + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: synthesised, + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("applied_to_snapshot"), + }); + expect(result).toEqual({ kind: "snapshot", response: "snap-with-entry" }); + expect(synthesised).toHaveBeenCalledTimes(1); + const ctx = synthesised.mock.calls[0]?.[0]; + expect(ctx?.bufferEntry).not.toBeNull(); + // The pre-check entry has the env-matching shape set up by + // bufferReturning() / preCheckEntry(). + expect(ctx?.bufferEntry?.envId).toBe("env_a"); + expect(ctx?.bufferEntry?.orgId).toBe("org_1"); + }); + + // Symmetric writer-fallback in the `!buffer` short-circuit. Without + // this, mollifier-disabled deployments (or boot-time buffer init + // failures) would regress the pre-PR mutation routes — those read + // from the writer directly, so a fresh PG row was always visible. + // The replica offload introduced here moves the read to the lagging + // follower; if the buffer isn't available to disambiguate, we still + // probe the writer before returning 404. + it("replica miss + !buffer + writer hit → pgMutation (mollifier-disabled mode recovery)", async () => { + const row = fakeRun({ friendlyId: "run_1" }); + const pgMutation = vi.fn(async () => "pg-recovered-no-buffer"); + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([row]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => null, + }); + expect(result).toEqual({ kind: "pg", response: "pg-recovered-no-buffer" }); + expect(pgMutation).toHaveBeenCalledWith(row); + }); + + it("replica miss + !buffer + writer miss → not_found (genuine 404 in mollifier-disabled mode)", async () => { + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([null]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => null, + }); + expect(result).toEqual({ kind: "not_found" }); + }); + + it("replica miss + buffer not_found + writer miss → not_found", async () => { + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([null]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("not_found"), + }); + expect(result).toEqual({ kind: "not_found" }); + }); + + it("replica miss + buffer not_found + writer hit → pgMutation (replica-lag recovery)", async () => { + const row = fakeRun({ friendlyId: "run_1" }); + const pgMutation = vi.fn(async () => "pg-recovered"); + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([row]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("not_found"), + }); + expect(result).toEqual({ kind: "pg", response: "pg-recovered" }); + expect(pgMutation).toHaveBeenCalledWith(row); + }); + + it("busy → watches buffer through DRAINING, materialises, hits primary exactly once", async () => { + const row = fakeRun(); + const pgMutation = vi.fn(async () => "pg-after-wait"); + // Writer is read ONCE, only after the buffer reports materialised. + const writer = fakePrisma([row]); + const buffer = bufferBusy([entryDraining(), entryDraining(), entryMaterialised()]); + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: writer as unknown as typeof import("~/db.server").prisma, + getBuffer: () => buffer, + sleep: async (ms) => { + nowValue += ms; + }, + now: () => nowValue, + safetyNetMs: 2000, + pollStepMs: 20, + random: () => 0, + }); + expect(result).toEqual({ kind: "pg", response: "pg-after-wait" }); + expect(pgMutation).toHaveBeenCalledWith(row); + // One env-pre-check call + 3 busy-wait polls = 4 getEntry reads; + // primary read exactly once. + expect(buffer.getEntry).toHaveBeenCalledTimes(4); + expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1); + }); + + it("busy → entry deleted by terminal fail, writer finds SYSTEM_FAILURE row → pgMutation", async () => { + const row = fakeRun(); + const pgMutation = vi.fn(async () => "pg-failed-row"); + const writer = fakePrisma([row]); + const buffer = bufferBusy([entryDraining(), null]); + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: writer as unknown as typeof import("~/db.server").prisma, + getBuffer: () => buffer, + sleep: async (ms) => { + nowValue += ms; + }, + now: () => nowValue, + safetyNetMs: 2000, + pollStepMs: 20, + random: () => 0, + }); + expect(result).toEqual({ kind: "pg", response: "pg-failed-row" }); + expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1); + }); + + it("busy → entry deleted but no PG row (terminal write failed) → not_found", async () => { + const buffer = bufferBusy([null]); + const writer = fakePrisma([null]); + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: writer as unknown as typeof import("~/db.server").prisma, + getBuffer: () => buffer, + sleep: async (ms) => { + nowValue += ms; + }, + now: () => nowValue, + safetyNetMs: 2000, + pollStepMs: 20, + random: () => 0, + }); + expect(result).toEqual({ kind: "not_found" }); + expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1); + }); + + it("busy → requeued (back to QUEUED) then materialises; doesn't resolve early", async () => { + const row = fakeRun(); + const pgMutation = vi.fn(async () => "pg-after-requeue"); + const writer = fakePrisma([row]); + // QUEUED (requeued after a retryable drain error) must NOT be treated + // as "done" — the run hasn't reached PG. Only the later materialise does. + const buffer = bufferBusy([entryQueued(), entryDraining(), entryMaterialised()]); + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: writer as unknown as typeof import("~/db.server").prisma, + getBuffer: () => buffer, + sleep: async (ms) => { + nowValue += ms; + }, + now: () => nowValue, + safetyNetMs: 2000, + pollStepMs: 20, + random: () => 0, + }); + expect(result).toEqual({ kind: "pg", response: "pg-after-requeue" }); + // One env-pre-check + 3 busy-wait polls. + expect(buffer.getEntry).toHaveBeenCalledTimes(4); + expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(1); + }); + + it("busy → drainer never resolves (stays DRAINING) → timed_out, primary never touched", async () => { + const writer = fakePrisma([]); + const buffer = bufferBusy([entryDraining()]); + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: writer as unknown as typeof import("~/db.server").prisma, + getBuffer: () => buffer, + sleep: async (ms) => { + nowValue += ms; + }, + now: () => nowValue, + safetyNetMs: 100, + pollStepMs: 20, + random: () => 0, + }); + expect(result).toEqual({ kind: "timed_out" }); + // The whole point: while the run is still draining we never read the primary. + expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(0); + }); + + it("abort signal during wait → timed_out without further polls", async () => { + const writer = fakePrisma([]); + const buffer = bufferBusy([entryDraining(), entryDraining()]); + const controller = new AbortController(); + let nowValue = 0; + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: writer as unknown as typeof import("~/db.server").prisma, + getBuffer: () => buffer, + sleep: async (ms) => { + nowValue += ms; + controller.abort(); + }, + now: () => nowValue, + safetyNetMs: 2000, + pollStepMs: 20, + random: () => 0, + abortSignal: controller.signal, + }); + expect(result).toEqual({ kind: "timed_out" }); + // One env-pre-check + one busy-wait poll before sleep+abort; primary untouched. + expect(buffer.getEntry).toHaveBeenCalledTimes(2); + expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(0); + }); + + it("replica miss + buffer limit_exceeded → rejected via rejectedResponse builder", async () => { + const pgMutation = vi.fn(async () => "pg"); + const synthesisedResponse = vi.fn(() => "snap"); + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse, + rejectedResponse: () => "too-many-tags", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("limit_exceeded"), + }); + expect(result).toEqual({ kind: "rejected", response: "too-many-tags" }); + expect(pgMutation).not.toHaveBeenCalled(); + expect(synthesisedResponse).not.toHaveBeenCalled(); + }); + + it("buffer limit_exceeded without a rejectedResponse builder → throws (programmer error)", async () => { + await expect( + mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => bufferReturning("limit_exceeded"), + }) + ).rejects.toThrow(/limit_exceeded/); + }); + + it("replica miss + buffer entry belongs to a different env → not_found (cross-env auth gate)", async () => { + // Same flow as the applied_to_snapshot test, except the entry's + // envId doesn't match input.environmentId. mutateWithFallback must + // refuse the write and return not_found (without leaking that the + // runId exists in another env), and must NOT call mutateSnapshot. + const crossEnvEntry: BufferEntry = { + envId: "env_OTHER", + orgId: "org_1", + status: "QUEUED", + materialised: false, + } as unknown as BufferEntry; + const mutateSnapshot = vi.fn(async () => "applied_to_snapshot" as const); + const buffer = { + mutateSnapshot, + getEntry: vi.fn(async () => crossEnvEntry), + } as unknown as MollifierBuffer; + + const pgMutation = vi.fn(async () => "pg"); + const synthesisedResponse = vi.fn(() => "snap"); + const result = await mutateWithFallback({ + ...baseInput, + pgMutation, + synthesisedResponse, + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => buffer, + }); + expect(result).toEqual({ kind: "not_found" }); + expect(mutateSnapshot).not.toHaveBeenCalled(); + expect(pgMutation).not.toHaveBeenCalled(); + expect(synthesisedResponse).not.toHaveBeenCalled(); + }); + + it("replica miss + buffer entry belongs to a different org → not_found (cross-org auth gate)", async () => { + const crossOrgEntry: BufferEntry = { + envId: "env_a", + orgId: "org_OTHER", + status: "QUEUED", + materialised: false, + } as unknown as BufferEntry; + const mutateSnapshot = vi.fn(async () => "applied_to_snapshot" as const); + const buffer = { + mutateSnapshot, + getEntry: vi.fn(async () => crossOrgEntry), + } as unknown as MollifierBuffer; + + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => buffer, + }); + expect(result).toEqual({ kind: "not_found" }); + expect(mutateSnapshot).not.toHaveBeenCalled(); + }); + + it("buffer is null (mollifier disabled) → not_found after replica miss", async () => { + const result = await mutateWithFallback({ + ...baseInput, + pgMutation: async () => "pg", + synthesisedResponse: () => "snap", + prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica, + prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma, + getBuffer: () => null, + }); + expect(result).toEqual({ kind: "not_found" }); + }); +}); diff --git a/apps/webapp/test/mollifierReadFallback.test.ts b/apps/webapp/test/mollifierReadFallback.test.ts new file mode 100644 index 00000000000..feef6a420ad --- /dev/null +++ b/apps/webapp/test/mollifierReadFallback.test.ts @@ -0,0 +1,535 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: {}, +})); + +import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server"; +import type { MollifierBuffer, BufferEntry } from "@trigger.dev/redis-worker"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; + +function fakeBuffer(entry: BufferEntry | null): MollifierBuffer { + return { + getEntry: vi.fn(async () => entry), + } as unknown as MollifierBuffer; +} + +const NOW = new Date("2026-05-11T12:00:00Z"); + +describe("findRunByIdWithMollifierFallback", () => { + it("returns null when buffer is unavailable (mollifier disabled)", async () => { + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => null }, + ); + expect(result).toBeNull(); + }); + + it("returns null when no buffer entry exists", async () => { + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(null) }, + ); + expect(result).toBeNull(); + }); + + it("returns null when buffer entry envId does not match caller (auth mismatch)", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_OTHER", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).toBeNull(); + }); + + it("returns null when buffer entry orgId does not match caller (auth mismatch)", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_OTHER", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).toBeNull(); + }); + + it("returns synthesised QUEUED run when entry exists with matching auth", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "my-task" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.friendlyId).toBe("run_1"); + expect(result!.status).toBe("QUEUED"); + expect(result!.taskIdentifier).toBe("my-task"); + expect(result!.createdAt).toEqual(NOW); + }); + + it("returns synthesised QUEUED for DRAINING (internal state same externally)", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "DRAINING", + attempts: 1, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.status).toBe("QUEUED"); + }); + + it("returns FAILED state with structured error for FAILED entries", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "FAILED", + attempts: 3, + createdAt: NOW, + lastError: { code: "VALIDATION", message: "task not found" }, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.status).toBe("FAILED"); + expect(result!.error).toEqual({ code: "VALIDATION", message: "task not found" }); + }); + + it("extracts snapshot-derived fields from the buffered payload", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "my-task", + payload: '{"foo":"bar"}', + payloadType: "application/json", + metadata: '{"customer":"acme"}', + metadataType: "application/json", + idempotencyKey: "client-abc", + idempotencyKeyOptions: { key: "client-abc", scope: "run" }, + isTest: true, + depth: 2, + ttl: "1h", + tags: ["tag-a", "tag-b"], + // The engine.trigger snapshot stores the locked version string under + // `taskVersion` (see triggerTask.server.ts#buildEngineTriggerInput). + taskVersion: "20260511.1", + resumeParentOnCompletion: false, + parentTaskRunId: "run_parent", + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.payloadType).toBe("application/json"); + expect(result!.metadata).toBe('{"customer":"acme"}'); + expect(result!.metadataType).toBe("application/json"); + expect(result!.idempotencyKey).toBe("client-abc"); + expect(result!.idempotencyKeyOptions).toEqual({ key: "client-abc", scope: "run" }); + expect(result!.isTest).toBe(true); + expect(result!.depth).toBe(2); + expect(result!.ttl).toBe("1h"); + expect(result!.tags).toEqual(["tag-a", "tag-b"]); + expect(result!.lockedToVersion).toBe("20260511.1"); + expect(result!.resumeParentOnCompletion).toBe(false); + expect(result!.parentTaskRunId).toBe("run_parent"); + }); + + it("extracts gate-allocated trace context from the snapshot", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + traceId: "trace_abc", + spanId: "span_xyz", + parentSpanId: "span_parent", + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.traceId).toBe("trace_abc"); + expect(result!.spanId).toBe("span_xyz"); + expect(result!.parentSpanId).toBe("span_parent"); + }); + + it("parses idempotencyKeyOptions in the canonical { key, scope } object shape (regression for the buffered-vs-PG API contract divergence)", async () => { + // Regression for the bug where `readFallback` parsed + // `idempotencyKeyOptions` via Array.isArray and rejected the + // canonical object shape. The SDK and Prisma both serialise this + // as `{ key, scope }`; the legacy array check would reject it, + // returning `undefined` here, which downstream demoted the API's + // `idempotencyKey` field to surface the *hash* (server-side + // generated) instead of the user-supplied key — diverging from + // how materialised runs render the same field, and creating a + // silent contract flip at the drainer-materialisation boundary. + // Pin the schema-parse path so the buffered response matches + // PG-resident behaviour from the moment the run is buffered. + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + idempotencyKey: "", + idempotencyKeyOptions: { key: "user-supplied-key", scope: "global" }, + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.idempotencyKeyOptions).toEqual({ + key: "user-supplied-key", + scope: "global", + }); + }); + + it("returns undefined for idempotencyKeyOptions when the snapshot carries a legacy/invalid shape", async () => { + // The Zod schema parse rejects: + // - array shape (the legacy bug we just fixed) + // - object without required fields + // - missing field entirely + // In all these cases the field is left `undefined`. Downstream + // `getUserProvidedIdempotencyKey` then falls back to the + // `idempotencyKey` field, matching how PG-resident runs handle + // malformed/missing options. + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + idempotencyKey: "", + // Legacy array shape — must NOT be accepted. + idempotencyKeyOptions: ["payload"], + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.idempotencyKeyOptions).toBeUndefined(); + }); + + it("defaults snapshot-derived fields to safe values when absent", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.payloadType).toBeUndefined(); + expect(result!.metadata).toBeUndefined(); + expect(result!.idempotencyKey).toBeUndefined(); + expect(result!.isTest).toBe(false); + expect(result!.depth).toBe(0); + expect(result!.tags).toEqual([]); + expect(result!.resumeParentOnCompletion).toBe(false); + expect(result!.traceId).toBeUndefined(); + expect(result!.spanId).toBeUndefined(); + }); + + it("populates replay-relevant fields from the snapshot", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "my-task", + environment: { id: "env_a" }, + workerQueue: "default", + queue: "task/my-task", + concurrencyKey: "tenant-42", + machine: "medium-1x", + realtimeStreamsVersion: "v2", + seedMetadata: '{"k":"v"}', + seedMetadataType: "application/json", + tags: ["t1", "t2"], + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.id).toBeTypeOf("string"); + expect(result!.id.length).toBeGreaterThan(0); + expect(result!.engine).toBe("V2"); + expect(result!.runtimeEnvironmentId).toBe("env_a"); + expect(result!.workerQueue).toBe("default"); + expect(result!.queue).toBe("task/my-task"); + expect(result!.concurrencyKey).toBe("tenant-42"); + expect(result!.machinePreset).toBe("medium-1x"); + expect(result!.realtimeStreamsVersion).toBe("v2"); + expect(result!.seedMetadata).toBe('{"k":"v"}'); + expect(result!.seedMetadataType).toBe("application/json"); + expect(result!.runTags).toEqual(["t1", "t2"]); + }); + + it("extracts batchId from the snapshot's nested batch object (engine.trigger shape)", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + // The engine.trigger input nests the batch as `{ id, index }`, + // where `id` is the batch's internal cuid (not a flat `batchId`). + batch: { id: "batch_internal_cuid", index: 3 }, + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.batchId).toBe("batch_internal_cuid"); + }); + + it("leaves batchId undefined when the snapshot has no batch (non-batched run)", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.batchId).toBeUndefined(); + }); + + it("treats invalid date strings as undefined and does not mis-classify status as CANCELED", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + cancelledAt: "not-a-date", + cancelReason: "user requested", + delayUntil: "also-not-a-date", + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result).not.toBeNull(); + expect(result!.status).toBe("QUEUED"); + expect(result!.cancelledAt).toBeUndefined(); + expect(result!.delayUntil).toBeUndefined(); + }); + + it("parses valid ISO date strings on cancelledAt and delayUntil", async () => { + const cancelledAtIso = "2026-05-11T13:00:00.000Z"; + const delayUntilIso = "2026-05-11T14:00:00.000Z"; + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + cancelledAt: cancelledAtIso, + cancelReason: "user requested", + delayUntil: delayUntilIso, + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.status).toBe("CANCELED"); + expect(result!.cancelledAt).toEqual(new Date(cancelledAtIso)); + expect(result!.cancelReason).toBe("user requested"); + expect(result!.delayUntil).toEqual(new Date(delayUntilIso)); + }); + + it("falls back to entry.envId for runtimeEnvironmentId when snapshot lacks environment.id", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.runtimeEnvironmentId).toBe("env_a"); + expect(result!.workerQueue).toBeUndefined(); + expect(result!.queue).toBeUndefined(); + }); + + it("extracts batchId from the nested snapshot.batch object (not the flat key)", async () => { + // Regression for the field-name mismatch Devin flagged: + // #buildEngineTriggerInput writes batch info as + // `batch: { id, index }`, never as a flat `batchId`. readFallback + // must read the nested key, otherwise SyntheticRun.batchId is always + // undefined for buffered runs. + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + batch: { id: "batch_internal_xyz", index: 3 }, + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.batchId).toBe("batch_internal_xyz"); + }); + + it("does NOT read a flat `batchId` key — only the nested batch.id", async () => { + // Belt-and-braces: a payload with the wrong-shaped flat key should + // resolve to undefined, not silently pick up the bogus value. + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + batchId: "should-be-ignored", + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.batchId).toBeUndefined(); + }); + + it("converts internal parent/root IDs in the snapshot to friendlyIds", async () => { + // Regression for Devin's structural-unfillable finding: the snapshot + // only carries INTERNAL parent/root ids (engine.trigger consumes the + // internal shape), while SyntheticRun exposes friendlyIds. Convert + // here so consumers don't have to special-case the buffered path. + // The conversion is deterministic via RunId.toFriendlyId — we drive + // it through `RunId.generate()` to get a matching internal+friendly + // pair and assert the round-trip. + const parent = RunId.generate(); + const root = RunId.generate(); + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + taskIdentifier: "t", + parentTaskRunId: parent.id, + rootTaskRunId: root.id, + }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.parentTaskRunFriendlyId).toBe(parent.friendlyId); + expect(result!.rootTaskRunFriendlyId).toBe(root.friendlyId); + }); + + it("leaves parent/root friendlyIds undefined when the snapshot carries no parent context", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ taskIdentifier: "t" }), + status: "QUEUED", + attempts: 0, + createdAt: NOW, + }; + const result = await findRunByIdWithMollifierFallback( + { runId: "run_1", environmentId: "env_a", organizationId: "org_1" }, + { getBuffer: () => fakeBuffer(entry) }, + ); + expect(result!.parentTaskRunFriendlyId).toBeUndefined(); + expect(result!.rootTaskRunFriendlyId).toBeUndefined(); + }); +}); diff --git a/apps/webapp/test/mollifierReplayPayloadShape.test.ts b/apps/webapp/test/mollifierReplayPayloadShape.test.ts new file mode 100644 index 00000000000..d2f098d7086 --- /dev/null +++ b/apps/webapp/test/mollifierReplayPayloadShape.test.ts @@ -0,0 +1,99 @@ +import { describe, expect, it } from "vitest"; +import { + serialiseMollifierSnapshot, + deserialiseMollifierSnapshot, +} from "~/v3/mollifier/mollifierSnapshot.server"; +import { prettyPrintPacket } from "@trigger.dev/core/v3"; + +// Regression test for the Devin "Buffered replay loader passes +// non-string payload to prettyPrintPacket" finding on PR #3757. +// +// Devin's claim is that the snapshot codec double-unwraps the +// payload: `engine.trigger` carries it pre-serialised, then the +// snapshot serialise/deserialise round-trip would JSON.parse it a +// second time, leaving `buffered.payload` as a *parsed* object — +// which `prettyPrintPacket` then mis-handles, producing malformed +// payload display in the Replay dialog. +// +// This test pins the actual contract: the snapshot codec is a single +// JSON.stringify / JSON.parse layer. The payload field stored on the +// engine trigger input is a string (the SDK-serialised payload from +// `payloadPacket.data`). A string round-trips through +// JSON.stringify/JSON.parse unchanged — it does NOT get a second +// unwrap. Therefore `buffered.payload` reaches the replay loader as +// a string, exactly the shape `prettyPrintPacket` expects. +describe("mollifier replay payload shape", () => { + it("serialise/deserialise preserves the payload as a string", () => { + // Shape mirrors what `triggerTask.server.ts:#buildEngineTriggerInput` + // produces — `payload` is `args.payloadPacket.data`, already a JSON + // string from the SDK's packet serialisation. + const triggerInput = { + friendlyId: "run_x", + taskIdentifier: "hello-world", + payload: JSON.stringify({ hello: "world", n: 42 }), + payloadType: "application/json", + traceId: "trace_x", + spanId: "span_x", + }; + + const serialised = serialiseMollifierSnapshot(triggerInput); + const roundTripped = deserialiseMollifierSnapshot(serialised); + + expect(typeof roundTripped.payload).toBe("string"); + expect(roundTripped.payload).toBe(triggerInput.payload); + expect(roundTripped.payloadType).toBe("application/json"); + }); + + it("prettyPrintPacket on the round-tripped payload produces the expected pretty JSON", async () => { + const original = { hello: "world", nested: { count: 3 } }; + const triggerInput = { + payload: JSON.stringify(original), + payloadType: "application/json", + }; + + const roundTripped = deserialiseMollifierSnapshot( + serialiseMollifierSnapshot(triggerInput), + ); + + // This is exactly the call the replay loader makes: + // prettyPrintPacket(run.payload, run.payloadType) + // If Devin were right, the payload here would be a parsed object + // and prettyPrintPacket would either double-encode or skip + // formatting. In reality it's a string, so we get correct pretty + // JSON. + const pretty = await prettyPrintPacket( + roundTripped.payload, + roundTripped.payloadType as string, + ); + + expect(pretty).toBe(JSON.stringify(original, null, 2)); + }); + + it("string payload survives the buffer-codec round-trip even with snapshot fields around it", () => { + // Replicate the realistic snapshot shape (the engine.trigger input + // has many sibling fields). Confirms there's no field-shape + // interaction that would mutate payload. + const triggerInput = { + friendlyId: "run_x", + environment: { + id: "env", + type: "DEVELOPMENT", + project: { id: "p" }, + organization: { id: "o" }, + }, + taskIdentifier: "t", + payload: '{"a":1}', + payloadType: "application/json", + context: { run: { id: "x" } }, + traceContext: { traceparent: "00-...-..." }, + traceId: "abc", + spanId: "def", + tags: ["one", "two"], + depth: 2, + isTest: false, + }; + const out = deserialiseMollifierSnapshot(serialiseMollifierSnapshot(triggerInput)); + expect(typeof out.payload).toBe("string"); + expect(out.payload).toBe('{"a":1}'); + }); +}); diff --git a/apps/webapp/test/mollifierResetIdempotencyKey.test.ts b/apps/webapp/test/mollifierResetIdempotencyKey.test.ts new file mode 100644 index 00000000000..4909087d70c --- /dev/null +++ b/apps/webapp/test/mollifierResetIdempotencyKey.test.ts @@ -0,0 +1,158 @@ +import { describe, expect, it, vi } from "vitest"; + +// Mock the db module so the BaseService default prisma doesn't try to +// open a real connection at module load. Each test wires its own +// prisma stub. +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); +// Prevent the runEngine singleton from instantiating and spinning up +// PG/Redis workers at module load — without this CI fails with +// unhandled `PrismaClientInitializationError`s even though the +// assertions all pass (see `mollifierDrainerWorker.test.ts`). +vi.mock("~/v3/runEngine.server", () => ({ engine: {} })); + +// Hoisted mock state so we can swap the buffer per test without +// re-importing modules. +const bufferMock: { current: unknown } = { current: null }; +vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({ + getMollifierBuffer: () => bufferMock.current, +})); + +import { ResetIdempotencyKeyService } from "~/v3/services/resetIdempotencyKey.server"; +import { ServiceValidationError } from "~/v3/services/baseService.server"; + +type FakePrisma = { + taskRun: { updateMany: (...args: unknown[]) => Promise<{ count: number }> }; +}; + +function makePrisma(pgCount: number): FakePrisma { + return { + taskRun: { + updateMany: vi.fn(async () => ({ count: pgCount })), + }, + }; +} + +const env = { + id: "env_a", + organizationId: "org_1", +} as unknown as Parameters[2]; + +describe("ResetIdempotencyKeyService — buffer-outage handling", () => { + it("returns success when PG cleared >=1 run, even if the buffer reset throws", async () => { + bufferMock.current = { + resetIdempotency: vi.fn(async () => { + throw new Error("ECONNREFUSED"); + }), + }; + const prisma = makePrisma(1); + const service = new ResetIdempotencyKeyService(prisma as never); + + const result = await service.call("ikey", "task", env); + expect(result).toEqual({ id: "ikey" }); + }); + + it("returns success when PG cleared nothing but the buffer cleared a run", async () => { + bufferMock.current = { + resetIdempotency: vi.fn(async () => ({ clearedRunId: "run_x" })), + }; + const prisma = makePrisma(0); + const service = new ResetIdempotencyKeyService(prisma as never); + + const result = await service.call("ikey", "task", env); + expect(result).toEqual({ id: "ikey" }); + }); + + it("404s when PG and buffer both legitimately report 'nothing to clear'", async () => { + bufferMock.current = { + resetIdempotency: vi.fn(async () => ({ clearedRunId: null })), + }; + const prisma = makePrisma(0); + const service = new ResetIdempotencyKeyService(prisma as never); + + await expect(service.call("ikey", "task", env)).rejects.toMatchObject({ + status: 404, + }); + }); + + // Regression for the silent-not-found hazard CodeRabbit flagged: if PG + // sees nothing AND we can't read the buffer (Redis outage), the + // previous behaviour was to 404 — masking a partial outage and + // leaving a buffered key effectively un-reset while the caller was + // told "doesn't exist." We now surface 503 so the caller retries. + it("503s when PG cleared nothing AND the buffer reset failed (partial outage)", async () => { + bufferMock.current = { + resetIdempotency: vi.fn(async () => { + throw new Error("ECONNREFUSED"); + }), + }; + const prisma = makePrisma(0); + const service = new ResetIdempotencyKeyService(prisma as never); + + const error = await service.call("ikey", "task", env).then( + () => null, + (err) => err, + ); + expect(error).toBeInstanceOf(ServiceValidationError); + expect(error.status).toBe(503); + expect(error.message).toMatch(/retry/i); + }); + + it("404s normally when buffer is null (mollifier disabled) and PG cleared nothing", async () => { + bufferMock.current = null; + const prisma = makePrisma(0); + const service = new ResetIdempotencyKeyService(prisma as never); + + await expect(service.call("ikey", "task", env)).rejects.toMatchObject({ + status: 404, + }); + }); + + // Regression for the PG↔buffer handoff race CodeRabbit flagged on PR #3756. + // + // Sequence the test models (deterministic, by setup): + // 1. ResetIdempotencyKeyService.call begins while the run is still + // buffered. The initial pg.updateMany sees no PG row → count=0. + // 2. Between that update and the buffer reset, the drainer materialises + // the buffered run into PG (engine.trigger writes the row with the + // original idempotencyKey intact) AND `buffer.ack` clears the + // associated Redis idempotency lookup — that's part of ack's + // atomic contract (see `buffer.ts:493` comment). + // 3. buffer.resetIdempotency runs after ack → returns + // `{ clearedRunId: null }` because the lookup is gone. + // + // Without the handoff re-check, totalCount = 0 + 0 = 0 → the service + // throws 404 for a key that genuinely still exists on the now- + // materialised PG row. The customer's reset is silently lost. + // + // Correct behaviour: the service must discover the materialised row + // and clear its key, returning success. This test pins that contract. + it("succeeds when a buffered run materialises into PG between the initial pgUpdate and the buffer reset (handoff race)", async () => { + let updateManyCalls = 0; + const prisma: FakePrisma = { + taskRun: { + // First call: pre-materialisation, no PG row yet → 0. + // Second call (the fix's re-check after both surfaces report + // nothing): post-materialisation, drainer wrote the row → 1. + updateMany: vi.fn(async () => { + updateManyCalls += 1; + return updateManyCalls === 1 ? { count: 0 } : { count: 1 }; + }), + }, + }; + const resetIdempotency = vi.fn(async () => ({ clearedRunId: null as string | null })); + bufferMock.current = { resetIdempotency }; + + const service = new ResetIdempotencyKeyService(prisma as never); + + const result = await service.call("ikey", "task", env); + expect(result).toEqual({ id: "ikey" }); + + // Load-bearing pieces of the fix: + // - The buffer path was consulted (we didn't bypass the normal + // handoff window check), and + // - A second pg.updateMany fired AFTER the buffer's null result, + // catching the now-materialised row. + expect(resetIdempotency).toHaveBeenCalledOnce(); + expect(updateManyCalls).toBe(2); + }); +}); diff --git a/apps/webapp/test/mollifierResolveRunForMutation.test.ts b/apps/webapp/test/mollifierResolveRunForMutation.test.ts new file mode 100644 index 00000000000..b50d8ad9400 --- /dev/null +++ b/apps/webapp/test/mollifierResolveRunForMutation.test.ts @@ -0,0 +1,229 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ + // Both default clients return null. Individual tests inject their + // own fakes via `deps` when they want non-default behaviour. + prisma: { taskRun: { findFirst: vi.fn(async () => null) } }, + $replica: { taskRun: { findFirst: vi.fn(async () => null) } }, +})); + +import { resolveRunForMutation } from "~/v3/mollifier/resolveRunForMutation.server"; +import type { BufferEntry, MollifierBuffer } from "@trigger.dev/redis-worker"; + +// Regression coverage for the cancel-route 404 bug (commit b490afe23). +// Before the fix the route had `findResource: async () => null`, which +// caused the route builder to 404 every cancel — including for valid +// PG-row runs — BEFORE the action handler could run. The helper +// resolveRunForMutation has to return a non-null discriminated value +// whenever the run exists in either store. + +const NOW = new Date("2026-05-21T10:00:00Z"); + +function fakeReplica(row: { friendlyId: string } | null) { + return { taskRun: { findFirst: vi.fn(async () => row) } }; +} +function fakeWriter(row: { friendlyId: string } | null) { + return { taskRun: { findFirst: vi.fn(async () => row) } }; +} + +function fakeBuffer(entry: BufferEntry | null): MollifierBuffer { + return { + getEntry: vi.fn(async () => entry), + } as unknown as MollifierBuffer; +} + +const baseInput = { + runParam: "run_1", + environmentId: "env_a", + organizationId: "org_1", +}; + +describe("resolveRunForMutation", () => { + it("returns { source: 'pg' } when the PG row exists", async () => { + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica({ friendlyId: "run_1" }), + getBuffer: () => null, + }, + }); + expect(result).toEqual({ source: "pg", friendlyId: "run_1" }); + }); + + it("returns { source: 'buffer' } when PG misses and the buffer entry matches env+org", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_1", + payload: "{}", + status: "QUEUED", + attempts: 0, + createdAt: NOW, + createdAtMicros: 1747044000000000, + materialised: false, + idempotencyLookupKey: "", + metadataVersion: 0, + }; + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => fakeBuffer(entry), + }, + }); + expect(result).toEqual({ source: "buffer", friendlyId: "run_1" }); + }); + + it("returns null when PG misses and the buffer entry env doesn't match", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_OTHER", + orgId: "org_1", + payload: "{}", + status: "QUEUED", + attempts: 0, + createdAt: NOW, + createdAtMicros: 1747044000000000, + materialised: false, + idempotencyLookupKey: "", + metadataVersion: 0, + }; + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => fakeBuffer(entry), + }, + }); + expect(result).toBeNull(); + }); + + it("returns null when PG misses and the buffer entry org doesn't match", async () => { + const entry: BufferEntry = { + runId: "run_1", + envId: "env_a", + orgId: "org_OTHER", + payload: "{}", + status: "QUEUED", + attempts: 0, + createdAt: NOW, + createdAtMicros: 1747044000000000, + materialised: false, + idempotencyLookupKey: "", + metadataVersion: 0, + }; + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => fakeBuffer(entry), + }, + }); + expect(result).toBeNull(); + }); + + it("returns null when both PG and buffer miss", async () => { + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => fakeBuffer(null), + }, + }); + expect(result).toBeNull(); + }); + + it("returns null when buffer is unavailable (mollifier disabled) and PG misses", async () => { + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + getBuffer: () => null, + }, + }); + expect(result).toBeNull(); + }); + + it("PG-hit short-circuits before consulting the buffer", async () => { + const buffer = fakeBuffer(null); + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica({ friendlyId: "run_1" }), + getBuffer: () => buffer, + }, + }); + expect(result?.source).toBe("pg"); + expect(buffer.getEntry).not.toHaveBeenCalled(); + }); + + // Regressions for the degraded-mode false-404 CodeRabbit flagged. + // + // Pre-PR the mutation routes read from the writer directly, so any + // PG row was visible regardless of replication lag. This helper + // moved the read to the replica for offload purposes. The route + // builder treats a null return as a hard 404 BEFORE the action + // handler runs, so any path where replica misses and the writer has + // the row needs to be reachable here — otherwise mutateWithFallback's + // own writer recovery never gets a chance to fire. + it("falls back to the writer when both replica and buffer miss, returning the writer row as 'pg' source", async () => { + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + prismaWriter: fakeWriter({ friendlyId: "run_1" }), + getBuffer: () => fakeBuffer(null), + }, + }); + expect(result?.source).toBe("pg"); + expect(result?.friendlyId).toBe("run_1"); + }); + + it("falls back to the writer when the buffer is unavailable (mollifier disabled) and replica misses", async () => { + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + prismaWriter: fakeWriter({ friendlyId: "run_1" }), + getBuffer: () => null, + }, + }); + expect(result?.source).toBe("pg"); + expect(result?.friendlyId).toBe("run_1"); + }); + + it("still returns null when replica, buffer, AND writer all miss (legitimate not-found)", async () => { + const writer = fakeWriter(null); + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica(null), + prismaWriter: writer, + getBuffer: () => fakeBuffer(null), + }, + }); + expect(result).toBeNull(); + // Writer probe ran — the fallback fires exactly once on the miss + // path; doesn't pile retries. + expect(writer.taskRun.findFirst).toHaveBeenCalledOnce(); + }); + + it("PG-hit short-circuits before consulting either the buffer OR the writer", async () => { + const buffer = fakeBuffer(null); + const writer = fakeWriter({ friendlyId: "should-not-be-read" }); + const result = await resolveRunForMutation({ + ...baseInput, + deps: { + prismaReplica: fakeReplica({ friendlyId: "run_1" }), + prismaWriter: writer, + getBuffer: () => buffer, + }, + }); + expect(result?.source).toBe("pg"); + expect(result?.friendlyId).toBe("run_1"); + expect(buffer.getEntry).not.toHaveBeenCalled(); + // Writer must NOT fire when the replica already had the row — + // otherwise we'd negate the whole replica-offload purpose. + expect(writer.taskRun.findFirst).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts new file mode 100644 index 00000000000..94928611119 --- /dev/null +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -0,0 +1,976 @@ +import { describe, expect, it, vi } from "vitest"; +import { redisTest } from "@internal/testcontainers"; +import { MollifierBuffer } from "@trigger.dev/redis-worker"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { + runStaleSweepOnce, + startStaleSweepInterval, +} from "~/v3/mollifier/mollifierStaleSweep.server"; +import { MollifierStaleSweepState } from "~/v3/mollifier/mollifierStaleSweepState.server"; + +const SNAPSHOT = { + taskIdentifier: "hello-world", + payload: '{"x":1}', + payloadType: "application/json", + traceContext: {}, +}; + +// In-memory fake state for unit tests that don't have a Redis container. +// The testcontainer tests use a real MollifierStaleSweepState against +// the test Redis instead. +function makeFakeState() { + let cursor = 0; + let orgList: string[] = []; + const counts = new Map(); + let visited = new Set(); + return { + readCursor: async () => cursor, + writeCursor: async (v: number) => { + cursor = v; + }, + rebuildOrgList: async (orgs: string[]) => { + orgList = [...orgs]; + }, + readOrgListSlice: async (start: number, count: number) => ({ + orgs: orgList.slice(start, start + count), + total: orgList.length, + }), + setEnvStaleCount: async (envId: string, count: number) => { + if (count > 0) counts.set(envId, count); + else counts.delete(envId); + }, + readAllEnvStaleCounts: async () => new Map(counts), + markEnvVisited: async (envId: string) => { + visited.add(envId); + }, + reconcileVisited: async () => { + for (const envId of [...counts.keys()]) { + if (!visited.has(envId)) counts.delete(envId); + } + visited = new Set(); + }, + clearAll: async () => { + cursor = 0; + orgList = []; + counts.clear(); + visited = new Set(); + }, + close: async () => {}, + }; +} + +function spyDeps() { + // Counter ticks — metric carries no `envId` label (high-cardinality) + // so the spy is a simple call count. Per-env detail lives on the + // structured warn log and the snapshot map. + let staleEntryCount = 0; + const snapshots: Array> = []; + const warnings: Array<{ message: string; fields: Record }> = []; + return { + get staleEntryCount() { + return staleEntryCount; + }, + snapshots, + warnings, + deps: { + recordStaleEntry: () => { + staleEntryCount += 1; + }, + reportStaleEntrySnapshot: (snapshot: Map) => { + // Clone so post-sweep assertions see what was reported *at that + // call site*, not whatever subsequent passes mutate the source + // map into. + snapshots.push(new Map(snapshot)); + }, + logger: { + warn: (message: string, fields: Record) => { + warnings.push({ message, fields }); + }, + }, + }, + }; +} + +describe("runStaleSweepOnce — unit", () => { + it("returns zeros when the buffer is null", async () => { + // Mirrors the prod gate: if TRIGGER_MOLLIFIER_ENABLED=0 the buffer + // singleton is null and the sweep is a no-op. We don't want it to + // emit a metric (or throw) just because mollifier is disabled. + const spies = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 1000 }, + { ...spies.deps, getBuffer: () => null, state: makeFakeState() }, + ); + expect(result).toEqual({ + orgsScanned: 0, + envsScanned: 0, + entriesScanned: 0, + staleCount: 0, + }); + expect(spies.staleEntryCount).toBe(0); + expect(spies.warnings).toEqual([]); + const snapshots = spies.snapshots; + // An empty snapshot is still reported so any previously-paging env + // (from a prior sweep before mollifier was disabled) clears. + expect(snapshots).toHaveLength(1); + expect(snapshots[0].size).toBe(0); + }); + + it("surfaces readOrgListSlice failures and leaves durable state untouched", async () => { + // Regression: previously a Redis read failure inside + // `readOrgListSlice` returned `{ orgs: [], total: 0 }` and the + // sweep treated that as a clean empty cycle — writing cursor=0, + // reconciling visited envs against the empty result, and CLEARING + // the stale-entry gauge. That silenced the very alerts the sweep + // exists to raise. The fix re-throws; the caller (this function + // and the interval wrapper above it) must NOT mutate cursor or + // counts when readOrgListSlice fails. + const state = makeFakeState(); + // Seed durable state so we can assert it isn't touched on failure. + await state.writeCursor(42); + await state.setEnvStaleCount("env_seed", 7); + await state.rebuildOrgList(["org_pre"]); + // Inject a failure on the very next slice read. + const readErr = new Error("Redis read failed"); + let readAttempts = 0; + const failingState = { + ...state, + readOrgListSlice: async (start: number, count: number) => { + readAttempts += 1; + throw readErr; + }, + }; + const spies = spyDeps(); + const buffer = { + listOrgs: async () => ["org_pre"], + listEnvsForOrg: async () => [], + listEntriesForEnv: async () => [], + } as unknown as MollifierBuffer; + + await expect( + runStaleSweepOnce( + { staleThresholdMs: 60_000, maxOrgsPerPass: 10 }, + { + ...spies.deps, + state: failingState, + getBuffer: () => buffer, + now: () => Date.now(), + }, + ), + ).rejects.toThrow("Redis read failed"); + + expect(readAttempts).toBe(1); + // Cursor untouched (still the seeded 42, not reset to 0). + expect(await state.readCursor()).toBe(42); + // Counts hash untouched — the seeded env's count survives the + // failed cycle so the gauge keeps reporting its last-known value. + const counts = await state.readAllEnvStaleCounts(); + expect(counts.get("env_seed")).toBe(7); + // No snapshot was reported because the function threw before + // reaching reportStaleEntrySnapshot. + expect(spies.snapshots).toHaveLength(0); + expect(spies.staleEntryCount).toBe(0); + }); +}); + +describe("runStaleSweepOnce — testcontainers", () => { + redisTest( + "flags every entry whose dwell exceeds the stale threshold", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + // Three entries across two envs in the same org. The sweep below + // runs against a `now` advanced by 5 minutes, so all three have + // dwell ~5min and ALL THREE are stale against a 1-minute + // threshold — there is no "fresh" entry in this scenario. The + // assertions below pin the all-three-stale shape. + await buffer.accept({ + runId: "run_stale_a", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_stale_b", + envId: "env_b", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_stale_c", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + // Yank the system clock forward 5 minutes for the sweep — way + // past the threshold below. The `now` deps seam lets us drive + // the threshold without actually waiting in real time. + const futureNow = Date.now() + 5 * 60 * 1000; + + const spies = spyDeps(); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }, + ); + + expect(result.envsScanned).toBe(2); + expect(result.entriesScanned).toBe(3); + expect(result.staleCount).toBe(3); + // All three entries exceed the threshold; each emits one + // counter tick + one warning. + expect(spies.staleEntryCount).toBe(3); + expect(spies.warnings).toHaveLength(3); + for (const w of spies.warnings) { + expect(w.message).toBe("mollifier.stale_entry"); + expect(w.fields.staleThresholdMs).toBe(60 * 1000); + expect(w.fields.dwellMs).toBeGreaterThan(60 * 1000); + } + // Snapshot drives the alertable gauge — env_a has 2 stale + // entries, env_b has 1. Per-env detail is still passed to + // `reportStaleEntrySnapshot` for forensic value even though the + // gauge itself aggregates the total. + expect(spies.snapshots).toHaveLength(1); + expect(Object.fromEntries(spies.snapshots[0])).toEqual({ + env_a: 2, + env_b: 1, + }); + } finally { + await state.close(); + } + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "snapshot omits envs that have entries but none stale (durable hash HDEL's zeros)", + { timeout: 20_000 }, + async ({ redisOptions }) => { + // Critical for alert behaviour: a previous sweep flagged env_a + // stale, alert fired, drainer caught up. The next sweep must + // remove env_a from the durable counts hash so the gauge drops + // below the alert threshold instead of staying latched at the + // last stale value. With the sharded design the snapshot is + // sourced from the HASH directly — visiting an env with zero + // stale entries HDEL's it, so it's simply absent from the + // snapshot (telemetry sums values, so absence is equivalent to + // zero for the gauge). + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_just_arrived", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const spies = spyDeps(); + await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...spies.deps, getBuffer: () => buffer, state }, + ); + expect(spies.snapshots).toHaveLength(1); + // env_a has entries but none stale → not in the snapshot. + expect(spies.snapshots[0].has("env_a")).toBe(false); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "leaves fresh entries alone (dwell below threshold)", + { timeout: 20_000 }, + async ({ redisOptions }) => { + // Regression guard for the inequality direction. A bug that flipped + // `dwellMs > threshold` to `dwellMs >= threshold` would flag every + // entry the first time the sweep runs after a perfectly synchronised + // accept call — the dashboard would page on every burst. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_fresh_only", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const spies = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...spies.deps, getBuffer: () => buffer, state }, + ); + expect(result.staleCount).toBe(0); + expect(spies.staleEntryCount).toBe(0); + expect(spies.warnings).toEqual([]); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "shards work across ticks: cursor advances by maxOrgsPerPass and wraps after a full cycle", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // Without sharding the sweep walks every org/env every tick — at + // any meaningful backlog that runs longer than the tick interval + // and the next tick gets dropped by the inFlight guard. Sharding + // splits the work: each tick visits at most `maxOrgsPerPass` orgs, + // advances a durable cursor, and resumes from there next tick. + // Over `ceil(N / cap)` ticks the cycle covers every org. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + for (let i = 0; i < 5; i++) { + await buffer.accept({ + runId: `run_shard_${i}`, + envId: `env_shard_${i}`, + orgId: `org_shard_${i}`, + payload: JSON.stringify(SNAPSHOT), + }); + } + const futureNow = Date.now() + 5 * 60 * 1000; + const spies = spyDeps(); + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 2 }; + const baseDeps = { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }; + + // Tick 1: cursor starts at 0, scans 2 orgs. + const r1 = await runStaleSweepOnce(cfg, baseDeps); + expect(r1.orgsScanned).toBe(2); + expect(spies.snapshots[0].size).toBe(2); + + // Tick 2: cursor was 2, scans 2 more orgs. + const r2 = await runStaleSweepOnce(cfg, baseDeps); + expect(r2.orgsScanned).toBe(2); + // Snapshot is the durable HASH — accumulates across ticks. + expect(spies.snapshots[1].size).toBe(4); + + // Tick 3: cursor was 4, scans the last 1 org and wraps to 0. + const r3 = await runStaleSweepOnce(cfg, baseDeps); + expect(r3.orgsScanned).toBe(1); + expect(spies.snapshots[2].size).toBe(5); + + // Tick 4: cycle complete, cursor is back at 0 — starts over. + const r4 = await runStaleSweepOnce(cfg, baseDeps); + expect(r4.orgsScanned).toBe(2); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "clears an env from the durable snapshot on revisit when it has entries but none currently stale", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // Stale state in the durable hash must be HDEL'd, not just left + // stale, when a previously-flagged env no longer has any entries + // whose dwell exceeds the threshold (drainer caught up, alert + // condition cleared). The same `entry` flips from stale to + // not-stale between two sweep ticks by varying the sweep's `now` + // — tick 1 uses a future clock so the entry is flagged stale; + // tick 2 uses real time so the same entry has near-zero dwell and + // is no longer stale. The env stays in the active set throughout + // (queue still has an entry), so the cursor revisits it and the + // hash field is cleared. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_drain", + envId: "env_drain", + orgId: "org_drain", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const spies = spyDeps(); + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }; + + // Tick 1 with future clock: entry's dwell is 5min vs 1min + // threshold → flagged stale. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }); + expect(spies.snapshots[0].get("env_drain")).toBe(1); + + // Tick 2 with real time: same entry, but its dwell is now ~ms + // vs the same 1min threshold → not stale. The env is revisited + // (cursor wrapped to 0 after tick 1, only 1 org in the list), + // setEnvStaleCount called with 0 → HDEL. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + }); + expect(spies.snapshots[1].has("env_drain")).toBe(false); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "evicts fully-drained envs from the counts hash at cycle wrap (no permanent alert)", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // Devin's BUG report on PR #3754: an env that drains completely + // between sweep ticks disappears from `mollifier:org-envs:${orgId}` + // entirely, so the inner loop at runStaleSweepOnce never visits it + // and `setEnvStaleCount(envId, 0)` (which HDELs the field) is + // never called. The counts hash retains the env's last-known + // stale count forever, the gauge stays elevated, and the + // recommended alert `> 0 for 5m` fires indefinitely. + // + // Fix: at cycle wrap (cursor returned to 0) HDEL any env in the + // counts hash that wasn't visited during the just-completed cycle. + // Verified here by: + // 1. Flagging env_will_drain stale, confirming it's in the hash + // 2. Draining its only entry — now invisible to listEnvsForOrg + // 3. Running a sweep tick that triggers cycle wrap + // 4. Asserting the env is no longer in the snapshot + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_will_drain", + envId: "env_will_drain", + orgId: "org_will_drain", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }; + const spies = spyDeps(); + + // Tick 1: env_will_drain is flagged stale → enters counts hash. + // Cursor wraps to 0 (only 1 org in the list). + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }); + expect(spies.snapshots[0].get("env_will_drain")).toBe(1); + + // Drain the only entry. mollifier:queue:env_will_drain is now + // empty, and the buffer's atomic Lua removes env_will_drain + // from `mollifier:org-envs:org_will_drain` (and removes the org + // from `mollifier:orgs` since it has no other envs). The env is + // now invisible to listEnvsForOrg. + const popped = await buffer.pop("env_will_drain"); + expect(popped?.runId).toBe("run_will_drain"); + + // Tick 2: cursor was 0 after tick 1's wrap, so this rebuilds + // the org list (now empty) and immediately wraps again. The + // wrap-handler must HDEL env_will_drain from the counts hash + // because it wasn't in the visited set for this cycle. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }); + expect(spies.snapshots[1].has("env_will_drain")).toBe(false); + // And the durable hash is genuinely empty, not just absent from + // this snapshot. + expect((await state.readAllEnvStaleCounts()).size).toBe(0); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "scans across multiple orgs", + { timeout: 20_000 }, + async ({ redisOptions }) => { + // The drainer pops with org-level fairness, so the sweep must + // walk every org/env to surface stale entries across all of them + // — not just stop at the first env it finds. If a future refactor + // collapsed listOrgs/listEnvsForOrg into a single env-flat list, + // this test catches a regression there. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_x", + envId: "env_x", + orgId: "org_x", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_y", + envId: "env_y", + orgId: "org_y", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const spies = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...spies.deps, getBuffer: () => buffer, state, now: () => futureNow }, + ); + expect(result.orgsScanned).toBe(2); + expect(result.envsScanned).toBe(2); + expect(result.staleCount).toBe(2); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "state survives process restart: a second state instance picks up the cursor and counts", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // This is the headline reason the sweep state is durable in Redis + // instead of process-local — a webapp restart mid-cycle must not + // re-emit the gauge as fresh-zero for previously-flagged envs nor + // restart the cursor walk from scratch. Simulated here by closing + // state1 (its Redis client quits cleanly) and constructing state2 + // against the same Redis. The cursor + counts that state1 wrote + // are visible to state2 on its first tick. + const buffer = new MollifierBuffer({ redisOptions }); + const state1 = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_a", + envId: "env_a", + orgId: "org_a", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_b", + envId: "env_b", + orgId: "org_b", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 1 }; + const spies1 = spyDeps(); + + // Tick 1 with state1: visits 1 of 2 orgs. + await runStaleSweepOnce(cfg, { + ...spies1.deps, + getBuffer: () => buffer, + state: state1, + now: () => futureNow, + }); + expect(spies1.snapshots[0].size).toBe(1); + } finally { + // Simulate webapp restart: state1's Redis client closes cleanly. + await state1.close(); + } + + // New process boots, constructs a fresh state pointing at the + // same Redis. The cycle's frozen org_list, the cursor, and the + // counts hash are all preserved — state2 picks up at the second + // org of the cycle. + const state2 = new MollifierStaleSweepState({ redisOptions }); + try { + const futureNow = Date.now() + 5 * 60 * 1000; + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 1 }; + const spies2 = spyDeps(); + + await runStaleSweepOnce(cfg, { + ...spies2.deps, + getBuffer: () => buffer, + state: state2, + now: () => futureNow, + }); + // Snapshot now has BOTH envs: the one tick 1 flagged (still in + // the counts hash from state1) plus the one tick 2 just flagged. + // A non-durable design would show only the second. + expect(spies2.snapshots[0].size).toBe(2); + } finally { + await state2.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "cycle wrap rebuilds the org list, so orgs that joined mid-cycle get visited on the next cycle", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // The docstring promises "orgs joining mid-cycle wait until the + // next cycle to be visited." The mechanism is rebuildOrgList at + // cursor=0: a fresh snapshot of buffer.listOrgs() replaces the + // previous frozen LIST. Verified here by adding a third org + // between cycles and asserting it shows up only in the next + // cycle's snapshot. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_init_a", + envId: "env_init_a", + orgId: "org_init_a", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_init_b", + envId: "env_init_b", + orgId: "org_init_b", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const spies = spyDeps(); + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }; + const baseDeps = { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }; + + // Tick 1: cycle 1. Visits both initial orgs; cursor wraps to 0. + await runStaleSweepOnce(cfg, baseDeps); + expect(spies.snapshots[0].size).toBe(2); + + // Mid-flight: a third org joins the buffer. It must NOT have + // been part of cycle 1's frozen LIST. + await buffer.accept({ + runId: "run_mid", + envId: "env_mid", + orgId: "org_mid", + payload: JSON.stringify(SNAPSHOT), + }); + + // Tick 2: cycle 2 begins (cursor was 0 after tick 1's wrap). + // rebuildOrgList captures all 3 orgs; this tick visits all 3. + const r2 = await runStaleSweepOnce(cfg, baseDeps); + expect(r2.orgsScanned).toBe(3); + expect(spies.snapshots[1].size).toBe(3); + expect(spies.snapshots[1].has("env_mid")).toBe(true); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "empty buffer (no orgs) advances cleanly with zero work and an empty snapshot", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // `mollifier:orgs` is empty (no entries ever accepted, or every + // entry has been drained). The sweep must handle the boundary: + // rebuildOrgList with [], readOrgListSlice returns total=0, + // the org loop is skipped, and the cursor stays at 0 instead of + // tripping the wrap math. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + const spies = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }, + { ...spies.deps, getBuffer: () => buffer, state }, + ); + expect(result).toEqual({ + orgsScanned: 0, + envsScanned: 0, + entriesScanned: 0, + staleCount: 0, + }); + expect(spies.snapshots).toHaveLength(1); + expect(spies.snapshots[0].size).toBe(0); + // Cursor stayed at 0 — nothing to advance through. + expect(await state.readCursor()).toBe(0); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "buffer-null branch wipes the durable state so a re-enable starts fresh", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // The unit test above asserts the snapshot is empty when the + // buffer is null, but doesn't verify the durable state was + // actually cleared. Without clearAll the next re-enable would + // resume on a stale cursor + carry over a stale counts hash. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_seed", + envId: "env_seed", + orgId: "org_seed", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }; + const spies = spyDeps(); + + // Tick 1: populate state. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }); + expect(spies.snapshots[0].size).toBe(1); + expect((await state.readAllEnvStaleCounts()).size).toBe(1); + + // Tick 2: mollifier flips OFF — getBuffer returns null. The + // sweep must clear the durable state. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => null, + state, + }); + expect(spies.snapshots[1].size).toBe(0); + expect((await state.readAllEnvStaleCounts()).size).toBe(0); + expect(await state.readCursor()).toBe(0); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); +}); + +describe("MollifierStaleSweepState — direct unit tests", () => { + redisTest("readCursor returns 0 when the key is absent", { timeout: 20_000 }, async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + expect(await state.readCursor()).toBe(0); + } finally { + await state.close(); + } + }); + + redisTest( + "writeCursor + readCursor round-trip; readCursor parses a non-numeric value as 0", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await state.writeCursor(42); + expect(await state.readCursor()).toBe(42); + + // Defensive: a corrupted/garbage value must not throw or + // propagate NaN into the sweep's cursor arithmetic. + await state["redis"].set("mollifier:stale_sweep:cursor", "not-a-number"); + expect(await state.readCursor()).toBe(0); + } finally { + await state.close(); + } + }, + ); + + redisTest( + "rebuildOrgList replaces the previous list (DEL + RPUSH, in order)", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await state.rebuildOrgList(["org_a", "org_b", "org_c"]); + let slice = await state.readOrgListSlice(0, 10); + expect(slice.total).toBe(3); + expect(slice.orgs).toEqual(["org_a", "org_b", "org_c"]); + + // Replacement, not append. + await state.rebuildOrgList(["org_x"]); + slice = await state.readOrgListSlice(0, 10); + expect(slice.total).toBe(1); + expect(slice.orgs).toEqual(["org_x"]); + + // Empty rebuild leaves the list empty (DEL fires, no RPUSH). + await state.rebuildOrgList([]); + slice = await state.readOrgListSlice(0, 10); + expect(slice.total).toBe(0); + expect(slice.orgs).toEqual([]); + } finally { + await state.close(); + } + }, + ); + + redisTest( + "setEnvStaleCount HSETs when count > 0 and HDELs when count === 0", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await state.setEnvStaleCount("env_a", 3); + await state.setEnvStaleCount("env_b", 1); + let counts = await state.readAllEnvStaleCounts(); + expect(Object.fromEntries(counts)).toEqual({ env_a: 3, env_b: 1 }); + + // Zero clears the field (HDEL), not stores 0. + await state.setEnvStaleCount("env_a", 0); + counts = await state.readAllEnvStaleCounts(); + expect(Object.fromEntries(counts)).toEqual({ env_b: 1 }); + expect(counts.has("env_a")).toBe(false); + } finally { + await state.close(); + } + }, + ); + + redisTest( + "clearAll DELs cursor, org_list, and counts in one call", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await state.writeCursor(7); + await state.rebuildOrgList(["org_a", "org_b"]); + await state.setEnvStaleCount("env_a", 5); + + await state.clearAll(); + + expect(await state.readCursor()).toBe(0); + expect((await state.readOrgListSlice(0, 10)).total).toBe(0); + expect((await state.readAllEnvStaleCounts()).size).toBe(0); + } finally { + await state.close(); + } + }, + ); +}); + +describe("startStaleSweepInterval — lifecycle", () => { + it("stop() waits for an in-flight tick to finish before closing the state", async () => { + // Devin's BUG report on PR #3754: `stop()` previously called + // `deps.state.close()` immediately after `clearInterval`, but the + // `tick` function only checks `stopped` at entry. A tick that was + // already past that check would keep making `state.*` Redis calls + // against a now-closed ioredis client, throw, get caught by tick's + // own try/catch, and log a `mollifier.stale_sweep.failed` warning + // for every graceful shutdown. + // + // The fix tracks the current tick promise so `stop()` can await it + // before closing. This test pins that order by gating one of the + // tick's state calls on a Deferred — until we resolve it, the tick + // can't progress, and `stop()` must hang in the meantime. + let resolveGate: () => void = () => {}; + const gate = new Promise((r) => { + resolveGate = r; + }); + + const callOrder: string[] = []; + let closeCalled = false; + const state = { + readCursor: async () => { + callOrder.push("readCursor:start"); + await gate; + callOrder.push("readCursor:end"); + return 0; + }, + writeCursor: async () => { + callOrder.push("writeCursor"); + }, + rebuildOrgList: async () => { + callOrder.push("rebuildOrgList"); + }, + readOrgListSlice: async () => { + callOrder.push("readOrgListSlice"); + // Return zero orgs so the org loop is a no-op — we only care + // about ordering of state calls vs close, not the work. + return { orgs: [] as string[], total: 0 }; + }, + setEnvStaleCount: async () => { + callOrder.push("setEnvStaleCount"); + }, + readAllEnvStaleCounts: async () => { + callOrder.push("readAllEnvStaleCounts"); + return new Map(); + }, + markEnvVisited: async () => { + callOrder.push("markEnvVisited"); + }, + reconcileVisited: async () => { + callOrder.push("reconcileVisited"); + }, + clearAll: async () => { + callOrder.push("clearAll"); + }, + close: async () => { + callOrder.push("close"); + closeCalled = true; + }, + }; + + const fakeBuffer = { + listOrgs: async () => [], + listEnvsForOrg: async () => [], + listEntriesForEnv: async () => [], + } as any; + + const handle = startStaleSweepInterval( + { + intervalMs: 20, + staleThresholdMs: 60_000, + maxOrgsPerPass: 10, + }, + { + state, + getBuffer: () => fakeBuffer, + recordStaleEntry: () => {}, + reportStaleEntrySnapshot: () => {}, + logger: { warn: () => {} }, + now: () => Date.now(), + }, + ); + + // Wait for the interval to fire one tick. The tick will start, call + // readCursor, and then block on `gate`. + await new Promise((r) => setTimeout(r, 80)); + expect(callOrder).toContain("readCursor:start"); + expect(closeCalled).toBe(false); + + // Call stop() concurrently — its promise MUST NOT resolve while the + // tick is still mid-flight. + let stopResolved = false; + const stopPromise = handle.stop().then(() => { + stopResolved = true; + }); + await new Promise((r) => setTimeout(r, 50)); + expect(stopResolved).toBe(false); + expect(closeCalled).toBe(false); + + // Release the gate. The tick can now finish, and only then should + // stop() resolve and close the state. + resolveGate(); + await stopPromise; + expect(stopResolved).toBe(true); + expect(closeCalled).toBe(true); + + // The tick's readCursor:end MUST appear before the close — otherwise + // we closed the Redis client out from under an in-flight tick. + expect(callOrder.indexOf("readCursor:end")).toBeGreaterThan(-1); + expect(callOrder.indexOf("close")).toBeGreaterThan( + callOrder.indexOf("readCursor:end"), + ); + }); +}); diff --git a/apps/webapp/test/mollifierSynthesiseFoundRun.test.ts b/apps/webapp/test/mollifierSynthesiseFoundRun.test.ts new file mode 100644 index 00000000000..4e2d6a61632 --- /dev/null +++ b/apps/webapp/test/mollifierSynthesiseFoundRun.test.ts @@ -0,0 +1,216 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { + synthesiseFoundRunFromBuffer, + type FoundRun, +} from "~/presenters/v3/ApiRetrieveRunPresenter.server"; +import type { SyntheticRun } from "~/v3/mollifier/readFallback.server"; + +const NOW = new Date("2026-05-24T10:00:00Z"); + +function makeSyntheticRun(overrides: Partial = {}): SyntheticRun { + return { + id: "run_internal_1", + friendlyId: "run_friendly_1", + status: "QUEUED", + cancelledAt: undefined, + cancelReason: undefined, + delayUntil: undefined, + taskIdentifier: "hello-world", + createdAt: NOW, + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: undefined, + metadataType: undefined, + seedMetadata: undefined, + seedMetadataType: undefined, + idempotencyKey: undefined, + idempotencyKeyOptions: undefined, + isTest: false, + depth: 0, + ttl: undefined, + tags: ["alpha", "beta"], + runTags: ["alpha", "beta"], + lockedToVersion: undefined, + resumeParentOnCompletion: false, + parentTaskRunId: undefined, + traceId: "trace_1", + spanId: "span_1", + parentSpanId: undefined, + runtimeEnvironmentId: "env_a", + engine: "V2", + workerQueue: undefined, + queue: undefined, + concurrencyKey: undefined, + machinePreset: undefined, + realtimeStreamsVersion: undefined, + maxAttempts: undefined, + maxDurationInSeconds: undefined, + replayedFromTaskRunFriendlyId: undefined, + annotations: undefined, + traceContext: undefined, + scheduleId: undefined, + batchId: undefined, + parentTaskRunFriendlyId: undefined, + rootTaskRunFriendlyId: undefined, + ...overrides, + }; +} + +describe("synthesiseFoundRunFromBuffer", () => { + it("populates internal id and friendlyId so downstream logging keys off the cuid", () => { + const found: FoundRun = synthesiseFoundRunFromBuffer(makeSyntheticRun()); + expect(found.id).toBe("run_internal_1"); + expect(found.friendlyId).toBe("run_friendly_1"); + }); + + it("marks the synth as isBuffered=true so callers like the events route can short-circuit ClickHouse lookups", () => { + // The PG path of `findRun` sets `isBuffered: false`; the buffered + // path goes through `synthesiseFoundRunFromBuffer` and must set + // `isBuffered: true` so consumers (e.g. the events endpoint) can + // skip queries that are guaranteed to return empty for buffered + // runs without rewriting them around surrogate signals like + // `traceId === ""`. + const found: FoundRun = synthesiseFoundRunFromBuffer(makeSyntheticRun()); + expect(found.isBuffered).toBe(true); + }); + + it("forwards scheduleId from the snapshot so resolveSchedule can hydrate the schedule field", () => { + // Regression: scheduleId was previously hardcoded to null, dropping the + // schedule metadata for buffered scheduled runs even though the snapshot + // carries it (readFallback.server.ts extracts snapshot.scheduleId). + const found = synthesiseFoundRunFromBuffer( + makeSyntheticRun({ scheduleId: "schedule_internal_42" }) + ); + expect(found.scheduleId).toBe("schedule_internal_42"); + }); + + it("leaves scheduleId null when the snapshot has no scheduleId (non-scheduled trigger)", () => { + const found = synthesiseFoundRunFromBuffer(makeSyntheticRun()); + expect(found.scheduleId).toBeNull(); + }); + + it("reconstructs batch.friendlyId from snapshot.batchId so batch-scoped JWTs authorise", () => { + // Regression: batch was previously hardcoded to null, so the + // route-authorization callbacks (which read run.batch?.friendlyId) + // skipped pushing the batch resource — a batch-scoped JWT 403'd on + // buffered batched runs. + const found = synthesiseFoundRunFromBuffer( + // BatchId.toFriendlyId encodes the internal id with a "batch_" prefix. + makeSyntheticRun({ batchId: "abcdefghijklmnopqrstuvwx" }) + ); + expect(found.batch).not.toBeNull(); + expect(found.batch!.id).toBe("abcdefghijklmnopqrstuvwx"); + expect(found.batch!.friendlyId).toMatch(/^batch_/); + }); + + it("leaves batch null when the snapshot has no batchId (non-batched run)", () => { + const found = synthesiseFoundRunFromBuffer(makeSyntheticRun()); + expect(found.batch).toBeNull(); + }); + + it("defaults workerQueue to '' so createCommonRunStructure coerces region to undefined", () => { + // Regression: workerQueue previously defaulted to "main", which fed + // through `run.workerQueue || undefined` as the API response's + // `region` — advertising a not-yet-assigned region. + const found = synthesiseFoundRunFromBuffer(makeSyntheticRun({ workerQueue: undefined })); + expect(found.workerQueue).toBe(""); + }); + + it("passes through an explicit workerQueue from the snapshot unchanged", () => { + const found = synthesiseFoundRunFromBuffer( + makeSyntheticRun({ workerQueue: "us-east-1" }) + ); + expect(found.workerQueue).toBe("us-east-1"); + }); + + it("maps buffered FAILED to SYSTEM_FAILURE so the API surfaces the failure", () => { + const found = synthesiseFoundRunFromBuffer( + makeSyntheticRun({ + status: "FAILED", + error: { code: "GATE_REJECTED", message: "buffer rejected the run" }, + }) + ); + expect(found.status).toBe("SYSTEM_FAILURE"); + expect(found.error).toEqual({ + type: "STRING_ERROR", + raw: "GATE_REJECTED: buffer rejected the run", + }); + }); + + it("maps buffered CANCELED to CANCELED with completedAt populated from cancelledAt", () => { + const cancelledAt = new Date("2026-05-24T10:05:00Z"); + const found = synthesiseFoundRunFromBuffer( + makeSyntheticRun({ status: "CANCELED", cancelledAt }) + ); + expect(found.status).toBe("CANCELED"); + expect(found.completedAt).toEqual(cancelledAt); + }); + + it("maps buffered QUEUED to PENDING with no error and no completedAt", () => { + const found = synthesiseFoundRunFromBuffer(makeSyntheticRun({ status: "QUEUED" })); + expect(found.status).toBe("PENDING"); + expect(found.error).toBeNull(); + expect(found.completedAt).toBeNull(); + }); + + it("passes through a string snapshot.metadata unchanged", () => { + const found = synthesiseFoundRunFromBuffer( + makeSyntheticRun({ metadata: '{"customer":"acme"}' }) + ); + expect(found.metadata).toBe('{"customer":"acme"}'); + }); + + it("defensively coerces a non-string snapshot.metadata to a JSON string instead of dropping it silently", () => { + // Production never writes non-string metadata, but if the snapshot + // shape drifts we'd rather see the value (with a warn log) than have + // it disappear. + const found = synthesiseFoundRunFromBuffer( + makeSyntheticRun({ metadata: { customer: "acme" } }) + ); + expect(found.metadata).toBe('{"customer":"acme"}'); + }); + + it("defaults idempotencyKey / idempotencyKeyOptions to null when absent", () => { + const found = synthesiseFoundRunFromBuffer(makeSyntheticRun()); + expect(found.idempotencyKey).toBeNull(); + expect(found.idempotencyKeyOptions).toBeNull(); + }); + + it("zeroes execution-state fields that aren't meaningful for a buffered run", () => { + const found = synthesiseFoundRunFromBuffer(makeSyntheticRun()); + expect(found.startedAt).toBeNull(); + expect(found.attempts).toEqual([]); + expect(found.attemptNumber).toBeNull(); + expect(found.parentTaskRun).toBeNull(); + expect(found.rootTaskRun).toBeNull(); + expect(found.childRuns).toEqual([]); + expect(found.output).toBeNull(); + expect(found.costInCents).toBe(0); + expect(found.baseCostInCents).toBe(0); + expect(found.usageDurationMs).toBe(0); + }); + + it("forwards runTags from the snapshot tags array", () => { + // Use distinct values for `tags` and `runTags` so the assertion + // actually pins the mapping. With the fixture's previous + // `runTags` default matching the same `["alpha", "beta"]` input, + // this test would have passed even if synthesiseFoundRunFromBuffer + // accidentally read `runTags` instead of `tags`. + const found = synthesiseFoundRunFromBuffer( + makeSyntheticRun({ + tags: ["from-tags"], + runTags: ["stale-run-tags"], + }) + ); + expect(found.runTags).toEqual(["from-tags"]); + }); + + it("pins engine to V2 and taskEventStore to taskEvent (only valid values for a buffered run)", () => { + const found = synthesiseFoundRunFromBuffer(makeSyntheticRun()); + expect(found.engine).toBe("V2"); + expect(found.taskEventStore).toBe("taskEvent"); + }); +}); diff --git a/apps/webapp/test/mollifierSyntheticApiResponses.test.ts b/apps/webapp/test/mollifierSyntheticApiResponses.test.ts new file mode 100644 index 00000000000..94ee67c8584 --- /dev/null +++ b/apps/webapp/test/mollifierSyntheticApiResponses.test.ts @@ -0,0 +1,164 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { + buildSyntheticSpanDetailBody, + buildSyntheticTraceBody, +} from "~/v3/mollifier/syntheticApiResponses.server"; +import type { SyntheticRun } from "~/v3/mollifier/readFallback.server"; + +const NOW = new Date("2026-05-23T10:00:00Z"); + +function makeSyntheticRun(overrides: Partial = {}): SyntheticRun { + return { + id: "run_internal_1", + friendlyId: "run_friendly_1", + status: "QUEUED", + cancelledAt: undefined, + cancelReason: undefined, + delayUntil: undefined, + taskIdentifier: "hello-world", + createdAt: NOW, + payload: undefined, + payloadType: undefined, + metadata: undefined, + metadataType: undefined, + seedMetadata: undefined, + seedMetadataType: undefined, + idempotencyKey: undefined, + idempotencyKeyOptions: undefined, + isTest: false, + depth: 0, + ttl: undefined, + tags: [], + runTags: [], + lockedToVersion: undefined, + resumeParentOnCompletion: false, + parentTaskRunId: undefined, + traceId: "trace_1", + spanId: "span_1", + parentSpanId: "span_parent", + runtimeEnvironmentId: "env_a", + engine: "V2", + workerQueue: undefined, + queue: "task/hello-world", + concurrencyKey: undefined, + machinePreset: "small-1x", + realtimeStreamsVersion: undefined, + maxAttempts: undefined, + maxDurationInSeconds: undefined, + replayedFromTaskRunFriendlyId: undefined, + annotations: undefined, + traceContext: undefined, + scheduleId: undefined, + batchId: undefined, + parentTaskRunFriendlyId: undefined, + rootTaskRunFriendlyId: undefined, + ...overrides, + }; +} + +describe("buildSyntheticSpanDetailBody", () => { + it("populates identity fields from the buffered run", () => { + const body = buildSyntheticSpanDetailBody(makeSyntheticRun()); + expect(body.spanId).toBe("span_1"); + expect(body.parentId).toBe("span_parent"); + expect(body.runId).toBe("run_friendly_1"); + expect(body.message).toBe("hello-world"); + expect(body.level).toBe("TRACE"); + expect(body.startTime).toEqual(NOW); + expect(body.durationMs).toBe(0); + }); + + it("defaults parentId to null when the buffered run has no parentSpanId", () => { + const body = buildSyntheticSpanDetailBody(makeSyntheticRun({ parentSpanId: undefined })); + expect(body.parentId).toBeNull(); + }); + + it("defaults message to '' when the buffered run has no taskIdentifier", () => { + const body = buildSyntheticSpanDetailBody( + makeSyntheticRun({ taskIdentifier: undefined }) + ); + expect(body.message).toBe(""); + }); + + it("renders a QUEUED buffered run as a still-partial, non-error, non-cancelled span", () => { + const body = buildSyntheticSpanDetailBody(makeSyntheticRun({ status: "QUEUED" })); + expect(body.isPartial).toBe(true); + expect(body.isError).toBe(false); + expect(body.isCancelled).toBe(false); + }); + + it("renders a CANCELED buffered run as a non-partial, non-error, cancelled span", () => { + const body = buildSyntheticSpanDetailBody(makeSyntheticRun({ status: "CANCELED" })); + expect(body.isPartial).toBe(false); + expect(body.isError).toBe(false); + expect(body.isCancelled).toBe(true); + }); + + it("renders a FAILED buffered run as a non-partial, errored, non-cancelled span", () => { + // Regression: a FAILED buffered run used to slip through as + // `isPartial: true, isError: false`, telling SDK pollers it was still + // executing. + const body = buildSyntheticSpanDetailBody(makeSyntheticRun({ status: "FAILED" })); + expect(body.isPartial).toBe(false); + expect(body.isError).toBe(true); + expect(body.isCancelled).toBe(false); + }); +}); + +describe("buildSyntheticTraceBody", () => { + it("envelopes the synthesised root span under `trace.rootSpan` with the buffered traceId", () => { + const body = buildSyntheticTraceBody(makeSyntheticRun()); + expect(body.trace.traceId).toBe("trace_1"); + expect(body.trace.rootSpan.id).toBe("span_1"); + expect(body.trace.rootSpan.runId).toBe("run_friendly_1"); + expect(body.trace.rootSpan.children).toEqual([]); + expect(body.trace.rootSpan.data.events).toEqual([]); + }); + + it("falls back to empty strings when traceId / spanId are absent from the snapshot", () => { + const body = buildSyntheticTraceBody( + makeSyntheticRun({ traceId: undefined, spanId: undefined }) + ); + expect(body.trace.traceId).toBe(""); + expect(body.trace.rootSpan.id).toBe(""); + }); + + it("passes through queueName and machinePreset from the snapshot", () => { + const body = buildSyntheticTraceBody(makeSyntheticRun()); + expect(body.trace.rootSpan.data.queueName).toBe("task/hello-world"); + expect(body.trace.rootSpan.data.machinePreset).toBe("small-1x"); + }); + + it("defaults taskSlug to undefined when the buffered run has no taskIdentifier", () => { + const body = buildSyntheticTraceBody(makeSyntheticRun({ taskIdentifier: undefined })); + expect(body.trace.rootSpan.data.taskSlug).toBeUndefined(); + expect(body.trace.rootSpan.data.message).toBe(""); + }); + + it("renders a QUEUED buffered run as a partial, non-error, non-cancelled root span", () => { + const body = buildSyntheticTraceBody(makeSyntheticRun({ status: "QUEUED" })); + expect(body.trace.rootSpan.data.isPartial).toBe(true); + expect(body.trace.rootSpan.data.isError).toBe(false); + expect(body.trace.rootSpan.data.isCancelled).toBe(false); + }); + + it("renders a CANCELED buffered run as a non-partial, non-error, cancelled root span", () => { + const body = buildSyntheticTraceBody(makeSyntheticRun({ status: "CANCELED" })); + expect(body.trace.rootSpan.data.isPartial).toBe(false); + expect(body.trace.rootSpan.data.isError).toBe(false); + expect(body.trace.rootSpan.data.isCancelled).toBe(true); + }); + + it("renders a FAILED buffered run as a non-partial, errored, non-cancelled root span", () => { + // Regression: a FAILED buffered run used to render with + // `isPartial: true, isError: false`, masking the failure from SDK + // consumers. + const body = buildSyntheticTraceBody(makeSyntheticRun({ status: "FAILED" })); + expect(body.trace.rootSpan.data.isPartial).toBe(false); + expect(body.trace.rootSpan.data.isError).toBe(true); + expect(body.trace.rootSpan.data.isCancelled).toBe(false); + }); +}); diff --git a/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts b/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts new file mode 100644 index 00000000000..a996b9de693 --- /dev/null +++ b/apps/webapp/test/mollifierSyntheticRedirectInfo.test.ts @@ -0,0 +1,197 @@ +import { describe, expect, vi } from "vitest"; +import { redisTest } from "@internal/testcontainers"; +import { MollifierBuffer } from "@trigger.dev/redis-worker"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { findBufferedRunRedirectInfo } from "~/v3/mollifier/syntheticRedirectInfo.server"; + +const SNAPSHOT = { + spanId: "span_1", + environment: { + slug: "dev", + project: { slug: "hello-world-bN7m" }, + organization: { slug: "references-6120" }, + }, +}; + +function fakePrisma(member: { id: string } | null) { + return { + orgMember: { findFirst: vi.fn(async () => member) }, + } as unknown as Parameters[1]["prismaClient"]; +} + +describe("findBufferedRunRedirectInfo (testcontainers)", () => { + redisTest("returns slugs + spanId for a real buffer entry when user is a member", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_1", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_1", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toEqual({ + organizationSlug: "references-6120", + projectSlug: "hello-world-bN7m", + environmentSlug: "dev", + spanId: "span_1", + }); + } finally { + await buffer.close(); + } + }); + + redisTest("returns null when no buffer entry exists for the runId", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_missing", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }); + + redisTest("returns null when the user is not an org member (default check enforced)", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_2", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_2", userId: "user_other" }, + { getBuffer: () => buffer, prismaClient: fakePrisma(null) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }); + + redisTest("skips the org-membership check when skipOrgMembershipCheck is set (admin path)", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_3", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const findFirst = vi.fn(); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_3", userId: "user_admin", skipOrgMembershipCheck: true }, + { + getBuffer: () => buffer, + prismaClient: { orgMember: { findFirst } } as unknown as Parameters[1]["prismaClient"], + }, + ); + expect(info?.organizationSlug).toBe("references-6120"); + expect(findFirst).not.toHaveBeenCalled(); + } finally { + await buffer.close(); + } + }); + + redisTest("returns null when snapshot is malformed JSON", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_4", + envId: "env_a", + orgId: "org_1", + payload: "{not-json", + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_4", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }); + + redisTest("returns null when snapshot lacks org/project slugs", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_5", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ spanId: "s", environment: { slug: "dev" } }), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_5", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }); + + redisTest("returns info with undefined spanId when snapshot has no spanId", async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_6", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ environment: SNAPSHOT.environment }), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_6", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info?.spanId).toBeUndefined(); + expect(info?.environmentSlug).toBe("dev"); + } finally { + await buffer.close(); + } + }); + + redisTest( + "rejects snapshots where a slug is the wrong type (Zod guard, not just typeof)", + async ({ redisOptions }) => { + // Regression for the pre-Zod implementation: the slug check was + // `typeof slug !== "string"` so any string passed, including ones + // that should've been rejected on shape grounds. The Zod schema + // gives us full structural validation — a `slug: 42` (number) + // collapses into the parse-fail branch like any other shape + // mismatch and we return null instead of leaking a half-built + // redirect URL. + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_real_7", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify({ + environment: { + slug: 42, + project: { slug: "p" }, + organization: { slug: "o" }, + }, + }), + }); + const info = await findBufferedRunRedirectInfo( + { runFriendlyId: "run_real_7", userId: "user_1" }, + { getBuffer: () => buffer, prismaClient: fakePrisma({ id: "member_1" }) }, + ); + expect(info).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); +}); diff --git a/apps/webapp/test/mollifierSyntheticReplayTaskRun.test.ts b/apps/webapp/test/mollifierSyntheticReplayTaskRun.test.ts new file mode 100644 index 00000000000..6df2d92dde4 --- /dev/null +++ b/apps/webapp/test/mollifierSyntheticReplayTaskRun.test.ts @@ -0,0 +1,106 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { buildSyntheticReplayTaskRun } from "~/v3/mollifier/syntheticReplayTaskRun.server"; +import type { SyntheticRun } from "~/v3/mollifier/readFallback.server"; + +const NOW = new Date("2026-05-21T10:00:00Z"); + +function makeSyntheticRun(overrides: Partial = {}): SyntheticRun { + return { + id: "run_internal_1", + friendlyId: "run_friendly_1", + status: "QUEUED", + cancelledAt: undefined, + cancelReason: undefined, + delayUntil: undefined, + taskIdentifier: "hello-world", + createdAt: NOW, + payload: { message: "hi" }, + payloadType: "application/json", + metadata: undefined, + metadataType: undefined, + seedMetadata: undefined, + seedMetadataType: undefined, + idempotencyKey: undefined, + idempotencyKeyOptions: undefined, + isTest: false, + depth: 0, + ttl: "10m", + tags: [], + runTags: [], + lockedToVersion: undefined, + resumeParentOnCompletion: false, + parentTaskRunId: undefined, + traceId: "trace_1", + spanId: "span_1", + parentSpanId: undefined, + runtimeEnvironmentId: "env_a", + engine: "V2", + workerQueue: "worker-queue-1", + queue: "task/hello-world", + concurrencyKey: undefined, + machinePreset: "small-1x", + realtimeStreamsVersion: "v1", + maxAttempts: 3, + maxDurationInSeconds: 3600, + replayedFromTaskRunFriendlyId: undefined, + annotations: undefined, + traceContext: undefined, + scheduleId: undefined, + batchId: undefined, + parentTaskRunFriendlyId: undefined, + rootTaskRunFriendlyId: undefined, + ...overrides, + }; +} + +const ENV_ROW = { + slug: "dev", + project: { slug: "hello-world", organization: { slug: "references" } }, +}; + +describe("buildSyntheticReplayTaskRun", () => { + it("returns the adapted TaskRun shape when traceId and spanId are present", () => { + const taskRun = buildSyntheticReplayTaskRun({ + synthetic: makeSyntheticRun(), + envRow: ENV_ROW, + }); + expect(taskRun).not.toBeNull(); + expect(taskRun!.traceId).toBe("trace_1"); + expect(taskRun!.spanId).toBe("span_1"); + expect(taskRun!.project.slug).toBe("hello-world"); + expect(taskRun!.project.organization.slug).toBe("references"); + expect(taskRun!.runtimeEnvironment.slug).toBe("dev"); + }); + + it("returns null when the snapshot has no traceId", () => { + // ReplayTaskRunService builds `00-${traceId}-${spanId}-01` without + // guarding for undefined. Falling through with a missing traceId + // would emit `00-undefined-...-01`, an invalid W3C traceparent that + // OTel silently drops, breaking the replayed run's trace linkage to + // the original. The helper must refuse rather than degrade silently. + const taskRun = buildSyntheticReplayTaskRun({ + synthetic: makeSyntheticRun({ traceId: undefined }), + envRow: ENV_ROW, + }); + expect(taskRun).toBeNull(); + }); + + it("returns null when the snapshot has no spanId", () => { + const taskRun = buildSyntheticReplayTaskRun({ + synthetic: makeSyntheticRun({ spanId: undefined }), + envRow: ENV_ROW, + }); + expect(taskRun).toBeNull(); + }); + + it("returns null when both traceId and spanId are missing", () => { + const taskRun = buildSyntheticReplayTaskRun({ + synthetic: makeSyntheticRun({ traceId: undefined, spanId: undefined }), + envRow: ENV_ROW, + }); + expect(taskRun).toBeNull(); + }); +}); diff --git a/apps/webapp/test/mollifierSyntheticRunHeader.test.ts b/apps/webapp/test/mollifierSyntheticRunHeader.test.ts new file mode 100644 index 00000000000..0d9f7c7e13f --- /dev/null +++ b/apps/webapp/test/mollifierSyntheticRunHeader.test.ts @@ -0,0 +1,130 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { buildSyntheticRunHeader } from "~/v3/mollifier/syntheticRunHeader.server"; +import type { SyntheticRun } from "~/v3/mollifier/readFallback.server"; + +const NOW = new Date("2026-05-21T10:00:00Z"); +const CANCELLED_AT = new Date("2026-05-21T10:00:30Z"); + +function makeSyntheticRun(overrides: Partial = {}): SyntheticRun { + return { + id: "run_internal_1", + friendlyId: "run_friendly_1", + status: "QUEUED", + cancelledAt: undefined, + cancelReason: undefined, + delayUntil: undefined, + taskIdentifier: "hello-world", + createdAt: NOW, + payload: { message: "hi" }, + payloadType: "application/json", + metadata: undefined, + metadataType: undefined, + seedMetadata: undefined, + seedMetadataType: undefined, + idempotencyKey: undefined, + idempotencyKeyOptions: undefined, + isTest: false, + depth: 0, + ttl: "10m", + tags: [], + runTags: [], + lockedToVersion: undefined, + resumeParentOnCompletion: false, + parentTaskRunId: undefined, + traceId: "trace_1", + spanId: "span_1", + parentSpanId: undefined, + runtimeEnvironmentId: "env_a", + engine: "V2", + workerQueue: "worker-queue-1", + queue: "task/hello-world", + concurrencyKey: undefined, + machinePreset: "small-1x", + realtimeStreamsVersion: "v1", + maxAttempts: 3, + maxDurationInSeconds: 3600, + replayedFromTaskRunFriendlyId: undefined, + annotations: undefined, + traceContext: undefined, + scheduleId: undefined, + batchId: undefined, + parentTaskRunFriendlyId: undefined, + rootTaskRunFriendlyId: undefined, + ...overrides, + }; +} + +const ENV = { + id: "env_a", + organizationId: "org_a", + type: "DEVELOPMENT" as const, + slug: "dev", +}; + +describe("buildSyntheticRunHeader", () => { + it("returns PENDING / non-final state for a queued buffered run", () => { + const header = buildSyntheticRunHeader({ run: makeSyntheticRun(), environment: ENV }); + expect(header.status).toBe("PENDING"); + expect(header.isFinished).toBe(false); + expect(header.completedAt).toBeNull(); + }); + + it("reflects CANCELED state from the snapshot so the NavBar and Cancel-button gate update before the drainer materialises", () => { + const header = buildSyntheticRunHeader({ + run: makeSyntheticRun({ status: "CANCELED", cancelledAt: CANCELLED_AT }), + environment: ENV, + }); + // The Cancel button in route.tsx is gated on `!run.isFinished` and the + // status badge reads `run.status`. Both must flip on buffered-cancel + // or the user sees a "Pending" badge with a Cancel button on a run + // that's already cancelled in the snapshot. + expect(header.status).toBe("CANCELED"); + expect(header.isFinished).toBe(true); + expect(header.completedAt).toEqual(CANCELLED_AT); + }); + + it("populates completedAt for FAILED runs so the route stops live-reloading and renders as completed", () => { + // The run-detail route derives `isCompleted` from + // `run.completedAt !== null` and gates SSE live-reloading on it + // (`route.tsx:459`, `:551`). Leaving completedAt null for FAILED + // buffered runs would keep a terminal run live-reloading forever + // while the badge already says SYSTEM_FAILURE. Symmetric with + // buildSyntheticSpanRun + ApiRetrieveRunPresenter. + const header = buildSyntheticRunHeader({ + run: makeSyntheticRun({ status: "FAILED" }), + environment: ENV, + }); + expect(header.status).toBe("SYSTEM_FAILURE"); + expect(header.isFinished).toBe(true); + expect(header.completedAt).toEqual(NOW); + }); + + it("forwards identity and environment fields from the snapshot", () => { + const header = buildSyntheticRunHeader({ run: makeSyntheticRun(), environment: ENV }); + expect(header.friendlyId).toBe("run_friendly_1"); + // `id` mirrors RunPresenter.getRun (the PG path) which puts the + // internal cuid in this field. SyntheticRun.id is the cuid; the + // header must surface it (not the friendlyId). + expect(header.id).toBe("run_internal_1"); + expect(header.traceId).toBe("trace_1"); + expect(header.spanId).toBe("span_1"); + expect(header.environment).toMatchObject({ + id: "env_a", + organizationId: "org_a", + type: "DEVELOPMENT", + slug: "dev", + }); + }); + + it("falls back to empty strings when the snapshot has no trace/span ids", () => { + const header = buildSyntheticRunHeader({ + run: makeSyntheticRun({ traceId: undefined, spanId: undefined }), + environment: ENV, + }); + expect(header.traceId).toBe(""); + expect(header.spanId).toBe(""); + }); +}); diff --git a/apps/webapp/test/mollifierSyntheticSpanRun.test.ts b/apps/webapp/test/mollifierSyntheticSpanRun.test.ts new file mode 100644 index 00000000000..3a89046e8cb --- /dev/null +++ b/apps/webapp/test/mollifierSyntheticSpanRun.test.ts @@ -0,0 +1,197 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { buildSyntheticSpanRun } from "~/v3/mollifier/syntheticSpanRun.server"; +import type { SyntheticRun } from "~/v3/mollifier/readFallback.server"; + +const NOW = new Date("2026-05-21T10:00:00Z"); + +function makeSyntheticRun(overrides: Partial = {}): SyntheticRun { + return { + id: "run_internal_1", + friendlyId: "run_friendly_1", + status: "QUEUED", + taskIdentifier: "hello-world", + createdAt: NOW, + payload: { message: "hi" }, + payloadType: "application/json", + metadata: undefined, + metadataType: undefined, + seedMetadata: undefined, + seedMetadataType: undefined, + idempotencyKey: undefined, + idempotencyKeyOptions: undefined, + isTest: false, + depth: 0, + ttl: "10m", + tags: ["a", "b"], + runTags: ["a", "b"], + lockedToVersion: undefined, + resumeParentOnCompletion: false, + parentTaskRunId: undefined, + traceId: "trace_1", + spanId: "span_1", + parentSpanId: undefined, + runtimeEnvironmentId: "env_a", + engine: "V2", + workerQueue: "worker-queue-1", + queue: "task/hello-world", + concurrencyKey: undefined, + machinePreset: "small-1x", + realtimeStreamsVersion: "v1", + maxAttempts: 3, + maxDurationInSeconds: 3600, + replayedFromTaskRunFriendlyId: undefined, + annotations: undefined, + traceContext: undefined, + scheduleId: undefined, + batchId: undefined, + parentTaskRunFriendlyId: undefined, + rootTaskRunFriendlyId: undefined, + ...overrides, + }; +} + +const ENV = { + id: "env_a", + slug: "dev", + type: "DEVELOPMENT" as const, +}; + +describe("buildSyntheticSpanRun", () => { + it("populates the core identity fields from the snapshot", async () => { + const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV }); + expect(synth.id).toBe("run_internal_1"); + expect(synth.friendlyId).toBe("run_friendly_1"); + expect(synth.taskIdentifier).toBe("hello-world"); + expect(synth.traceId).toBe("trace_1"); + expect(synth.spanId).toBe("span_1"); + expect(synth.environmentId).toBe("env_a"); + expect(synth.engine).toBe("V2"); + expect(synth.workerQueue).toBe("worker-queue-1"); + }); + + it("reports PENDING status and the non-final flags", async () => { + const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV }); + expect(synth.status).toBe("PENDING"); + expect(synth.isFinished).toBe(false); + expect(synth.isRunning).toBe(false); + expect(synth.isError).toBe(false); + expect(synth.startedAt).toBeNull(); + expect(synth.completedAt).toBeNull(); + }); + + it("pretty-prints the JSON payload from the snapshot", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ payload: { message: "hi" }, payloadType: "application/json" }), + environment: ENV, + }); + // prettyPrintPacket round-trips JSON with 2-space indent. + expect(synth.payload).toContain('"message": "hi"'); + expect(synth.payloadType).toBe("application/json"); + }); + + it("forwards runTags onto `tags` exactly", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ runTags: ["alpha", "beta"] }), + environment: ENV, + }); + expect(synth.tags).toEqual(["alpha", "beta"]); + }); + + it("classifies the queue name as custom when it does not start with 'task/'", async () => { + const taskQueue = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ queue: "task/hello-world" }), + environment: ENV, + }); + expect(taskQueue.queue.isCustomQueue).toBe(false); + + const customQueue = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ queue: "my-custom" }), + environment: ENV, + }); + expect(customQueue.queue.isCustomQueue).toBe(true); + }); + + it("derives idempotency status from the snapshot key/options", async () => { + const withKey = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ idempotencyKey: "abc", idempotencyKeyOptions: ["scope"] }), + environment: ENV, + }); + expect(withKey.idempotencyKey).toBe("abc"); + expect(withKey.idempotencyKeyStatus).toBe("active"); + + const noKey = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ idempotencyKey: undefined, idempotencyKeyOptions: undefined }), + environment: ENV, + }); + expect(noKey.idempotencyKeyStatus).toBeUndefined(); + }); + + it("omits relationships even when parent/root friendlyIds are present, since the snapshot lacks their spanId/taskIdentifier", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ + parentTaskRunFriendlyId: "run_parent", + rootTaskRunFriendlyId: "run_root", + }), + environment: ENV, + }); + expect(synth.relationships.parent).toBeUndefined(); + expect(synth.relationships.root).toBeUndefined(); + }); + + it("returns no relationship objects when the snapshot has no parent/root", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun(), + environment: ENV, + }); + expect(synth.relationships.parent).toBeUndefined(); + expect(synth.relationships.root).toBeUndefined(); + }); + + it("reflects a buffered CANCELED run as a finished, cancelled terminal state", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ + status: "CANCELED", + cancelledAt: NOW, + cancelReason: "cancelled by user", + }), + environment: ENV, + }); + expect(synth.status).toBe("CANCELED"); + expect(synth.statusReason).toBe("cancelled by user"); + expect(synth.isFinished).toBe(true); + expect(synth.isError).toBe(false); + expect(synth.completedAt).toEqual(NOW); + }); + + it("reflects a buffered FAILED run as a finished, errored SYSTEM_FAILURE", async () => { + const synth = await buildSyntheticSpanRun({ + run: makeSyntheticRun({ + status: "FAILED", + error: { code: "GATE_REJECTED", message: "buffer rejected the run" }, + }), + environment: ENV, + }); + expect(synth.status).toBe("SYSTEM_FAILURE"); + expect(synth.isFinished).toBe(true); + expect(synth.isError).toBe(true); + expect(synth.statusReason).toBe("buffer rejected the run"); + expect(synth.error).toEqual({ + type: "STRING_ERROR", + raw: "GATE_REJECTED: buffer rejected the run", + }); + // PG-resident SYSTEM_FAILURE rows always have completedAt set; + // mirror that on the synth path so callers checking + // `isFinished && completedAt` don't render a finished run with + // no completion timestamp. The buffer entry has no separate + // failedAt, so createdAt is the best-available proxy. + expect(synth.completedAt).toEqual(NOW); + }); + + it("flags the synthetic run as 'not cached' since cache lookup did not match it", async () => { + const synth = await buildSyntheticSpanRun({ run: makeSyntheticRun(), environment: ENV }); + expect(synth.isCached).toBe(false); + }); +}); diff --git a/apps/webapp/test/mollifierSyntheticTrace.test.ts b/apps/webapp/test/mollifierSyntheticTrace.test.ts new file mode 100644 index 00000000000..ac7425a8fe9 --- /dev/null +++ b/apps/webapp/test/mollifierSyntheticTrace.test.ts @@ -0,0 +1,149 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { buildSyntheticTraceForBufferedRun } from "~/v3/mollifier/syntheticTrace.server"; +import type { SyntheticRun } from "~/v3/mollifier/readFallback.server"; + +const NOW = new Date("2026-05-22T10:00:00Z"); +const ONE_MS_IN_NS = 1_000_000; + +function makeSyntheticRun(overrides: Partial = {}): SyntheticRun { + return { + id: "run_internal_1", + friendlyId: "run_friendly_1", + status: "QUEUED", + cancelledAt: undefined, + cancelReason: undefined, + delayUntil: undefined, + taskIdentifier: "hello-world", + createdAt: NOW, + payload: undefined, + payloadType: undefined, + metadata: undefined, + metadataType: undefined, + seedMetadata: undefined, + seedMetadataType: undefined, + idempotencyKey: undefined, + idempotencyKeyOptions: undefined, + isTest: false, + depth: 0, + ttl: undefined, + tags: [], + runTags: [], + lockedToVersion: undefined, + resumeParentOnCompletion: false, + parentTaskRunId: undefined, + traceId: "trace_1", + spanId: "span_1", + parentSpanId: undefined, + runtimeEnvironmentId: "env_a", + engine: "V2", + workerQueue: undefined, + queue: undefined, + concurrencyKey: undefined, + machinePreset: undefined, + realtimeStreamsVersion: undefined, + maxAttempts: undefined, + maxDurationInSeconds: undefined, + replayedFromTaskRunFriendlyId: undefined, + annotations: undefined, + traceContext: undefined, + scheduleId: undefined, + batchId: undefined, + parentTaskRunFriendlyId: undefined, + rootTaskRunFriendlyId: undefined, + ...overrides, + }; +} + +describe("buildSyntheticTraceForBufferedRun", () => { + it("populates the synthesised root span from snapshot identity fields", () => { + const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun()); + expect(trace.events).toHaveLength(1); + const root = trace.events[0]; + expect(root.id).toBe("span_1"); + expect(root.data.message).toBe("hello-world"); + expect(root.data.startTime).toEqual(NOW); + expect(root.data.isRoot).toBe(true); + expect(root.data.offset).toBe(0); + expect(root.data.level).toBe("TRACE"); + }); + + it("defaults the span message to 'Task' when the snapshot has no taskIdentifier", () => { + const trace = buildSyntheticTraceForBufferedRun( + makeSyntheticRun({ taskIdentifier: undefined }) + ); + expect(trace.events[0].data.message).toBe("Task"); + }); + + it("falls back to an empty-string span id when the snapshot has no spanId", () => { + const trace = buildSyntheticTraceForBufferedRun( + makeSyntheticRun({ spanId: undefined }) + ); + expect(trace.events[0].id).toBe(""); + // Empty id still marks as root (it matches the rootId fallback). + expect(trace.events[0].data.isRoot).toBe(true); + }); + + it("renders a QUEUED buffered run as an executing, partial root span", () => { + const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun({ status: "QUEUED" })); + expect(trace.rootSpanStatus).toBe("executing"); + expect(trace.events[0].data.isPartial).toBe(true); + expect(trace.events[0].data.isError).toBe(false); + expect(trace.events[0].data.isCancelled).toBe(false); + // A partial span exposes duration as null (the timeline reads it as + // "still running"); see syntheticTrace.server.ts duration mapping. + expect(trace.events[0].data.duration).toBeNull(); + }); + + it("renders a CANCELED buffered run as a completed, non-partial cancelled span", () => { + const trace = buildSyntheticTraceForBufferedRun( + makeSyntheticRun({ status: "CANCELED", cancelledAt: NOW }) + ); + expect(trace.rootSpanStatus).toBe("completed"); + expect(trace.events[0].data.isPartial).toBe(false); + expect(trace.events[0].data.isCancelled).toBe(true); + expect(trace.events[0].data.isError).toBe(false); + // Non-partial: duration is the span's numeric value (0 here), not null. + expect(trace.events[0].data.duration).toBe(0); + }); + + it("renders a FAILED buffered run as a failed, non-partial errored span", () => { + const trace = buildSyntheticTraceForBufferedRun( + makeSyntheticRun({ + status: "FAILED", + error: { code: "GATE_REJECTED", message: "buffer rejected the run" }, + }) + ); + expect(trace.rootSpanStatus).toBe("failed"); + expect(trace.events[0].data.isPartial).toBe(false); + expect(trace.events[0].data.isError).toBe(true); + expect(trace.events[0].data.isCancelled).toBe(false); + expect(trace.events[0].data.duration).toBe(0); + }); + + it("floors the trace duration to a minimum of 1ms (in nanoseconds) so the timeline has a positive extent", () => { + const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun()); + expect(trace.duration).toBe(ONE_MS_IN_NS); + }); + + it("reports the buffered createdAt as the trace's rootStartedAt and leaves startedAt null", () => { + const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun()); + expect(trace.rootStartedAt).toEqual(NOW); + expect(trace.startedAt).toBeNull(); + }); + + it("returns no link or override metadata (buffered traces are single-span)", () => { + const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun()); + expect(trace.linkedRunIdBySpanId).toEqual({}); + expect(trace.overridesBySpanId).toBeUndefined(); + expect(trace.queuedDuration).toBeUndefined(); + }); + + it("synthesises an empty events list (no timeline events from the buffer)", () => { + const trace = buildSyntheticTraceForBufferedRun(makeSyntheticRun()); + expect(trace.events[0].data.events).toEqual([]); + expect(trace.events[0].data.timelineEvents).toEqual([]); + }); +}); diff --git a/apps/webapp/test/mollifierTripEvaluator.test.ts b/apps/webapp/test/mollifierTripEvaluator.test.ts index b9a9bf8c94a..14ac0cc55bc 100644 --- a/apps/webapp/test/mollifierTripEvaluator.test.ts +++ b/apps/webapp/test/mollifierTripEvaluator.test.ts @@ -14,7 +14,7 @@ describe("createRealTripEvaluator", () => { redisTest( "returns divert=false when the sliding window stays under threshold", async ({ redisOptions }) => { - const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 }); + const buffer = new MollifierBuffer({ redisOptions }); try { const evaluator = createRealTripEvaluator({ getBuffer: () => buffer, @@ -32,7 +32,7 @@ describe("createRealTripEvaluator", () => { redisTest( "returns divert=true with reason per_env_rate once the window trips", async ({ redisOptions }) => { - const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 }); + const buffer = new MollifierBuffer({ redisOptions }); try { // threshold=2 → the 3rd call within windowMs is the first that trips. const options = { windowMs: 5000, threshold: 2, holdMs: 5000 } as const; @@ -73,7 +73,7 @@ describe("createRealTripEvaluator", () => { redisTest( "returns divert=false when buffer throws (fail-open)", async ({ redisOptions }) => { - const buffer = new MollifierBuffer({ redisOptions, entryTtlSeconds: 600 }); + const buffer = new MollifierBuffer({ redisOptions }); // Closing the client up front means evaluateTrip will throw on the first // Redis command — a real failure mode, not a stub. await buffer.close(); diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index da42247111a..835ff90cc48 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -450,6 +450,199 @@ export class RunEngine { //MARK: - Run functions + /** + * Writes a TaskRun row in CANCELED state directly, bypassing the trigger + * pipeline. Used by the mollifier drainer when a cancel API call lands on + * a buffered run before it materialises. + * + * Skips: queue insertion (no execution), waitpoint creation (the + * mollifier gate refuses to buffer triggerAndWait children, so a + * cancelled buffered run never has a waiting parent to unblock), + * concurrency reservation. Emits `runCancelled` by default — callers + * working on buffered-only runs (no primary trace event exists) can + * opt out via `emitRunCancelledEvent: false` to avoid the systematic + * "Failed to cancel run event" noise the handler would log when its + * `cancelRunEvent` call can't find a span. + * + * Idempotent: if a row with the same friendlyId already exists (double + * drainer pop after requeue), Prisma's P2002 unique-constraint violation + * is caught and the existing row is returned. The duplicate runCancelled + * emission is skipped — the original drain's emit already wrote the + * TaskEvent (when applicable). + */ + async createCancelledRun( + { + snapshot, + cancelledAt, + cancelReason, + emitRunCancelledEvent = true, + }: { + snapshot: TriggerParams; + cancelledAt: Date; + cancelReason: string; + /** + * Whether to emit the `runCancelled` engine-bus event. Defaults to + * true. + * + * Set to `false` for buffered-only runs that never had a primary + * trace event written (the mollifier gate never called + * `repository.traceEvent` for them). The `runCancelled` handler in + * `runEngineHandlers.server.ts` calls `cancelRunEvent`, which + * looks up the run's primary span in the event store — for + * buffered-only runs that span doesn't exist, so the lookup fails, + * the handler's `tryCatch` swallows it, and a "[runCancelled] + * Failed to cancel run event" error is logged for every cancelled + * buffered run. Suppressing the emit avoids that systematic noise. + * The CANCELED PG row is still written; only the trace-event + * mirror is skipped. + */ + emitRunCancelledEvent?: boolean; + }, + tx?: PrismaClientOrTransaction, + ): Promise { + const prisma = tx ?? this.prisma; + return startSpan(this.tracer, "createCancelledRun", async (span) => { + span.setAttribute("friendlyId", snapshot.friendlyId); + span.setAttribute("taskIdentifier", snapshot.taskIdentifier); + const id = RunId.fromFriendlyId(snapshot.friendlyId); + const error: TaskRunError = { type: "STRING_ERROR", raw: cancelReason }; + + try { + const taskRun = await prisma.taskRun.create({ + data: { + id, + engine: "V2", + status: "CANCELED", + friendlyId: snapshot.friendlyId, + runtimeEnvironmentId: snapshot.environment.id, + environmentType: snapshot.environment.type, + organizationId: snapshot.environment.organization.id, + projectId: snapshot.environment.project.id, + idempotencyKey: snapshot.idempotencyKey, + idempotencyKeyExpiresAt: snapshot.idempotencyKeyExpiresAt, + idempotencyKeyOptions: snapshot.idempotencyKeyOptions, + taskIdentifier: snapshot.taskIdentifier, + payload: snapshot.payload, + payloadType: snapshot.payloadType, + context: snapshot.context, + traceContext: snapshot.traceContext, + traceId: snapshot.traceId, + spanId: snapshot.spanId, + parentSpanId: snapshot.parentSpanId, + lockedToVersionId: snapshot.lockedToVersionId, + taskVersion: snapshot.taskVersion, + sdkVersion: snapshot.sdkVersion, + cliVersion: snapshot.cliVersion, + concurrencyKey: snapshot.concurrencyKey, + queue: snapshot.queue, + lockedQueueId: snapshot.lockedQueueId, + workerQueue: snapshot.workerQueue, + isTest: snapshot.isTest, + taskEventStore: snapshot.taskEventStore, + // Defensive: the snapshot comes from a cjson-encoded buffer + // payload, where empty Lua tables encode as `{}` not `[]`. If + // the drainer pops a buffered run with no tags, snapshot.tags + // will be an empty object, which Prisma misreads as a relation + // update op. Normalise to a real array (or undefined for the + // empty case). + runTags: Array.isArray(snapshot.tags) && snapshot.tags.length > 0 + ? snapshot.tags + : undefined, + oneTimeUseToken: snapshot.oneTimeUseToken, + parentTaskRunId: snapshot.parentTaskRunId, + rootTaskRunId: snapshot.rootTaskRunId, + replayedFromTaskRunFriendlyId: snapshot.replayedFromTaskRunFriendlyId, + batchId: snapshot.batch?.id, + resumeParentOnCompletion: snapshot.resumeParentOnCompletion, + depth: snapshot.depth, + seedMetadata: snapshot.seedMetadata, + seedMetadataType: snapshot.seedMetadataType, + metadata: snapshot.metadata, + metadataType: snapshot.metadataType, + machinePreset: snapshot.machine, + scheduleId: snapshot.scheduleId, + scheduleInstanceId: snapshot.scheduleInstanceId, + createdAt: snapshot.createdAt, + bulkActionGroupIds: snapshot.bulkActionId ? [snapshot.bulkActionId] : undefined, + planType: snapshot.planType, + realtimeStreamsVersion: snapshot.realtimeStreamsVersion, + streamBasinName: snapshot.streamBasinName, + annotations: snapshot.annotations, + completedAt: cancelledAt, + updatedAt: cancelledAt, + error: error as unknown as Prisma.InputJsonValue, + attemptNumber: 0, + executionSnapshots: { + create: { + engine: "V2", + executionStatus: "FINISHED", + description: "Run cancelled before materialisation", + runStatus: "CANCELED", + environmentId: snapshot.environment.id, + environmentType: snapshot.environment.type, + projectId: snapshot.environment.project.id, + organizationId: snapshot.environment.organization.id, + }, + }, + }, + }); + + if (emitRunCancelledEvent) { + this.eventBus.emit("runCancelled", { + time: cancelledAt, + run: { + id: taskRun.id, + status: taskRun.status, + friendlyId: taskRun.friendlyId, + spanId: taskRun.spanId, + taskEventStore: taskRun.taskEventStore, + createdAt: taskRun.createdAt, + completedAt: taskRun.completedAt, + error, + updatedAt: taskRun.updatedAt, + attemptNumber: taskRun.attemptNumber ?? 0, + }, + organization: { id: snapshot.environment.organization.id }, + project: { id: snapshot.environment.project.id }, + environment: { id: snapshot.environment.id }, + }); + } + + return taskRun; + } catch (err) { + // P2002 = unique constraint violation. Double-pop after a drainer + // requeue can reach this. Idempotent: return the existing row + // without re-emitting. + if ( + err instanceof Prisma.PrismaClientKnownRequestError && + err.code === "P2002" + ) { + this.logger.info( + "createCancelledRun: row already exists, returning existing (idempotent)", + { friendlyId: snapshot.friendlyId }, + ); + const existing = await prisma.taskRun.findFirst({ where: { id } }); + if (existing) { + // Only treat the conflict as idempotent when the existing + // row is ALREADY canceled. If a non-canceled row landed + // first (e.g. the drainer's normal `engine.trigger` replay + // path raced ahead of the cancel) we surface a conflict + // rather than silently reporting "cancelled" — the run is + // genuinely live and the caller must decide between + // engine.cancelRun() and skipping. + if (existing.status === "CANCELED") { + return existing; + } + throw new Error( + `createCancelledRun conflict: existing run ${snapshot.friendlyId} has status ${existing.status}`, + ); + } + } + throw err; + } + }); + } + /** "Triggers" one run. */ async trigger( { @@ -648,7 +841,16 @@ export class RunEngine { priorityMs, queueTimestamp: queueTimestamp ?? delayUntil ?? new Date(), ttl: resolvedTtl, - runTags: tags.length === 0 ? undefined : tags, + // Defensive: when the mollifier drainer replays a buffered + // snapshot whose payload was rewritten by a buffer-side Lua + // mutate (e.g. append_tags clears an empty list), cjson + // encodes an empty Lua table as `{}` rather than `[]`. JS + // parses that back as an empty object, and `{}.length` is + // undefined — the original `tags.length === 0` check would + // pass `{}` straight to Prisma's `String[]` column. Mirror + // the same Array.isArray guard that `createCancelledRun` + // uses for symmetry with the trigger replay path. + runTags: Array.isArray(tags) && tags.length > 0 ? tags : undefined, oneTimeUseToken, parentTaskRunId, rootTaskRunId, @@ -881,6 +1083,7 @@ export class RunEngine { taskEventStore, queue: queueOverride, lockedQueueId: lockedQueueIdOverride, + emitRunFailedEvent = true, }: { friendlyId: string; environment: { @@ -908,6 +1111,19 @@ export class RunEngine { queue?: string; /** Resolved TaskQueue.id when the task is locked to a specific queue. */ lockedQueueId?: string; + /** + * Whether to emit the `runFailed` engine-bus event. Defaults to true. + * + * Set to `false` when the caller is ALREADY managing the trace-event + * lifecycle for this run via `repository.traceEvent({ incomplete: false, + * isError: true, ... })`. In that path the outer trace event handles + * span completion itself; emitting `runFailed` from here causes the + * `runFailed` → `completeFailedRunEvent` handler to write a second + * completion row for the same (traceId, spanId), racing with the + * outer trace event's own write. The alert side of `runFailed` is + * preserved by emitting from the caller after `traceEvent` returns. + */ + emitRunFailedEvent?: boolean; }): Promise { return startSpan( this.tracer, @@ -983,6 +1199,57 @@ export class RunEngine { }); } + // Emit `runFailed` so the alert pipeline picks up the + // SYSTEM_FAILURE row and the event-store handler writes the + // completion event into the trace. Without this the mollifier + // drainer's terminal failures (and batch-trigger's + // exceed-limit failures) land in PG silently — visible in the + // dashboard list but never reaching customers' configured + // ERROR alert channels. + // + // Gated by `emitRunFailedEvent` so call sites that already wrap + // this inside `repository.traceEvent({ incomplete: false, + // isError: true })` can opt out — the outer trace event writes + // the completion row itself, and a second write via + // `completeFailedRunEvent` would race against it. Callers that + // disable the emit are responsible for triggering the alerts + // side themselves (e.g. by calling + // `PerformTaskRunAlertsService.enqueue` directly after the + // trace event closes). + if (!emitRunFailedEvent) { + return taskRun; + } + this.eventBus.emit("runFailed", { + time: taskRun.completedAt ?? new Date(), + run: { + id: taskRun.id, + status: taskRun.status, + spanId: taskRun.spanId, + error, + taskEventStore: taskRun.taskEventStore, + createdAt: taskRun.createdAt, + completedAt: taskRun.completedAt, + updatedAt: taskRun.updatedAt, + // This row never attempted execution — it's a synthesised + // terminal failure. The alert payload's `attemptNumber=0` + // is the signal downstream consumers can use to + // distinguish a never-ran failure from a run that + // exhausted its retries. + attemptNumber: 0, + usageDurationMs: 0, + costInCents: 0, + }, + organization: { + id: environment.organization.id, + }, + project: { + id: environment.project.id, + }, + environment: { + id: environment.id, + }, + }); + return taskRun; }, { diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts new file mode 100644 index 00000000000..68662074ea2 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts @@ -0,0 +1,345 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; + +function freshRunId() { + return RunId.generate().friendlyId; +} +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import type { EventBusEventArgs } from "../eventBus.js"; +import { setupAuthenticatedEnvironment } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function baseEngineOptions(redisOptions: Parameters[0]["queue"]["redis"]) { + return { + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +// engine.createCancelledRun writes a CANCELED +// TaskRun row directly from a buffer snapshot. Verifies the bypass- +// queue / bypass-waitpoint / emit-runCancelled contract. +describe("RunEngine.createCancelledRun", () => { + containerTest( + "writes CANCELED PG row with snapshot fields, completedAt, error", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + try { + const friendlyId = freshRunId(); + const cancelledAt = new Date("2026-05-20T12:00:00.000Z"); + const cancelReason = "Canceled by user"; + + const result = await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000aaaa000000000000", + spanId: "bbbb000000000000", + queue: "task/test-task", + isTest: false, + tags: ["test-tag"], + }, + cancelledAt, + cancelReason, + }); + + expect(result.status).toBe("CANCELED"); + expect(result.friendlyId).toBe(friendlyId); + expect(result.id).toBe(RunId.fromFriendlyId(friendlyId)); + expect(result.completedAt?.toISOString()).toBe(cancelledAt.toISOString()); + expect(result.taskIdentifier).toBe("test-task"); + expect(result.runTags).toEqual(["test-tag"]); + expect(result.payload).toBe('{"hello":"world"}'); + const err = result.error as { type?: string; raw?: string }; + expect(err.type).toBe("STRING_ERROR"); + expect(err.raw).toBe(cancelReason); + + // Verify the PG row is canonical (findFirst returns the row). + const stored = await prisma.taskRun.findFirst({ + where: { friendlyId }, + }); + expect(stored).not.toBeNull(); + expect(stored!.status).toBe("CANCELED"); + } finally { + await engine.quit(); + } + }, + ); + + containerTest( + "emits runCancelled with correct payload", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + const captured: EventBusEventArgs<"runCancelled">[0][] = []; + engine.eventBus.on("runCancelled", (event) => { + captured.push(event); + }); + + try { + const cancelledAt = new Date(); + const cancelReason = "Test cancel"; + const friendlyId = freshRunId(); + await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000cccc000000000000", + spanId: "dddd000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }, + cancelledAt, + cancelReason, + }); + + expect(captured).toHaveLength(1); + expect(captured[0]!.run.status).toBe("CANCELED"); + expect(captured[0]!.run.friendlyId).toBe(friendlyId); + expect(captured[0]!.run.error).toEqual({ type: "STRING_ERROR", raw: cancelReason }); + expect(captured[0]!.organization.id).toBe(env.organization.id); + } finally { + await engine.quit(); + } + }, + ); + + containerTest( + "emitRunCancelledEvent: false suppresses the bus emit but still writes the CANCELED PG row", + async ({ prisma, redisOptions }) => { + // The mollifier drainer passes `emitRunCancelledEvent: false` for + // buffered-only runs because the runCancelled handler's + // `cancelRunEvent` lookup fails for them (no primary trace event + // span exists — the mollifier gate never called + // `repository.traceEvent` for this run). Without the gate, every + // cancelled buffered run produces a `[runCancelled] Failed to + // cancel run event` error log. This pins the gate's contract: PG + // row still lands, bus emit suppressed. + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + const captured: EventBusEventArgs<"runCancelled">[0][] = []; + engine.eventBus.on("runCancelled", (event) => { + captured.push(event); + }); + + try { + const friendlyId = freshRunId(); + const result = await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000eeee000000000000", + spanId: "ffff000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }, + cancelledAt: new Date(), + cancelReason: "Test cancel (silent emit)", + emitRunCancelledEvent: false, + }); + + // PG row still lands. + expect(result.status).toBe("CANCELED"); + expect(result.friendlyId).toBe(friendlyId); + // Bus emit suppressed. + expect(captured).toHaveLength(0); + } finally { + await engine.quit(); + } + }, + ); + + containerTest( + "idempotent on double-pop: second call returns existing row without re-emitting", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + const captured: EventBusEventArgs<"runCancelled">[0][] = []; + engine.eventBus.on("runCancelled", (event) => { + captured.push(event); + }); + + try { + const snapshot = { + friendlyId: freshRunId(), + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000eeee000000000000", + spanId: "ffff000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }; + const cancelledAt = new Date(); + const cancelReason = "Test idempotent"; + + const first = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason }); + const second = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason }); + + expect(second.id).toBe(first.id); + // Only the first call's emit fired; the P2002 path skips re-emission. + expect(captured).toHaveLength(1); + } finally { + await engine.quit(); + } + }, + ); + + // Regression: cjson encodes empty Lua tables as `{}`, not `[]`. When + // the drainer pops a buffered run that never had a tag set, the + // deserialised snapshot's `tags` field is an empty object. The old + // implementation passed it straight into Prisma's `runTags:` field; + // Prisma misread the object as a relation update op and threw + // `Argument 'set' is missing`. The drainer caught the error and + // marked the buffer entry FAILED — so the CANCELED PG row never + // landed. + containerTest( + "tolerates snapshot.tags being an empty object (cjson edge case)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + try { + const friendlyId = freshRunId(); + // Cast through unknown to simulate the cjson-decode output shape + // for an empty Lua table — TypeScript's snapshot type says + // string[], but the buffer Lua delivers {} for the empty case. + const result = await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000abcd000000000000", + spanId: "1234000000000000", + queue: "task/test-task", + isTest: false, + tags: {} as unknown as string[], + }, + cancelledAt: new Date(), + cancelReason: "Cancelled — empty tags", + }); + expect(result.status).toBe("CANCELED"); + expect(result.friendlyId).toBe(friendlyId); + // Prisma normalises the absent-tags case to either [] or null + // depending on the column default; assert it's an empty array. + expect(result.runTags).toEqual([]); + } finally { + await engine.quit(); + } + }, + ); + + // Regression: the P2002-on-id idempotency path used to return ANY + // existing row, which would silently report success even if a live + // (non-CANCELED) row landed first. The guard now requires the + // existing row's status to be CANCELED; anything else surfaces a + // conflict so the caller can route to engine.cancelRun() or skip. + containerTest( + "P2002 conflict with non-CANCELED existing row throws (does not silently succeed)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + try { + const friendlyId = freshRunId(); + const id = RunId.fromFriendlyId(friendlyId); + + // Plant a live (non-CANCELED) row with the same id so the + // cancelled-run INSERT hits P2002 and the guard finds a row + // that ISN'T CANCELED. + await prisma.taskRun.create({ + data: { + id, + friendlyId, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + status: "PENDING", + runtimeEnvironmentId: env.id, + projectId: env.project.id, + organizationId: env.organizationId, + queue: "task/test-task", + traceId: "0000000000000000aaaa000000000000", + spanId: "bbbb000000000000", + engine: "V2", + }, + }); + + await expect( + engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000aaaa000000000000", + spanId: "bbbb000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }, + cancelledAt: new Date(), + cancelReason: "Should not silently overwrite a live row", + }), + ).rejects.toThrow(/createCancelledRun conflict.*PENDING/); + } finally { + await engine.quit(); + } + }, + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts new file mode 100644 index 00000000000..84d33baa87d --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts @@ -0,0 +1,176 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { EventBusEventArgs } from "../eventBus.js"; +import { setupAuthenticatedEnvironment } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunEngine.createFailedTaskRun", () => { + containerTest("emits runFailed so the alert pipeline wakes up", async ({ prisma, redisOptions }) => { + // The mollifier drainer (and batch-trigger over-limit path) call + // createFailedTaskRun to write a terminal SYSTEM_FAILURE PG row + // for runs that never actually executed. Without an explicit + // runFailed emit, the row lands silently — the + // runEngineHandlers' `runFailed` listener (which enqueues + // PerformTaskRunAlertsService) never fires, so customers' + // configured TASK_RUN alert channels miss the failure entirely. + // + // Regression intent: if the emit is removed or moved out of + // createFailedTaskRun's success path, this test fails. The + // shape assertions pin the fields the alert delivery service + // reads from the event payload (run.id, run.status, error, + // attemptNumber=0 as the never-ran-marker). + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const failedEvents: EventBusEventArgs<"runFailed">[0][] = []; + engine.eventBus.on("runFailed", (event) => { + failedEvents.push(event); + }); + + const friendlyId = generateFriendlyId("run"); + const taskIdentifier = "drainer-terminal-test"; + + const failed = await engine.createFailedTaskRun({ + friendlyId, + environment: { + id: authenticatedEnvironment.id, + type: authenticatedEnvironment.type, + project: { id: authenticatedEnvironment.project.id }, + organization: { id: authenticatedEnvironment.organization.id }, + }, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + error: { + type: "STRING_ERROR", + raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic", + }, + traceId: "0123456789abcdef0123456789abcdef", + spanId: "fedcba9876543210", + }); + + expect(failed.status).toBe("SYSTEM_FAILURE"); + + expect(failedEvents).toHaveLength(1); + const event = failedEvents[0]; + expect(event.run.id).toBe(failed.id); + expect(event.run.status).toBe("SYSTEM_FAILURE"); + expect(event.run.spanId).toBe("fedcba9876543210"); + // attemptNumber=0 is the marker that the run never executed — + // it's a synthesised terminal failure, not an exhausted-retries + // failure. Downstream consumers can use this to distinguish. + expect(event.run.attemptNumber).toBe(0); + expect(event.run.usageDurationMs).toBe(0); + expect(event.run.costInCents).toBe(0); + expect(event.run.error).toEqual({ + type: "STRING_ERROR", + raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic", + }); + expect(event.organization.id).toBe(authenticatedEnvironment.organization.id); + expect(event.project.id).toBe(authenticatedEnvironment.project.id); + expect(event.environment.id).toBe(authenticatedEnvironment.id); + } finally { + await engine.quit(); + } + }); + + // The TriggerFailedTaskService.call() path wraps createFailedTaskRun + // inside `repository.traceEvent({ incomplete: false, isError: true })` + // which already writes the completion row for the (traceId, spanId). + // Emitting `runFailed` from here would cause the + // `completeFailedRunEvent` handler to race a second write against + // the same span — the `emitRunFailedEvent: false` opt-out is what + // suppresses the emit. The PG row + alert side stay correct because + // the caller enqueues `PerformTaskRunAlertsService.enqueue(run.id)` + // directly after the trace event closes. + containerTest( + "emitRunFailedEvent: false suppresses the bus emit but still creates the PG row", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { redis: redisOptions, masterQueueConsumersDisabled: true, processWorkerQueueDebounceMs: 50 }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const failedEvents: EventBusEventArgs<"runFailed">[0][] = []; + engine.eventBus.on("runFailed", (event) => { + failedEvents.push(event); + }); + + const friendlyId = generateFriendlyId("run"); + const failed = await engine.createFailedTaskRun({ + friendlyId, + environment: { + id: authenticatedEnvironment.id, + type: authenticatedEnvironment.type, + project: { id: authenticatedEnvironment.project.id }, + organization: { id: authenticatedEnvironment.organization.id }, + }, + taskIdentifier: "outer-trace-event-test", + payload: "{}", + payloadType: "application/json", + error: { type: "STRING_ERROR", raw: "outer trace event manages span" }, + traceId: "0123456789abcdef0123456789abcdef", + spanId: "fedcba9876543210", + emitRunFailedEvent: false, + }); + + // PG row landed (caller still gets a usable TaskRun). + expect(failed.status).toBe("SYSTEM_FAILURE"); + expect(failed.friendlyId).toBe(friendlyId); + + // Bus emit was suppressed. + expect(failedEvents).toHaveLength(0); + } finally { + await engine.quit(); + } + }, + ); +}); diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index e86e503de47..570f03aca2c 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -157,6 +157,14 @@ export const IdempotencyKeyOptionsSchema = z.object({ export type IdempotencyKeyOptionsSchema = z.infer; +// Coerces user-supplied concurrencyKey values to string. The downstream Prisma +// column is String?, so passing a number (a common foot-gun when callers do +// `concurrencyKey: payload.userId`) used to fail at `prisma.taskRun.create` +// with PrismaClientValidationError. Accept the intent and stringify here. +const ConcurrencyKeySchema = z + .union([z.string(), z.number()]) + .transform((value) => String(value)); + export const TriggerTaskRequestBody = z.object({ payload: z.any(), context: z.any(), @@ -195,7 +203,7 @@ export const TriggerTaskRequestBody = z.object({ concurrencyLimit: z.number().int().optional(), }) .optional(), - concurrencyKey: z.string().optional(), + concurrencyKey: ConcurrencyKeySchema.optional(), delay: z.string().or(z.coerce.date()).optional(), idempotencyKey: z .string() @@ -253,7 +261,7 @@ export const BatchTriggerTaskItem = z.object({ context: z.any(), options: z .object({ - concurrencyKey: z.string().optional(), + concurrencyKey: ConcurrencyKeySchema.optional(), delay: z.string().or(z.coerce.date()).optional(), idempotencyKey: z .string() @@ -401,7 +409,12 @@ export type CreateBatchResponse = z.infer; /** * Phase 2: Individual item in the NDJSON stream - * Each line in the NDJSON body should match this schema + * Each line in the NDJSON body should match this schema. + * + * `options` reuses the strict shape from BatchTriggerTaskItem so that the + * Phase-2 streaming path validates option fields identically to the V2/V3 + * batch trigger endpoints — historically this used z.record(z.unknown()) and + * let invalid values (e.g. numeric concurrencyKey) reach Prisma. */ export const BatchItemNDJSON = z.object({ /** Zero-based index of this item (used for idempotency and ordering) */ @@ -411,7 +424,7 @@ export const BatchItemNDJSON = z.object({ /** The payload for this task run */ payload: z.unknown().optional(), /** Options for this specific item */ - options: z.record(z.unknown()).optional(), + options: BatchTriggerTaskItem.shape.options, }); export type BatchItemNDJSON = z.infer; diff --git a/packages/core/src/v3/schemas/batchItemNDJSON.test.ts b/packages/core/src/v3/schemas/batchItemNDJSON.test.ts new file mode 100644 index 00000000000..f130bba4450 --- /dev/null +++ b/packages/core/src/v3/schemas/batchItemNDJSON.test.ts @@ -0,0 +1,88 @@ +import { describe, it, expect } from "vitest"; +import { BatchItemNDJSON, BatchTriggerTaskItem, TriggerTaskRequestBody } from "./api.js"; + +describe("concurrencyKey coercion", () => { + // Phase-2 NDJSON used to accept arbitrary shapes for `options`, so a numeric + // concurrencyKey (a common foot-gun when callers pass + // `concurrencyKey: payload.userId`) reached Prisma untouched and failed + // there with PrismaClientValidationError. The schema now coerces + // number → string at the API boundary across every trigger path. + describe("BatchItemNDJSON", () => { + it("coerces a numeric concurrencyKey to a string", () => { + const result = BatchItemNDJSON.safeParse({ + index: 0, + task: "user-workflow-tick", + payload: { json: { userId: 51262 } }, + options: { concurrencyKey: 51262 }, + }); + + expect(result.success).toBe(true); + if (result.success) { + expect(result.data.options?.concurrencyKey).toBe("51262"); + } + }); + + it("accepts a string concurrencyKey unchanged", () => { + const result = BatchItemNDJSON.safeParse({ + index: 0, + task: "user-workflow-tick", + payload: { json: { userId: 51262 } }, + options: { concurrencyKey: "user-51262" }, + }); + + expect(result.success).toBe(true); + if (result.success) { + expect(result.data.options?.concurrencyKey).toBe("user-51262"); + } + }); + + it("accepts an item with no options", () => { + const result = BatchItemNDJSON.safeParse({ + index: 0, + task: "user-workflow-tick", + payload: { json: { userId: 51262 } }, + }); + + expect(result.success).toBe(true); + }); + + it("rejects a non-numeric, non-string concurrencyKey", () => { + const result = BatchItemNDJSON.safeParse({ + index: 0, + task: "user-workflow-tick", + options: { concurrencyKey: { nested: "object" } }, + }); + + expect(result.success).toBe(false); + }); + }); + + describe("BatchTriggerTaskItem", () => { + it("coerces a numeric concurrencyKey to a string", () => { + const result = BatchTriggerTaskItem.safeParse({ + task: "user-workflow-tick", + payload: { userId: 51262 }, + options: { concurrencyKey: 51262 }, + }); + + expect(result.success).toBe(true); + if (result.success) { + expect(result.data.options?.concurrencyKey).toBe("51262"); + } + }); + }); + + describe("TriggerTaskRequestBody", () => { + it("coerces a numeric concurrencyKey to a string", () => { + const result = TriggerTaskRequestBody.safeParse({ + payload: { userId: 51262 }, + options: { concurrencyKey: 51262 }, + }); + + expect(result.success).toBe(true); + if (result.success) { + expect(result.data.options?.concurrencyKey).toBe("51262"); + } + }); + }); +}); diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts index c8f7b95c97a..b47e41589e3 100644 --- a/packages/redis-worker/src/mollifier/buffer.test.ts +++ b/packages/redis-worker/src/mollifier/buffer.test.ts @@ -2,7 +2,52 @@ import { describe, expect, it } from "vitest"; import { BufferEntrySchema, serialiseSnapshot, deserialiseSnapshot } from "./schemas.js"; import { redisTest } from "@internal/testcontainers"; import { Logger } from "@trigger.dev/core/logger"; -import { MollifierBuffer } from "./buffer.js"; +import { + MollifierBuffer, + idempotencyLookupKeyFor, + makeIdempotencyClaimKey, + mollifierReconnectDelayMs, +} from "./buffer.js"; + +describe("mollifierReconnectDelayMs", () => { + it("grows linearly with the attempt count and caps the base at 1s", () => { + // random=()=>1 yields the top of the equal-jitter band (== base). + const top = (times: number) => mollifierReconnectDelayMs(times, () => 1); + expect(top(1)).toBe(50); + expect(top(4)).toBe(200); + expect(top(20)).toBe(1000); + // Past the cap the base stays at 1000. + expect(top(100)).toBe(1000); + }); + + it("applies equal jitter: result is uniform in [base/2, base]", () => { + // base for times=10 is 500, so the band is [250, 500]. + expect(mollifierReconnectDelayMs(10, () => 0)).toBe(250); // floor of band + expect(mollifierReconnectDelayMs(10, () => 0.999999)).toBe(500); // top of band + const mid = mollifierReconnectDelayMs(10, () => 0.5); + expect(mid).toBeGreaterThanOrEqual(250); + expect(mid).toBeLessThanOrEqual(500); + }); + + it("never exceeds the original fixed-schedule envelope (strictly an improvement)", () => { + for (const times of [1, 2, 5, 10, 20, 50]) { + const cap = Math.min(times * 50, 1000); + for (const r of [0, 0.25, 0.5, 0.75, 0.999999]) { + const delay = mollifierReconnectDelayMs(times, () => r); + expect(delay).toBeLessThanOrEqual(cap); + expect(delay).toBeGreaterThanOrEqual(Math.floor(cap / 2)); + } + } + }); + + it("decorrelates concurrent reconnects (distinct values across random draws)", () => { + const draws = [0.05, 0.3, 0.55, 0.8, 0.95].map((r) => + mollifierReconnectDelayMs(20, () => r), + ); + // Lockstep would collapse to a single value; jitter spreads them. + expect(new Set(draws).size).toBeGreaterThan(1); + }); +}); describe("schemas", () => { it("serialiseSnapshot then deserialiseSnapshot is identity for plain objects", () => { @@ -20,12 +65,32 @@ describe("schemas", () => { status: "QUEUED", attempts: "0", createdAt: "2026-05-11T10:00:00.000Z", + createdAtMicros: "1747044000000000", }; const parsed = BufferEntrySchema.parse(raw); expect(parsed.runId).toBe("run_abc"); expect(parsed.status).toBe("QUEUED"); expect(parsed.attempts).toBe(0); expect(parsed.createdAt).toBeInstanceOf(Date); + expect(parsed.createdAtMicros).toBe(1747044000000000); + }); + + it("BufferEntrySchema defaults createdAtMicros for entries written before the field existed", () => { + // Backward compat: an entry written by an accept Lua predating + // createdAtMicros (only the original 7 fields) must still parse on + // pop rather than being silently dropped. + const raw = { + runId: "run_old", + envId: "env_1", + orgId: "org_1", + payload: serialiseSnapshot({}), + status: "QUEUED", + attempts: "0", + createdAt: "2026-05-11T10:00:00.000Z", + // no createdAtMicros + }; + const parsed = BufferEntrySchema.parse(raw); + expect(parsed.createdAtMicros).toBe(0); }); it("BufferEntrySchema parses a FAILED entry with lastError", () => { @@ -37,6 +102,7 @@ describe("schemas", () => { status: "FAILED", attempts: "3", createdAt: "2026-05-11T10:00:00.000Z", + createdAtMicros: "1747044000000000", lastError: JSON.stringify({ code: "P2024", message: "connection lost" }), }; const parsed = BufferEntrySchema.parse(raw); @@ -52,7 +118,6 @@ describe("MollifierBuffer construction", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -68,7 +133,6 @@ describe("MollifierBuffer.accept", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -105,7 +169,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -132,7 +195,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -151,7 +213,6 @@ describe("MollifierBuffer.pop", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -169,24 +230,56 @@ describe("MollifierBuffer.pop", () => { }); describe("MollifierBuffer.ack", () => { - redisTest("ack deletes the entry", { timeout: 20_000 }, async ({ redisContainer }) => { + redisTest( + "ack marks entry materialised and applies the grace TTL — entry persists as a read-fallback safety net", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.pop("env_a"); + await buffer.ack("run_x"); + + const after = await buffer.getEntry("run_x"); + expect(after).not.toBeNull(); + expect(after!.materialised).toBe(true); + + // ack grace TTL is the only context where an entry hash gets + // an EXPIRE — accept no longer sets one. Should be at most 30s. + const ttl = await buffer.getEntryTtlSeconds("run_x"); + expect(ttl).toBeGreaterThan(0); + expect(ttl).toBeLessThanOrEqual(30); + } finally { + await buffer.close(); + } + }, + ); + + redisTest("ack on missing entry is a no-op", { timeout: 20_000 }, async ({ redisContainer }) => { const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { - await buffer.accept({ runId: "run_x", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer.pop("env_a"); - await buffer.ack("run_x"); - - const after = await buffer.getEntry("run_x"); - expect(after).toBeNull(); + await buffer.ack("run_ghost"); + const stored = await buffer.getEntry("run_ghost"); + expect(stored).toBeNull(); + // Critical: no partial hash created. + const raw = await buffer["redis"].hgetall("mollifier:entries:run_ghost"); + expect(Object.keys(raw)).toHaveLength(0); } finally { await buffer.close(); } @@ -204,13 +297,12 @@ describe("MollifierBuffer.pop orphan handling", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { - // Simulate a TTL-expired orphan: queue ref exists, entry hash does not. - await buffer["redis"].lpush("mollifier:queue:env_a", "run_orphan"); + // Simulate an evicted orphan: queue ref exists, entry hash does not. + await buffer["redis"].rpush("mollifier:queue:env_a", "run_orphan"); const popped = await buffer.pop("env_a"); expect(popped).toBeNull(); @@ -238,17 +330,17 @@ describe("MollifierBuffer.pop orphan handling", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { - // Layout (oldest-first, since RPOP takes from tail): orphan, valid, orphan. - // LPUSH puts items at the head, so to get RPOP order [orphan_a, valid, orphan_b] - // we LPUSH in reverse: orphan_b first, then valid, then orphan_a. - await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b"); + // Build the queue so RPOP (tail-first) yields: orphan_a, valid, + // orphan_b. accept LPUSHes "valid"; RPUSH puts orphan_a at the + // tail (popped first), LPUSH puts orphan_b at the head (popped + // last). First pop skips orphan_a, returns valid; orphan_b remains. await buffer.accept({ runId: "valid", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_a"); + await buffer["redis"].rpush("mollifier:queue:env_a", "orphan_a"); + await buffer["redis"].lpush("mollifier:queue:env_a", "orphan_b"); const popped = await buffer.pop("env_a"); expect(popped).not.toBeNull(); @@ -283,7 +375,6 @@ describe("MollifierBuffer.requeue", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -305,30 +396,43 @@ describe("MollifierBuffer.requeue", () => { }); describe("MollifierBuffer.fail", () => { - redisTest("fail transitions to FAILED and stores lastError", { timeout: 20_000 }, async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - entryTtlSeconds: 600, - logger: new Logger("test", "log"), - }); + redisTest( + "fail returns true and tears the entry down (drainer-terminal cleanup)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: the drainer's createFailedTaskRun has + // already written a SYSTEM_FAILURE PG row by the time we call + // fail(), so the entry hash is no longer load-bearing. fail + // returns true and removes the entry; without this teardown + // failed entries would accrete forever now that there's no + // accept-time TTL. The Lua also DELs the idempotency lookup so + // future retries with the same key go through to PG instead of + // hitting an orphan dedup record. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); - try { - await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" }); - await buffer.pop("env_a"); - const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" }); - expect(failed).toBe(true); + try { + await buffer.accept({ runId: "run_f", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.pop("env_a"); + const failed = await buffer.fail("run_f", { code: "VALIDATION", message: "boom" }); + expect(failed).toBe(true); - const entry = await buffer.getEntry("run_f"); - expect(entry!.status).toBe("FAILED"); - expect(entry!.lastError).toEqual({ code: "VALIDATION", message: "boom" }); - } finally { - await buffer.close(); - } - }); + // Entry hash is gone post-fail. + const entry = await buffer.getEntry("run_f"); + expect(entry).toBeNull(); + const raw = await buffer["redis"].hgetall("mollifier:entries:run_f"); + expect(Object.keys(raw)).toHaveLength(0); + } finally { + await buffer.close(); + } + }, + ); redisTest( "fail on missing entry is a no-op (returns false; no partial hash created)", @@ -340,7 +444,6 @@ describe("MollifierBuffer.fail", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -358,30 +461,94 @@ describe("MollifierBuffer.fail", () => { } }, ); + + redisTest( + "fail DELs the idempotency lookup so a same-key retry goes through to PG", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Symmetric with the ack path: the failMollifierEntry Lua reads the + // idempotencyLookupKey off the hash and DELs it. Without this, a + // post-fail retry with the same idempotency key would hit the + // orphaned dedup record and resolve to a run that no longer exists. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ + runId: "run_fk", + envId: "env_a", + orgId: "org_1", + payload: "{}", + idempotencyKey: "kf", + taskIdentifier: "t", + }); + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_a", + taskIdentifier: "t", + idempotencyKey: "kf", + }); + // Lookup exists before fail. + expect(await buffer["redis"].get(lookupKey)).toBe("run_fk"); + + await buffer.pop("env_a"); + const failed = await buffer.fail("run_fk", { code: "VALIDATION", message: "boom" }); + expect(failed).toBe(true); + + // Lookup is cleared, so the slot is reclaimable: a fresh accept + // with the same tuple succeeds rather than deduping. + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + const reaccept = await buffer.accept({ + runId: "run_fk2", + envId: "env_a", + orgId: "org_1", + payload: "{}", + idempotencyKey: "kf", + taskIdentifier: "t", + }); + expect(reaccept).toEqual({ kind: "accepted" }); + } finally { + await buffer.close(); + } + }, + ); }); describe("MollifierBuffer TTL", () => { - redisTest("entry has TTL applied on accept", { timeout: 20_000 }, async ({ redisContainer }) => { - const buffer = new MollifierBuffer({ - redisOptions: { - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }, - entryTtlSeconds: 600, - logger: new Logger("test", "log"), - }); + redisTest( + "entry has NO TTL applied on accept — drainer is the only cleanup path", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Regression guard for the design change: buffer entries must + // persist until the drainer ACKs or FAILs them. An accept-time + // EXPIRE would re-introduce the silent-loss-when-drainer-offline + // failure mode that the stale-entry alerting pipeline depends on + // *not* happening. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); - try { - await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" }); + try { + await buffer.accept({ runId: "run_t", envId: "env_a", orgId: "org_1", payload: "{}" }); - const ttl = await buffer.getEntryTtlSeconds("run_t"); - expect(ttl).toBeGreaterThan(0); - expect(ttl).toBeLessThanOrEqual(600); - } finally { - await buffer.close(); - } - }); + // Redis returns -1 when the key exists but has no TTL set. + const ttl = await buffer.getEntryTtlSeconds("run_t"); + expect(ttl).toBe(-1); + } finally { + await buffer.close(); + } + }, + ); }); describe("MollifierBuffer payload encoding", () => { @@ -395,7 +562,6 @@ describe("MollifierBuffer payload encoding", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -437,7 +603,6 @@ describe("MollifierBuffer.requeue on missing entry", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -458,16 +623,22 @@ describe("MollifierBuffer.requeue on missing entry", () => { describe("MollifierBuffer.requeue ordering", () => { redisTest( - "requeued entry is popped AFTER other queued entries on the same env (FIFO retry)", + "requeued entry gets retry priority (RPUSH to the RPOP/tail end), popping ahead of newer items", { timeout: 20_000 }, async ({ redisContainer }) => { + // LIST: accept LPUSHes at the head, pop RPOPs from the tail, so the + // first-accepted entry pops first. requeue RPUSHes back to the tail, + // giving a transiently failed entry *retry priority* — it pops next, + // ahead of newer queued items, rather than going to the back. (This + // is deliberately not FIFO relative to the rest of the queue.) + // `maxAttempts` in the drainer bounds the retry loop for a + // persistently failing entry (after which it goes to `fail`, not requeue). const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -481,12 +652,13 @@ describe("MollifierBuffer.requeue ordering", () => { await buffer.requeue("a"); + // a was RPUSHed back to the tail → pops next, ahead of b and c. const next = await buffer.pop("env_a"); - expect(next!.runId).toBe("b"); + expect(next!.runId).toBe("a"); const after = await buffer.pop("env_a"); - expect(after!.runId).toBe("c"); + expect(after!.runId).toBe("b"); const last = await buffer.pop("env_a"); - expect(last!.runId).toBe("a"); + expect(last!.runId).toBe("c"); } finally { await buffer.close(); } @@ -508,7 +680,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -530,7 +701,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -557,7 +727,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -585,7 +754,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -610,7 +778,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -638,7 +805,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -671,7 +837,6 @@ describe("MollifierBuffer.evaluateTrip", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -707,22 +872,21 @@ describe("MollifierBuffer entry lifecycle invariants", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); try { await buffer.accept({ runId: "run_ttl", envId: "env_a", orgId: "org_1", payload: "{}" }); const beforeTtl = await buffer.getEntryTtlSeconds("run_ttl"); - expect(beforeTtl).toBeGreaterThan(0); + expect(beforeTtl).toBe(-1); await buffer.pop("env_a"); const afterTtl = await buffer.getEntryTtlSeconds("run_ttl"); - // TTL must still be present (>0). Redis returns -1 if the key has no - // TTL — that's the leak shape we're guarding against. - expect(afterTtl).toBeGreaterThan(0); - expect(afterTtl).toBeLessThanOrEqual(beforeTtl); + // No TTL applied at any point during accept/pop — the entry + // persists until the drainer ACKs or FAILs. Returning -1 from + // Redis here is the expected steady state, not a leak. + expect(afterTtl).toBe(-1); } finally { await buffer.close(); } @@ -739,7 +903,6 @@ describe("MollifierBuffer entry lifecycle invariants", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -795,7 +958,6 @@ describe("MollifierBuffer.accept idempotency", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -813,8 +975,8 @@ describe("MollifierBuffer.accept idempotency", () => { payload: serialiseSnapshot({ first: false }), }); - expect(first).toBe(true); - expect(second).toBe(false); + expect(first).toEqual({ kind: "accepted" }); + expect(second).toEqual({ kind: "duplicate_run_id" }); // First payload preserved; second was a no-op. const stored = await buffer.getEntry("run_dup"); @@ -844,7 +1006,6 @@ describe("MollifierBuffer.accept idempotency", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -855,7 +1016,7 @@ describe("MollifierBuffer.accept idempotency", () => { expect(stored!.status).toBe("DRAINING"); const dup = await buffer.accept({ runId: "run_dr", envId: "env_a", orgId: "org_1", payload: "{}" }); - expect(dup).toBe(false); + expect(dup).toEqual({ kind: "duplicate_run_id" }); const afterDup = await buffer.getEntry("run_dr"); expect(afterDup!.status).toBe("DRAINING"); // unchanged @@ -866,16 +1027,21 @@ describe("MollifierBuffer.accept idempotency", () => { ); redisTest( - "accept refused while existing entry is FAILED", + "runId slot is reclaimable after fail tears the entry down", { timeout: 20_000 }, async ({ redisContainer }) => { + // Post-TTL-drop design: fail() deletes the entry hash because + // the SYSTEM_FAILURE PG row is the canonical record of the + // failure. The runId slot is therefore free for a fresh accept + // afterwards — runIds are server-generated CUIDs and don't + // collide in practice, but the contract pinning here documents + // that a re-acceptance does NOT see a phantom "FAILED" entry. const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -883,15 +1049,20 @@ describe("MollifierBuffer.accept idempotency", () => { await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" }); await buffer.pop("env_a"); await buffer.fail("run_fl", { code: "VALIDATION", message: "boom" }); - const stored = await buffer.getEntry("run_fl"); - expect(stored!.status).toBe("FAILED"); - const dup = await buffer.accept({ runId: "run_fl", envId: "env_a", orgId: "org_1", payload: "{}" }); - expect(dup).toBe(false); + // Entry hash gone after fail (see "fail returns true and tears + // the entry down" — this test pins the accept-side effect). + expect(await buffer.getEntry("run_fl")).toBeNull(); - const afterDup = await buffer.getEntry("run_fl"); - expect(afterDup!.status).toBe("FAILED"); // unchanged - expect(afterDup!.lastError).toEqual({ code: "VALIDATION", message: "boom" }); + const fresh = await buffer.accept({ + runId: "run_fl", + envId: "env_a", + orgId: "org_1", + payload: '{"fresh":true}', + }); + expect(fresh).toEqual({ kind: "accepted" }); + const after = await buffer.getEntry("run_fl"); + expect(after?.status).toBe("QUEUED"); } finally { await buffer.close(); } @@ -899,16 +1070,21 @@ describe("MollifierBuffer.accept idempotency", () => { ); redisTest( - "re-accept after ack works (terminal entry can be re-accepted)", + "accept refused while a previously-acked (materialised) entry is still inside its grace TTL", { timeout: 20_000 }, async ({ redisContainer }) => { + // After ack, the entry hash persists for the grace window as a + // read-fallback safety net. RunIds are server-generated and + // never collide in practice, but defense-in-depth: accept refuses + // while *any* entry exists for the runId, including materialised + // ones. The entry hash's TTL is now ~30s instead of the original + // entryTtlSeconds. const buffer = new MollifierBuffer({ redisOptions: { host: redisContainer.getHost(), port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -922,7 +1098,6 @@ describe("MollifierBuffer.accept idempotency", () => { await buffer.pop("env_a"); await buffer.ack("run_x"); - // Entry is gone — re-accept should succeed. const reAccept = await buffer.accept({ runId: "run_x", envId: "env_a", @@ -930,8 +1105,11 @@ describe("MollifierBuffer.accept idempotency", () => { payload: "{}", }); - expect(first).toBe(true); - expect(reAccept).toBe(true); + expect(first).toEqual({ kind: "accepted" }); + expect(reAccept).toEqual({ kind: "duplicate_run_id" }); + + const stored = await buffer.getEntry("run_x"); + expect(stored!.materialised).toBe(true); } finally { await buffer.close(); } @@ -950,7 +1128,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -976,7 +1153,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -1006,7 +1182,6 @@ describe("MollifierBuffer envs set lifecycle", () => { port: redisContainer.getPort(), password: redisContainer.getPassword(), }, - entryTtlSeconds: 600, logger: new Logger("test", "log"), }); @@ -1025,3 +1200,1527 @@ describe("MollifierBuffer envs set lifecycle", () => { }, ); }); + +describe("MollifierBuffer idempotency lookup", () => { + redisTest( + "accept with idempotencyKey + taskIdentifier writes the lookup with no TTL", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: the idempotency lookup has no TTL, so it + // can never expire ahead of the entry hash (which used to cause + // a dedup-drift bug — once the lookup expired but the entry + // didn't, a retry with the same key would create a *new* + // buffered run for the same key). The drainer's ack and fail + // both DEL the lookup as part of teardown. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.accept({ + runId: "ri1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-1", + taskIdentifier: "my-task", + }); + expect(result).toEqual({ kind: "accepted" }); + + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_i", + taskIdentifier: "my-task", + idempotencyKey: "ikey-1", + }); + const stored = await buffer["redis"].get(lookupKey); + expect(stored).toBe("ri1"); + // -1 = key exists with no TTL set. + expect(await buffer["redis"].ttl(lookupKey)).toBe(-1); + + const entry = await buffer.getEntry("ri1"); + expect(entry!.idempotencyLookupKey).toBe(lookupKey); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "second accept with same (env, task, idempotencyKey) returns duplicate_idempotency with the winner's runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const first = await buffer.accept({ + runId: "ri-a", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-2", + taskIdentifier: "my-task", + }); + const second = await buffer.accept({ + runId: "ri-b", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ikey-2", + taskIdentifier: "my-task", + }); + + expect(first).toEqual({ kind: "accepted" }); + expect(second).toEqual({ + kind: "duplicate_idempotency", + existingRunId: "ri-a", + }); + + // The loser's runId entry was never created. + const loserEntry = await buffer.getEntry("ri-b"); + expect(loserEntry).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency hits when the run is buffered", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rl1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "k1", + taskIdentifier: "t", + }); + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "k1", + }); + expect(found).toBe("rl1"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency returns null when no lookup is bound", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "absent", + }); + expect(found).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "lookupIdempotency self-heals when the lookup points at an expired entry", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + // Plant a stale lookup pointing at a non-existent entry. + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "stale", + }); + await buffer["redis"].set(lookupKey, "rl-stale", "EX", 600); + expect(await buffer["redis"].get(lookupKey)).toBe("rl-stale"); + + const found = await buffer.lookupIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "stale", + }); + expect(found).toBeNull(); + // Self-healed. + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "ack DELs the idempotency lookup along with marking materialised", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "ra1", + envId: "env_i", + orgId: "org_1", + payload: "{}", + idempotencyKey: "ka", + taskIdentifier: "t", + }); + await buffer.pop("env_i"); + await buffer.ack("ra1"); + + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "ka", + }); + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + const entry = await buffer.getEntry("ra1"); + expect(entry!.materialised).toBe(true); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "resetIdempotency clears snapshot fields + lookup; returns the runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rr1", + envId: "env_i", + orgId: "org_1", + payload: serialiseSnapshot({ + idempotencyKey: "kr", + idempotencyKeyExpiresAt: "2026-12-01T00:00:00Z", + other: "field", + }), + idempotencyKey: "kr", + taskIdentifier: "t", + }); + + const result = await buffer.resetIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "kr", + }); + expect(result.clearedRunId).toBe("rr1"); + + // Lookup is gone. + const lookupKey = idempotencyLookupKeyFor({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "kr", + }); + expect(await buffer["redis"].get(lookupKey)).toBeNull(); + + // Snapshot's idempotency fields are nulled, other fields kept. + const entry = await buffer.getEntry("rr1"); + const payload = JSON.parse(entry!.payload) as { + idempotencyKey: unknown; + idempotencyKeyExpiresAt: unknown; + other: string; + }; + expect(payload.idempotencyKey).toBeNull(); + expect(payload.idempotencyKeyExpiresAt).toBeNull(); + expect(payload.other).toBe("field"); + expect(entry!.idempotencyLookupKey).toBe(""); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "resetIdempotency returns null when nothing is bound", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.resetIdempotency({ + envId: "env_i", + taskIdentifier: "t", + idempotencyKey: "absent", + }); + expect(result.clearedRunId).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "resetIdempotency also clears the pre-gate claim slot", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // The lookup and the cross-store claim are two pointers for the same + // key. Reset must reopen both — otherwise a resolved/pending claim + // keeps deduping new triggers for the rest of its TTL even though + // the binding was reset. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + const tuple = { envId: "env_rc", taskIdentifier: "t", idempotencyKey: "krc" }; + try { + // A resolved claim is in place... + await buffer.claimIdempotency({ ...tuple, token: "owner", ttlSeconds: 600 }); + await buffer.publishClaim({ ...tuple, token: "owner", runId: "rc1", ttlSeconds: 600 }); + expect(await buffer.readClaim(tuple)).toEqual({ kind: "resolved", runId: "rc1" }); + // ...alongside a buffered run holding the lookup. + await buffer.accept({ + runId: "rc1", + envId: "env_rc", + orgId: "org_1", + payload: serialiseSnapshot({}), + idempotencyKey: "krc", + taskIdentifier: "t", + }); + + await buffer.resetIdempotency(tuple); + + // Both the lookup and the claim are gone. + expect(await buffer.lookupIdempotency(tuple)).toBeNull(); + expect(await buffer.readClaim(tuple)).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "accept self-heals a stale lookup: a new run rebinds when the bound entry was evicted", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // If an entry hash is evicted (maxmemory) but its idempotency lookup + // survives, a fresh accept with the same key must NOT return the dead + // runId (which would block the key forever) — it should rebind to the + // new run and accept it. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + const idem = { idempotencyKey: "kheal", taskIdentifier: "t" }; + try { + await buffer.accept({ runId: "heal_old", envId: "env_h", orgId: "org_1", payload: "{}", ...idem }); + // Simulate eviction of the entry hash while the lookup survives. + await buffer["redis"].del("mollifier:entries:heal_old"); + const lookupKey = idempotencyLookupKeyFor({ envId: "env_h", ...idem }); + expect(await buffer["redis"].get(lookupKey)).toBe("heal_old"); + + // A fresh accept with the same key rebinds rather than deduping + // onto the dead run. + const result = await buffer.accept({ + runId: "heal_new", + envId: "env_h", + orgId: "org_1", + payload: "{}", + ...idem, + }); + expect(result).toEqual({ kind: "accepted" }); + expect(await buffer["redis"].get(lookupKey)).toBe("heal_new"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "accept still dedups when the bound entry is live", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // The self-heal must not weaken normal dedup: a live bound entry + // still wins, and the loser gets its runId back. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + const idem = { idempotencyKey: "klive", taskIdentifier: "t" }; + try { + await buffer.accept({ runId: "live_win", envId: "env_h", orgId: "org_1", payload: "{}", ...idem }); + const result = await buffer.accept({ + runId: "live_lose", + envId: "env_h", + orgId: "org_1", + payload: "{}", + ...idem, + }); + expect(result).toEqual({ kind: "duplicate_idempotency", existingRunId: "live_win" }); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.casSetMetadata", () => { + redisTest( + "applies when expectedVersion matches; increments version; updates payload", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas1", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }), + }); + const result = await buffer.casSetMetadata({ + runId: "cas1", + expectedVersion: 0, + newMetadata: '{"v":2}', + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "applied", newVersion: 1 }); + + const entry = await buffer.getEntry("cas1"); + expect(entry!.metadataVersion).toBe(1); + const payload = JSON.parse(entry!.payload) as { metadata: string }; + expect(payload.metadata).toBe('{"v":2}'); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns version_conflict when expectedVersion is stale", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas2", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({}), + }); + await buffer.casSetMetadata({ + runId: "cas2", + expectedVersion: 0, + newMetadata: '{"a":1}', + newMetadataType: "application/json", + }); + + // Second write with stale expectedVersion = 0 must conflict. + const result = await buffer.casSetMetadata({ + runId: "cas2", + expectedVersion: 0, + newMetadata: '{"a":2}', + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns not_found / busy on missing or terminal entries", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const nf = await buffer.casSetMetadata({ + runId: "absent", + expectedVersion: 0, + newMetadata: "{}", + newMetadataType: "application/json", + }); + expect(nf).toEqual({ kind: "not_found" }); + + await buffer.accept({ + runId: "cas3", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({}), + }); + await buffer.pop("env_c"); + const busy = await buffer.casSetMetadata({ + runId: "cas3", + expectedVersion: 0, + newMetadata: "{}", + newMetadataType: "application/json", + }); + expect(busy).toEqual({ kind: "busy" }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns busy on a materialised entry (post-ack grace window)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // The guard rejects `materialised == 'true'` as well as non-QUEUED + // status. After ack the entry lingers QUEUED-but-materialised for + // the grace TTL; a CAS in that window must not mutate it. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas_mat", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({}), + }); + await buffer.pop("env_c"); + await buffer.ack("cas_mat"); + + const result = await buffer.casSetMetadata({ + runId: "cas_mat", + expectedVersion: 0, + newMetadata: "{}", + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "busy" }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "a mutateSnapshot set_metadata bumps metadataVersion so an in-flight CAS conflicts", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // CAS isolation: a reader fetches version N, then a concurrent + // mutateSnapshot(set_metadata) overwrites the metadata. The reader's + // CAS at expectedVersion=N must NOT silently win — both paths write + // payload.metadata, so set_metadata bumps the same counter the CAS + // is gated on. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "cas_int", + envId: "env_c", + orgId: "org_1", + payload: serialiseSnapshot({ metadata: '{"v":0}', metadataType: "application/json" }), + }); + // Reader observes version 0. + const before = await buffer.getEntry("cas_int"); + expect(before!.metadataVersion).toBe(0); + + // Concurrent snapshot mutation writes metadata + bumps version. + const mutated = await buffer.mutateSnapshot("cas_int", { + type: "set_metadata", + metadata: '{"v":1}', + metadataType: "application/json", + }); + expect(mutated).toBe("applied_to_snapshot"); + const mid = await buffer.getEntry("cas_int"); + expect(mid!.metadataVersion).toBe(1); + + // The reader's stale CAS conflicts instead of clobbering. + const result = await buffer.casSetMetadata({ + runId: "cas_int", + expectedVersion: 0, + newMetadata: '{"v":2}', + newMetadataType: "application/json", + }); + expect(result).toEqual({ kind: "version_conflict", currentVersion: 1 }); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.mutateSnapshot", () => { + redisTest( + "returns not_found when no entry exists for the runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const result = await buffer.mutateSnapshot("nope", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("not_found"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "append_tags on QUEUED entry appends and dedupes", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r1", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: ["existing"] }), + }); + const first = await buffer.mutateSnapshot("r1", { + type: "append_tags", + tags: ["existing", "new"], + }); + expect(first).toBe("applied_to_snapshot"); + + const entry = await buffer.getEntry("r1"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags).toEqual(["existing", "new"]); + + // Second mutation appends without duplicating + const second = await buffer.mutateSnapshot("r1", { + type: "append_tags", + tags: ["new", "third"], + }); + expect(second).toBe("applied_to_snapshot"); + const e2 = await buffer.getEntry("r1"); + const p2 = JSON.parse(e2!.payload) as { tags: string[] }; + expect(p2.tags).toEqual(["existing", "new", "third"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "append_tags creates payload.tags when absent", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r2", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r2", { + type: "append_tags", + tags: ["a", "b"], + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r2"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags).toEqual(["a", "b"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "append_tags rejects with limit_exceeded when maxTags would be exceeded, writing nothing", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r_cap", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: ["a", "b"] }), + }); + + // 2 existing + 2 new = 4 deduped > cap of 3 → rejected, nothing written. + const rejected = await buffer.mutateSnapshot("r_cap", { + type: "append_tags", + tags: ["c", "d"], + maxTags: 3, + }); + expect(rejected).toBe("limit_exceeded"); + const afterReject = await buffer.getEntry("r_cap"); + const rejPayload = JSON.parse(afterReject!.payload) as { tags: string[] }; + expect(rejPayload.tags).toEqual(["a", "b"]); + + // Dedup keeps the count under the cap → applied. + const applied = await buffer.mutateSnapshot("r_cap", { + type: "append_tags", + tags: ["a", "c"], + maxTags: 3, + }); + expect(applied).toBe("applied_to_snapshot"); + const afterApply = await buffer.getEntry("r_cap"); + const appPayload = JSON.parse(afterApply!.payload) as { tags: string[] }; + expect(appPayload.tags).toEqual(["a", "b", "c"]); + + // Landing exactly on the cap is allowed. + const exact = await buffer.mutateSnapshot("r_cap", { + type: "append_tags", + tags: ["a", "b", "c"], + maxTags: 3, + }); + expect(exact).toBe("applied_to_snapshot"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "set_metadata replaces metadata + metadataType (last-write-wins)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r3", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ metadata: '{"v":1}', metadataType: "application/json" }), + }); + const result = await buffer.mutateSnapshot("r3", { + type: "set_metadata", + metadata: '{"v":2}', + metadataType: "application/json", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r3"); + const payload = JSON.parse(entry!.payload) as { + metadata: string; + metadataType: string; + }; + expect(payload.metadata).toBe('{"v":2}'); + expect(payload.metadataType).toBe("application/json"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "set_delay sets payload.delayUntil", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r4", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r4", { + type: "set_delay", + delayUntil: "2026-06-01T00:00:00.000Z", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r4"); + const payload = JSON.parse(entry!.payload) as { delayUntil: string }; + expect(payload.delayUntil).toBe("2026-06-01T00:00:00.000Z"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "mark_cancelled stamps cancelledAt + cancelReason", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "r5", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ taskId: "t" }), + }); + const result = await buffer.mutateSnapshot("r5", { + type: "mark_cancelled", + cancelledAt: "2026-05-19T12:00:00.000Z", + cancelReason: "user-initiated", + }); + expect(result).toBe("applied_to_snapshot"); + const entry = await buffer.getEntry("r5"); + const payload = JSON.parse(entry!.payload) as { + cancelledAt: string; + cancelReason: string; + }; + expect(payload.cancelledAt).toBe("2026-05-19T12:00:00.000Z"); + expect(payload.cancelReason).toBe("user-initiated"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns busy when entry is DRAINING", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rd", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + const result = await buffer.mutateSnapshot("rd", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("busy"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns not_found when entry was FAILED (drainer-terminal teardown)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Post-TTL-drop design: fail() DELs the entry hash because the + // drainer has already written the canonical SYSTEM_FAILURE PG + // row, and without an accept-time TTL we'd otherwise accrete + // failed entries in Redis forever. Late mutations against a + // failed run therefore see `not_found`, matching the same shape + // they'd get for any other already-cleaned-up runId. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rf", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + await buffer.fail("rf", { code: "X", message: "boom" }); + const result = await buffer.mutateSnapshot("rf", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("not_found"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "returns busy when entry is materialised (post-ack grace window)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rm", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + await buffer.pop("env_m"); + await buffer.ack("rm"); + const result = await buffer.mutateSnapshot("rm", { + type: "append_tags", + tags: ["x"], + }); + expect(result).toBe("busy"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "Lua atomicity serialises concurrent mutations per-runId", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.accept({ + runId: "rcc", + envId: "env_m", + orgId: "org_1", + payload: serialiseSnapshot({ tags: [] }), + }); + + const tagsToAdd = Array.from({ length: 50 }, (_, i) => `t${i}`); + await Promise.all( + tagsToAdd.map((t) => buffer.mutateSnapshot("rcc", { type: "append_tags", tags: [t] })), + ); + + const entry = await buffer.getEntry("rcc"); + const payload = JSON.parse(entry!.payload) as { tags: string[] }; + expect(payload.tags.sort()).toEqual(tagsToAdd.sort()); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer LIST storage", () => { + redisTest( + "queue key is a LIST; createdAtMicros is a hash field, not a sort key", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "z1", envId: "env_z", orgId: "org_1", payload: "{}" }); + + // LIST-only commands must succeed against the queue key. + const len = await buffer["redis"].llen("mollifier:queue:env_z"); + expect(len).toBe(1); + const members = await buffer["redis"].lrange("mollifier:queue:env_z", 0, -1); + expect(members).toEqual(["z1"]); + + // The queue holds no score — it's not a ZSET. + await expect(buffer["redis"].zscore("mollifier:queue:env_z", "z1")).rejects.toThrow(); + + // createdAtMicros lives on the entry hash (for dwell metrics) and + // is plausibly recent (within the last minute, as microseconds). + const micros = Number(await buffer["redis"].hget("mollifier:entries:z1", "createdAtMicros")); + const nowMicros = Date.now() * 1000; + expect(micros).toBeGreaterThan(nowMicros - 60_000_000); + expect(micros).toBeLessThanOrEqual(nowMicros + 1_000_000); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "pop returns entries in FIFO insertion order (independent of member lex order)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + // Accept in reverse-lex order to prove ordering is by insertion + // (LPUSH head / RPOP tail), not by member value. + await buffer.accept({ runId: "zzz", envId: "env_o", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "mmm", envId: "env_o", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "aaa", envId: "env_o", orgId: "org_1", payload: "{}" }); + + const first = await buffer.pop("env_o"); + expect(first!.runId).toBe("zzz"); + const second = await buffer.pop("env_o"); + expect(second!.runId).toBe("mmm"); + const third = await buffer.pop("env_o"); + expect(third!.runId).toBe("aaa"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "requeue re-enqueues to the LIST; createdAt is immutable across retries", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "rq", envId: "env_rq", orgId: "org_1", payload: "{}" }); + const originalMicros = await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"); + + await buffer.pop("env_rq"); + // Queue is empty after the pop. + expect(await buffer["redis"].llen("mollifier:queue:env_rq")).toBe(0); + + await buffer.requeue("rq"); + + // Back on the LIST, and createdAtMicros is unchanged. + expect(await buffer["redis"].lrange("mollifier:queue:env_rq", 0, -1)).toEqual(["rq"]); + const newMicros = await buffer["redis"].hget("mollifier:entries:rq", "createdAtMicros"); + expect(newMicros).toBe(originalMicros); + } finally { + await buffer.close(); + } + }, + ); +}); + +describe("MollifierBuffer.listEntriesForEnv", () => { + redisTest( + "returns up to maxCount entries from the queue without consuming them", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "r1", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "r2", envId: "env_a", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "r3", envId: "env_a", orgId: "org_1", payload: "{}" }); + + const entries = await buffer.listEntriesForEnv("env_a", 2); + expect(entries).toHaveLength(2); + const runIds = entries.map((e) => e.runId); + expect(new Set(runIds).size).toBe(2); + for (const id of runIds) expect(["r1", "r2", "r3"]).toContain(id); + + // Non-destructive: the drainer can still pop all three. + const popped: string[] = []; + for (let i = 0; i < 3; i++) { + const entry = await buffer.pop("env_a"); + if (entry) popped.push(entry.runId); + } + expect(new Set(popped)).toEqual(new Set(["r1", "r2", "r3"])); + } finally { + await buffer.close(); + } + }, + ); + + redisTest("returns empty array when env queue is empty", { timeout: 20_000 }, async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + expect(await buffer.listEntriesForEnv("env_empty", 10)).toEqual([]); + } finally { + await buffer.close(); + } + }); + + redisTest( + "skips entries whose hash was torn down between LRANGE and HGETALL (concurrent drainer ack/fail race)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // The drainer can RPOP + ack/fail an entry between our LRANGE and + // the per-runId HGETALL — its DEL of the entry hash races our read. + // listEntriesForEnv must tolerate this: skip the runId, return + // every other entry. This is exercised here by simulating the race: + // LPUSH a runId onto the queue without an accompanying entry hash. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + await buffer.accept({ runId: "r_a", envId: "env_race", orgId: "org_1", payload: "{}" }); + await buffer.accept({ runId: "r_b", envId: "env_race", orgId: "org_1", payload: "{}" }); + + // Tear down r_a's hash to simulate the drainer winning the race. + // The runId stays on the queue LIST but its entry hash is gone — + // listEntriesForEnv must tolerate the missing HGETALL result. + await buffer["redis"].del("mollifier:entries:r_a"); + + const entries = await buffer.listEntriesForEnv("env_race", 10); + expect(entries.map((e) => e.runId).sort()).toEqual(["r_b"]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest("maxCount <= 0 returns empty without hitting redis", { timeout: 20_000 }, async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + expect(await buffer.listEntriesForEnv("env_a", 0)).toEqual([]); + expect(await buffer.listEntriesForEnv("env_a", -5)).toEqual([]); + } finally { + await buffer.close(); + } + }); +}); + +// Composite-key safety. The Redis-key builders concatenate +// `(envId, taskIdentifier, idempotencyKey)` with `:` separators; without +// per-segment encoding, `taskIdentifier="a:b"` and `idempotencyKey="x"` +// would map to the same key as `taskIdentifier="a"` and +// `idempotencyKey="b:x"`. base64url encoding has no `:` in its alphabet, +// so the encoded keys are unique per tuple. +describe("MollifierBuffer composite-key encoding (collision resistance)", () => { + redisTest( + "two accepts whose unencoded keys would alias don't collide on the idempotency lookup", + { timeout: 30_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + // Aliased tuples under raw `:` concatenation: + // env_x : "a:b" : "x" → "mollifier:idempotency:env_x:a:b:x" + // env_x : "a" : "b:x" → "mollifier:idempotency:env_x:a:b:x" + const r1 = await buffer.accept({ + runId: "ck_run_1", + envId: "env_x", + orgId: "org_1", + payload: "{}", + taskIdentifier: "a:b", + idempotencyKey: "x", + }); + const r2 = await buffer.accept({ + runId: "ck_run_2", + envId: "env_x", + orgId: "org_1", + payload: "{}", + taskIdentifier: "a", + idempotencyKey: "b:x", + }); + // Both accepted — no false-positive collision. + expect(r1).toEqual({ kind: "accepted" }); + expect(r2).toEqual({ kind: "accepted" }); + + // Each tuple resolves to its own runId. + const hit1 = await buffer.lookupIdempotency({ + envId: "env_x", + taskIdentifier: "a:b", + idempotencyKey: "x", + }); + const hit2 = await buffer.lookupIdempotency({ + envId: "env_x", + taskIdentifier: "a", + idempotencyKey: "b:x", + }); + expect(hit1).toBe("ck_run_1"); + expect(hit2).toBe("ck_run_2"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "encoded lookup key contains no ':' separator beyond the namespace", + { timeout: 20_000 }, + async () => { + // Pure-function test — verifies the encoding bijection without + // needing a live buffer. Re-uses the redisTest fixture for + // parallelism with other describe blocks but doesn't touch redis. + const key = idempotencyLookupKeyFor({ + envId: "env_x", + taskIdentifier: "a:b", + idempotencyKey: "x:y:z", + }); + // namespace prefix is exactly `mollifier:idempotency:` (two `:`), + // then three base64url segments separated by two more `:` — + // never the customer-supplied colons. + const colonCount = key.split(":").length - 1; + expect(colonCount).toBe(4); + // base64url alphabet has no `:`, `+`, `/`, or `=`. + const afterNamespace = key.slice("mollifier:idempotency:".length); + expect(afterNamespace).toMatch(/^[A-Za-z0-9_\-]+:[A-Za-z0-9_\-]+:[A-Za-z0-9_\-]+$/); + }, + ); +}); + +// Pre-gate claim ownership protection. The claim slot stores +// `"pending:"` so publish and release compare-and-act on the +// caller's token — a late release from a previous claimant whose TTL +// expired cannot erase a new owner's claim. +describe("MollifierBuffer pre-gate claim — ownership token safety", () => { + const claimInput = { + envId: "env_c", + taskIdentifier: "task_c", + idempotencyKey: "key_c", + }; + + redisTest( + "claimIdempotency: first caller gets 'claimed', second concurrent caller gets 'pending'", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + const first = await buffer.claimIdempotency({ + ...claimInput, + token: "token-A", + ttlSeconds: 30, + }); + expect(first.kind).toBe("claimed"); + + // Second concurrent caller with a different token sees pending. + const second = await buffer.claimIdempotency({ + ...claimInput, + token: "token-B", + ttlSeconds: 30, + }); + expect(second.kind).toBe("pending"); + + // readClaim distinguishes pending from resolved without leaking + // the token to the loser. + const read = await buffer.readClaim(claimInput); + expect(read?.kind).toBe("pending"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "releaseClaim with the wrong token is a no-op (compare-and-delete)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.claimIdempotency({ ...claimInput, token: "owner", ttlSeconds: 30 }); + + // Pretend a stale claimant fires a release with their old token. + await buffer.releaseClaim({ ...claimInput, token: "stale-impostor" }); + + // The owner's claim survives. + const stillThere = await buffer.readClaim(claimInput); + expect(stillThere?.kind).toBe("pending"); + + // The owner can still release. + await buffer.releaseClaim({ ...claimInput, token: "owner" }); + expect(await buffer.readClaim(claimInput)).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "publishClaim with the wrong token is a no-op and returns false", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + await buffer.claimIdempotency({ ...claimInput, token: "owner", ttlSeconds: 30 }); + + const wrongTokenPublish = await buffer.publishClaim({ + ...claimInput, + token: "stale-impostor", + runId: "imposter-run", + ttlSeconds: 60, + }); + expect(wrongTokenPublish).toBe(false); + + // Claim slot unchanged. + const stillPending = await buffer.readClaim(claimInput); + expect(stillPending?.kind).toBe("pending"); + + const goodPublish = await buffer.publishClaim({ + ...claimInput, + token: "owner", + runId: "real-run", + ttlSeconds: 60, + }); + expect(goodPublish).toBe(true); + + const resolved = await buffer.readClaim(claimInput); + expect(resolved).toEqual({ kind: "resolved", runId: "real-run" }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "regression: stale release after TTL expiry does NOT erase a fresh claim", + { timeout: 20_000 }, + async ({ redisContainer }) => { + // Hazard from CodeRabbit r3290070707: + // 1. Claimant A SETNXs the slot with their token, then stalls. + // 2. TTL expires, slot vanishes. + // 3. Claimant B SETNXs the slot with a DIFFERENT token. + // 4. Claimant A finally finishes (or errors) and calls + // releaseClaim with their original token. + // Without compare-and-delete, A's release would wipe B's slot and + // any concurrent customer of B's idempotency key would see "no + // claim" and re-issue, breaking same-key dedup. + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + try { + // Step 1: A claims with token "A". + const a = await buffer.claimIdempotency({ + ...claimInput, + token: "A", + ttlSeconds: 1, // short TTL to simulate expiry quickly + }); + expect(a.kind).toBe("claimed"); + + // Step 2: simulate TTL expiry — DEL the slot directly so the + // test doesn't rely on wall-clock sleeping. Targets the same key + // the buffer writes via the exported builder, so a key-format + // change can't silently make this DEL miss. + await buffer["redis"].del(makeIdempotencyClaimKey(claimInput)); + + // Step 3: B claims with token "B". + const b = await buffer.claimIdempotency({ + ...claimInput, + token: "B", + ttlSeconds: 30, + }); + expect(b.kind).toBe("claimed"); + + // Step 4: A's late release. MUST be a no-op. + await buffer.releaseClaim({ ...claimInput, token: "A" }); + + // B's claim survives intact. + const after = await buffer.readClaim(claimInput); + expect(after?.kind).toBe("pending"); + + // B can still publish. + const published = await buffer.publishClaim({ + ...claimInput, + token: "B", + runId: "B-run", + ttlSeconds: 60, + }); + expect(published).toBe(true); + } finally { + await buffer.close(); + } + }, + ); +}); diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts index f739e3ff362..71920bb4ff4 100644 --- a/packages/redis-worker/src/mollifier/buffer.ts +++ b/packages/redis-worker/src/mollifier/buffer.ts @@ -10,25 +10,119 @@ import { BufferEntry, BufferEntrySchema } from "./schemas.js"; export type MollifierBufferOptions = { redisOptions: RedisOptions; - entryTtlSeconds: number; logger?: Logger; }; +// Grace TTL applied to the entry hash on drainer ack. The entry survives +// this long after materialisation so direct reads (retrieve, trace, etc.) +// have a safety net while PG replica lag settles. +const ACK_GRACE_TTL_SECONDS = 30; + +// ioredis reconnect backoff for the mollifier buffer client. The base +// grows linearly with the attempt count and is capped at 1s (the same +// envelope as the previous fixed `Math.min(times * 50, 1000)` schedule). +// We then apply equal jitter — a uniform pick in `[base/2, base]` — so a +// fleet of webapp instances reconnecting after the same Redis blip don't +// retry in lockstep and stampede Redis on recovery (thundering herd). +// Because the jittered value never exceeds the original cap, this is never +// slower than before — just decorrelated. Mirrors the jittered-backoff +// approach the mutate-fallback wait loop adopted for the same reason. +export function mollifierReconnectDelayMs( + times: number, + random: () => number = Math.random, +): number { + const base = Math.min(times * 50, 1000); + const half = Math.floor(base / 2); + return half + Math.round(random() * (base - half)); +} + +export type SnapshotPatch = + // `maxTags`, when set, caps the deduped tag count atomically inside the + // Lua: if appending would push the snapshot over the limit the patch is + // rejected ("limit_exceeded") and nothing is written, mirroring the + // PG-path MAX_TAGS_PER_RUN check so a buffered run can't accumulate more + // tags than the trigger validator would have allowed at creation. + | { type: "append_tags"; tags: string[]; maxTags?: number } + | { type: "set_metadata"; metadata: string; metadataType: string } + | { type: "set_delay"; delayUntil: string } + | { type: "mark_cancelled"; cancelledAt: string; cancelReason?: string }; + +export type MutateSnapshotResult = + | "applied_to_snapshot" + | "not_found" + | "busy" + | "limit_exceeded"; + +export type CasSetMetadataResult = + | { kind: "applied"; newVersion: number } + | { kind: "version_conflict"; currentVersion: number } + | { kind: "not_found" } + | { kind: "busy" }; + +export type AcceptResult = + | { kind: "accepted" } + | { kind: "duplicate_run_id" } + | { kind: "duplicate_idempotency"; existingRunId: string }; + +export type IdempotencyLookupInput = { + envId: string; + taskIdentifier: string; + idempotencyKey: string; +}; + +// Reversible encoding for Redis-key segments. The composite-key builders +// concatenate `envId`, `taskIdentifier`, and `idempotencyKey` with `:` +// separators; if any segment contains a literal `:` (envId is internal +// and `:`-free, but taskIdentifier and idempotencyKey are +// customer-supplied) different tuples would map to the same Redis key +// and dedupe the wrong run. base64url has no `:` in its alphabet and is +// bijective on the input string, so the encoded keys are +// collision-free. +function encodeKeyPart(value: string): string { + return Buffer.from(value, "utf8").toString("base64url"); +} + +// Exported so tests can compute the same Redis key the buffer writes +// without hard-coding the encoding (which is a buffer-internal detail). +export function idempotencyLookupKeyFor(input: IdempotencyLookupInput): string { + return `mollifier:idempotency:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`; +} + +// Pre-gate claim key namespace, distinct from `mollifier:idempotency` so the +// existing buffer-side dedup stays isolated. The claim is the +// authoritative cross-store "this idempotency key is in flight or +// resolved" pointer used by the trigger hot path. Values: +// "pending:" → claimed by a trigger pipeline; `` is the +// caller-supplied ownership token. Release and +// publish compare-and-act on this token so a +// late release from a previous claimant whose TTL +// expired cannot erase a new owner's claim. +// → the winning trigger's resolved runId. +const PENDING_PREFIX = "pending:"; + +// Exported (like `idempotencyLookupKeyFor`) so tests can target the same +// claim key the buffer writes without hard-coding the encoding. +export function makeIdempotencyClaimKey(input: IdempotencyLookupInput): string { + return `mollifier:claim:${encodeKeyPart(input.envId)}:${encodeKeyPart(input.taskIdentifier)}:${encodeKeyPart(input.idempotencyKey)}`; +} + +export type IdempotencyClaimResult = + | { kind: "claimed" } + | { kind: "pending" } + | { kind: "resolved"; runId: string }; + export class MollifierBuffer { private readonly redis: Redis; - private readonly entryTtlSeconds: number; private readonly logger: Logger; constructor(options: MollifierBufferOptions) { - this.entryTtlSeconds = options.entryTtlSeconds; this.logger = options.logger ?? new Logger("MollifierBuffer", "debug"); this.redis = createRedisClient( { ...options.redisOptions, retryStrategy(times) { - const delay = Math.min(times * 50, 1000); - return delay; + return mollifierReconnectDelayMs(times); }, maxRetriesPerRequest: 20, }, @@ -41,19 +135,45 @@ export class MollifierBuffer { this.#registerCommands(); } - // Returns true if the entry was newly written; false if a duplicate runId - // was already buffered (idempotent no-op). Callers can use the boolean to - // record a duplicate-accept metric without affecting buffer state. + // Three outcomes: + // - { kind: "accepted" } — entry was newly written. + // - { kind: "duplicate_run_id" } — runId was already buffered (idempotent + // no-op, same semantic as the previous boolean-false return). + // - { kind: "duplicate_idempotency", existingRunId } — the (env, task, + // idempotencyKey) tuple was already bound to another buffered run. + // The Lua's atomic SETNX is the race-winner; the second caller gets + // the winner's runId so it can return that as the trigger response. async accept(input: { runId: string; envId: string; orgId: string; payload: string; - }): Promise { + // Optional idempotency-key triple. When all three are present we + // SETNX a Redis lookup at `mollifier:idempotency:{env}:{task}:{key}` + // pointing at the runId so trigger-time dedup during the buffered + // window resolves the same way PG's unique constraint resolves it + // post-materialisation. + idempotencyKey?: string; + taskIdentifier?: string; + }): Promise { const entryKey = `mollifier:entries:${input.runId}`; const queueKey = `mollifier:queue:${input.envId}`; const orgsKey = "mollifier:orgs"; - const createdAt = new Date().toISOString(); + const nowMs = Date.now(); + const createdAt = new Date(nowMs).toISOString(); + // Microsecond epoch, stored as a hash field for dwell-time metrics + // (stale sweep, drainer dwell span). FIFO ordering comes from the + // LIST itself (LPUSH head / RPOP tail), not from this value — it is + // no longer a queue sort key. + const createdAtMicros = nowMs * 1000; + const idempotencyLookupKey = + input.idempotencyKey && input.taskIdentifier + ? idempotencyLookupKeyFor({ + envId: input.envId, + taskIdentifier: input.taskIdentifier, + idempotencyKey: input.idempotencyKey, + }) + : ""; const result = await this.redis.acceptMollifierEntry( entryKey, queueKey, @@ -63,10 +183,18 @@ export class MollifierBuffer { input.orgId, input.payload, createdAt, - String(this.entryTtlSeconds), + String(createdAtMicros), "mollifier:org-envs:", + idempotencyLookupKey, + "mollifier:entries:", ); - return result === 1; + // Lua returns 1 (accepted), 0 (duplicate runId), or a string runId + // (duplicate idempotency — value is the existing winner's runId). + if (typeof result === "string" && result.length > 0) { + return { kind: "duplicate_idempotency", existingRunId: result }; + } + if (result === 1) return { kind: "accepted" }; + return { kind: "duplicate_run_id" }; } async pop(envId: string): Promise { @@ -128,8 +256,245 @@ export class MollifierBuffer { return this.redis.smembers(`mollifier:org-envs:${orgId}`); } + // Read-only enumeration of currently-queued entries for a single env. + // Used by the stale-sweep to compute per-entry dwell time, so order is + // immaterial — LRANGE returns them newest-first (LPUSH head) but the + // caller scans the whole window. Non-destructive: the drainer still + // RPOPs these entries in FIFO order. + // + // The entry HGETALLs are issued in a single pipelined batch (one + // network round-trip instead of N) — at the stale-sweep's default + // maxCount=1000 the serial implementation cost ~1000 RTTs per env, + // which dominated sweep wall-time at any meaningful backlog. + // + // A missing entry (empty hash) is skipped: the drainer's RPOP+DEL of + // the entry hash can race our LRANGE→HGETALL window, so a runId on + // the queue with no backing hash is an expected concurrency outcome, + // not an error. + async listEntriesForEnv(envId: string, maxCount: number): Promise { + if (maxCount <= 0) return []; + const runIds = await this.redis.lrange( + `mollifier:queue:${envId}`, + 0, + maxCount - 1, + ); + if (runIds.length === 0) return []; + + const pipeline = this.redis.pipeline(); + for (const runId of runIds) { + pipeline.hgetall(`mollifier:entries:${runId}`); + } + const results = await pipeline.exec(); + if (!results) return []; + + const entries: BufferEntry[] = []; + for (let i = 0; i < results.length; i++) { + const [err, raw] = results[i] as [Error | null, Record | null]; + if (err) { + this.logger.error("MollifierBuffer.listEntriesForEnv: hgetall failed", { + runId: runIds[i], + err: err.message, + }); + continue; + } + if (!raw || Object.keys(raw).length === 0) continue; + const parsed = BufferEntrySchema.safeParse(raw); + if (!parsed.success) { + this.logger.error("MollifierBuffer.listEntriesForEnv: invalid entry shape", { + runId: runIds[i], + errors: parsed.error.flatten(), + }); + continue; + } + entries.push(parsed.data); + } + return entries; + } + + // Atomic snapshot mutation. Used by customer-mutation API endpoints + // (tags, metadata-put, reschedule, cancel) when the run is still in + // the buffer. Three outcomes: + // - "applied_to_snapshot": entry was QUEUED + not materialised; the + // drainer will read the patched payload on its next pop. + // - "not_found": no entry hash exists for this runId — including a + // FAILED entry, whose hash the drainer-terminal `fail` path DELs. + // - "busy": entry is DRAINING or materialised. The API + // wait-and-bounces through PG. + // - "limit_exceeded": an `append_tags` patch carrying `maxTags` would + // push the deduped tag count over the cap; nothing is written. + async mutateSnapshot(runId: string, patch: SnapshotPatch): Promise { + const result = (await this.redis.mutateMollifierSnapshot( + `mollifier:entries:${runId}`, + JSON.stringify(patch), + )) as string; + if ( + result === "applied_to_snapshot" || + result === "not_found" || + result === "busy" || + result === "limit_exceeded" + ) { + return result; + } + throw new Error(`MollifierBuffer.mutateSnapshot: unexpected Lua return value: ${result}`); + } + + // Optimistic compare-and-swap on the snapshot's metadata. Caller reads + // the current metadataVersion via getEntry, applies operations in JS via + // `applyMetadataOperations`, then calls this with the new metadata + the + // expected version. Lua refuses if the version has moved (caller retries + // up to N times). Mirrors the PG-side `UpdateMetadataService` retry + // loop so concurrent increment/append operations don't lose deltas. + async casSetMetadata(input: { + runId: string; + expectedVersion: number; + newMetadata: string; + newMetadataType: string; + }): Promise { + const entryKey = `mollifier:entries:${input.runId}`; + const raw = (await this.redis.casSetMollifierMetadata( + entryKey, + String(input.expectedVersion), + input.newMetadata, + input.newMetadataType, + )) as string; + if (raw === "not_found") return { kind: "not_found" }; + if (raw === "busy") return { kind: "busy" }; + if (raw.startsWith("conflict:")) { + return { kind: "version_conflict", currentVersion: Number(raw.slice("conflict:".length)) }; + } + if (raw.startsWith("applied:")) { + return { kind: "applied", newVersion: Number(raw.slice("applied:".length)) }; + } + throw new Error(`MollifierBuffer.casSetMetadata: unexpected Lua return: ${raw}`); + } + + // Atomic pre-gate claim on a (env, task, idempotencyKey) tuple. One + // call across both PG and buffer paths serialises through this claim; + // closes the race the buffer-side SETNX leaves open during the + // gate-transition burst window. + // + // The caller supplies an opaque `token` (UUID) on claim. The same token + // MUST be passed to `publishClaim` / `releaseClaim`, which compare-and- + // act so a late release from a previous claimant whose TTL expired + // cannot erase a new owner's claim. + // + // - "claimed": we now own the claim, the caller proceeds with the + // trigger pipeline and must `publishClaim` on success or + // `releaseClaim` on failure. + // - "pending": another trigger owns the claim and hasn't published + // yet; the caller should poll. + // - "resolved": the claim already holds a runId; the caller can + // return that runId as a cached hit. + async claimIdempotency( + input: IdempotencyLookupInput & { token: string; ttlSeconds: number }, + ): Promise { + const claimKey = makeIdempotencyClaimKey(input); + const raw = (await this.redis.claimMollifierIdempotency( + claimKey, + `${PENDING_PREFIX}${input.token}`, + PENDING_PREFIX, + String(input.ttlSeconds), + )) as string; + if (raw === "claimed") return { kind: "claimed" }; + if (raw === "pending") return { kind: "pending" }; + if (raw.startsWith("resolved:")) { + return { kind: "resolved", runId: raw.slice("resolved:".length) }; + } + throw new Error(`MollifierBuffer.claimIdempotency: unexpected return: ${raw}`); + } + + // Publish the winning runId to the claim so subsequent claimants / + // waiters see "resolved". TTL bounded by the customer's + // `idempotencyKeyExpiresAt` minus now; caller computes. + // + // Compare-and-set on the caller's token: if the current value isn't + // our pending marker (TTL expired and another claimant moved in, or + // someone else already published), the publish is a no-op. The caller + // can treat any such case as "we lost the claim" and re-read. + // Returns true if we published; false if the claim slot was no longer + // ours. + async publishClaim( + input: IdempotencyLookupInput & { token: string; runId: string; ttlSeconds: number }, + ): Promise { + const claimKey = makeIdempotencyClaimKey(input); + const result = (await this.redis.publishMollifierClaim( + claimKey, + `${PENDING_PREFIX}${input.token}`, + input.runId, + String(input.ttlSeconds), + )) as number; + return result === 1; + } + + // Release the claim on pipeline error so waiters can re-claim and + // retry. Idempotent. + // + // Compare-and-delete on the caller's token: only deletes if the + // current value is exactly our pending marker. A late release from a + // claimant whose TTL expired is a no-op, so a new owner's claim is + // never wiped by a slow predecessor. + async releaseClaim(input: IdempotencyLookupInput & { token: string }): Promise { + const claimKey = makeIdempotencyClaimKey(input); + await this.redis.releaseMollifierClaim( + claimKey, + `${PENDING_PREFIX}${input.token}`, + ); + } + + // Read the current claim value, used by the wait/poll loop on losers + // to detect "pending" → "resolved" transitions and timeouts. + async readClaim(input: IdempotencyLookupInput): Promise { + const claimKey = makeIdempotencyClaimKey(input); + const value = await this.redis.get(claimKey); + if (value === null) return null; + if (value.startsWith(PENDING_PREFIX)) return { kind: "pending" }; + return { kind: "resolved", runId: value }; + } + + // Resolve a buffered run by (env, task, idempotencyKey) tuple. Used by + // `IdempotencyKeyConcern.handleTriggerRequest` after the PG check + // misses — same key may belong to a buffered run waiting to drain. The + // lookup self-heals: if the lookup points at an entry hash that's gone, + // we clear the lookup and report a miss. The clear is a compare-and- + // delete (only if the key still holds the stale runId we observed) so a + // fresh accept that rebinds the key between our GET and DEL isn't wiped. + async lookupIdempotency(input: IdempotencyLookupInput): Promise { + const lookupKey = idempotencyLookupKeyFor(input); + const runId = await this.redis.get(lookupKey); + if (!runId) return null; + const entry = await this.getEntry(runId); + if (!entry) { + await this.redis.delMollifierKeyIfEquals(lookupKey, runId); + return null; + } + return runId; + } + + // Clear the idempotency binding from a buffered run. Used by + // `ResetIdempotencyKeyService` alongside the existing PG-side + // `updateMany`. Returns the runId that was cleared, or null if no + // buffered run held this key. + async resetIdempotency(input: IdempotencyLookupInput): Promise<{ clearedRunId: string | null }> { + const lookupKey = idempotencyLookupKeyFor(input); + const claimKey = makeIdempotencyClaimKey(input); + const clearedRunId = (await this.redis.resetMollifierIdempotency( + lookupKey, + "mollifier:entries:", + claimKey, + )) as string; + return { clearedRunId: clearedRunId.length > 0 ? clearedRunId : null }; + } + + // Marks the entry as materialised (PG row written) and resets its TTL to + // the grace window. Entry hash persists past ack as a read-fallback + // safety net for the brief PG replica-lag window between drainer-side + // write and reader-side visibility. Also clears the associated + // idempotency lookup if one was set on accept. async ack(runId: string): Promise { - await this.redis.del(`mollifier:entries:${runId}`); + await this.redis.ackMollifierEntry( + `mollifier:entries:${runId}`, + String(ACK_GRACE_TTL_SECONDS), + ); } async requeue(runId: string): Promise { @@ -142,9 +507,12 @@ export class MollifierBuffer { ); } - // Returns true if the entry transitioned to FAILED; false if the entry no - // longer exists (TTL expired between pop and fail). Caller can use the - // boolean to skip downstream FAILED handling for ghost entries. + // Returns true if a live entry was torn down; false if the entry no + // longer existed (a concurrent ack or manual cleanup removed it between + // pop and fail — there is no accept-time TTL). Note FAILED is not an + // observable state: the Lua marks the hash FAILED then DELs it in the + // same atomic script, so a subsequent getEntry returns null. Caller can + // use the boolean to skip downstream FAILED handling for ghost entries. async fail(runId: string, error: { code: string; message: string }): Promise { const result = await this.redis.failMollifierEntry( `mollifier:entries:${runId}`, @@ -153,10 +521,16 @@ export class MollifierBuffer { return result === 1; } + // Returns Redis-side TTL on the entry hash. Returns -1 for entries + // with no TTL — the steady state under the current design, where + // entries persist until drainer ack/fail. The ack grace TTL (30s + // post-materialise) is the only context where this returns a + // positive value; tests around the grace TTL still rely on it. async getEntryTtlSeconds(runId: string): Promise { return this.redis.ttl(`mollifier:entries:${runId}`); } + async evaluateTrip( envId: string, options: { windowMs: number; threshold: number; holdMs: number }, @@ -190,8 +564,10 @@ export class MollifierBuffer { local orgId = ARGV[3] local payload = ARGV[4] local createdAt = ARGV[5] - local ttlSeconds = tonumber(ARGV[6]) + local createdAtMicros = ARGV[6] local orgEnvsPrefix = ARGV[7] + local idempotencyLookupKey = ARGV[8] or '' + local entryPrefix = ARGV[9] -- Idempotent: refuse if an entry for this runId already exists in any -- state. Caller-side dedup is also enforced via API idempotency keys, @@ -200,6 +576,27 @@ export class MollifierBuffer { return 0 end + -- Idempotency-key dedup. If the caller passed a lookup key + -- and it's already bound to another buffered run, return the + -- winner's runId so the loser's API response can echo it as a + -- cached hit. Otherwise SET the lookup (no TTL — lifecycle is + -- paired with the entry hash; drainer ack/fail clear it + -- explicitly). + if idempotencyLookupKey ~= '' then + local existing = redis.call('GET', idempotencyLookupKey) + if existing then + -- Self-heal: only honour the binding if its entry hash still + -- exists. If the entry was evicted (maxmemory) but the lookup + -- survived, the binding is stale — fall through and rebind to + -- this run rather than returning a dead runId that would block + -- the key indefinitely. Mirrors lookupIdempotency's self-heal. + if redis.call('EXISTS', entryPrefix .. existing) == 1 then + return existing + end + end + redis.call('SET', idempotencyLookupKey, runId) + end + redis.call('HSET', entryKey, 'runId', runId, 'envId', envId, @@ -207,8 +604,20 @@ export class MollifierBuffer { 'payload', payload, 'status', 'QUEUED', 'attempts', '0', - 'createdAt', createdAt) - redis.call('EXPIRE', entryKey, ttlSeconds) + 'createdAt', createdAt, + 'createdAtMicros', createdAtMicros, + 'idempotencyLookupKey', idempotencyLookupKey, + 'metadataVersion', '0') + -- No EXPIRE on the entry hash. Buffer entries persist until the + -- drainer ACKs (post-materialise grace) or FAILs them — the + -- drainer is the only recovery mechanism, so silent TTL-based + -- eviction would lose runs with no customer-visible signal. + -- Memory pressure from an offline drainer is the alertable + -- failure mode instead; see _ops/mollifier-ops.md. + -- LIST queue: LPUSH at the head, drainer RPOPs from the tail, so + -- insertion order == drain order (FIFO). createdAtMicros is kept + -- as a hash field for dwell metrics only — it is no longer a sort + -- key now that the buffer has no list/pagination surface. redis.call('LPUSH', queueKey, runId) -- Org-level membership: maintained atomically with the per-env -- queue so the drainer can walk orgs → envs-for-org and @@ -239,7 +648,12 @@ export class MollifierBuffer { local nextAttempts = tonumber(currentAttempts or '0') + 1 redis.call('HSET', entryKey, 'status', 'QUEUED', 'attempts', tostring(nextAttempts)) - redis.call('LPUSH', queuePrefix .. envId, runId) + -- Requeue RPUSHes to the tail (the RPOP end) so a transiently + -- failed entry pops next rather than going to the back of the + -- line behind a fresh backlog. createdAt is immutable across + -- retries; the drainer's maxAttempts caps the + -- retry loop so a poisoned entry doesn't head-of-line forever. + redis.call('RPUSH', queuePrefix .. envId, runId) -- Re-track the org/env: pop may have SREM'd them when the queue -- last emptied. SADDs are idempotent if the values are still -- present. @@ -274,11 +688,13 @@ export class MollifierBuffer { end end - -- Loop to skip orphan queue references — runIds whose entry hash has - -- expired (TTL hit). HSET on a missing key would CREATE a partial - -- hash without a TTL, leaking memory. The loop is bounded by queue - -- length; entire Lua script remains atomic. + -- Loop to skip orphan queue references — runIds whose entry hash is + -- gone (e.g. Redis maxmemory eviction, since QUEUED entries carry + -- no TTL of their own). HSET on a missing key would CREATE a + -- partial hash without a TTL, leaking memory. The loop is bounded + -- by queue length; entire Lua script remains atomic. while true do + -- RPOP returns the tail member (oldest, FIFO), or false when empty. local runId = redis.call('RPOP', queueKey) if not runId then -- Queue is empty AND we have no entry to read orgId from, so @@ -296,16 +712,274 @@ export class MollifierBuffer { result[raw[i]] = raw[i + 1] end -- Prune org-level membership if this pop drained the queue. - -- Atomic with the RPOP above — a concurrent accept AFTER this - -- script will SADD both back along with its LPUSH. + -- Atomic with the RPOP above — a concurrent accept AFTER + -- this script will SADD both back along with its LPUSH. if redis.call('LLEN', queueKey) == 0 then pruneOrgMembership(result['orgId']) end return cjson.encode(result) end - -- Orphan queue reference: entry TTL expired while runId was queued. - -- Discard the reference and loop to the next. + -- Orphan queue reference: entry hash gone (evicted) while runId + -- was queued. Discard the reference and loop to the next. + end + `, + }); + + this.redis.defineCommand("casSetMollifierMetadata", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local expectedVersion = tonumber(ARGV[1]) + local newMetadata = ARGV[2] + local newMetadataType = ARGV[3] + + if redis.call('EXISTS', entryKey) == 0 then + return 'not_found' + end + + local status = redis.call('HGET', entryKey, 'status') + local materialised = redis.call('HGET', entryKey, 'materialised') + if status ~= 'QUEUED' or materialised == 'true' then + return 'busy' + end + + local currentVersionStr = redis.call('HGET', entryKey, 'metadataVersion') or '0' + local currentVersion = tonumber(currentVersionStr) or 0 + if currentVersion ~= expectedVersion then + return 'conflict:' .. tostring(currentVersion) + end + + -- Write the new metadata onto the snapshot's payload JSON. We + -- keep the rest of the payload intact — only metadata/metadataType + -- change. metadataVersion is denormalised on the hash for cheap + -- CAS reads; it's intentionally NOT stored inside the payload + -- itself (PG-side metadataVersion is a column, not a JSON field). + local payloadJson = redis.call('HGET', entryKey, 'payload') + local ok, payload = pcall(cjson.decode, payloadJson) + if not ok then return 'busy' end + payload.metadata = newMetadata + payload.metadataType = newMetadataType + + local newVersion = currentVersion + 1 + redis.call('HSET', entryKey, + 'payload', cjson.encode(payload), + 'metadataVersion', tostring(newVersion)) + return 'applied:' .. tostring(newVersion) + `, + }); + + this.redis.defineCommand("claimMollifierIdempotency", { + numberOfKeys: 1, + lua: ` + local claimKey = KEYS[1] + local pendingMarker = ARGV[1] -- "pending:" + local pendingPrefix = ARGV[2] -- "pending:" + local ttl = tonumber(ARGV[3]) + + -- SETNX-with-TTL: atomic; only one caller can win. + local won = redis.call('SET', claimKey, pendingMarker, 'NX', 'EX', ttl) + if won then + return 'claimed' + end + + local existing = redis.call('GET', claimKey) + if not existing then + -- The slot expired in the race window between the SET NX + -- failing and this GET. It's free now — claim it so we don't + -- string.sub a nil and error out. + redis.call('SET', claimKey, pendingMarker, 'EX', ttl) + return 'claimed' + end + -- Any "pending:*" value is a live claim — the caller-supplied + -- token differentiates ownership but is opaque to losers. + if string.sub(existing, 1, string.len(pendingPrefix)) == pendingPrefix then + return 'pending' + end + return 'resolved:' .. existing + `, + }); + + // Publish a winning runId to a claim slot we own. Compare-and-set on + // the caller's pending marker: if the slot is no longer ours (TTL + // expired and another claimant moved in, or already resolved by + // someone else), we no-op. Returns 1 on publish, 0 on no-op. + this.redis.defineCommand("publishMollifierClaim", { + numberOfKeys: 1, + lua: ` + local claimKey = KEYS[1] + local ownerMarker = ARGV[1] -- "pending:" + local runId = ARGV[2] + local ttl = tonumber(ARGV[3]) + + local existing = redis.call('GET', claimKey) + if existing == ownerMarker then + redis.call('SET', claimKey, runId, 'EX', ttl) + return 1 + end + return 0 + `, + }); + + // Release a claim slot we own. Compare-and-delete on the caller's + // pending marker: a late release from a previous claimant whose TTL + // expired is a no-op, so a new owner's claim is never wiped. + this.redis.defineCommand("releaseMollifierClaim", { + numberOfKeys: 1, + lua: ` + local claimKey = KEYS[1] + local ownerMarker = ARGV[1] -- "pending:" + + local existing = redis.call('GET', claimKey) + if existing == ownerMarker then + redis.call('DEL', claimKey) + return 1 + end + return 0 + `, + }); + + this.redis.defineCommand("resetMollifierIdempotency", { + numberOfKeys: 1, + lua: ` + local lookupKey = KEYS[1] + local entryPrefix = ARGV[1] + local claimKey = ARGV[2] + + -- Reset reopens the key across BOTH the buffer lookup and the + -- cross-store pre-gate claim pointer. Without clearing the claim, + -- a resolved/pending claim would keep deduping new triggers for + -- the rest of its TTL even though the binding was reset. DEL is + -- unconditional — the claim is gone regardless of whether a + -- buffered run currently holds the lookup. + redis.call('DEL', claimKey) + + local runId = redis.call('GET', lookupKey) + if not runId then + return '' + end + + local entryKey = entryPrefix .. runId + if redis.call('EXISTS', entryKey) == 0 then + -- Stale lookup. Lazy cleanup. + redis.call('DEL', lookupKey) + return '' + end + + -- Clear the idempotency fields on the snapshot payload so the + -- drainer's eventual engine.trigger call inserts a PG row + -- without the key set. + local payloadJson = redis.call('HGET', entryKey, 'payload') + if payloadJson then + local ok, payload = pcall(cjson.decode, payloadJson) + if ok then + payload.idempotencyKey = cjson.null + payload.idempotencyKeyExpiresAt = cjson.null + redis.call('HSET', entryKey, 'payload', cjson.encode(payload)) + end + end + -- Clear the denormalised lookup pointer on the hash so a later + -- ack doesn't try to DEL a key that's already gone. + redis.call('HSET', entryKey, 'idempotencyLookupKey', '') + redis.call('DEL', lookupKey) + return runId + `, + }); + + this.redis.defineCommand("mutateMollifierSnapshot", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local patchJson = ARGV[1] + + if redis.call('EXISTS', entryKey) == 0 then + return 'not_found' + end + + local status = redis.call('HGET', entryKey, 'status') + local materialised = redis.call('HGET', entryKey, 'materialised') + if status ~= 'QUEUED' or materialised == 'true' then + return 'busy' + end + + local payloadJson = redis.call('HGET', entryKey, 'payload') + local ok, payload = pcall(cjson.decode, payloadJson) + if not ok then return 'busy' end + + local patch = cjson.decode(patchJson) + + if patch.type == 'append_tags' then + -- cjson decode of an absent or empty-array field gives nil or + -- an empty table; we rebuild as a dense array. Existing tags + -- are preserved; new tags are appended only if not present. + local existing = payload.tags or {} + local seen = {} + local merged = {} + for _, t in ipairs(existing) do + if not seen[t] then + seen[t] = true + table.insert(merged, t) + end + end + for _, t in ipairs(patch.tags or {}) do + if not seen[t] then + seen[t] = true + table.insert(merged, t) + end + end + -- Cap the deduped count when the caller supplies a limit, so a + -- buffered run can't exceed MAX_TAGS_PER_RUN via the tags API. + -- Reject the whole patch (write nothing) rather than truncating. + if patch.maxTags ~= nil and #merged > patch.maxTags then + return 'limit_exceeded' + end + payload.tags = merged + elseif patch.type == 'set_metadata' then + payload.metadata = patch.metadata + payload.metadataType = patch.metadataType + -- Bump the denormalised metadataVersion so an in-flight + -- casSetMetadata (optimistic CAS keyed on this counter) sees + -- the concurrent write as a version conflict and retries, + -- instead of clobbering it under a now-stale expectedVersion. + local currentVersion = tonumber(redis.call('HGET', entryKey, 'metadataVersion') or '0') or 0 + redis.call('HSET', entryKey, 'metadataVersion', tostring(currentVersion + 1)) + elseif patch.type == 'set_delay' then + payload.delayUntil = patch.delayUntil + elseif patch.type == 'mark_cancelled' then + payload.cancelledAt = patch.cancelledAt + payload.cancelReason = patch.cancelReason + else + return 'busy' end + + redis.call('HSET', entryKey, 'payload', cjson.encode(payload)) + return 'applied_to_snapshot' + `, + }); + + this.redis.defineCommand("ackMollifierEntry", { + numberOfKeys: 1, + lua: ` + local entryKey = KEYS[1] + local graceTtlSeconds = tonumber(ARGV[1]) + + -- Guard: never create a partial entry. If the hash is gone between + -- pop and ack (concurrent fail or eviction — QUEUED entries carry + -- no TTL), the run is gone, nothing to mark materialised. + if redis.call('EXISTS', entryKey) == 0 then + return 0 + end + + -- If the entry was accepted with an idempotency key, the lookup + -- string was stored on the hash at accept time. Clear it now — + -- PG becomes canonical for the key post-materialisation. + local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey') + if lookupKey and lookupKey ~= '' then + redis.call('DEL', lookupKey) + end + + redis.call('HSET', entryKey, 'materialised', 'true') + redis.call('EXPIRE', entryKey, graceTtlSeconds) + return 1 `, }); @@ -315,17 +989,49 @@ export class MollifierBuffer { local entryKey = KEYS[1] local errorPayload = ARGV[1] - -- Guard: never create a partial entry. If the hash expired between - -- pop and fail, the run is gone — nothing to mark FAILED. + -- Guard: nothing to mark FAILED if the hash is gone (concurrent + -- ack/manual cleanup). Returning 0 lets the caller distinguish + -- "marked failed" from "no-op". if redis.call('EXISTS', entryKey) == 0 then return 0 end redis.call('HSET', entryKey, 'status', 'FAILED', 'lastError', errorPayload) + + -- Terminal-failure contract: the drainer's onTerminalFailure + -- callback (see MollifierDrainer.processEntry) has been + -- invoked before this fail() and has either written a + -- SYSTEM_FAILURE PG row (for both non-retryable AND + -- max-attempts-exhausted retryable errors) or chosen to fall + -- through (genuinely bad snapshot the engine can't materialise + -- a row from). Either way the buffer entry is no longer + -- load-bearing here. Clear the idempotency lookup -- PG's + -- unique constraint is the canonical dedup mechanism + -- post-materialise -- and drop the entry hash so failed runs + -- don't accrete forever now that there's no accept-time TTL. + local lookupKey = redis.call('HGET', entryKey, 'idempotencyLookupKey') + if lookupKey and lookupKey ~= '' then + redis.call('DEL', lookupKey) + end + redis.call('DEL', entryKey) return 1 `, }); + // Compare-and-delete: DEL the key only if it still holds the expected + // value. Used by lookupIdempotency's stale-lookup self-heal so a + // concurrent accept that rebinds the key between the reader's GET and + // this DEL isn't clobbered. + this.redis.defineCommand("delMollifierKeyIfEquals", { + numberOfKeys: 1, + lua: ` + if redis.call('GET', KEYS[1]) == ARGV[1] then + return redis.call('DEL', KEYS[1]) + end + return 0 + `, + }); + this.redis.defineCommand("mollifierEvaluateTrip", { numberOfKeys: 2, lua: ` @@ -362,10 +1068,12 @@ declare module "@internal/redis" { orgId: string, payload: string, createdAt: string, - ttlSeconds: string, + createdAtMicros: string, orgEnvsPrefix: string, - callback?: Callback, - ): Result; + idempotencyLookupKey: string, + entryPrefix: string, + callback?: Callback, + ): Result; popAndMarkDraining( queueKey: string, orgsKey: string, @@ -382,11 +1090,58 @@ declare module "@internal/redis" { orgEnvsPrefix: string, callback?: Callback, ): Result; + mutateMollifierSnapshot( + entryKey: string, + patchJson: string, + callback?: Callback, + ): Result; + casSetMollifierMetadata( + entryKey: string, + expectedVersion: string, + newMetadata: string, + newMetadataType: string, + callback?: Callback, + ): Result; + resetMollifierIdempotency( + lookupKey: string, + entryPrefix: string, + claimKey: string, + callback?: Callback, + ): Result; + claimMollifierIdempotency( + claimKey: string, + pendingMarker: string, + pendingPrefix: string, + ttlSeconds: string, + callback?: Callback, + ): Result; + publishMollifierClaim( + claimKey: string, + ownerMarker: string, + runId: string, + ttlSeconds: string, + callback?: Callback, + ): Result; + releaseMollifierClaim( + claimKey: string, + ownerMarker: string, + callback?: Callback, + ): Result; + ackMollifierEntry( + entryKey: string, + graceTtlSeconds: string, + callback?: Callback, + ): Result; failMollifierEntry( entryKey: string, errorPayload: string, callback?: Callback, ): Result; + delMollifierKeyIfEquals( + key: string, + expected: string, + callback?: Callback, + ): Result; mollifierEvaluateTrip( rateKey: string, trippedKey: string, diff --git a/packages/redis-worker/src/mollifier/drainer.test.ts b/packages/redis-worker/src/mollifier/drainer.test.ts index c8f68977f69..c6832e94c77 100644 --- a/packages/redis-worker/src/mollifier/drainer.test.ts +++ b/packages/redis-worker/src/mollifier/drainer.test.ts @@ -6,7 +6,6 @@ import { MollifierDrainer } from "./drainer.js"; import { serialiseSnapshot } from "./schemas.js"; const noopOptions = { - entryTtlSeconds: 600, logger: new Logger("test", "log"), }; @@ -87,8 +86,11 @@ describe("MollifierDrainer.runOnce", () => { payload: { foo: 1 }, }); + // After ack the entry persists as a read-fallback safety net with + // materialised=true and a fresh grace TTL. const entry = await buffer.getEntry("run_1"); - expect(entry).toBeNull(); + expect(entry).not.toBeNull(); + expect(entry!.materialised).toBe(true); } finally { await buffer.close(); } @@ -167,9 +169,14 @@ describe("MollifierDrainer error handling", () => { expect(after2!.status).toBe("QUEUED"); expect(after2!.attempts).toBe(2); - await drainer.runOnce(); + const result3 = await drainer.runOnce(); + // On attempt 3 the drainer hits maxAttempts and calls fail(), + // which deletes the entry — once the drainer-handler has written + // the SYSTEM_FAILURE PG row the buffer entry is no longer + // load-bearing. The runOnce result is the surviving signal. const after3 = await buffer.getEntry("run_r"); - expect(after3!.status).toBe("FAILED"); + expect(after3).toBeNull(); + expect(result3.failed).toBe(1); expect(calls).toBe(3); } finally { await buffer.close(); @@ -202,11 +209,13 @@ describe("MollifierDrainer error handling", () => { try { await buffer.accept({ runId: "run_nr", envId: "env_a", orgId: "org_1", payload: "{}" }); - await drainer.runOnce(); + const result = await drainer.runOnce(); + // fail() deletes the entry once the drainer-handler has written + // the canonical SYSTEM_FAILURE PG row. const entry = await buffer.getEntry("run_nr"); - expect(entry!.status).toBe("FAILED"); - expect(entry!.lastError).toEqual({ code: "Error", message: "validation failure" }); + expect(entry).toBeNull(); + expect(result.failed).toBe(1); } finally { await buffer.close(); } @@ -270,6 +279,296 @@ describe("MollifierDrainer error handling", () => { ); }); +// `onTerminalFailure` is the callback the drainer fires on any terminal +// path (non-retryable OR max-attempts-exhausted retryable) before it +// calls `buffer.fail()`. Webapp wires it to `createFailedTaskRun` so the +// customer's run lands a SYSTEM_FAILURE PG row in both cases. Pre-fix, +// the retryable-exhausted path called `buffer.fail()` with no PG row, +// silently losing the run. These tests pin both terminal causes plus the +// retry-on-retryable-callback-failure escape hatch. +describe("MollifierDrainer.onTerminalFailure", () => { + redisTest( + "fires with cause max-attempts-exhausted after retryable failures exhaust", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + ...noopOptions, + }); + + let handlerCalls = 0; + const handler = async () => { + handlerCalls++; + throw new Error("retryable PG blip"); + }; + + type TerminalCallArgs = { + runId: string; + attempts: number; + cause: "non-retryable" | "max-attempts-exhausted"; + errorMessage: string; + }; + const terminalCalls: TerminalCallArgs[] = []; + + const drainer = new MollifierDrainer({ + buffer, + handler, + onTerminalFailure: async (input) => { + terminalCalls.push({ + runId: input.runId, + attempts: input.attempts, + cause: input.cause, + errorMessage: input.error.message, + }); + }, + concurrency: 1, + maxAttempts: 2, + isRetryable: () => true, + logger: new Logger("test-drainer", "log"), + }); + + try { + await buffer.accept({ runId: "run_exhaust", envId: "env_a", orgId: "org_1", payload: "{}" }); + + // Attempt 1: retryable error → requeue, no terminal callback fires. + const r1 = await drainer.runOnce(); + expect(r1.failed).toBe(1); + expect(terminalCalls).toHaveLength(0); + const after1 = await buffer.getEntry("run_exhaust"); + expect(after1!.status).toBe("QUEUED"); + expect(after1!.attempts).toBe(1); + + // Attempt 2: maxAttempts (2) reached → terminal callback fires + // with cause "max-attempts-exhausted", THEN buffer.fail() deletes. + const r2 = await drainer.runOnce(); + expect(r2.failed).toBe(1); + expect(handlerCalls).toBe(2); + expect(terminalCalls).toHaveLength(1); + expect(terminalCalls[0]).toMatchObject({ + runId: "run_exhaust", + attempts: 2, + cause: "max-attempts-exhausted", + errorMessage: "retryable PG blip", + }); + // buffer entry torn down post-callback. + expect(await buffer.getEntry("run_exhaust")).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "fires with cause non-retryable on the first non-retryable error", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + ...noopOptions, + }); + + const handler = async () => { + throw new Error("validation failure"); + }; + + const terminalCalls: Array<{ cause: string; attempts: number }> = []; + const drainer = new MollifierDrainer({ + buffer, + handler, + onTerminalFailure: async (input) => { + terminalCalls.push({ cause: input.cause, attempts: input.attempts }); + }, + concurrency: 1, + // Generous attempts budget — non-retryable should bypass it + // entirely and terminate on the first attempt. + maxAttempts: 5, + isRetryable: () => false, + logger: new Logger("test-drainer", "log"), + }); + + try { + await buffer.accept({ runId: "run_nr", envId: "env_a", orgId: "org_1", payload: "{}" }); + + const r = await drainer.runOnce(); + expect(r.failed).toBe(1); + expect(terminalCalls).toHaveLength(1); + expect(terminalCalls[0]).toEqual({ cause: "non-retryable", attempts: 1 }); + expect(await buffer.getEntry("run_nr")).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "callback throwing a retryable error requeues instead of failing", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + ...noopOptions, + }); + + // Handler always fails (non-retryable so we hit onTerminalFailure + // on the first attempt regardless of maxAttempts). + const handler = async () => { + throw new Error("validation failure"); + }; + + let callbackInvocations = 0; + const drainer = new MollifierDrainer({ + buffer, + handler, + onTerminalFailure: async () => { + callbackInvocations++; + // Simulate PG still unreachable when we try to write the + // SYSTEM_FAILURE row — drainer should requeue, not fail. + const err: Error & { code?: string } = new Error("Can't reach database server"); + err.code = "P1001"; + throw err; + }, + concurrency: 1, + maxAttempts: 3, + // Both `validation failure` (handler) AND `P1001` (callback) are + // retryable from the drainer's perspective. The handler's + // non-retryable disposition is set by the underlying error + // identity, not by `isRetryable` — callers like the webapp use a + // narrower retryable predicate. Here we set `isRetryable: true` + // because the test only cares about the callback-retryable path. + isRetryable: () => true, + logger: new Logger("test-drainer", "log"), + }); + + try { + await buffer.accept({ runId: "run_cb_retry", envId: "env_a", orgId: "org_1", payload: "{}" }); + + // Tick 1: handler throws → attempts=1 < maxAttempts=3 → requeue + // (no callback invocation, retryable path). + const r1 = await drainer.runOnce(); + expect(r1.failed).toBe(1); + expect(callbackInvocations).toBe(0); + const after1 = await buffer.getEntry("run_cb_retry"); + expect(after1!.status).toBe("QUEUED"); + expect(after1!.attempts).toBe(1); + + // Tick 2: handler throws → attempts=2 < 3 → requeue again. + const r2 = await drainer.runOnce(); + expect(r2.failed).toBe(1); + expect(callbackInvocations).toBe(0); + + // Tick 3: handler throws → attempts=3 (the nextAttempts check is + // `< maxAttempts`, so 3 < 3 is false) → terminal. Callback throws + // retryable → drainer requeues instead of fail(). Entry survives. + const r3 = await drainer.runOnce(); + expect(r3.failed).toBe(1); + expect(callbackInvocations).toBe(1); + const after3 = await buffer.getEntry("run_cb_retry"); + expect(after3).not.toBeNull(); + expect(after3!.status).toBe("QUEUED"); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "callback throwing a non-retryable error falls through to buffer.fail()", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + ...noopOptions, + }); + + const handler = async () => { + throw new Error("validation failure"); + }; + + const drainer = new MollifierDrainer({ + buffer, + handler, + onTerminalFailure: async () => { + // Genuinely bad write (e.g. snapshot too malformed to insert). + // Drainer must NOT loop on this — falls through to buffer.fail. + throw new Error("malformed snapshot"); + }, + concurrency: 1, + maxAttempts: 3, + isRetryable: () => false, + logger: new Logger("test-drainer", "log"), + }); + + try { + await buffer.accept({ runId: "run_cb_dead", envId: "env_a", orgId: "org_1", payload: "{}" }); + + const r = await drainer.runOnce(); + expect(r.failed).toBe(1); + // Entry was failed despite the callback throwing — the + // non-retryable branch of the callback-error guard sends it to + // buffer.fail so a poisoned run can't loop forever. + expect(await buffer.getEntry("run_cb_dead")).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "no onTerminalFailure provided keeps pre-fix behaviour (buffer.fail with no callback)", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const buffer = new MollifierBuffer({ + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + ...noopOptions, + }); + + const handler = async () => { + throw new Error("validation failure"); + }; + + const drainer = new MollifierDrainer({ + buffer, + handler, + // onTerminalFailure intentionally omitted — verifies the option + // is genuinely optional and backwards-compatible. + concurrency: 1, + maxAttempts: 2, + isRetryable: () => false, + logger: new Logger("test-drainer", "log"), + }); + + try { + await buffer.accept({ runId: "run_no_cb", envId: "env_a", orgId: "org_1", payload: "{}" }); + const r = await drainer.runOnce(); + expect(r.failed).toBe(1); + expect(await buffer.getEntry("run_no_cb")).toBeNull(); + } finally { + await buffer.close(); + } + }, + ); +}); + // Transient Redis errors used to permanently kill the loop because // `processOneFromEnv` didn't catch `buffer.pop()` rejections — the error // bubbled through `Promise.all` → `runOnce` → `loop`'s outer catch and @@ -972,7 +1271,7 @@ describe("MollifierDrainer additional coverage", () => { // ack() lives inside the same try as the handler call, so if the // handler succeeds but ack throws (e.g. transient Redis blip), the // entry is routed through the retry/terminal path even though the - // handler-side work completed. Phase 2's engine-replay handler will + // handler-side work completed. A later engine-replay handler will // need idempotency to absorb the re-execution this implies on retry, // OR ack should be lifted out of the try block. let handlerCalls = 0; diff --git a/packages/redis-worker/src/mollifier/drainer.ts b/packages/redis-worker/src/mollifier/drainer.ts index 407b389e14e..20b5ee3ae1f 100644 --- a/packages/redis-worker/src/mollifier/drainer.ts +++ b/packages/redis-worker/src/mollifier/drainer.ts @@ -12,9 +12,30 @@ export type MollifierDrainerHandler = (input: { createdAt: Date; }) => Promise; +// Invoked once per entry before `buffer.fail()` on any terminal path — +// non-retryable error OR retryable error after maxAttempts. Lets the caller +// land a SYSTEM_FAILURE PG row so the customer sees the run instead of it +// silently disappearing alongside the buffer entry. Throwing a retryable +// error from the callback causes the drainer to requeue rather than fail +// (so the PG write itself gets another chance once PG recovers); throwing +// anything else falls through to `buffer.fail()` to avoid an infinite loop +// on a genuinely bad payload. +export type MollifierDrainerTerminalFailureCause = "non-retryable" | "max-attempts-exhausted"; +export type MollifierDrainerTerminalFailureHandler = (input: { + runId: string; + envId: string; + orgId: string; + payload: TPayload; + attempts: number; + createdAt: Date; + error: { code: string; message: string }; + cause: MollifierDrainerTerminalFailureCause; +}) => Promise; + export type MollifierDrainerOptions = { buffer: MollifierBuffer; handler: MollifierDrainerHandler; + onTerminalFailure?: MollifierDrainerTerminalFailureHandler; concurrency: number; maxAttempts: number; isRetryable: (err: unknown) => boolean; @@ -42,6 +63,7 @@ export type DrainResult = { export class MollifierDrainer { private readonly buffer: MollifierBuffer; private readonly handler: MollifierDrainerHandler; + private readonly onTerminalFailure?: MollifierDrainerTerminalFailureHandler; private readonly maxAttempts: number; private readonly isRetryable: (err: unknown) => boolean; private readonly pollIntervalMs: number; @@ -60,6 +82,7 @@ export class MollifierDrainer { constructor(options: MollifierDrainerOptions) { this.buffer = options.buffer; this.handler = options.handler; + this.onTerminalFailure = options.onTerminalFailure; this.maxAttempts = options.maxAttempts; this.isRetryable = options.isRetryable; this.pollIntervalMs = options.pollIntervalMs ?? 100; @@ -275,13 +298,56 @@ export class MollifierDrainer { }); return "failed"; } + const cause: MollifierDrainerTerminalFailureCause = this.isRetryable(err) + ? "max-attempts-exhausted" + : "non-retryable"; const code = err instanceof Error ? err.name : "Unknown"; const message = err instanceof Error ? err.message : String(err); + // Run the terminal-failure callback BEFORE buffer.fail() so a + // SYSTEM_FAILURE PG row can land while the entry is still around to + // read from (and so we don't lose the run if the callback's own + // write itself needs a retry). If the callback throws a retryable + // error, requeue the entry instead of fail()ing — PG is still + // unreachable, give it another tick. Any other callback failure + // falls through to buffer.fail() so a genuinely bad snapshot + // doesn't loop forever. + if (this.onTerminalFailure) { + try { + await this.onTerminalFailure({ + runId: entry.runId, + envId: entry.envId, + orgId: entry.orgId, + payload: deserialiseSnapshot(entry.payload), + attempts: nextAttempts, + createdAt: entry.createdAt, + error: { code, message }, + cause, + }); + } catch (writeErr) { + if (this.isRetryable(writeErr)) { + await this.buffer.requeue(entry.runId); + this.logger.warn( + "MollifierDrainer: terminal-failure callback retryable; requeued", + { + runId: entry.runId, + attempts: nextAttempts, + writeErr, + }, + ); + return "failed"; + } + this.logger.error("MollifierDrainer: terminal-failure callback failed", { + runId: entry.runId, + writeErr, + }); + } + } await this.buffer.fail(entry.runId, { code, message }); this.logger.error("MollifierDrainer: terminal failure", { runId: entry.runId, code, message, + cause, }); return "failed"; } diff --git a/packages/redis-worker/src/mollifier/index.ts b/packages/redis-worker/src/mollifier/index.ts index 5e6fe202e3d..c7875a7d55f 100644 --- a/packages/redis-worker/src/mollifier/index.ts +++ b/packages/redis-worker/src/mollifier/index.ts @@ -1,8 +1,21 @@ -export { MollifierBuffer, type MollifierBufferOptions } from "./buffer.js"; +export { + MollifierBuffer, + type MollifierBufferOptions, + type SnapshotPatch, + type AcceptResult, + type MutateSnapshotResult, + type CasSetMetadataResult, + type IdempotencyClaimResult, + type IdempotencyLookupInput, + idempotencyLookupKeyFor, + makeIdempotencyClaimKey, +} from "./buffer.js"; export { MollifierDrainer, type MollifierDrainerOptions, type MollifierDrainerHandler, + type MollifierDrainerTerminalFailureHandler, + type MollifierDrainerTerminalFailureCause, type DrainResult, } from "./drainer.js"; export { diff --git a/packages/redis-worker/src/mollifier/schemas.ts b/packages/redis-worker/src/mollifier/schemas.ts index f93b0f0a3c3..5acd0c7c15d 100644 --- a/packages/redis-worker/src/mollifier/schemas.ts +++ b/packages/redis-worker/src/mollifier/schemas.ts @@ -27,6 +27,10 @@ const stringToDate = z.string().transform((v, ctx) => { return d; }); +const stringToBool = z + .union([z.literal("true"), z.literal("false")]) + .transform((v) => v === "true"); + const stringToError = z.string().transform((v, ctx) => { try { return BufferEntryError.parse(JSON.parse(v)); @@ -44,6 +48,27 @@ export const BufferEntrySchema = z.object({ status: BufferEntryStatus, attempts: stringToInt, createdAt: stringToDate, + // Microsecond epoch of accept time, kept as a hash field for dwell + // metrics. Not a queue sort key (the queue is a FIFO LIST). Defaulted + // so an entry written by an accept Lua predating this field — or one + // surviving across the deploy that introduced it — still parses instead + // of being silently dropped on pop. + createdAtMicros: stringToInt.default("0"), + // Drainer-ack flag: `true` once the drainer has materialised this run + // into PG. The hash persists for a short grace TTL after ack so direct + // reads (retrieve, trace, etc.) still resolve while PG replica lag + // settles. Absent on pre-ack entries. + materialised: stringToBool.default("false"), + // Denormalised pointer to the Redis idempotency lookup key (set when + // the run was accepted with an idempotency key, empty otherwise). The + // ack Lua reads this to DEL the lookup atomically with marking the + // entry materialised. + idempotencyLookupKey: z.string().optional().default(""), + // Optimistic-lock counter for the snapshot's `metadata` field. + // Incremented atomically by the CAS metadata Lua. Matches the + // semantic of `TaskRun.metadataVersion` on the PG side (which the + // UpdateMetadataService uses for the same retry-on-conflict pattern). + metadataVersion: stringToInt.default("0"), lastError: stringToError.optional(), }); diff --git a/packages/trigger-sdk/src/v3/ai-shared.ts b/packages/trigger-sdk/src/v3/ai-shared.ts index 35b61910563..a0ea3036cff 100644 --- a/packages/trigger-sdk/src/v3/ai-shared.ts +++ b/packages/trigger-sdk/src/v3/ai-shared.ts @@ -16,7 +16,7 @@ */ import type { Task, AnyTask } from "@trigger.dev/core/v3"; -import type { ModelMessage, UIMessage } from "ai"; +import type { InferUITools, ModelMessage, ToolSet, UIDataTypes, UIMessage } from "ai"; /** * Message-part `type` value for the pending-message data part the agent @@ -199,6 +199,26 @@ export type InferChatUIMessage = TTask extends Task< ? TUIM : UIMessage; +/** + * Derive the chat `UIMessage` type for a given tool set. The tool-part types + * (`tool-${name}` with typed input/output) are inferred from the tools. Use + * this to declare the message type from your tools (e.g. to pass to + * `chat.withUIMessage<...>()` or to type the frontend) without hand-writing + * the `UIMessage>` triple. + * + * @example + * ```ts + * import type { InferChatUIMessageFromTools } from "@trigger.dev/sdk/ai"; + * const tools = { search, readFile }; + * type ChatUiMessage = InferChatUIMessageFromTools; + * ``` + */ +export type InferChatUIMessageFromTools = UIMessage< + unknown, + UIDataTypes, + InferUITools +>; + /** * Upsert an incoming wire message into the customer's DB-backed chain * inside a `hydrateMessages` hook. Returns `true` iff the chain was diff --git a/packages/trigger-sdk/src/v3/ai.ts b/packages/trigger-sdk/src/v3/ai.ts index d1a6a226023..62aad9d8c57 100644 --- a/packages/trigger-sdk/src/v3/ai.ts +++ b/packages/trigger-sdk/src/v3/ai.ts @@ -102,7 +102,16 @@ const METADATA_KEY = "tool.execute.options"; * stopped/aborted conversations with partial tool parts. */ function toModelMessages(messages: UIMessage[]): Promise { - return convertToModelMessages(messages, { ignoreIncompleteToolCalls: true }); + // Pass the resolved per-turn `tools` (if any) so the AI SDK can look up each + // tool's `toModelOutput` and re-apply it to prior-turn tool results. Without + // `tools` it falls back to JSON-stringifying the raw output (TRI-10149). The + // conditional spread keeps the options object byte-identical to the no-tools + // path when nothing was declared. + const tools = locals.get(chatResolvedToolsKey); + return convertToModelMessages(messages, { + ignoreIncompleteToolCalls: true, + ...(tools ? { tools } : {}), + }); } export type ToolCallExecutionOptions = { @@ -1425,7 +1434,10 @@ export type ChatTaskSignals = { * The full payload passed to a `chatAgent` run function. * Extends `ChatTaskPayload` (the wire payload) with abort signals. */ -export type ChatTaskRunPayload = ChatTaskPayload & +export type ChatTaskRunPayload< + TClientData = unknown, + TTools extends ToolSet = ToolSet, +> = ChatTaskPayload & ChatTaskSignals & { /** * Task run context — same object as the `ctx` passed to a standard `task({ run })` handler’s second argument. @@ -1436,6 +1448,21 @@ export type ChatTaskRunPayload = ChatTaskPayload + * streamText({ model, messages, tools, abortSignal: signal }) + * ``` + * + * Declaring `tools` on the config is also what lets the SDK re-run each + * tool's `toModelOutput` when it re-converts prior-turn history (see the + * `tools` option on `chat.agent`). Empty object when no `tools` were declared. + */ + tools: TTools; }; // Input streams for bidirectional chat communication @@ -2366,6 +2393,20 @@ const chatPrepareMessagesKey = locals.create<(event: PrepareMessagesEvent) => ModelMessage[] | Promise>( "chat.prepareMessages" ); +/** + * @internal The raw `tools` option from `chat.agent({ tools })`, either a + * static `ToolSet` or a per-turn function. Set once at boot. + */ +const chatToolsOptionKey = locals.create< + ToolSet | ((event: ResolveToolsEvent) => ToolSet | Promise) +>("chat.toolsOption"); +/** + * @internal The concrete `ToolSet` resolved for the current turn. Read by + * `toModelMessages` so `convertToModelMessages` can re-run `toModelOutput` on + * prior-turn tool results. Unset when no `tools` were declared (preserves the + * exact pre-feature conversion behavior). + */ +const chatResolvedToolsKey = locals.create("chat.resolvedTools"); /** @internal Flag set by `chat.requestUpgrade()` to exit the loop after the current turn. */ const chatUpgradeRequestedKey = locals.create("chat.upgradeRequested"); @@ -2626,6 +2667,25 @@ export type PrepareMessagesEvent = { clientData?: TClientData; }; +/** + * Event passed to the per-turn `tools` function form on `chat.agent`. + * + * Use this when the active tool set depends on per-turn context (the user, a + * feature flag, etc.). Return the `ToolSet` to use for converting this turn's + * history. Only `inputSchema` and `toModelOutput` are read during conversion, + * so a lightweight map (no `execute`) is fine. + */ +export type ResolveToolsEvent = { + /** The chat session ID. */ + chatId: string; + /** The current turn number (0-indexed). */ + turn: number; + /** Whether this run is continuing an existing chat. */ + continuation: boolean; + /** Custom data from the frontend. */ + clientData?: TClientData; +}; + /** * Data shape for `data-compaction` stream chunks emitted during compaction. * Use to type the `data` field when rendering compaction parts in the frontend. @@ -2800,6 +2860,41 @@ async function applyPrepareMessages( ); } +/** + * Resolve the `tools` option into a concrete `ToolSet` and cache it in locals so + * `toModelMessages` can pass it to `convertToModelMessages`. For the function + * form, invokes the user function with the given context (or the current turn + * context when no override is passed). Pass an `override` for the boot-time + * history conversion, which runs before the per-turn context exists and uses + * the run/continuation payload's `clientData`. + * + * Fails closed: a throwing resolver propagates rather than carrying a prior + * turn's set forward. The function form can gate capabilities by user or flag, + * so reusing stale tools would leak capabilities. No-op when no `tools` were + * declared. + * @internal + */ +async function resolveTurnTools( + override?: { chatId: string; turn: number; continuation: boolean; clientData: unknown } +): Promise { + const option = locals.get(chatToolsOptionKey); + if (!option) return; + + if (typeof option !== "function") { + locals.set(chatResolvedToolsKey, option); + return; + } + + const ctx = override ?? locals.get(chatTurnContextKey); + const resolved = await option({ + chatId: ctx?.chatId ?? "", + turn: ctx?.turn ?? 0, + continuation: ctx?.continuation ?? false, + clientData: ctx?.clientData, + }); + locals.set(chatResolvedToolsKey, resolved); +} + /** * Read the current compaction state. Returns the summary and base message count * if compaction has occurred in this turn, or `undefined` if not. @@ -4250,6 +4345,7 @@ export type ChatAgentOptions< TClientDataSchema extends TaskSchema | undefined = undefined, TUIMessage extends UIMessage = UIMessage, TActionSchema extends TaskSchema | undefined = undefined, + TTools extends ToolSet = ToolSet, > = Omit< TaskOptions< TIdentifier, @@ -4360,6 +4456,41 @@ export type ChatAgentOptions< > ) => Promise | unknown; + /** + * The tools available to this agent. + * + * `chat.agent` doesn't call the model for you. Your tools still go to + * `streamText({ tools })` inside `run()`. Declaring them here additionally + * lets the SDK re-run each tool's + * [`toModelOutput`](https://ai-sdk.dev/docs/ai-sdk-core/tools-and-tool-calling#tomodeloutput) + * when it re-converts persisted history on later turns. Without this, the + * AI SDK has no `tools` to look up `toModelOutput` against, so a tool's + * transformed result (e.g. raw image bytes → an image content part, or a + * sub-agent summary) silently degrades to its raw JSON output from turn 2 + * onward. + * + * Only `inputSchema` and `toModelOutput` are read during conversion (never + * `execute`), so you may pass a lightweight map if you keep heavy execute + * deps out of this module. + * + * Pass either a static `ToolSet` or a function of per-turn context (for + * tools that depend on the user, a feature flag, etc.). The resolved set is + * available on the `run()` payload as `tools`. + * + * @example + * ```ts + * const tools = { read_file, search }; + * chat.agent({ + * tools, + * run: async ({ messages, tools, signal }) => + * streamText({ model, messages, tools, abortSignal: signal }), + * }); + * ``` + */ + tools?: + | TTools + | ((event: ResolveToolsEvent>) => TTools | Promise); + /** * The run function for the chat task. * @@ -4370,7 +4501,9 @@ export type ChatAgentOptions< * **Auto-piping:** If this function returns a value with `.toUIMessageStream()`, * the stream is automatically piped to the frontend. */ - run: (payload: ChatTaskRunPayload>) => Promise; + run: ( + payload: ChatTaskRunPayload, TTools> + ) => Promise; /** * Called once at the start of every run boot — for the initial run, for @@ -4951,8 +5084,9 @@ function chatAgent< TClientDataSchema extends TaskSchema | undefined = undefined, TUIMessage extends UIMessage = UIMessage, TActionSchema extends TaskSchema | undefined = undefined, + TTools extends ToolSet = ToolSet, >( - options: ChatAgentOptions + options: ChatAgentOptions ): Task>, unknown> { const { run: userRun, @@ -4971,6 +5105,7 @@ function chatAgent< compaction, pendingMessages: pendingMessagesConfig, prepareMessages, + tools: toolsOption, onTurnComplete, maxTurns = 100, turnTimeout = "1h", @@ -5049,6 +5184,25 @@ function chatAgent< locals.set(chatPrepareMessagesKey, prepareMessages); } + if (toolsOption) { + // Cast: the option's function form is typed against the parsed + // `clientData` (`ResolveToolsEvent>`), but the + // locals key uses the erased `ResolveToolsEvent`. The runtime + // value is identical; this mirrors how `prepareMessages` is stored. + locals.set( + chatToolsOptionKey, + toolsOption as + | ToolSet + | ((event: ResolveToolsEvent) => ToolSet | Promise) + ); + // Static tools are usable immediately. The function form is resolved + // just before the boot history conversion (with the payload's + // clientData) and again per-turn (see resolveTurnTools). + if (typeof toolsOption !== "function") { + locals.set(chatResolvedToolsKey, toolsOption); + } + } + if (compaction) { locals.set( chatAgentCompactionKey, @@ -5438,6 +5592,29 @@ function chatAgent< } if (accumulatedUIMessages.length > 0) { + // Resolve a function-form `tools` with the run/continuation payload's + // clientData so this conversion of the restored history applies each + // tool's toModelOutput (static tools were already seeded above). This + // only re-renders saved history, so it fails open: a resolver hiccup + // logs and converts without tools rather than blocking the resume. + // Per-turn resolveTurnTools still fails closed for live turns. + if (typeof toolsOption === "function") { + try { + await resolveTurnTools({ + chatId: payload.chatId, + turn: 0, + continuation: payload.continuation ?? false, + clientData: parseClientData + ? await parseClientData(payload.metadata) + : payload.metadata, + }); + } catch (error) { + logger.warn( + "chat.agent: tools() resolver threw at boot; restored history converted without toModelOutput", + { error: error instanceof Error ? error.message : String(error) } + ); + } + } try { accumulatedMessages = await toModelMessages(accumulatedUIMessages); } catch (error) { @@ -5958,6 +6135,11 @@ function chatAgent< clientData, }); + // Resolve the per-turn `tools` set now that turn context + // (incl. parsed clientData) exists, so every toModelMessages + // call this turn can re-apply tool `toModelOutput`. + await resolveTurnTools(); + // Per-turn stop controller (reset each turn) const stopController = new AbortController(); currentStopController = stopController; @@ -6613,6 +6795,7 @@ function chatAgent< previousTurnUsage, totalUsage: cumulativeUsage, ctx, + tools: locals.get(chatResolvedToolsKey) ?? {}, signal: combinedSignal, cancelSignal, stopSignal, @@ -7512,11 +7695,11 @@ export interface ChatBuilder< * (backwards compatible). */ agent: [TClientDataSchema] extends [undefined] - ? ( - options: ChatAgentOptions + ? ( + options: ChatAgentOptions ) => Task>, unknown> - : ( - options: Omit, "clientDataSchema"> + : ( + options: Omit, "clientDataSchema"> ) => Task>, unknown>; /** @@ -9145,7 +9328,11 @@ function chatLocal>(options: { id: string }): // the browser graph. Re-exported here so `@trigger.dev/sdk/ai` consumers // still see them. import type { InferChatClientData, InferChatUIMessage } from "./ai-shared.js"; -export type { InferChatClientData, InferChatUIMessage } from "./ai-shared.js"; +export type { + InferChatClientData, + InferChatUIMessage, + InferChatUIMessageFromTools, +} from "./ai-shared.js"; /** * Options for {@link createChatStartSessionAction}. diff --git a/packages/trigger-sdk/test/mockChatAgent.test.ts b/packages/trigger-sdk/test/mockChatAgent.test.ts index 3832e64b848..7245622b6e4 100644 --- a/packages/trigger-sdk/test/mockChatAgent.test.ts +++ b/packages/trigger-sdk/test/mockChatAgent.test.ts @@ -2082,3 +2082,139 @@ describe("mockChatAgent", () => { }); }); }); + +describe("mockChatAgent tools / toModelOutput (TRI-10149)", () => { + // A tool whose raw `execute`/output never contains the marker; the marker + // lives ONLY in `toModelOutput`. If the SDK re-converts prior-turn history + // without threading tools, `toModelOutput` is skipped and the marker is lost. + const makeVault = () => + tool({ + description: "Vault.", + inputSchema: z.object({}), + toModelOutput: () => ({ type: "text" as const, value: "MARKER-XYZ" }), + }); + + // Seed a prior assistant turn that already carries a resolved vault tool + // result whose raw output has NO marker. + const seedAssistantWithToolResult = { + id: "a-vault", + role: "assistant" as const, + parts: [ + { + type: "tool-vault" as const, + toolCallId: "tc_vault", + state: "output-available" as const, + input: {}, + output: { bytes: "raw-no-marker" }, + }, + ], + }; + + it("re-applies tool.toModelOutput when re-converting prior-turn history (static tools)", async () => { + const vault = makeVault(); + const model = new MockLanguageModelV3({ + doStream: async () => ({ stream: textStream("ok") }), + }); + + const seenToolResults: string[] = []; + const agent = chat.agent({ + id: "mockChatAgent.toModelOutput-static", + tools: { vault }, + run: async ({ messages, tools, signal }) => { + // REUSE A: `tools` is threaded onto the run payload (typed concretely, + // not the broad `ToolSet`). The static-form type inference is validated + // end-to-end by the references/ai-chat typecheck; here we exercise the + // runtime behavior. (test/ is not part of the package's tsc pass.) + for (const m of messages) { + if (m.role === "tool") seenToolResults.push(JSON.stringify(m.content)); + } + return streamText({ model, messages, tools, abortSignal: signal }); + }, + }); + + const harness = mockChatAgent(agent, { chatId: "test-tmo-static" }); + try { + // Turn 1 seeds the tool result; turn 2 forces a re-conversion of history. + await harness.sendMessage(seedAssistantWithToolResult as any); + await harness.sendMessage(userMessage("recall")); + await new Promise((r) => setTimeout(r, 20)); + + const all = seenToolResults.join("|"); + // toModelOutput ran → transformed value present, raw output gone. + expect(all).toContain("MARKER-XYZ"); + expect(all).not.toContain("raw-no-marker"); + } finally { + await harness.close(); + } + }); + + it("resolves the per-turn function form of tools and re-applies toModelOutput", async () => { + const vault = makeVault(); + const model = new MockLanguageModelV3({ + doStream: async () => ({ stream: textStream("ok") }), + }); + + const seenToolResults: string[] = []; + let resolverCalls = 0; + const agent = chat.agent({ + id: "mockChatAgent.toModelOutput-fn", + tools: () => { + resolverCalls++; + return { vault }; + }, + run: async ({ messages, tools, signal }) => { + for (const m of messages) { + if (m.role === "tool") seenToolResults.push(JSON.stringify(m.content)); + } + return streamText({ model, messages, tools, abortSignal: signal }); + }, + }); + + const harness = mockChatAgent(agent, { chatId: "test-tmo-fn" }); + try { + await harness.sendMessage(seedAssistantWithToolResult as any); + await harness.sendMessage(userMessage("recall")); + await new Promise((r) => setTimeout(r, 20)); + + const all = seenToolResults.join("|"); + expect(all).toContain("MARKER-XYZ"); + expect(all).not.toContain("raw-no-marker"); + // The resolver runs per turn (once each), not per conversion call. + expect(resolverCalls).toBeGreaterThanOrEqual(2); + } finally { + await harness.close(); + } + }); + + it("leaves conversion unchanged when no tools are declared (raw output passes through)", async () => { + const model = new MockLanguageModelV3({ + doStream: async () => ({ stream: textStream("ok") }), + }); + + const seenToolResults: string[] = []; + const agent = chat.agent({ + id: "mockChatAgent.toModelOutput-none", + run: async ({ messages, signal }) => { + for (const m of messages) { + if (m.role === "tool") seenToolResults.push(JSON.stringify(m.content)); + } + return streamText({ model, messages, abortSignal: signal }); + }, + }); + + const harness = mockChatAgent(agent, { chatId: "test-tmo-none" }); + try { + await harness.sendMessage(seedAssistantWithToolResult as any); + await harness.sendMessage(userMessage("recall")); + await new Promise((r) => setTimeout(r, 20)); + + // No tools declared → no toModelOutput lookup → raw output stringified + // (the pre-feature behavior, preserved for backward compatibility). + const all = seenToolResults.join("|"); + expect(all).toContain("raw-no-marker"); + expect(all).not.toContain("MARKER-XYZ"); + } finally { + await harness.close(); + } + }); +}); diff --git a/references/ai-chat/src/components/chat-sidebar.tsx b/references/ai-chat/src/components/chat-sidebar.tsx index 9707b61ac36..f49e20ec2d2 100644 --- a/references/ai-chat/src/components/chat-sidebar.tsx +++ b/references/ai-chat/src/components/chat-sidebar.tsx @@ -119,6 +119,7 @@ export function ChatSidebar({ +