feat(lambda): warn when slowest chunk approaches the 15-min cap

jrusso1020 · claude · jrusso1020 · commit 23fb16813ff8 · 2026-05-21T23:18:16.000Z
Render-time, post-hoc warning: when the slowest RenderChunk Lambda
invocation burned through more than 80% of the 900-second cap, surface
a warning at the end of --wait mode pointing at --max-parallel-chunks.
The cost-analysis sweep hit this twice — inspector-launch at 1080p/60
and 4K@anything blew past the cap with default 16-way fan-out, producing
a Sandbox.Timedout retry storm. The next user to push fps or duration
on a heavy composition will hit the same wall; this turns a cryptic
SFN failure into a one-line hint they can act on before the next
render.

Plumbing:
 - getRenderProgress tracks max billed-duration across RenderChunk
   invocations (the only state whose runtime is gated by the 15-min
   cap; Plan + Assemble are off-path).
 - RenderProgress.maxChunkDurationMs is null before the first chunk
   reports back.
 - LAMBDA_TIMEOUT_MS / CHUNK_RUNTIME_WARN_RATIO / CHUNK_RUNTIME_WARN_MS
   live in chunkRuntime.ts and are exported from the SDK so external
   callers (custom CLIs, monitoring) can match the threshold.
 - CLI's render --wait path prints the warning with a suggested
   --max-parallel-chunks value scaled by the observed headroom ratio.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/packages/aws-lambda/src/sdk/chunkRuntime.ts b/packages/aws-lambda/src/sdk/chunkRuntime.ts
@@ -0,0 +1,21 @@
+/**
+ * Per-chunk runtime constants + warning threshold for `RenderChunk` Lambda
+ * invocations.
+ *
+ * Lambda's hard per-invocation cap is 900 seconds; a chunk that runs to
+ * 720s+ is one bad cold-start away from `Sandbox.Timedout` on a slightly
+ * heavier next render (more fps, more duration, more compositing). The
+ * cost-analysis sweep that produced this module hit the cap twice — both
+ * with default 16-way fan-out on heavy WebGL — and the
+ * `getRenderProgress` SDK now surfaces the slowest observed chunk so
+ * callers can warn ahead of that failure.
+ */
+
+/** Lambda's hard per-invocation cap. */
+export const LAMBDA_TIMEOUT_MS = 900_000;
+
+/** Fraction of {@link LAMBDA_TIMEOUT_MS} above which we surface the warning. */
+export const CHUNK_RUNTIME_WARN_RATIO = 0.8;
+
+/** Pre-computed `LAMBDA_TIMEOUT_MS × CHUNK_RUNTIME_WARN_RATIO`. */
+export const CHUNK_RUNTIME_WARN_MS = LAMBDA_TIMEOUT_MS * CHUNK_RUNTIME_WARN_RATIO;
diff --git a/packages/aws-lambda/src/sdk/getRenderProgress.test.ts b/packages/aws-lambda/src/sdk/getRenderProgress.test.ts
@@ -357,6 +357,85 @@ describe("getRenderProgress", () => {
       ]);
     });
 
+    it("tracks the slowest RenderChunk billed duration for the CLI warning", async () => {
+      const sfn = new FakeSFN();
+      const renderChunk = (ms: number) => [
+        stateEntered("RenderChunk"),
+        taskScheduled(),
+        taskSucceeded({ Action: "renderChunk", FramesEncoded: 10, DurationMs: ms }),
+      ];
+      sfn.historyPages = [
+        [
+          stateEntered("Plan"),
+          taskScheduled(),
+          taskSucceeded({ Action: "plan", TotalFrames: 30, DurationMs: 1_000 }),
+          ...renderChunk(100_000),
+          ...renderChunk(820_000), // 91% of the 900s cap
+          ...renderChunk(50_000),
+          stateEntered("Assemble"),
+          taskScheduled(),
+          taskSucceeded({
+            Action: "assemble",
+            FileSize: 1_000,
+            OutputS3Uri: "s3://b/k.mp4",
+            DurationMs: 5_000,
+          }),
+        ],
+      ];
+      sfn.describe.status = "SUCCEEDED";
+      const progress = await getRenderProgress({
+        executionArn: "arn",
+        sfn: sfn as unknown as SFNClient,
+      });
+      expect(progress.maxChunkDurationMs).toBe(820_000);
+    });
+
+    it("ignores Plan/Assemble billed durations in maxChunkDurationMs", async () => {
+      const sfn = new FakeSFN();
+      sfn.historyPages = [
+        [
+          stateEntered("Plan"),
+          taskScheduled(),
+          // 60s plan — would be the max if Plan counted, but it shouldn't.
+          taskSucceeded({ Action: "plan", TotalFrames: 30, DurationMs: 60_000 }),
+          stateEntered("RenderChunk"),
+          taskScheduled(),
+          taskSucceeded({ Action: "renderChunk", FramesEncoded: 10, DurationMs: 12_000 }),
+          stateEntered("Assemble"),
+          taskScheduled(),
+          // 90s assemble — also would be the max if Assemble counted.
+          taskSucceeded({
+            Action: "assemble",
+            FileSize: 1_000,
+            OutputS3Uri: "s3://b/k.mp4",
+            DurationMs: 90_000,
+          }),
+        ],
+      ];
+      sfn.describe.status = "SUCCEEDED";
+      const progress = await getRenderProgress({
+        executionArn: "arn",
+        sfn: sfn as unknown as SFNClient,
+      });
+      expect(progress.maxChunkDurationMs).toBe(12_000);
+    });
+
+    it("maxChunkDurationMs is null before any RenderChunk completes", async () => {
+      const sfn = new FakeSFN();
+      sfn.historyPages = [
+        [
+          stateEntered("Plan"),
+          taskScheduled(),
+          taskSucceeded({ Action: "plan", TotalFrames: 30, DurationMs: 1_000 }),
+        ],
+      ];
+      const progress = await getRenderProgress({
+        executionArn: "arn",
+        sfn: sfn as unknown as SFNClient,
+      });
+      expect(progress.maxChunkDurationMs).toBeNull();
+    });
+
     it("billed-seconds sum matches what the cost-analysis script computes", async () => {
       // Real-shape regression: cost script summed `DurationMs` across plan
       // + renderChunk + assemble TaskSucceeded events and pinned the
diff --git a/packages/aws-lambda/src/sdk/getRenderProgress.ts b/packages/aws-lambda/src/sdk/getRenderProgress.ts
@@ -81,6 +81,17 @@ export interface RenderProgress {
    */
   lambdasInvoked: number;
   costs: RenderCost;
+  /**
+   * Per-chunk runtime headroom snapshot. `maxChunkDurationMs` is the
+   * billed duration of the slowest `RenderChunk` Lambda invocation so
+   * far; null until the first chunk reports back. The CLI warns when
+   * this exceeds {@link CHUNK_RUNTIME_WARN_RATIO} of Lambda's 900-second
+   * hard cap so the user knows to bump `--max-parallel-chunks` before
+   * the next render hits a `Sandbox.Timedout` retry storm. Defaults
+   * tuned from the cost-analysis sweep where inspector-launch at
+   * 1080p/60fps with mpc=16 blew past the cap.
+   */
+  maxChunkDurationMs: number | null;
   /** Final output object if Assemble succeeded; `null` otherwise. */
   outputFile: { s3Uri: string; bytes: number | null } | null;
   errors: RenderError[];
@@ -125,6 +136,7 @@ export async function getRenderProgress(opts: GetRenderProgressOptions): Promise
     totalFrames: summary.totalFrames,
     lambdasInvoked: summary.lambdasInvoked,
     costs,
+    maxChunkDurationMs: summary.maxChunkDurationMs,
     outputFile: summary.outputFile,
     errors: summary.errors,
     fatalErrorEncountered: isTerminalFailure(status),
@@ -159,6 +171,8 @@ interface HistorySummary {
   totalFrames: number | null;
   lambdasInvoked: number;
   assembleComplete: boolean;
+  /** Slowest billed-duration across `RenderChunk` invocations; null if none observed. */
+  maxChunkDurationMs: number | null;
   outputFile: { s3Uri: string; bytes: number | null } | null;
   errors: RenderError[];
 }
@@ -177,6 +191,7 @@ function summarizeHistory(events: HistoryEvent[], memoryMb: number): HistorySumm
   let assembleComplete = false;
   let outputFile: HistorySummary["outputFile"] = null;
   let stateTransitions = 0;
+  let maxChunkDurationMs: number | null = null;
   const errors: RenderError[] = [];
   const lambdaInvocations: BilledLambdaInvocation[] = [];
 
@@ -236,6 +251,11 @@ function summarizeHistory(events: HistoryEvent[], memoryMb: number): HistorySumm
         applyPayloadFrameCounts(payload, currentLambdaState, (delta) => {
           framesRendered += delta;
         });
+        maxChunkDurationMs = bumpMaxChunkDuration(
+          maxChunkDurationMs,
+          currentLambdaState,
+          billedDurationMs,
+        );
         if (payload && typeof payload === "object") {
           const obj = payload as Record<string, unknown>;
           if (typeof obj.TotalFrames === "number") totalFrames = obj.TotalFrames;
@@ -250,6 +270,11 @@ function summarizeHistory(events: HistoryEvent[], memoryMb: number): HistorySumm
           memorySizeMb: memoryMb,
           estimated: billedDurationMs === 0,
         });
+        maxChunkDurationMs = bumpMaxChunkDuration(
+          maxChunkDurationMs,
+          currentLambdaState,
+          billedDurationMs,
+        );
         if (payload && typeof payload === "object") {
           const obj = payload as Record<string, unknown>;
           if (typeof obj.TotalFrames === "number") totalFrames = obj.TotalFrames;
@@ -334,6 +359,7 @@ function summarizeHistory(events: HistoryEvent[], memoryMb: number): HistorySumm
     totalFrames,
     lambdasInvoked,
     assembleComplete,
+    maxChunkDurationMs,
     outputFile,
     errors,
   };
@@ -369,6 +395,25 @@ function unwrapLambdaPayload(payload: unknown): unknown {
   return payload;
 }
 
+/**
+ * Bump the running `maxChunkDurationMs` if this Lambda invocation
+ * belongs to the `RenderChunk` state and reported a non-zero billed
+ * duration. Shared between the `TaskSucceeded` and
+ * `LambdaFunctionSucceeded` branches so both integrations contribute
+ * to the same headroom counter — and so the duplication-gate stays
+ * happy. Returns the new max (or the input unchanged if neither
+ * condition is met).
+ */
+function bumpMaxChunkDuration(
+  current: number | null,
+  currentLambdaState: string | null,
+  billedDurationMs: number,
+): number | null {
+  if (currentLambdaState !== "RenderChunk") return current;
+  if (billedDurationMs <= 0) return current;
+  return Math.max(current ?? 0, billedDurationMs);
+}
+
 /**
  * Apply `FramesEncoded` from a Lambda success payload to the running
  * counter, but only when the enclosing state is the chunk-render step.
diff --git a/packages/aws-lambda/src/sdk/index.ts b/packages/aws-lambda/src/sdk/index.ts
@@ -22,6 +22,11 @@ export {
   computeRenderCost,
   type RenderCost,
 } from "./costAccounting.js";
+export {
+  CHUNK_RUNTIME_WARN_MS,
+  CHUNK_RUNTIME_WARN_RATIO,
+  LAMBDA_TIMEOUT_MS,
+} from "./chunkRuntime.js";
 export {
   InvalidConfigError,
   MAX_STEP_FUNCTIONS_INPUT_BYTES,
diff --git a/packages/cli/src/commands/lambda/render.ts b/packages/cli/src/commands/lambda/render.ts
@@ -223,6 +223,7 @@ async function waitForCompletion(
         console.log(`  ${c.dim("Output:")}        ${progress.outputFile.s3Uri}`);
         console.log(`  ${c.dim("Size:")}          ${progress.outputFile.bytes ?? "?"} bytes`);
         console.log(`  ${c.dim("Total cost:")}    ${progress.costs.displayCost}`);
+        await warnIfChunkRuntimeIsCloseToCap(progress);
       } else {
         console.log();
         console.log(c.error(`Render ended with status ${progress.status}.`));
@@ -237,6 +238,40 @@ async function waitForCompletion(
   }
 }
 
+/**
+ * Surface a runtime-headroom warning when the slowest chunk burned through
+ * more than 80% of Lambda's 900-second cap. A future render that adds
+ * fps, duration, or composition complexity to the same project will
+ * push past the cap and hit `Sandbox.Timedout` — that's the exact
+ * symptom that wedged the cost-analysis sweep (see the
+ * sparticuz-executable-guard fix). Print the actual seconds + the
+ * suggested fan-out so the user can act on it before the next render.
+ */
+// fallow-ignore-next-line complexity
+async function warnIfChunkRuntimeIsCloseToCap(progress: {
+  maxChunkDurationMs: number | null;
+  lambdasInvoked: number;
+}): Promise<void> {
+  const { CHUNK_RUNTIME_WARN_MS, LAMBDA_TIMEOUT_MS } = await loadSDK();
+  const max = progress.maxChunkDurationMs;
+  if (max === null || max < CHUNK_RUNTIME_WARN_MS) return;
+  const slowestSec = Math.round(max / 1000);
+  const capSec = LAMBDA_TIMEOUT_MS / 1000;
+  // Roughly halve the chunk runtime by doubling the fan-out; round up to
+  // the nearest power-of-2 that's a comfortable suggestion (32, 64, 128).
+  const headroomRatio = max / CHUNK_RUNTIME_WARN_MS;
+  const suggestedFanOut = headroomRatio > 1.5 ? 128 : headroomRatio > 1.2 ? 64 : 32;
+  console.log();
+  console.log(
+    c.warn(
+      `Heads up: slowest chunk ran ${slowestSec}s of the ${capSec}s Lambda cap. ` +
+        `Adding fps, duration, or complexity to this composition will likely trip ` +
+        `Sandbox.Timedout on the next render.\n` +
+        `  Mitigate with: --max-parallel-chunks ${suggestedFanOut} (shrinks per-chunk work).`,
+    ),
+  );
+}
+
 function sleep(ms: number): Promise<void> {
   return new Promise((res) => setTimeout(res, ms));
 }