Skip to content

Commit 23fb168

Browse files
jrusso1020claude
andcommitted
feat(lambda): warn when slowest chunk approaches the 15-min cap
Render-time, post-hoc warning: when the slowest RenderChunk Lambda invocation burned through more than 80% of the 900-second cap, surface a warning at the end of --wait mode pointing at --max-parallel-chunks. The cost-analysis sweep hit this twice — inspector-launch at 1080p/60 and 4K@anything blew past the cap with default 16-way fan-out, producing a Sandbox.Timedout retry storm. The next user to push fps or duration on a heavy composition will hit the same wall; this turns a cryptic SFN failure into a one-line hint they can act on before the next render. Plumbing: - getRenderProgress tracks max billed-duration across RenderChunk invocations (the only state whose runtime is gated by the 15-min cap; Plan + Assemble are off-path). - RenderProgress.maxChunkDurationMs is null before the first chunk reports back. - LAMBDA_TIMEOUT_MS / CHUNK_RUNTIME_WARN_RATIO / CHUNK_RUNTIME_WARN_MS live in chunkRuntime.ts and are exported from the SDK so external callers (custom CLIs, monitoring) can match the threshold. - CLI's render --wait path prints the warning with a suggested --max-parallel-chunks value scaled by the observed headroom ratio. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3e5b059 commit 23fb168

5 files changed

Lines changed: 185 additions & 0 deletions

File tree

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/**
2+
* Per-chunk runtime constants + warning threshold for `RenderChunk` Lambda
3+
* invocations.
4+
*
5+
* Lambda's hard per-invocation cap is 900 seconds; a chunk that runs to
6+
* 720s+ is one bad cold-start away from `Sandbox.Timedout` on a slightly
7+
* heavier next render (more fps, more duration, more compositing). The
8+
* cost-analysis sweep that produced this module hit the cap twice — both
9+
* with default 16-way fan-out on heavy WebGL — and the
10+
* `getRenderProgress` SDK now surfaces the slowest observed chunk so
11+
* callers can warn ahead of that failure.
12+
*/
13+
14+
/** Lambda's hard per-invocation cap. */
15+
export const LAMBDA_TIMEOUT_MS = 900_000;
16+
17+
/** Fraction of {@link LAMBDA_TIMEOUT_MS} above which we surface the warning. */
18+
export const CHUNK_RUNTIME_WARN_RATIO = 0.8;
19+
20+
/** Pre-computed `LAMBDA_TIMEOUT_MS × CHUNK_RUNTIME_WARN_RATIO`. */
21+
export const CHUNK_RUNTIME_WARN_MS = LAMBDA_TIMEOUT_MS * CHUNK_RUNTIME_WARN_RATIO;

packages/aws-lambda/src/sdk/getRenderProgress.test.ts

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,85 @@ describe("getRenderProgress", () => {
357357
]);
358358
});
359359

360+
it("tracks the slowest RenderChunk billed duration for the CLI warning", async () => {
361+
const sfn = new FakeSFN();
362+
const renderChunk = (ms: number) => [
363+
stateEntered("RenderChunk"),
364+
taskScheduled(),
365+
taskSucceeded({ Action: "renderChunk", FramesEncoded: 10, DurationMs: ms }),
366+
];
367+
sfn.historyPages = [
368+
[
369+
stateEntered("Plan"),
370+
taskScheduled(),
371+
taskSucceeded({ Action: "plan", TotalFrames: 30, DurationMs: 1_000 }),
372+
...renderChunk(100_000),
373+
...renderChunk(820_000), // 91% of the 900s cap
374+
...renderChunk(50_000),
375+
stateEntered("Assemble"),
376+
taskScheduled(),
377+
taskSucceeded({
378+
Action: "assemble",
379+
FileSize: 1_000,
380+
OutputS3Uri: "s3://b/k.mp4",
381+
DurationMs: 5_000,
382+
}),
383+
],
384+
];
385+
sfn.describe.status = "SUCCEEDED";
386+
const progress = await getRenderProgress({
387+
executionArn: "arn",
388+
sfn: sfn as unknown as SFNClient,
389+
});
390+
expect(progress.maxChunkDurationMs).toBe(820_000);
391+
});
392+
393+
it("ignores Plan/Assemble billed durations in maxChunkDurationMs", async () => {
394+
const sfn = new FakeSFN();
395+
sfn.historyPages = [
396+
[
397+
stateEntered("Plan"),
398+
taskScheduled(),
399+
// 60s plan — would be the max if Plan counted, but it shouldn't.
400+
taskSucceeded({ Action: "plan", TotalFrames: 30, DurationMs: 60_000 }),
401+
stateEntered("RenderChunk"),
402+
taskScheduled(),
403+
taskSucceeded({ Action: "renderChunk", FramesEncoded: 10, DurationMs: 12_000 }),
404+
stateEntered("Assemble"),
405+
taskScheduled(),
406+
// 90s assemble — also would be the max if Assemble counted.
407+
taskSucceeded({
408+
Action: "assemble",
409+
FileSize: 1_000,
410+
OutputS3Uri: "s3://b/k.mp4",
411+
DurationMs: 90_000,
412+
}),
413+
],
414+
];
415+
sfn.describe.status = "SUCCEEDED";
416+
const progress = await getRenderProgress({
417+
executionArn: "arn",
418+
sfn: sfn as unknown as SFNClient,
419+
});
420+
expect(progress.maxChunkDurationMs).toBe(12_000);
421+
});
422+
423+
it("maxChunkDurationMs is null before any RenderChunk completes", async () => {
424+
const sfn = new FakeSFN();
425+
sfn.historyPages = [
426+
[
427+
stateEntered("Plan"),
428+
taskScheduled(),
429+
taskSucceeded({ Action: "plan", TotalFrames: 30, DurationMs: 1_000 }),
430+
],
431+
];
432+
const progress = await getRenderProgress({
433+
executionArn: "arn",
434+
sfn: sfn as unknown as SFNClient,
435+
});
436+
expect(progress.maxChunkDurationMs).toBeNull();
437+
});
438+
360439
it("billed-seconds sum matches what the cost-analysis script computes", async () => {
361440
// Real-shape regression: cost script summed `DurationMs` across plan
362441
// + renderChunk + assemble TaskSucceeded events and pinned the

packages/aws-lambda/src/sdk/getRenderProgress.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,17 @@ export interface RenderProgress {
8181
*/
8282
lambdasInvoked: number;
8383
costs: RenderCost;
84+
/**
85+
* Per-chunk runtime headroom snapshot. `maxChunkDurationMs` is the
86+
* billed duration of the slowest `RenderChunk` Lambda invocation so
87+
* far; null until the first chunk reports back. The CLI warns when
88+
* this exceeds {@link CHUNK_RUNTIME_WARN_RATIO} of Lambda's 900-second
89+
* hard cap so the user knows to bump `--max-parallel-chunks` before
90+
* the next render hits a `Sandbox.Timedout` retry storm. Defaults
91+
* tuned from the cost-analysis sweep where inspector-launch at
92+
* 1080p/60fps with mpc=16 blew past the cap.
93+
*/
94+
maxChunkDurationMs: number | null;
8495
/** Final output object if Assemble succeeded; `null` otherwise. */
8596
outputFile: { s3Uri: string; bytes: number | null } | null;
8697
errors: RenderError[];
@@ -125,6 +136,7 @@ export async function getRenderProgress(opts: GetRenderProgressOptions): Promise
125136
totalFrames: summary.totalFrames,
126137
lambdasInvoked: summary.lambdasInvoked,
127138
costs,
139+
maxChunkDurationMs: summary.maxChunkDurationMs,
128140
outputFile: summary.outputFile,
129141
errors: summary.errors,
130142
fatalErrorEncountered: isTerminalFailure(status),
@@ -159,6 +171,8 @@ interface HistorySummary {
159171
totalFrames: number | null;
160172
lambdasInvoked: number;
161173
assembleComplete: boolean;
174+
/** Slowest billed-duration across `RenderChunk` invocations; null if none observed. */
175+
maxChunkDurationMs: number | null;
162176
outputFile: { s3Uri: string; bytes: number | null } | null;
163177
errors: RenderError[];
164178
}
@@ -177,6 +191,7 @@ function summarizeHistory(events: HistoryEvent[], memoryMb: number): HistorySumm
177191
let assembleComplete = false;
178192
let outputFile: HistorySummary["outputFile"] = null;
179193
let stateTransitions = 0;
194+
let maxChunkDurationMs: number | null = null;
180195
const errors: RenderError[] = [];
181196
const lambdaInvocations: BilledLambdaInvocation[] = [];
182197

@@ -236,6 +251,11 @@ function summarizeHistory(events: HistoryEvent[], memoryMb: number): HistorySumm
236251
applyPayloadFrameCounts(payload, currentLambdaState, (delta) => {
237252
framesRendered += delta;
238253
});
254+
maxChunkDurationMs = bumpMaxChunkDuration(
255+
maxChunkDurationMs,
256+
currentLambdaState,
257+
billedDurationMs,
258+
);
239259
if (payload && typeof payload === "object") {
240260
const obj = payload as Record<string, unknown>;
241261
if (typeof obj.TotalFrames === "number") totalFrames = obj.TotalFrames;
@@ -250,6 +270,11 @@ function summarizeHistory(events: HistoryEvent[], memoryMb: number): HistorySumm
250270
memorySizeMb: memoryMb,
251271
estimated: billedDurationMs === 0,
252272
});
273+
maxChunkDurationMs = bumpMaxChunkDuration(
274+
maxChunkDurationMs,
275+
currentLambdaState,
276+
billedDurationMs,
277+
);
253278
if (payload && typeof payload === "object") {
254279
const obj = payload as Record<string, unknown>;
255280
if (typeof obj.TotalFrames === "number") totalFrames = obj.TotalFrames;
@@ -334,6 +359,7 @@ function summarizeHistory(events: HistoryEvent[], memoryMb: number): HistorySumm
334359
totalFrames,
335360
lambdasInvoked,
336361
assembleComplete,
362+
maxChunkDurationMs,
337363
outputFile,
338364
errors,
339365
};
@@ -369,6 +395,25 @@ function unwrapLambdaPayload(payload: unknown): unknown {
369395
return payload;
370396
}
371397

398+
/**
399+
* Bump the running `maxChunkDurationMs` if this Lambda invocation
400+
* belongs to the `RenderChunk` state and reported a non-zero billed
401+
* duration. Shared between the `TaskSucceeded` and
402+
* `LambdaFunctionSucceeded` branches so both integrations contribute
403+
* to the same headroom counter — and so the duplication-gate stays
404+
* happy. Returns the new max (or the input unchanged if neither
405+
* condition is met).
406+
*/
407+
function bumpMaxChunkDuration(
408+
current: number | null,
409+
currentLambdaState: string | null,
410+
billedDurationMs: number,
411+
): number | null {
412+
if (currentLambdaState !== "RenderChunk") return current;
413+
if (billedDurationMs <= 0) return current;
414+
return Math.max(current ?? 0, billedDurationMs);
415+
}
416+
372417
/**
373418
* Apply `FramesEncoded` from a Lambda success payload to the running
374419
* counter, but only when the enclosing state is the chunk-render step.

packages/aws-lambda/src/sdk/index.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ export {
2222
computeRenderCost,
2323
type RenderCost,
2424
} from "./costAccounting.js";
25+
export {
26+
CHUNK_RUNTIME_WARN_MS,
27+
CHUNK_RUNTIME_WARN_RATIO,
28+
LAMBDA_TIMEOUT_MS,
29+
} from "./chunkRuntime.js";
2530
export {
2631
InvalidConfigError,
2732
MAX_STEP_FUNCTIONS_INPUT_BYTES,

packages/cli/src/commands/lambda/render.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ async function waitForCompletion(
223223
console.log(` ${c.dim("Output:")} ${progress.outputFile.s3Uri}`);
224224
console.log(` ${c.dim("Size:")} ${progress.outputFile.bytes ?? "?"} bytes`);
225225
console.log(` ${c.dim("Total cost:")} ${progress.costs.displayCost}`);
226+
await warnIfChunkRuntimeIsCloseToCap(progress);
226227
} else {
227228
console.log();
228229
console.log(c.error(`Render ended with status ${progress.status}.`));
@@ -237,6 +238,40 @@ async function waitForCompletion(
237238
}
238239
}
239240

241+
/**
242+
* Surface a runtime-headroom warning when the slowest chunk burned through
243+
* more than 80% of Lambda's 900-second cap. A future render that adds
244+
* fps, duration, or composition complexity to the same project will
245+
* push past the cap and hit `Sandbox.Timedout` — that's the exact
246+
* symptom that wedged the cost-analysis sweep (see the
247+
* sparticuz-executable-guard fix). Print the actual seconds + the
248+
* suggested fan-out so the user can act on it before the next render.
249+
*/
250+
// fallow-ignore-next-line complexity
251+
async function warnIfChunkRuntimeIsCloseToCap(progress: {
252+
maxChunkDurationMs: number | null;
253+
lambdasInvoked: number;
254+
}): Promise<void> {
255+
const { CHUNK_RUNTIME_WARN_MS, LAMBDA_TIMEOUT_MS } = await loadSDK();
256+
const max = progress.maxChunkDurationMs;
257+
if (max === null || max < CHUNK_RUNTIME_WARN_MS) return;
258+
const slowestSec = Math.round(max / 1000);
259+
const capSec = LAMBDA_TIMEOUT_MS / 1000;
260+
// Roughly halve the chunk runtime by doubling the fan-out; round up to
261+
// the nearest power-of-2 that's a comfortable suggestion (32, 64, 128).
262+
const headroomRatio = max / CHUNK_RUNTIME_WARN_MS;
263+
const suggestedFanOut = headroomRatio > 1.5 ? 128 : headroomRatio > 1.2 ? 64 : 32;
264+
console.log();
265+
console.log(
266+
c.warn(
267+
`Heads up: slowest chunk ran ${slowestSec}s of the ${capSec}s Lambda cap. ` +
268+
`Adding fps, duration, or complexity to this composition will likely trip ` +
269+
`Sandbox.Timedout on the next render.\n` +
270+
` Mitigate with: --max-parallel-chunks ${suggestedFanOut} (shrinks per-chunk work).`,
271+
),
272+
);
273+
}
274+
240275
function sleep(ms: number): Promise<void> {
241276
return new Promise((res) => setTimeout(res, ms));
242277
}

0 commit comments

Comments
 (0)