fix(engine): cap volume-automation keyframes so dense fades keep their audio

miguel-heygen · miguel-heygen · commit bff2da58a9b5 · 2026-05-28T22:55:28.000-04:00
GSAP/JS volume automation is folded into an FFmpeg `volume` expression that nests one `if(lt(t,...))` per keyframe. The 60 Hz timeline probe emits 100-300 keyframes for a multi-second fade, nesting the expression deep enough to overflow FFmpeg's expression evaluator (build-dependent, ~95 levels; lower on some Linux ffmpeg builds). When that happens filter-graph init fails, the whole audio mix fails, and the muxer drops the audio track entirely — so a `data-volume="0"` fade-in rendered with no audio at all (follow-up to #1066, where #1064's own scenario regressed once the fade was dense enough). Simplify the keyframes to a bounded piecewise-linear envelope before building the expression: Ramer-Douglas-Peucker at ~1% tolerance, with a uniform- downsample backstop to 32 segments. A linear fade collapses to its two endpoints, an eased fade to a handful, and pathological audio-rate input is capped well under the evaluator's limit. Endpoints are preserved so the envelope still spans the clip. Verified end-to-end: a 297-keyframe fade that previously rendered with no audio stream now renders a full track with the correct envelope. Adds a regression test asserting the nesting depth stays bounded for dense automation (the prior tests mocked ffmpeg, so they could not catch the evaluator overflow).
diff --git a/packages/engine/src/services/audioMixer.test.ts b/packages/engine/src/services/audioMixer.test.ts
@@ -108,6 +108,61 @@ describe("processCompositionAudio", () => {
     expect(filter).toContain("adelay=2000|2000");
   });
 
+  it("bounds expression nesting for dense keyframe automation without dropping the envelope", async () => {
+    const baseDir = mkdtempSync(join(tmpdir(), "hf-audio-base-"));
+    const workDir = mkdtempSync(join(tmpdir(), "hf-audio-work-"));
+    tempDirs.push(baseDir, workDir);
+
+    writeFileSync(join(baseDir, "bgm.wav"), "stub");
+
+    // Mirrors the 60 Hz timeline probe: a 10s eased fade emits hundreds of
+    // keyframes. The nested-if volume expression must not grow one level per
+    // keyframe — past ~95 levels FFmpeg fails filter-graph init and the audio
+    // track is dropped entirely (GH #1066 follow-up).
+    const keyframes = Array.from({ length: 300 }, (_, i) => {
+      const time = (i / 299) * 10;
+      const volume =
+        time < 3 ? 0.8 * (time / 3) ** 2 : time < 7 ? 0.8 : 0.8 * (1 - (time - 7) / 3) ** 2;
+      return { time, volume };
+    });
+
+    const result = await processCompositionAudio(
+      [
+        {
+          id: "bgm",
+          src: "bgm.wav",
+          start: 0,
+          end: 10,
+          mediaStart: 0,
+          layer: 0,
+          volume: 0,
+          volumeKeyframes: keyframes,
+          type: "audio",
+        },
+      ],
+      baseDir,
+      workDir,
+      join(baseDir, "out.m4a"),
+      10,
+    );
+
+    expect(result.success).toBe(true);
+
+    const mixArgs = runFfmpegMock.mock.calls[1]?.[0];
+    const filterIndex = mixArgs.indexOf("-filter_complex");
+    const filter = mixArgs[filterIndex + 1];
+
+    // One nested `if(lt(...))` is emitted per segment; cap it well under the
+    // FFmpeg evaluator's nesting limit (MAX_VOLUME_SEGMENTS = 32).
+    const nestingDepth = (filter.match(/if\(lt\(t/g) ?? []).length;
+    expect(nestingDepth).toBeGreaterThan(1);
+    expect(nestingDepth).toBeLessThan(32);
+
+    // The simplified envelope still spans the clip: silent start, audible peak.
+    expect(filter).toContain(":eval=frame");
+    expect(filter).toMatch(/volume=if\(lt\(t\\,[0-9.]+\)\\,0\+/);
+  });
+
   it("prepares percent-encoded non-Latin audio srcs from decoded filesystem paths", async () => {
     const baseDir = mkdtempSync(join(tmpdir(), "hf-audio-base-"));
     const workDir = mkdtempSync(join(tmpdir(), "hf-audio-work-"));
diff --git a/packages/engine/src/services/audioMixer.ts b/packages/engine/src/services/audioMixer.ts
@@ -30,6 +30,79 @@ function escapeExpressionCommas(expression: string): string {
   return expression.replace(/\\/g, "\\\\").replace(/,/g, "\\,");
 }
 
+/**
+ * Upper bound on volume-automation keyframes folded into the FFmpeg `volume`
+ * expression. The expression nests one `if(lt(...))` per keyframe, and
+ * FFmpeg's expression evaluator has a finite nesting depth: past ~95 levels
+ * (build-dependent — lower on some Linux ffmpeg builds) `volume=...:eval=frame`
+ * fails filter-graph init, which fails the whole mix and drops the audio track
+ * entirely. The 60 Hz timeline probe routinely emits 100–300 keyframes for a
+ * multi-second fade (GH #1066 follow-up: a 171-keyframe GSAP fade rendered with
+ * no audio). 32 segments keeps a wide safety margin and is far more resolution
+ * than a piecewise-linear volume envelope needs.
+ */
+const MAX_VOLUME_SEGMENTS = 32;
+
+/** Volume delta below which a keyframe is collinear enough to drop (≈1%, imperceptible). */
+const VOLUME_SIMPLIFY_EPSILON = 0.01;
+
+/**
+ * Reduce a sorted keyframe list to a perceptually-equivalent piecewise-linear
+ * envelope with a bounded segment count.
+ *
+ * Ramer–Douglas–Peucker drops control points lying within
+ * `VOLUME_SIMPLIFY_EPSILON` of the line through their neighbours (a linear fade
+ * collapses to its two endpoints; an eased fade to a handful). A uniform
+ * downsample backstop then bounds pathological inputs (e.g. audio-rate volume
+ * oscillation) to `MAX_VOLUME_SEGMENTS`. Endpoints are always preserved so the
+ * envelope still spans the full clip.
+ */
+function simplifyVolumeKeyframes(
+  keyframes: { time: number; volume: number }[],
+): { time: number; volume: number }[] {
+  if (keyframes.length < 3) return keyframes;
+
+  const keep = new Array<boolean>(keyframes.length).fill(false);
+  keep[0] = true;
+  keep[keyframes.length - 1] = true;
+  const stack: [number, number][] = [[0, keyframes.length - 1]];
+  while (stack.length > 0) {
+    const [startIndex, endIndex] = stack.pop()!;
+    const start = keyframes[startIndex]!;
+    const end = keyframes[endIndex]!;
+    const span = end.time - start.time;
+    let maxDistance = VOLUME_SIMPLIFY_EPSILON;
+    let splitIndex = -1;
+    for (let i = startIndex + 1; i < endIndex; i += 1) {
+      const point = keyframes[i]!;
+      const interpolated =
+        span === 0
+          ? start.volume
+          : start.volume + ((end.volume - start.volume) * (point.time - start.time)) / span;
+      const distance = Math.abs(point.volume - interpolated);
+      if (distance > maxDistance) {
+        maxDistance = distance;
+        splitIndex = i;
+      }
+    }
+    if (splitIndex !== -1) {
+      keep[splitIndex] = true;
+      stack.push([startIndex, splitIndex], [splitIndex, endIndex]);
+    }
+  }
+
+  const simplified = keyframes.filter((_, i) => keep[i]);
+  if (simplified.length <= MAX_VOLUME_SEGMENTS) return simplified;
+
+  const step = (simplified.length - 1) / (MAX_VOLUME_SEGMENTS - 1);
+  const sampled: { time: number; volume: number }[] = [];
+  for (let i = 0; i < MAX_VOLUME_SEGMENTS; i += 1) {
+    const point = simplified[Math.round(i * step)]!;
+    if (sampled.length === 0 || point.time > sampled.at(-1)!.time) sampled.push(point);
+  }
+  return sampled;
+}
+
 function buildVolumeExpression(track: AudioTrack): string {
   const trimDuration = track.end - track.start;
   const staticVolume = clampVolume(track.volume);
@@ -57,14 +130,19 @@ function buildVolumeExpression(track: AudioTrack): string {
     }
   }
 
-  if (deduped.length === 1) {
-    return `volume=${formatFilterNumber(deduped[0]!.volume)}`;
+  // Collapse the densely-sampled probe output to a bounded piecewise-linear
+  // envelope. Without this, the nested-if expression below grows one level per
+  // keyframe and overflows FFmpeg's expression evaluator (see MAX_VOLUME_SEGMENTS).
+  const simplified = simplifyVolumeKeyframes(deduped);
+
+  if (simplified.length === 1) {
+    return `volume=${formatFilterNumber(simplified[0]!.volume)}`;
   }
 
-  let expression = formatFilterNumber(deduped.at(-1)!.volume);
-  for (let i = deduped.length - 2; i >= 0; i -= 1) {
-    const current = deduped[i]!;
-    const next = deduped[i + 1]!;
+  let expression = formatFilterNumber(simplified.at(-1)!.volume);
+  for (let i = simplified.length - 2; i >= 0; i -= 1) {
+    const current = simplified[i]!;
+    const next = simplified[i + 1]!;
     const currentTime = formatFilterNumber(current.time);
     const nextTime = formatFilterNumber(next.time);
     const currentVolume = formatFilterNumber(current.volume);