Skip to content

Commit bff2da5

Browse files
committed
fix(engine): cap volume-automation keyframes so dense fades keep their audio
GSAP/JS volume automation is folded into an FFmpeg `volume` expression that nests one `if(lt(t,...))` per keyframe. The 60 Hz timeline probe emits 100-300 keyframes for a multi-second fade, nesting the expression deep enough to overflow FFmpeg's expression evaluator (build-dependent, ~95 levels; lower on some Linux ffmpeg builds). When that happens filter-graph init fails, the whole audio mix fails, and the muxer drops the audio track entirely — so a `data-volume="0"` fade-in rendered with no audio at all (follow-up to #1066, where #1064's own scenario regressed once the fade was dense enough). Simplify the keyframes to a bounded piecewise-linear envelope before building the expression: Ramer-Douglas-Peucker at ~1% tolerance, with a uniform- downsample backstop to 32 segments. A linear fade collapses to its two endpoints, an eased fade to a handful, and pathological audio-rate input is capped well under the evaluator's limit. Endpoints are preserved so the envelope still spans the clip. Verified end-to-end: a 297-keyframe fade that previously rendered with no audio stream now renders a full track with the correct envelope. Adds a regression test asserting the nesting depth stays bounded for dense automation (the prior tests mocked ffmpeg, so they could not catch the evaluator overflow).
1 parent b1f9587 commit bff2da5

2 files changed

Lines changed: 139 additions & 6 deletions

File tree

packages/engine/src/services/audioMixer.test.ts

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,61 @@ describe("processCompositionAudio", () => {
108108
expect(filter).toContain("adelay=2000|2000");
109109
});
110110

111+
it("bounds expression nesting for dense keyframe automation without dropping the envelope", async () => {
112+
const baseDir = mkdtempSync(join(tmpdir(), "hf-audio-base-"));
113+
const workDir = mkdtempSync(join(tmpdir(), "hf-audio-work-"));
114+
tempDirs.push(baseDir, workDir);
115+
116+
writeFileSync(join(baseDir, "bgm.wav"), "stub");
117+
118+
// Mirrors the 60 Hz timeline probe: a 10s eased fade emits hundreds of
119+
// keyframes. The nested-if volume expression must not grow one level per
120+
// keyframe — past ~95 levels FFmpeg fails filter-graph init and the audio
121+
// track is dropped entirely (GH #1066 follow-up).
122+
const keyframes = Array.from({ length: 300 }, (_, i) => {
123+
const time = (i / 299) * 10;
124+
const volume =
125+
time < 3 ? 0.8 * (time / 3) ** 2 : time < 7 ? 0.8 : 0.8 * (1 - (time - 7) / 3) ** 2;
126+
return { time, volume };
127+
});
128+
129+
const result = await processCompositionAudio(
130+
[
131+
{
132+
id: "bgm",
133+
src: "bgm.wav",
134+
start: 0,
135+
end: 10,
136+
mediaStart: 0,
137+
layer: 0,
138+
volume: 0,
139+
volumeKeyframes: keyframes,
140+
type: "audio",
141+
},
142+
],
143+
baseDir,
144+
workDir,
145+
join(baseDir, "out.m4a"),
146+
10,
147+
);
148+
149+
expect(result.success).toBe(true);
150+
151+
const mixArgs = runFfmpegMock.mock.calls[1]?.[0];
152+
const filterIndex = mixArgs.indexOf("-filter_complex");
153+
const filter = mixArgs[filterIndex + 1];
154+
155+
// One nested `if(lt(...))` is emitted per segment; cap it well under the
156+
// FFmpeg evaluator's nesting limit (MAX_VOLUME_SEGMENTS = 32).
157+
const nestingDepth = (filter.match(/if\(lt\(t/g) ?? []).length;
158+
expect(nestingDepth).toBeGreaterThan(1);
159+
expect(nestingDepth).toBeLessThan(32);
160+
161+
// The simplified envelope still spans the clip: silent start, audible peak.
162+
expect(filter).toContain(":eval=frame");
163+
expect(filter).toMatch(/volume=if\(lt\(t\\,[0-9.]+\)\\,0\+/);
164+
});
165+
111166
it("prepares percent-encoded non-Latin audio srcs from decoded filesystem paths", async () => {
112167
const baseDir = mkdtempSync(join(tmpdir(), "hf-audio-base-"));
113168
const workDir = mkdtempSync(join(tmpdir(), "hf-audio-work-"));

packages/engine/src/services/audioMixer.ts

Lines changed: 84 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,79 @@ function escapeExpressionCommas(expression: string): string {
3030
return expression.replace(/\\/g, "\\\\").replace(/,/g, "\\,");
3131
}
3232

33+
/**
34+
* Upper bound on volume-automation keyframes folded into the FFmpeg `volume`
35+
* expression. The expression nests one `if(lt(...))` per keyframe, and
36+
* FFmpeg's expression evaluator has a finite nesting depth: past ~95 levels
37+
* (build-dependent — lower on some Linux ffmpeg builds) `volume=...:eval=frame`
38+
* fails filter-graph init, which fails the whole mix and drops the audio track
39+
* entirely. The 60 Hz timeline probe routinely emits 100–300 keyframes for a
40+
* multi-second fade (GH #1066 follow-up: a 171-keyframe GSAP fade rendered with
41+
* no audio). 32 segments keeps a wide safety margin and is far more resolution
42+
* than a piecewise-linear volume envelope needs.
43+
*/
44+
const MAX_VOLUME_SEGMENTS = 32;
45+
46+
/** Volume delta below which a keyframe is collinear enough to drop (≈1%, imperceptible). */
47+
const VOLUME_SIMPLIFY_EPSILON = 0.01;
48+
49+
/**
50+
* Reduce a sorted keyframe list to a perceptually-equivalent piecewise-linear
51+
* envelope with a bounded segment count.
52+
*
53+
* Ramer–Douglas–Peucker drops control points lying within
54+
* `VOLUME_SIMPLIFY_EPSILON` of the line through their neighbours (a linear fade
55+
* collapses to its two endpoints; an eased fade to a handful). A uniform
56+
* downsample backstop then bounds pathological inputs (e.g. audio-rate volume
57+
* oscillation) to `MAX_VOLUME_SEGMENTS`. Endpoints are always preserved so the
58+
* envelope still spans the full clip.
59+
*/
60+
function simplifyVolumeKeyframes(
61+
keyframes: { time: number; volume: number }[],
62+
): { time: number; volume: number }[] {
63+
if (keyframes.length < 3) return keyframes;
64+
65+
const keep = new Array<boolean>(keyframes.length).fill(false);
66+
keep[0] = true;
67+
keep[keyframes.length - 1] = true;
68+
const stack: [number, number][] = [[0, keyframes.length - 1]];
69+
while (stack.length > 0) {
70+
const [startIndex, endIndex] = stack.pop()!;
71+
const start = keyframes[startIndex]!;
72+
const end = keyframes[endIndex]!;
73+
const span = end.time - start.time;
74+
let maxDistance = VOLUME_SIMPLIFY_EPSILON;
75+
let splitIndex = -1;
76+
for (let i = startIndex + 1; i < endIndex; i += 1) {
77+
const point = keyframes[i]!;
78+
const interpolated =
79+
span === 0
80+
? start.volume
81+
: start.volume + ((end.volume - start.volume) * (point.time - start.time)) / span;
82+
const distance = Math.abs(point.volume - interpolated);
83+
if (distance > maxDistance) {
84+
maxDistance = distance;
85+
splitIndex = i;
86+
}
87+
}
88+
if (splitIndex !== -1) {
89+
keep[splitIndex] = true;
90+
stack.push([startIndex, splitIndex], [splitIndex, endIndex]);
91+
}
92+
}
93+
94+
const simplified = keyframes.filter((_, i) => keep[i]);
95+
if (simplified.length <= MAX_VOLUME_SEGMENTS) return simplified;
96+
97+
const step = (simplified.length - 1) / (MAX_VOLUME_SEGMENTS - 1);
98+
const sampled: { time: number; volume: number }[] = [];
99+
for (let i = 0; i < MAX_VOLUME_SEGMENTS; i += 1) {
100+
const point = simplified[Math.round(i * step)]!;
101+
if (sampled.length === 0 || point.time > sampled.at(-1)!.time) sampled.push(point);
102+
}
103+
return sampled;
104+
}
105+
33106
function buildVolumeExpression(track: AudioTrack): string {
34107
const trimDuration = track.end - track.start;
35108
const staticVolume = clampVolume(track.volume);
@@ -57,14 +130,19 @@ function buildVolumeExpression(track: AudioTrack): string {
57130
}
58131
}
59132

60-
if (deduped.length === 1) {
61-
return `volume=${formatFilterNumber(deduped[0]!.volume)}`;
133+
// Collapse the densely-sampled probe output to a bounded piecewise-linear
134+
// envelope. Without this, the nested-if expression below grows one level per
135+
// keyframe and overflows FFmpeg's expression evaluator (see MAX_VOLUME_SEGMENTS).
136+
const simplified = simplifyVolumeKeyframes(deduped);
137+
138+
if (simplified.length === 1) {
139+
return `volume=${formatFilterNumber(simplified[0]!.volume)}`;
62140
}
63141

64-
let expression = formatFilterNumber(deduped.at(-1)!.volume);
65-
for (let i = deduped.length - 2; i >= 0; i -= 1) {
66-
const current = deduped[i]!;
67-
const next = deduped[i + 1]!;
142+
let expression = formatFilterNumber(simplified.at(-1)!.volume);
143+
for (let i = simplified.length - 2; i >= 0; i -= 1) {
144+
const current = simplified[i]!;
145+
const next = simplified[i + 1]!;
68146
const currentTime = formatFilterNumber(current.time);
69147
const nextTime = formatFilterNumber(next.time);
70148
const currentVolume = formatFilterNumber(current.volume);

0 commit comments

Comments
 (0)