Skip to content

Commit 2ad5fc0

Browse files
committed
feat(engine): bake volume automation into PCM samples (sample-accurate, no ceiling)
Make sample-level gain the primary path for time-varying volume: after a track's WAV is prepared (always pcm_s16le/48k/stereo), multiply its samples by the interpolated envelope in-house, then mix at unity. This is exact at every sample, has no keyframe ceiling, and leaves the downstream ffmpeg amix/AAC encode untouched — so output and golden baselines only change where a fade is applied. It's the MoviePy/MLT model done in-process, with no new dependency. The RDP-bounded ffmpeg `volume` expression remains as a fallback for the rare case where a WAV isn't the expected 16-bit PCM, and the static-volume retry still backstops that. Layered: exact PCM gain -> bounded expression -> base volume, so the audio track is never dropped. Verified: the 297-keyframe fade that rendered with no audio now bakes all 297 keyframes sample-accurately (confirmed baked=true at runtime). Adds unit tests for sample-accurate gain, track-start offset, base/tail holds, thousands of keyframes, and format rejection.
1 parent 0041ad4 commit 2ad5fc0

3 files changed

Lines changed: 319 additions & 2 deletions

File tree

packages/engine/src/services/audioMixer.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import { runFfmpeg } from "../utils/runFfmpeg.js";
1414
import { unwrapTemplate } from "../utils/htmlTemplate.js";
1515
import { resolveProjectRelativeSrc } from "./videoFrameExtractor.js";
1616
import type { AudioElement, AudioTrack, MixResult } from "./audioMixer.types.js";
17+
import { applyVolumeEnvelopeToWav } from "./audioVolumeEnvelope.js";
1718

1819
export type { AudioElement, MixResult } from "./audioMixer.types.js";
1920

@@ -555,15 +556,29 @@ export async function processCompositionAudio(
555556
audioSrcPath = trimmedPath;
556557
}
557558

559+
// Primary volume-automation path: bake the envelope into the PCM samples
560+
// (sample-accurate, no keyframe ceiling). If the WAV isn't the expected
561+
// 16-bit PCM, fall back to the ffmpeg expression path by leaving the
562+
// keyframes on the track for buildVolumeExpression to handle.
563+
let bakedEnvelope = false;
564+
if (element.volumeKeyframes && element.volumeKeyframes.length > 0) {
565+
bakedEnvelope = applyVolumeEnvelopeToWav(
566+
audioSrcPath,
567+
element.volumeKeyframes,
568+
element.start,
569+
element.volume ?? 1.0,
570+
);
571+
}
558572
tracks.push({
559573
id: element.id,
560574
srcPath: audioSrcPath,
561575
start: element.start,
562576
end: element.end,
563577
mediaStart: element.mediaStart,
564578
duration: element.end - element.start,
565-
volume: element.volume ?? 1.0,
566-
volumeKeyframes: element.volumeKeyframes,
579+
// Gain is already in the samples when baked, so mix at unity.
580+
volume: bakedEnvelope ? 1.0 : (element.volume ?? 1.0),
581+
volumeKeyframes: bakedEnvelope ? undefined : element.volumeKeyframes,
567582
});
568583
} catch (err: unknown) {
569584
errors.push(`Error: ${element.id}${err instanceof Error ? err.message : String(err)}`);
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import { afterEach, describe, expect, it } from "vitest";
2+
import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
3+
import { join } from "node:path";
4+
import { tmpdir } from "node:os";
5+
import { applyVolumeEnvelopeToWav } from "./audioVolumeEnvelope.js";
6+
7+
const SAMPLE_RATE = 48000;
8+
const CHANNELS = 2;
9+
10+
/** Build a PCM s16le stereo WAV whose every sample equals `value`. */
11+
function writeConstantWav(path: string, frames: number, value: number): void {
12+
const bytesPerSample = 2;
13+
const dataSize = frames * CHANNELS * bytesPerSample;
14+
const buffer = Buffer.alloc(44 + dataSize);
15+
buffer.write("RIFF", 0, "ascii");
16+
buffer.writeUInt32LE(36 + dataSize, 4);
17+
buffer.write("WAVE", 8, "ascii");
18+
buffer.write("fmt ", 12, "ascii");
19+
buffer.writeUInt32LE(16, 16);
20+
buffer.writeUInt16LE(1, 20); // PCM
21+
buffer.writeUInt16LE(CHANNELS, 22);
22+
buffer.writeUInt32LE(SAMPLE_RATE, 24);
23+
buffer.writeUInt32LE(SAMPLE_RATE * CHANNELS * bytesPerSample, 28);
24+
buffer.writeUInt16LE(CHANNELS * bytesPerSample, 32);
25+
buffer.writeUInt16LE(16, 34);
26+
buffer.write("data", 36, "ascii");
27+
buffer.writeUInt32LE(dataSize, 40);
28+
for (let i = 0; i < frames * CHANNELS; i += 1) buffer.writeInt16LE(value, 44 + i * 2);
29+
writeFileSync(path, buffer);
30+
}
31+
32+
function sampleAt(path: string, frame: number, channel = 0): number {
33+
const buffer = readFileSync(path);
34+
return buffer.readInt16LE(44 + (frame * CHANNELS + channel) * 2);
35+
}
36+
37+
describe("applyVolumeEnvelopeToWav", () => {
38+
const dirs: string[] = [];
39+
const tmp = () => {
40+
const d = mkdtempSync(join(tmpdir(), "hf-env-"));
41+
dirs.push(d);
42+
return d;
43+
};
44+
afterEach(() => {
45+
for (const d of dirs.splice(0)) rmSync(d, { recursive: true, force: true });
46+
});
47+
48+
it("applies a linear fade sample-accurately", () => {
49+
const path = join(tmp(), "a.wav");
50+
const frames = SAMPLE_RATE; // 1 second
51+
writeConstantWav(path, frames, 10000);
52+
53+
// Fade 0 -> 1 over the full second.
54+
const applied = applyVolumeEnvelopeToWav(
55+
path,
56+
[
57+
{ time: 0, volume: 0 },
58+
{ time: 1, volume: 1 },
59+
],
60+
0,
61+
0,
62+
);
63+
expect(applied).toBe(true);
64+
65+
expect(sampleAt(path, 0)).toBe(0); // gain 0
66+
expect(sampleAt(path, frames / 2)).toBeCloseTo(5000, -2); // gain ~0.5
67+
expect(sampleAt(path, frames - 1)).toBeGreaterThan(9900); // gain ~1
68+
});
69+
70+
it("offsets keyframes by the track start (composition time -> track-relative)", () => {
71+
const path = join(tmp(), "b.wav");
72+
const frames = SAMPLE_RATE;
73+
writeConstantWav(path, frames, 10000);
74+
75+
// Track starts at 5s; the fade runs from comp-time 5s..6s -> wav 0s..1s.
76+
applyVolumeEnvelopeToWav(
77+
path,
78+
[
79+
{ time: 5, volume: 0 },
80+
{ time: 6, volume: 1 },
81+
],
82+
5,
83+
0,
84+
);
85+
86+
expect(sampleAt(path, 0)).toBe(0);
87+
expect(sampleAt(path, frames / 2)).toBeCloseTo(5000, -2);
88+
});
89+
90+
it("holds base volume before the first keyframe and the last value after", () => {
91+
const path = join(tmp(), "c.wav");
92+
const frames = SAMPLE_RATE * 3; // 3 seconds
93+
writeConstantWav(path, frames, 10000);
94+
95+
// Base 0.8 held until a fade-out begins at 2s.
96+
applyVolumeEnvelopeToWav(
97+
path,
98+
[
99+
{ time: 2, volume: 0.8 },
100+
{ time: 3, volume: 0 },
101+
],
102+
0,
103+
0.8,
104+
);
105+
106+
expect(sampleAt(path, SAMPLE_RATE)).toBeCloseTo(8000, -2); // 1s: base 0.8
107+
expect(sampleAt(path, frames - 1)).toBeLessThan(200); // 3s: faded to ~0
108+
});
109+
110+
it("handles thousands of keyframes without failing (no expression ceiling)", () => {
111+
const path = join(tmp(), "d.wav");
112+
const frames = SAMPLE_RATE * 2;
113+
writeConstantWav(path, frames, 10000);
114+
115+
const keyframes = Array.from({ length: 5000 }, (_, i) => ({
116+
time: (i / 4999) * 2,
117+
volume: Math.abs(Math.sin(i / 50)),
118+
}));
119+
expect(applyVolumeEnvelopeToWav(path, keyframes, 0, 0)).toBe(true);
120+
});
121+
122+
it("rejects non-16-bit PCM so the caller can fall back", () => {
123+
const path = join(tmp(), "e.wav");
124+
// 24-bit PCM header (bitsPerSample = 24); body contents are irrelevant.
125+
const buffer = Buffer.alloc(44);
126+
buffer.write("RIFF", 0, "ascii");
127+
buffer.write("WAVE", 8, "ascii");
128+
buffer.write("fmt ", 12, "ascii");
129+
buffer.writeUInt32LE(16, 16);
130+
buffer.writeUInt16LE(1, 20);
131+
buffer.writeUInt16LE(CHANNELS, 22);
132+
buffer.writeUInt32LE(SAMPLE_RATE, 24);
133+
buffer.writeUInt16LE(24, 34);
134+
buffer.write("data", 36, "ascii");
135+
buffer.writeUInt32LE(0, 40);
136+
writeFileSync(path, buffer);
137+
138+
expect(
139+
applyVolumeEnvelopeToWav(
140+
path,
141+
[
142+
{ time: 0, volume: 0 },
143+
{ time: 1, volume: 1 },
144+
],
145+
0,
146+
0,
147+
),
148+
).toBe(false);
149+
});
150+
});
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
/**
2+
* Sample-accurate volume automation.
3+
*
4+
* The audio mixer's primary path for time-varying volume bakes the envelope
5+
* directly into the prepared PCM rather than encoding it as an FFmpeg `volume`
6+
* expression. The expression approach nests one `if(lt(t,...))` per keyframe and
7+
* overflows FFmpeg's expression evaluator past ~95 levels (a dense GSAP fade
8+
* emits hundreds of keyframes), which fails the whole mix and drops the audio
9+
* track. Multiplying the samples in-house has no such ceiling, is exact at every
10+
* sample, and keeps the downstream ffmpeg `amix`/AAC encode untouched — so the
11+
* output (and the golden baselines) only change where a fade is actually applied.
12+
*
13+
* The prepared tracks are always `pcm_s16le`, 48 kHz, stereo (see
14+
* `prepareAudioTrack` / `extractAudioFromVideo`). Anything else is rejected so
15+
* the caller can fall back to the expression path rather than corrupting audio.
16+
*/
17+
18+
import { readFileSync, writeFileSync } from "fs";
19+
import type { AudioVolumeKeyframe } from "./audioMixer.types.js";
20+
21+
const PCM_FORMAT = 1; // WAVE_FORMAT_PCM
22+
const SUPPORTED_BITS = 16;
23+
24+
interface WavLayout {
25+
numChannels: number;
26+
sampleRate: number;
27+
dataOffset: number;
28+
dataSize: number;
29+
}
30+
31+
/** Locate the `fmt ` and `data` chunks and validate the format we know how to edit. */
32+
function parseWavLayout(buffer: Buffer): WavLayout | null {
33+
if (buffer.length < 12 || buffer.toString("ascii", 0, 4) !== "RIFF") return null;
34+
if (buffer.toString("ascii", 8, 12) !== "WAVE") return null;
35+
36+
let offset = 12;
37+
let fmt: { numChannels: number; sampleRate: number; bitsPerSample: number } | null = null;
38+
let data: { offset: number; size: number } | null = null;
39+
40+
while (offset + 8 <= buffer.length) {
41+
const chunkId = buffer.toString("ascii", offset, offset + 4);
42+
const chunkSize = buffer.readUInt32LE(offset + 4);
43+
const body = offset + 8;
44+
if (chunkId === "fmt " && body + 16 <= buffer.length) {
45+
if (buffer.readUInt16LE(body) !== PCM_FORMAT) return null;
46+
fmt = {
47+
numChannels: buffer.readUInt16LE(body + 2),
48+
sampleRate: buffer.readUInt32LE(body + 4),
49+
bitsPerSample: buffer.readUInt16LE(body + 14),
50+
};
51+
} else if (chunkId === "data") {
52+
data = { offset: body, size: Math.min(chunkSize, buffer.length - body) };
53+
break; // sample data follows; no need to scan further
54+
}
55+
// Chunks are word-aligned: an odd size carries a trailing pad byte.
56+
offset = body + chunkSize + (chunkSize % 2);
57+
}
58+
59+
if (!fmt || !data) return null;
60+
if (fmt.bitsPerSample !== SUPPORTED_BITS || fmt.numChannels < 1) return null;
61+
return {
62+
numChannels: fmt.numChannels,
63+
sampleRate: fmt.sampleRate,
64+
dataOffset: data.offset,
65+
dataSize: data.size,
66+
};
67+
}
68+
69+
/**
70+
* Normalise keyframes to track-relative seconds, sorted and de-duplicated, with
71+
* `baseVolume` filling any gap before the first keyframe. Returns the breakpoints
72+
* the gain envelope is linearly interpolated between.
73+
*/
74+
function toRelativeEnvelope(
75+
keyframes: AudioVolumeKeyframe[],
76+
trackStart: number,
77+
baseVolume: number,
78+
): { time: number; volume: number }[] {
79+
const points = keyframes
80+
.filter((k) => Number.isFinite(k.time) && Number.isFinite(k.volume))
81+
.map((k) => ({
82+
time: Math.max(0, k.time - trackStart),
83+
volume: Math.max(0, Math.min(1, k.volume)),
84+
}))
85+
.sort((a, b) => a.time - b.time);
86+
87+
const deduped: { time: number; volume: number }[] = [];
88+
for (const point of points) {
89+
const previous = deduped.at(-1);
90+
if (previous && Math.abs(previous.time - point.time) < 1e-9) previous.volume = point.volume;
91+
else deduped.push(point);
92+
}
93+
94+
if (deduped.length === 0) return deduped;
95+
if (deduped[0]!.time > 0) {
96+
deduped.unshift({ time: 0, volume: Math.max(0, Math.min(1, baseVolume)) });
97+
}
98+
return deduped;
99+
}
100+
101+
/**
102+
* Multiply a prepared WAV's samples by a time-varying gain envelope in place.
103+
*
104+
* @returns `true` if the envelope was applied; `false` if the file isn't the
105+
* expected 16-bit PCM (caller should fall back to the expression path).
106+
*/
107+
export function applyVolumeEnvelopeToWav(
108+
wavPath: string,
109+
keyframes: AudioVolumeKeyframe[],
110+
trackStart: number,
111+
baseVolume: number,
112+
): boolean {
113+
const envelope = toRelativeEnvelope(keyframes, trackStart, baseVolume);
114+
if (envelope.length === 0) return false;
115+
116+
try {
117+
const buffer = readFileSync(wavPath);
118+
const layout = parseWavLayout(buffer);
119+
if (!layout) return false;
120+
121+
const { numChannels, sampleRate, dataOffset, dataSize } = layout;
122+
const bytesPerSample = SUPPORTED_BITS / 8;
123+
const frameBytes = numChannels * bytesPerSample;
124+
const frameCount = Math.floor(dataSize / frameBytes);
125+
126+
let segment = 0;
127+
for (let frame = 0; frame < frameCount; frame += 1) {
128+
const time = frame / sampleRate;
129+
while (segment < envelope.length - 2 && time >= envelope[segment + 1]!.time) segment += 1;
130+
131+
const a = envelope[segment]!;
132+
const b = envelope[segment + 1] ?? a;
133+
const span = b.time - a.time;
134+
const progress = span <= 0 ? 0 : Math.min(1, Math.max(0, (time - a.time) / span));
135+
const gain = a.volume + (b.volume - a.volume) * progress;
136+
137+
const base = dataOffset + frame * frameBytes;
138+
for (let channel = 0; channel < numChannels; channel += 1) {
139+
const at = base + channel * bytesPerSample;
140+
const scaled = Math.round(buffer.readInt16LE(at) * gain);
141+
buffer.writeInt16LE(scaled < -32768 ? -32768 : scaled > 32767 ? 32767 : scaled, at);
142+
}
143+
}
144+
145+
writeFileSync(wavPath, buffer);
146+
return true;
147+
} catch {
148+
// Any read/parse/write failure → leave the file untouched and let the
149+
// caller fall back to the ffmpeg expression path rather than losing audio.
150+
return false;
151+
}
152+
}

0 commit comments

Comments
 (0)