Skip to content

Commit 0ff95fe

Browse files
committed
Improve link_rnnoise to have zero delay
1 parent b493794 commit 0ff95fe

2 files changed

Lines changed: 86 additions & 62 deletions

File tree

src/modules/rnnoise/link_rnnoise.c

Lines changed: 85 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@
2525
#include <stdlib.h>
2626
#include <string.h>
2727

28-
#define FUTURE_FRAMES 1
2928
#define RNNOISE_RATE 48000
3029
#define MAX_CHANNELS 8
30+
#define MIN_RNNOISE_FRAMES 3
31+
#define RNNOISE_STARTUP_DROP_FRAMES 2
3132
// Buffer sizes: at 24fps/48kHz a frame has ~2002 samples.
3233
// With 1 future frame we can have up to ~4004 input samples at once.
3334
// RNNoise frame = 480. Max chunks = ceil(4004/480) = 9 → max out = 9*480 = 4320.
@@ -56,13 +57,9 @@ typedef struct
5657
mlt_position continuity_frame;
5758
int continuity_sample; // sample offset within continuity_frame's audio
5859

59-
// Dry-signal delay ring buffer for wet/dry mix alignment.
60-
// RNNoise synthesizes from the previous call's FFT, which itself analyzed
61-
// the frame before that ([analysis_mem, in]), so the output at call N
62-
// reconstructs in_{N-2} — exactly 2 * rnn_frame = 960 samples of delay.
63-
// We delay the dry signal by the same amount so both are aligned.
64-
float dry_ring[MAX_CHANNELS][960];
65-
int dry_ring_pos;
60+
// After reset, RNNoise emits two startup frames that are effectively silence.
61+
// Consume those frames internally so this link adds no output delay.
62+
int startup_drop_frames;
6663
} private_data;
6764

6865
static void reset_state(mlt_link self)
@@ -81,8 +78,7 @@ static void reset_state(mlt_link self)
8178
pdata->frequency = 0;
8279
pdata->in_carry_count = 0;
8380
pdata->out_carry_count = 0;
84-
memset(pdata->dry_ring, 0, sizeof(pdata->dry_ring));
85-
pdata->dry_ring_pos = 0;
81+
pdata->startup_drop_frames = RNNOISE_STARTUP_DROP_FRAMES;
8682
pdata->continuity_frame = -1;
8783
pdata->continuity_sample = 0;
8884
pdata->expected_frame = -1;
@@ -111,8 +107,7 @@ static void ensure_states(mlt_link self, int n_channels)
111107
pdata->n_channels = n_channels;
112108
pdata->in_carry_count = 0;
113109
pdata->out_carry_count = 0;
114-
memset(pdata->dry_ring, 0, sizeof(pdata->dry_ring));
115-
pdata->dry_ring_pos = 0;
110+
pdata->startup_drop_frames = RNNOISE_STARTUP_DROP_FRAMES;
116111
}
117112

118113
// Copy samples from src (planar float) channel c, starting at sample_offset,
@@ -143,19 +138,20 @@ static int link_get_audio(mlt_frame frame,
143138
mlt_link self = (mlt_link) mlt_frame_pop_audio(frame);
144139
private_data *pdata = (private_data *) self->child;
145140
int error = 0;
146-
147-
int requested_samples = *samples;
148-
int requested_channels = *channels <= 0 ? 2 : *channels;
141+
double link_fps = mlt_producer_get_fps(MLT_LINK_PRODUCER(self));
142+
if (link_fps <= 0.0)
143+
link_fps = 25.0;
144+
mlt_position frame_pos = mlt_frame_get_position(frame);
149145

150146
// Force 48kHz float for RNNoise
151147
*frequency = RNNOISE_RATE;
152148
*format = mlt_audio_float;
153-
*channels = requested_channels;
149+
*channels = *channels <= 0 ? 2 : *channels;
150+
if (*samples <= 0)
151+
*samples = mlt_audio_calculate_frame_samples(link_fps, RNNOISE_RATE, frame_pos);
154152

155153
mlt_service_lock(MLT_LINK_SERVICE(self));
156154

157-
mlt_position frame_pos = mlt_frame_get_position(frame);
158-
159155
// Detect seek: if not the expected frame, reset everything
160156
if (pdata->expected_frame != frame_pos) {
161157
reset_state(self);
@@ -166,12 +162,7 @@ static int link_get_audio(mlt_frame frame,
166162

167163
// Get current frame's audio (cached after first call)
168164
struct mlt_audio_s cur_audio;
169-
mlt_audio_set_values(&cur_audio,
170-
NULL,
171-
RNNOISE_RATE,
172-
mlt_audio_float,
173-
requested_samples,
174-
requested_channels);
165+
mlt_audio_set_values(&cur_audio, NULL, *frequency, *format, *samples, *channels);
175166
error = mlt_frame_get_audio(frame,
176167
&cur_audio.data,
177168
&cur_audio.format,
@@ -196,7 +187,12 @@ static int link_get_audio(mlt_frame frame,
196187

197188
// Allocate output buffer
198189
struct mlt_audio_s out;
199-
mlt_audio_set_values(&out, NULL, RNNOISE_RATE, mlt_audio_float, requested_samples, *channels);
190+
mlt_audio_set_values(&out,
191+
NULL,
192+
RNNOISE_RATE,
193+
cur_audio.format,
194+
cur_audio.samples,
195+
cur_audio.channels);
200196
mlt_audio_alloc_data(&out);
201197
if (!out.data) {
202198
mlt_service_unlock(MLT_LINK_SERVICE(self));
@@ -208,10 +204,10 @@ static int link_get_audio(mlt_frame frame,
208204
// We fill out from out_carry, then generate more by feeding RNNoise chunks.
209205
int out_delivered = 0;
210206

211-
while (out_delivered < requested_samples) {
207+
while (out_delivered < out.samples) {
212208
// Drain the output carry buffer first
213209
if (pdata->out_carry_count > 0) {
214-
int n_take = requested_samples - out_delivered;
210+
int n_take = out.samples - out_delivered;
215211
if (n_take > pdata->out_carry_count)
216212
n_take = pdata->out_carry_count;
217213
for (int c = 0; c < *channels && c < MAX_CHANNELS; c++) {
@@ -233,12 +229,13 @@ static int link_get_audio(mlt_frame frame,
233229
}
234230

235231
// Need to process more RNNoise frames to fill out_carry.
236-
// First fill in_carry to 480 samples.
237-
int fill_attempts = 0;
238-
while (pdata->in_carry_count < rnn_frame && fill_attempts < 2) {
232+
// First fill in_carry to 480 samples, crossing as many future MLT
233+
// frames as needed to satisfy this RNNoise chunk.
234+
while (pdata->in_carry_count < rnn_frame) {
239235
// Determine source frame and audio
240236
float *src_data = NULL;
241237
int src_total_samples = 0;
238+
int src_channels = *channels;
242239

243240
if (pdata->continuity_frame == frame_pos) {
244241
// Use current frame's audio
@@ -247,7 +244,6 @@ static int link_get_audio(mlt_frame frame,
247244
} else {
248245
// Look up future frame from unique_properties
249246
if (!unique_properties) {
250-
fill_attempts++;
251247
break;
252248
}
253249
char key[19];
@@ -257,17 +253,19 @@ static int link_get_audio(mlt_frame frame,
257253
key,
258254
NULL);
259255
if (!src_frame) {
260-
fill_attempts++;
261256
break;
262257
}
263258

264259
// Get audio from the future frame (may be cached)
265260
struct mlt_audio_s future_audio;
261+
int future_samples = mlt_audio_calculate_frame_samples(link_fps,
262+
RNNOISE_RATE,
263+
pdata->continuity_frame);
266264
mlt_audio_set_values(&future_audio,
267265
NULL,
268266
RNNOISE_RATE,
269267
mlt_audio_float,
270-
requested_samples,
268+
future_samples,
271269
*channels);
272270
int ferr = mlt_frame_get_audio(src_frame,
273271
&future_audio.data,
@@ -276,31 +274,40 @@ static int link_get_audio(mlt_frame frame,
276274
&future_audio.channels,
277275
&future_audio.samples);
278276
if (ferr || !future_audio.data || future_audio.samples <= 0) {
279-
fill_attempts++;
280277
break;
281278
}
282279
src_data = (float *) future_audio.data;
283280
src_total_samples = future_audio.samples;
281+
src_channels = future_audio.channels;
284282
}
285283

286284
// Copy as many samples as possible into in_carry (up to rnn_frame)
287285
int needed = rnn_frame - pdata->in_carry_count;
288286
int copied_any = 0;
289287
for (int c = 0; c < *channels && c < MAX_CHANNELS; c++) {
290-
float *src_plane = src_data + c * src_total_samples;
291-
int n = copy_samples(pdata->in_carry[c],
288+
int n = 0;
289+
if (c < src_channels) {
290+
float *src_plane = src_data + c * src_total_samples;
291+
n = copy_samples(pdata->in_carry[c],
292292
pdata->in_carry_count,
293293
src_plane,
294294
pdata->continuity_sample,
295295
src_total_samples,
296296
needed);
297+
} else {
298+
// Missing source channels are treated as silence.
299+
memset(pdata->in_carry[c] + pdata->in_carry_count, 0, needed * sizeof(float));
300+
n = needed;
301+
}
297302
if (c == 0)
298303
copied_any = n; // use channel 0 to track
299304
}
300305

301306
if (copied_any <= 0) {
302-
fill_attempts++;
303-
break;
307+
// Skip empty source frame and continue to the next one.
308+
pdata->continuity_frame++;
309+
pdata->continuity_sample = 0;
310+
continue;
304311
}
305312

306313
pdata->in_carry_count += copied_any;
@@ -326,7 +333,6 @@ static int link_get_audio(mlt_frame frame,
326333
// Process one 480-sample RNNoise chunk per channel
327334
float rnn_in[480];
328335
float rnn_out[480];
329-
const int ring_size = 2 * rnn_frame;
330336

331337
// Ensure out_carry buffers are allocated
332338
for (int c = 0; c < *channels && c < MAX_CHANNELS; c++) {
@@ -339,34 +345,35 @@ static int link_get_audio(mlt_frame frame,
339345
}
340346
}
341347

348+
int drop_chunk = pdata->startup_drop_frames > 0;
349+
int out_base = pdata->out_carry_count;
350+
if (!drop_chunk && out_base + rnn_frame > BUF_CAPACITY) {
351+
// Buffer overflow safeguard — should not happen with BUF_CAPACITY=8192
352+
error = 1;
353+
goto done;
354+
}
355+
342356
for (int c = 0; c < *channels && c < MAX_CHANNELS; c++) {
343357
// Scale up for RNNoise (expects ±32768)
344358
for (int s = 0; s < rnn_frame; s++)
345359
rnn_in[s] = pdata->in_carry[c][s] * 32768.0f;
346360

347361
rnnoise_process_frame(pdata->states[c], rnn_out, rnn_in);
348362

349-
// Scale back and apply wet/dry mix, then store in out_carry.
350-
// Delay dry by 2*rnn_frame to match RNNoise's two-frame internal delay.
351-
int out_base = pdata->out_carry_count;
352-
if (out_base + rnn_frame > BUF_CAPACITY) {
353-
// Buffer overflow safeguard — should not happen with BUF_CAPACITY=8192
354-
error = 1;
355-
goto done;
356-
}
357-
int ring_start = pdata->dry_ring_pos % ring_size;
363+
// Scale back and apply wet/dry mix with aligned dry signal.
364+
// Startup RNNoise delay is compensated by dropping first two output chunks.
358365
for (int s = 0; s < rnn_frame; s++) {
359366
float wet = rnn_out[s] / 32768.0f;
360367
float dry = pdata->in_carry[c][s];
361-
int ring_idx = (ring_start + s) % ring_size;
362-
float delayed_dry = pdata->dry_ring[c][ring_idx];
363-
pdata->dry_ring[c][ring_idx] = dry;
364-
pdata->out_carry[c][out_base + s] = delayed_dry + mix * (wet - delayed_dry);
368+
if (!drop_chunk)
369+
pdata->out_carry[c][out_base + s] = dry + mix * (wet - dry);
365370
}
366371
}
367-
// Advance ring_pos by one chunk (same for all channels)
368-
pdata->dry_ring_pos = (pdata->dry_ring_pos + rnn_frame) % ring_size;
369-
pdata->out_carry_count += rnn_frame;
372+
373+
if (drop_chunk)
374+
pdata->startup_drop_frames--;
375+
else
376+
pdata->out_carry_count += rnn_frame;
370377
pdata->in_carry_count = 0;
371378
}
372379

@@ -376,7 +383,8 @@ static int link_get_audio(mlt_frame frame,
376383
mlt_audio_silence(&out, out.samples, 0);
377384
}
378385

379-
mlt_frame_set_audio(frame, out.data, out.format, 0, out.release_data);
386+
int out_size = mlt_audio_format_size(out.format, out.samples, out.channels);
387+
mlt_frame_set_audio(frame, out.data, out.format, out_size, out.release_data);
380388
mlt_audio_get_values(&out, buffer, frequency, format, samples, channels);
381389

382390
pdata->expected_frame = frame_pos + 1;
@@ -388,7 +396,26 @@ static int link_get_audio(mlt_frame frame,
388396
static int link_get_frame(mlt_link self, mlt_frame_ptr frame, int index)
389397
{
390398
int error = 0;
399+
private_data *pdata = (private_data *) self->child;
391400
mlt_position frame_pos = mlt_producer_position(MLT_LINK_PRODUCER(self));
401+
double fps = mlt_producer_get_fps(MLT_LINK_PRODUCER(self));
402+
if (fps <= 0.0)
403+
fps = 25.0;
404+
int rnn_frame = rnnoise_get_frame_size();
405+
int needed_samples = MIN_RNNOISE_FRAMES * rnn_frame;
406+
int frame_samples = mlt_audio_calculate_frame_samples(fps, RNNOISE_RATE, frame_pos);
407+
int startup_drop = pdata ? pdata->startup_drop_frames : RNNOISE_STARTUP_DROP_FRAMES;
408+
int output_coverage_samples = frame_samples + (startup_drop * rnn_frame);
409+
if (output_coverage_samples > needed_samples)
410+
needed_samples = output_coverage_samples;
411+
int available_samples = mlt_audio_calculate_frame_samples(fps, RNNOISE_RATE, frame_pos);
412+
int future_frames_needed = 0;
413+
414+
while (available_samples < needed_samples) {
415+
mlt_position future_pos = frame_pos + future_frames_needed + 1;
416+
available_samples += mlt_audio_calculate_frame_samples(fps, RNNOISE_RATE, future_pos);
417+
future_frames_needed++;
418+
}
392419

393420
mlt_producer_seek(self->next, frame_pos);
394421
error = mlt_service_get_frame(MLT_PRODUCER_SERVICE(self->next), frame, index);
@@ -398,8 +425,8 @@ static int link_get_frame(mlt_link self, mlt_frame_ptr frame, int index)
398425

399426
mlt_properties unique_properties = mlt_frame_unique_properties(*frame, MLT_LINK_SERVICE(self));
400427

401-
// Fetch and store future frames
402-
for (int i = 0; i < FUTURE_FRAMES; i++) {
428+
// Fetch and store enough future frames to provide at least 3 RNNoise chunks.
429+
for (int i = 0; i < future_frames_needed; i++) {
403430
mlt_position future_pos = frame_pos + i + 1;
404431
mlt_frame future_frame = NULL;
405432
mlt_producer_seek(self->next, future_pos);

src/modules/rnnoise/link_rnnoise.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,7 @@ description: >
1515
notes: >
1616
Operates at 48 kHz; audio is resampled automatically if needed.
1717
18-
RNNoise itself introduces two 480-sample frames of latency, or about 20 ms
19-
at 48 kHz, and both the filter and the link are subject to that delay. This
20-
link processes audio in 480-sample frames at 48 kHz as required by RNNoise,
21-
and correctly buffers samples across MLT frames.
18+
Unlike filter_rnnoise, this link does not add a delay to the audio.
2219
audio_formats:
2320
- float
2421
parameters:

0 commit comments

Comments
 (0)