Improve link_rnnoise to have zero delay

bmatherly · bmatherly · commit 0ff95fe3e6e1 · 2026-05-20T20:44:51.000-05:00
diff --git a/src/modules/rnnoise/link_rnnoise.c b/src/modules/rnnoise/link_rnnoise.c
@@ -25,9 +25,10 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define FUTURE_FRAMES 1
 #define RNNOISE_RATE 48000
 #define MAX_CHANNELS 8
+#define MIN_RNNOISE_FRAMES 3
+#define RNNOISE_STARTUP_DROP_FRAMES 2
 // Buffer sizes: at 24fps/48kHz a frame has ~2002 samples.
 // With 1 future frame we can have up to ~4004 input samples at once.
 // RNNoise frame = 480. Max chunks = ceil(4004/480) = 9 → max out = 9*480 = 4320.
@@ -56,13 +57,9 @@ typedef struct
     mlt_position continuity_frame;
     int continuity_sample; // sample offset within continuity_frame's audio
 
-    // Dry-signal delay ring buffer for wet/dry mix alignment.
-    // RNNoise synthesizes from the previous call's FFT, which itself analyzed
-    // the frame before that ([analysis_mem, in]), so the output at call N
-    // reconstructs in_{N-2} — exactly 2 * rnn_frame = 960 samples of delay.
-    // We delay the dry signal by the same amount so both are aligned.
-    float dry_ring[MAX_CHANNELS][960];
-    int dry_ring_pos;
+    // After reset, RNNoise emits two startup frames that are effectively silence.
+    // Consume those frames internally so this link adds no output delay.
+    int startup_drop_frames;
 } private_data;
 
 static void reset_state(mlt_link self)
@@ -81,8 +78,7 @@ static void reset_state(mlt_link self)
     pdata->frequency = 0;
     pdata->in_carry_count = 0;
     pdata->out_carry_count = 0;
-    memset(pdata->dry_ring, 0, sizeof(pdata->dry_ring));
-    pdata->dry_ring_pos = 0;
+    pdata->startup_drop_frames = RNNOISE_STARTUP_DROP_FRAMES;
     pdata->continuity_frame = -1;
     pdata->continuity_sample = 0;
     pdata->expected_frame = -1;
@@ -111,8 +107,7 @@ static void ensure_states(mlt_link self, int n_channels)
     pdata->n_channels = n_channels;
     pdata->in_carry_count = 0;
     pdata->out_carry_count = 0;
-    memset(pdata->dry_ring, 0, sizeof(pdata->dry_ring));
-    pdata->dry_ring_pos = 0;
+    pdata->startup_drop_frames = RNNOISE_STARTUP_DROP_FRAMES;
 }
 
 // Copy samples from src (planar float) channel c, starting at sample_offset,
@@ -143,19 +138,20 @@ static int link_get_audio(mlt_frame frame,
     mlt_link self = (mlt_link) mlt_frame_pop_audio(frame);
     private_data *pdata = (private_data *) self->child;
     int error = 0;
-
-    int requested_samples = *samples;
-    int requested_channels = *channels <= 0 ? 2 : *channels;
+    double link_fps = mlt_producer_get_fps(MLT_LINK_PRODUCER(self));
+    if (link_fps <= 0.0)
+        link_fps = 25.0;
+    mlt_position frame_pos = mlt_frame_get_position(frame);
 
     // Force 48kHz float for RNNoise
     *frequency = RNNOISE_RATE;
     *format = mlt_audio_float;
-    *channels = requested_channels;
+    *channels = *channels <= 0 ? 2 : *channels;
+    if (*samples <= 0)
+        *samples = mlt_audio_calculate_frame_samples(link_fps, RNNOISE_RATE, frame_pos);
 
     mlt_service_lock(MLT_LINK_SERVICE(self));
 
-    mlt_position frame_pos = mlt_frame_get_position(frame);
-
     // Detect seek: if not the expected frame, reset everything
     if (pdata->expected_frame != frame_pos) {
         reset_state(self);
@@ -166,12 +162,7 @@ static int link_get_audio(mlt_frame frame,
 
     // Get current frame's audio (cached after first call)
     struct mlt_audio_s cur_audio;
-    mlt_audio_set_values(&cur_audio,
-                         NULL,
-                         RNNOISE_RATE,
-                         mlt_audio_float,
-                         requested_samples,
-                         requested_channels);
+    mlt_audio_set_values(&cur_audio, NULL, *frequency, *format, *samples, *channels);
     error = mlt_frame_get_audio(frame,
                                 &cur_audio.data,
                                 &cur_audio.format,
@@ -196,7 +187,12 @@ static int link_get_audio(mlt_frame frame,
 
     // Allocate output buffer
     struct mlt_audio_s out;
-    mlt_audio_set_values(&out, NULL, RNNOISE_RATE, mlt_audio_float, requested_samples, *channels);
+    mlt_audio_set_values(&out,
+                         NULL,
+                         RNNOISE_RATE,
+                         cur_audio.format,
+                         cur_audio.samples,
+                         cur_audio.channels);
     mlt_audio_alloc_data(&out);
     if (!out.data) {
         mlt_service_unlock(MLT_LINK_SERVICE(self));
@@ -208,10 +204,10 @@ static int link_get_audio(mlt_frame frame,
     // We fill out from out_carry, then generate more by feeding RNNoise chunks.
     int out_delivered = 0;
 
-    while (out_delivered < requested_samples) {
+    while (out_delivered < out.samples) {
         // Drain the output carry buffer first
         if (pdata->out_carry_count > 0) {
-            int n_take = requested_samples - out_delivered;
+            int n_take = out.samples - out_delivered;
             if (n_take > pdata->out_carry_count)
                 n_take = pdata->out_carry_count;
             for (int c = 0; c < *channels && c < MAX_CHANNELS; c++) {
@@ -233,12 +229,13 @@ static int link_get_audio(mlt_frame frame,
         }
 
         // Need to process more RNNoise frames to fill out_carry.
-        // First fill in_carry to 480 samples.
-        int fill_attempts = 0;
-        while (pdata->in_carry_count < rnn_frame && fill_attempts < 2) {
+        // First fill in_carry to 480 samples, crossing as many future MLT
+        // frames as needed to satisfy this RNNoise chunk.
+        while (pdata->in_carry_count < rnn_frame) {
             // Determine source frame and audio
             float *src_data = NULL;
             int src_total_samples = 0;
+            int src_channels = *channels;
 
             if (pdata->continuity_frame == frame_pos) {
                 // Use current frame's audio
@@ -247,7 +244,6 @@ static int link_get_audio(mlt_frame frame,
             } else {
                 // Look up future frame from unique_properties
                 if (!unique_properties) {
-                    fill_attempts++;
                     break;
                 }
                 char key[19];
@@ -257,17 +253,19 @@ static int link_get_audio(mlt_frame frame,
                                                                           key,
                                                                           NULL);
                 if (!src_frame) {
-                    fill_attempts++;
                     break;
                 }
 
                 // Get audio from the future frame (may be cached)
                 struct mlt_audio_s future_audio;
+                int future_samples = mlt_audio_calculate_frame_samples(link_fps,
+                                                                       RNNOISE_RATE,
+                                                                       pdata->continuity_frame);
                 mlt_audio_set_values(&future_audio,
                                      NULL,
                                      RNNOISE_RATE,
                                      mlt_audio_float,
-                                     requested_samples,
+                                     future_samples,
                                      *channels);
                 int ferr = mlt_frame_get_audio(src_frame,
                                                &future_audio.data,
@@ -276,31 +274,40 @@ static int link_get_audio(mlt_frame frame,
                                                &future_audio.channels,
                                                &future_audio.samples);
                 if (ferr || !future_audio.data || future_audio.samples <= 0) {
-                    fill_attempts++;
                     break;
                 }
                 src_data = (float *) future_audio.data;
                 src_total_samples = future_audio.samples;
+                src_channels = future_audio.channels;
             }
 
             // Copy as many samples as possible into in_carry (up to rnn_frame)
             int needed = rnn_frame - pdata->in_carry_count;
             int copied_any = 0;
             for (int c = 0; c < *channels && c < MAX_CHANNELS; c++) {
-                float *src_plane = src_data + c * src_total_samples;
-                int n = copy_samples(pdata->in_carry[c],
+                int n = 0;
+                if (c < src_channels) {
+                    float *src_plane = src_data + c * src_total_samples;
+                    n = copy_samples(pdata->in_carry[c],
                                      pdata->in_carry_count,
                                      src_plane,
                                      pdata->continuity_sample,
                                      src_total_samples,
                                      needed);
+                } else {
+                    // Missing source channels are treated as silence.
+                    memset(pdata->in_carry[c] + pdata->in_carry_count, 0, needed * sizeof(float));
+                    n = needed;
+                }
                 if (c == 0)
                     copied_any = n; // use channel 0 to track
             }
 
             if (copied_any <= 0) {
-                fill_attempts++;
-                break;
+                // Skip empty source frame and continue to the next one.
+                pdata->continuity_frame++;
+                pdata->continuity_sample = 0;
+                continue;
             }
 
             pdata->in_carry_count += copied_any;
@@ -326,7 +333,6 @@ static int link_get_audio(mlt_frame frame,
         // Process one 480-sample RNNoise chunk per channel
         float rnn_in[480];
         float rnn_out[480];
-        const int ring_size = 2 * rnn_frame;
 
         // Ensure out_carry buffers are allocated
         for (int c = 0; c < *channels && c < MAX_CHANNELS; c++) {
@@ -339,34 +345,35 @@ static int link_get_audio(mlt_frame frame,
             }
         }
 
+        int drop_chunk = pdata->startup_drop_frames > 0;
+        int out_base = pdata->out_carry_count;
+        if (!drop_chunk && out_base + rnn_frame > BUF_CAPACITY) {
+            // Buffer overflow safeguard — should not happen with BUF_CAPACITY=8192
+            error = 1;
+            goto done;
+        }
+
         for (int c = 0; c < *channels && c < MAX_CHANNELS; c++) {
             // Scale up for RNNoise (expects ±32768)
             for (int s = 0; s < rnn_frame; s++)
                 rnn_in[s] = pdata->in_carry[c][s] * 32768.0f;
 
             rnnoise_process_frame(pdata->states[c], rnn_out, rnn_in);
 
-            // Scale back and apply wet/dry mix, then store in out_carry.
-            // Delay dry by 2*rnn_frame to match RNNoise's two-frame internal delay.
-            int out_base = pdata->out_carry_count;
-            if (out_base + rnn_frame > BUF_CAPACITY) {
-                // Buffer overflow safeguard — should not happen with BUF_CAPACITY=8192
-                error = 1;
-                goto done;
-            }
-            int ring_start = pdata->dry_ring_pos % ring_size;
+            // Scale back and apply wet/dry mix with aligned dry signal.
+            // Startup RNNoise delay is compensated by dropping first two output chunks.
             for (int s = 0; s < rnn_frame; s++) {
                 float wet = rnn_out[s] / 32768.0f;
                 float dry = pdata->in_carry[c][s];
-                int ring_idx = (ring_start + s) % ring_size;
-                float delayed_dry = pdata->dry_ring[c][ring_idx];
-                pdata->dry_ring[c][ring_idx] = dry;
-                pdata->out_carry[c][out_base + s] = delayed_dry + mix * (wet - delayed_dry);
+                if (!drop_chunk)
+                    pdata->out_carry[c][out_base + s] = dry + mix * (wet - dry);
             }
         }
-        // Advance ring_pos by one chunk (same for all channels)
-        pdata->dry_ring_pos = (pdata->dry_ring_pos + rnn_frame) % ring_size;
-        pdata->out_carry_count += rnn_frame;
+
+        if (drop_chunk)
+            pdata->startup_drop_frames--;
+        else
+            pdata->out_carry_count += rnn_frame;
         pdata->in_carry_count = 0;
     }
 
@@ -376,7 +383,8 @@ static int link_get_audio(mlt_frame frame,
         mlt_audio_silence(&out, out.samples, 0);
     }
 
-    mlt_frame_set_audio(frame, out.data, out.format, 0, out.release_data);
+    int out_size = mlt_audio_format_size(out.format, out.samples, out.channels);
+    mlt_frame_set_audio(frame, out.data, out.format, out_size, out.release_data);
     mlt_audio_get_values(&out, buffer, frequency, format, samples, channels);
 
     pdata->expected_frame = frame_pos + 1;
@@ -388,7 +396,26 @@ static int link_get_audio(mlt_frame frame,
 static int link_get_frame(mlt_link self, mlt_frame_ptr frame, int index)
 {
     int error = 0;
+    private_data *pdata = (private_data *) self->child;
     mlt_position frame_pos = mlt_producer_position(MLT_LINK_PRODUCER(self));
+    double fps = mlt_producer_get_fps(MLT_LINK_PRODUCER(self));
+    if (fps <= 0.0)
+        fps = 25.0;
+    int rnn_frame = rnnoise_get_frame_size();
+    int needed_samples = MIN_RNNOISE_FRAMES * rnn_frame;
+    int frame_samples = mlt_audio_calculate_frame_samples(fps, RNNOISE_RATE, frame_pos);
+    int startup_drop = pdata ? pdata->startup_drop_frames : RNNOISE_STARTUP_DROP_FRAMES;
+    int output_coverage_samples = frame_samples + (startup_drop * rnn_frame);
+    if (output_coverage_samples > needed_samples)
+        needed_samples = output_coverage_samples;
+    int available_samples = mlt_audio_calculate_frame_samples(fps, RNNOISE_RATE, frame_pos);
+    int future_frames_needed = 0;
+
+    while (available_samples < needed_samples) {
+        mlt_position future_pos = frame_pos + future_frames_needed + 1;
+        available_samples += mlt_audio_calculate_frame_samples(fps, RNNOISE_RATE, future_pos);
+        future_frames_needed++;
+    }
 
     mlt_producer_seek(self->next, frame_pos);
     error = mlt_service_get_frame(MLT_PRODUCER_SERVICE(self->next), frame, index);
@@ -398,8 +425,8 @@ static int link_get_frame(mlt_link self, mlt_frame_ptr frame, int index)
 
     mlt_properties unique_properties = mlt_frame_unique_properties(*frame, MLT_LINK_SERVICE(self));
 
-    // Fetch and store future frames
-    for (int i = 0; i < FUTURE_FRAMES; i++) {
+    // Fetch and store enough future frames to provide at least 3 RNNoise chunks.
+    for (int i = 0; i < future_frames_needed; i++) {
         mlt_position future_pos = frame_pos + i + 1;
         mlt_frame future_frame = NULL;
         mlt_producer_seek(self->next, future_pos);
diff --git a/src/modules/rnnoise/link_rnnoise.yml b/src/modules/rnnoise/link_rnnoise.yml
@@ -15,10 +15,7 @@ description: >
 notes: >
   Operates at 48 kHz; audio is resampled automatically if needed.
 
-  RNNoise itself introduces two 480-sample frames of latency, or about 20 ms
-  at 48 kHz, and both the filter and the link are subject to that delay. This
-  link processes audio in 480-sample frames at 48 kHz as required by RNNoise,
-  and correctly buffers samples across MLT frames.
+  Unlike filter_rnnoise, this link does not add a delay to the audio.
 audio_formats:
   - float
 parameters: