2525#include <stdlib.h>
2626#include <string.h>
2727
28- #define FUTURE_FRAMES 1
2928#define RNNOISE_RATE 48000
3029#define MAX_CHANNELS 8
30+ #define MIN_RNNOISE_FRAMES 3
31+ #define RNNOISE_STARTUP_DROP_FRAMES 2
3132// Buffer sizes: at 24fps/48kHz a frame has ~2002 samples.
3233// With 1 future frame we can have up to ~4004 input samples at once.
3334// RNNoise frame = 480. Max chunks = ceil(4004/480) = 9 → max out = 9*480 = 4320.
@@ -56,13 +57,9 @@ typedef struct
5657 mlt_position continuity_frame ;
5758 int continuity_sample ; // sample offset within continuity_frame's audio
5859
59- // Dry-signal delay ring buffer for wet/dry mix alignment.
60- // RNNoise synthesizes from the previous call's FFT, which itself analyzed
61- // the frame before that ([analysis_mem, in]), so the output at call N
62- // reconstructs in_{N-2} — exactly 2 * rnn_frame = 960 samples of delay.
63- // We delay the dry signal by the same amount so both are aligned.
64- float dry_ring [MAX_CHANNELS ][960 ];
65- int dry_ring_pos ;
60+ // After reset, RNNoise emits two startup frames that are effectively silence.
61+ // Consume those frames internally so this link adds no output delay.
62+ int startup_drop_frames ;
6663} private_data ;
6764
6865static void reset_state (mlt_link self )
@@ -81,8 +78,7 @@ static void reset_state(mlt_link self)
8178 pdata -> frequency = 0 ;
8279 pdata -> in_carry_count = 0 ;
8380 pdata -> out_carry_count = 0 ;
84- memset (pdata -> dry_ring , 0 , sizeof (pdata -> dry_ring ));
85- pdata -> dry_ring_pos = 0 ;
81+ pdata -> startup_drop_frames = RNNOISE_STARTUP_DROP_FRAMES ;
8682 pdata -> continuity_frame = -1 ;
8783 pdata -> continuity_sample = 0 ;
8884 pdata -> expected_frame = -1 ;
@@ -111,8 +107,7 @@ static void ensure_states(mlt_link self, int n_channels)
111107 pdata -> n_channels = n_channels ;
112108 pdata -> in_carry_count = 0 ;
113109 pdata -> out_carry_count = 0 ;
114- memset (pdata -> dry_ring , 0 , sizeof (pdata -> dry_ring ));
115- pdata -> dry_ring_pos = 0 ;
110+ pdata -> startup_drop_frames = RNNOISE_STARTUP_DROP_FRAMES ;
116111}
117112
118113// Copy samples from src (planar float) channel c, starting at sample_offset,
@@ -143,19 +138,20 @@ static int link_get_audio(mlt_frame frame,
143138 mlt_link self = (mlt_link ) mlt_frame_pop_audio (frame );
144139 private_data * pdata = (private_data * ) self -> child ;
145140 int error = 0 ;
146-
147- int requested_samples = * samples ;
148- int requested_channels = * channels <= 0 ? 2 : * channels ;
141+ double link_fps = mlt_producer_get_fps (MLT_LINK_PRODUCER (self ));
142+ if (link_fps <= 0.0 )
143+ link_fps = 25.0 ;
144+ mlt_position frame_pos = mlt_frame_get_position (frame );
149145
150146 // Force 48kHz float for RNNoise
151147 * frequency = RNNOISE_RATE ;
152148 * format = mlt_audio_float ;
153- * channels = requested_channels ;
149+ * channels = * channels <= 0 ? 2 : * channels ;
150+ if (* samples <= 0 )
151+ * samples = mlt_audio_calculate_frame_samples (link_fps , RNNOISE_RATE , frame_pos );
154152
155153 mlt_service_lock (MLT_LINK_SERVICE (self ));
156154
157- mlt_position frame_pos = mlt_frame_get_position (frame );
158-
159155 // Detect seek: if not the expected frame, reset everything
160156 if (pdata -> expected_frame != frame_pos ) {
161157 reset_state (self );
@@ -166,12 +162,7 @@ static int link_get_audio(mlt_frame frame,
166162
167163 // Get current frame's audio (cached after first call)
168164 struct mlt_audio_s cur_audio ;
169- mlt_audio_set_values (& cur_audio ,
170- NULL ,
171- RNNOISE_RATE ,
172- mlt_audio_float ,
173- requested_samples ,
174- requested_channels );
165+ mlt_audio_set_values (& cur_audio , NULL , * frequency , * format , * samples , * channels );
175166 error = mlt_frame_get_audio (frame ,
176167 & cur_audio .data ,
177168 & cur_audio .format ,
@@ -196,7 +187,12 @@ static int link_get_audio(mlt_frame frame,
196187
197188 // Allocate output buffer
198189 struct mlt_audio_s out ;
199- mlt_audio_set_values (& out , NULL , RNNOISE_RATE , mlt_audio_float , requested_samples , * channels );
190+ mlt_audio_set_values (& out ,
191+ NULL ,
192+ RNNOISE_RATE ,
193+ cur_audio .format ,
194+ cur_audio .samples ,
195+ cur_audio .channels );
200196 mlt_audio_alloc_data (& out );
201197 if (!out .data ) {
202198 mlt_service_unlock (MLT_LINK_SERVICE (self ));
@@ -208,10 +204,10 @@ static int link_get_audio(mlt_frame frame,
208204 // We fill out from out_carry, then generate more by feeding RNNoise chunks.
209205 int out_delivered = 0 ;
210206
211- while (out_delivered < requested_samples ) {
207+ while (out_delivered < out . samples ) {
212208 // Drain the output carry buffer first
213209 if (pdata -> out_carry_count > 0 ) {
214- int n_take = requested_samples - out_delivered ;
210+ int n_take = out . samples - out_delivered ;
215211 if (n_take > pdata -> out_carry_count )
216212 n_take = pdata -> out_carry_count ;
217213 for (int c = 0 ; c < * channels && c < MAX_CHANNELS ; c ++ ) {
@@ -233,12 +229,13 @@ static int link_get_audio(mlt_frame frame,
233229 }
234230
235231 // Need to process more RNNoise frames to fill out_carry.
236- // First fill in_carry to 480 samples.
237- int fill_attempts = 0 ;
238- while (pdata -> in_carry_count < rnn_frame && fill_attempts < 2 ) {
232+ // First fill in_carry to 480 samples, crossing as many future MLT
233+ // frames as needed to satisfy this RNNoise chunk.
234+ while (pdata -> in_carry_count < rnn_frame ) {
239235 // Determine source frame and audio
240236 float * src_data = NULL ;
241237 int src_total_samples = 0 ;
238+ int src_channels = * channels ;
242239
243240 if (pdata -> continuity_frame == frame_pos ) {
244241 // Use current frame's audio
@@ -247,7 +244,6 @@ static int link_get_audio(mlt_frame frame,
247244 } else {
248245 // Look up future frame from unique_properties
249246 if (!unique_properties ) {
250- fill_attempts ++ ;
251247 break ;
252248 }
253249 char key [19 ];
@@ -257,17 +253,19 @@ static int link_get_audio(mlt_frame frame,
257253 key ,
258254 NULL );
259255 if (!src_frame ) {
260- fill_attempts ++ ;
261256 break ;
262257 }
263258
264259 // Get audio from the future frame (may be cached)
265260 struct mlt_audio_s future_audio ;
261+ int future_samples = mlt_audio_calculate_frame_samples (link_fps ,
262+ RNNOISE_RATE ,
263+ pdata -> continuity_frame );
266264 mlt_audio_set_values (& future_audio ,
267265 NULL ,
268266 RNNOISE_RATE ,
269267 mlt_audio_float ,
270- requested_samples ,
268+ future_samples ,
271269 * channels );
272270 int ferr = mlt_frame_get_audio (src_frame ,
273271 & future_audio .data ,
@@ -276,31 +274,40 @@ static int link_get_audio(mlt_frame frame,
276274 & future_audio .channels ,
277275 & future_audio .samples );
278276 if (ferr || !future_audio .data || future_audio .samples <= 0 ) {
279- fill_attempts ++ ;
280277 break ;
281278 }
282279 src_data = (float * ) future_audio .data ;
283280 src_total_samples = future_audio .samples ;
281+ src_channels = future_audio .channels ;
284282 }
285283
286284 // Copy as many samples as possible into in_carry (up to rnn_frame)
287285 int needed = rnn_frame - pdata -> in_carry_count ;
288286 int copied_any = 0 ;
289287 for (int c = 0 ; c < * channels && c < MAX_CHANNELS ; c ++ ) {
290- float * src_plane = src_data + c * src_total_samples ;
291- int n = copy_samples (pdata -> in_carry [c ],
288+ int n = 0 ;
289+ if (c < src_channels ) {
290+ float * src_plane = src_data + c * src_total_samples ;
291+ n = copy_samples (pdata -> in_carry [c ],
292292 pdata -> in_carry_count ,
293293 src_plane ,
294294 pdata -> continuity_sample ,
295295 src_total_samples ,
296296 needed );
297+ } else {
298+ // Missing source channels are treated as silence.
299+ memset (pdata -> in_carry [c ] + pdata -> in_carry_count , 0 , needed * sizeof (float ));
300+ n = needed ;
301+ }
297302 if (c == 0 )
298303 copied_any = n ; // use channel 0 to track
299304 }
300305
301306 if (copied_any <= 0 ) {
302- fill_attempts ++ ;
303- break ;
307+ // Skip empty source frame and continue to the next one.
308+ pdata -> continuity_frame ++ ;
309+ pdata -> continuity_sample = 0 ;
310+ continue ;
304311 }
305312
306313 pdata -> in_carry_count += copied_any ;
@@ -326,7 +333,6 @@ static int link_get_audio(mlt_frame frame,
326333 // Process one 480-sample RNNoise chunk per channel
327334 float rnn_in [480 ];
328335 float rnn_out [480 ];
329- const int ring_size = 2 * rnn_frame ;
330336
331337 // Ensure out_carry buffers are allocated
332338 for (int c = 0 ; c < * channels && c < MAX_CHANNELS ; c ++ ) {
@@ -339,34 +345,35 @@ static int link_get_audio(mlt_frame frame,
339345 }
340346 }
341347
348+ int drop_chunk = pdata -> startup_drop_frames > 0 ;
349+ int out_base = pdata -> out_carry_count ;
350+ if (!drop_chunk && out_base + rnn_frame > BUF_CAPACITY ) {
351+ // Buffer overflow safeguard — should not happen with BUF_CAPACITY=8192
352+ error = 1 ;
353+ goto done ;
354+ }
355+
342356 for (int c = 0 ; c < * channels && c < MAX_CHANNELS ; c ++ ) {
343357 // Scale up for RNNoise (expects ±32768)
344358 for (int s = 0 ; s < rnn_frame ; s ++ )
345359 rnn_in [s ] = pdata -> in_carry [c ][s ] * 32768.0f ;
346360
347361 rnnoise_process_frame (pdata -> states [c ], rnn_out , rnn_in );
348362
349- // Scale back and apply wet/dry mix, then store in out_carry.
350- // Delay dry by 2*rnn_frame to match RNNoise's two-frame internal delay.
351- int out_base = pdata -> out_carry_count ;
352- if (out_base + rnn_frame > BUF_CAPACITY ) {
353- // Buffer overflow safeguard — should not happen with BUF_CAPACITY=8192
354- error = 1 ;
355- goto done ;
356- }
357- int ring_start = pdata -> dry_ring_pos % ring_size ;
363+ // Scale back and apply wet/dry mix with aligned dry signal.
364+ // Startup RNNoise delay is compensated by dropping first two output chunks.
358365 for (int s = 0 ; s < rnn_frame ; s ++ ) {
359366 float wet = rnn_out [s ] / 32768.0f ;
360367 float dry = pdata -> in_carry [c ][s ];
361- int ring_idx = (ring_start + s ) % ring_size ;
362- float delayed_dry = pdata -> dry_ring [c ][ring_idx ];
363- pdata -> dry_ring [c ][ring_idx ] = dry ;
364- pdata -> out_carry [c ][out_base + s ] = delayed_dry + mix * (wet - delayed_dry );
368+ if (!drop_chunk )
369+ pdata -> out_carry [c ][out_base + s ] = dry + mix * (wet - dry );
365370 }
366371 }
367- // Advance ring_pos by one chunk (same for all channels)
368- pdata -> dry_ring_pos = (pdata -> dry_ring_pos + rnn_frame ) % ring_size ;
369- pdata -> out_carry_count += rnn_frame ;
372+
373+ if (drop_chunk )
374+ pdata -> startup_drop_frames -- ;
375+ else
376+ pdata -> out_carry_count += rnn_frame ;
370377 pdata -> in_carry_count = 0 ;
371378 }
372379
@@ -376,7 +383,8 @@ static int link_get_audio(mlt_frame frame,
376383 mlt_audio_silence (& out , out .samples , 0 );
377384 }
378385
379- mlt_frame_set_audio (frame , out .data , out .format , 0 , out .release_data );
386+ int out_size = mlt_audio_format_size (out .format , out .samples , out .channels );
387+ mlt_frame_set_audio (frame , out .data , out .format , out_size , out .release_data );
380388 mlt_audio_get_values (& out , buffer , frequency , format , samples , channels );
381389
382390 pdata -> expected_frame = frame_pos + 1 ;
@@ -388,7 +396,26 @@ static int link_get_audio(mlt_frame frame,
388396static int link_get_frame (mlt_link self , mlt_frame_ptr frame , int index )
389397{
390398 int error = 0 ;
399+ private_data * pdata = (private_data * ) self -> child ;
391400 mlt_position frame_pos = mlt_producer_position (MLT_LINK_PRODUCER (self ));
401+ double fps = mlt_producer_get_fps (MLT_LINK_PRODUCER (self ));
402+ if (fps <= 0.0 )
403+ fps = 25.0 ;
404+ int rnn_frame = rnnoise_get_frame_size ();
405+ int needed_samples = MIN_RNNOISE_FRAMES * rnn_frame ;
406+ int frame_samples = mlt_audio_calculate_frame_samples (fps , RNNOISE_RATE , frame_pos );
407+ int startup_drop = pdata ? pdata -> startup_drop_frames : RNNOISE_STARTUP_DROP_FRAMES ;
408+ int output_coverage_samples = frame_samples + (startup_drop * rnn_frame );
409+ if (output_coverage_samples > needed_samples )
410+ needed_samples = output_coverage_samples ;
411+ int available_samples = mlt_audio_calculate_frame_samples (fps , RNNOISE_RATE , frame_pos );
412+ int future_frames_needed = 0 ;
413+
414+ while (available_samples < needed_samples ) {
415+ mlt_position future_pos = frame_pos + future_frames_needed + 1 ;
416+ available_samples += mlt_audio_calculate_frame_samples (fps , RNNOISE_RATE , future_pos );
417+ future_frames_needed ++ ;
418+ }
392419
393420 mlt_producer_seek (self -> next , frame_pos );
394421 error = mlt_service_get_frame (MLT_PRODUCER_SERVICE (self -> next ), frame , index );
@@ -398,8 +425,8 @@ static int link_get_frame(mlt_link self, mlt_frame_ptr frame, int index)
398425
399426 mlt_properties unique_properties = mlt_frame_unique_properties (* frame , MLT_LINK_SERVICE (self ));
400427
401- // Fetch and store future frames
402- for (int i = 0 ; i < FUTURE_FRAMES ; i ++ ) {
428+ // Fetch and store enough future frames to provide at least 3 RNNoise chunks.
429+ for (int i = 0 ; i < future_frames_needed ; i ++ ) {
403430 mlt_position future_pos = frame_pos + i + 1 ;
404431 mlt_frame future_frame = NULL ;
405432 mlt_producer_seek (self -> next , future_pos );
0 commit comments