parakeet : add parakeet_stream_push API

danbev · danbev · commit ffb123956d2f · 2026-04-08T10:43:06.000+02:00
This commit adds a new API to parakeet to support streaming audio input.

The motivation for this came from trying to use parakeet.cpp with ffmpeg
where the existing API did not work very well.
diff --git a/include/parakeet.h b/include/parakeet.h
@@ -314,6 +314,29 @@ extern "C" {
                             const float * samples,
                                    int    n_samples);
 
+    // Initialize streaming state for a new stream.
+    PARAKEET_API int parakeet_stream_init(
+                struct parakeet_context * ctx,
+                  struct parakeet_state * state,
+            struct parakeet_full_params   params);
+
+    // Push audio samples in streaming mode. Internally this function will structure
+    // the samples in a buffer where with a left context, a center chunk, and a
+    // right context. The encoder will see the complete buffer which enables it
+    // to get boundry context for the target/center audio chunk. This avoids hard
+    // cut offs at the chunk boundaries. The joint network then only sees the
+    // center chunk and this function internally handles the context windowing.
+    PARAKEET_API int parakeet_stream_push(
+                struct parakeet_context * ctx,
+                  struct parakeet_state * state,
+                            const float * samples,
+                                    int   n_samples);
+
+    // Flush the final partial chunk at end-of-stream.
+    PARAKEET_API int parakeet_stream_flush(
+                struct parakeet_context * ctx,
+                  struct parakeet_state * state);
+
     // Number of generated text segments
     PARAKEET_API int parakeet_full_n_segments           (struct parakeet_context * ctx);
     PARAKEET_API int parakeet_full_n_segments_from_state(struct parakeet_state * state);
diff --git a/src/parakeet.cpp b/src/parakeet.cpp
@@ -580,6 +580,18 @@ struct tdt_stream_state {
     bool initialized;           // whether prediction LSTM state has been initialized
 };
 
+struct parakeet_stream {
+    std::vector<float> buffer;
+    int64_t n_samples_advanced = 0;
+
+    int n_left_ctx  = 0;
+    int n_chunk     = 0;
+    int n_right_ctx = 0;
+
+    parakeet_full_params params = {};
+    bool initialized = false;
+};
+
 struct parakeet_state {
     int64_t t_sample_us = 0;
     int64_t t_encode_us = 0;
@@ -638,6 +650,8 @@ struct parakeet_state {
     parakeet_lstm_state lstm_state;
 
     struct tdt_stream_state tdt_stream_state = {0, 0, 0, false};
+
+    parakeet_stream stream;
 };
 
 // FFT cache for mel spectrogram computation
@@ -2367,7 +2381,7 @@ static bool parakeet_decode(
     // Start with the blank token (8192)
     parakeet_token last_token = blank_id;
 
-    PARAKEET_LOG_INFO("parakeet_decode: starting decode with n_frames=%d\n", n_frames);
+    PARAKEET_LOG_DEBUG("parakeet_decode: starting decode with n_frames=%d\n", n_frames);
 
     batch.n_tokens  = 1;
     batch.token[0]  = last_token;
@@ -3609,6 +3623,259 @@ struct parakeet_full_params parakeet_full_default_params(enum parakeet_sampling_
     return result;
 }
 
+static void parakeet_stream_reset_state(struct parakeet_state * state) {
+    if (state == nullptr) {
+        return;
+    }
+
+    if (state->lstm_state.buffer) {
+        ggml_backend_buffer_clear(state->lstm_state.buffer, 0);
+    }
+
+    state->decoded_tokens.clear();
+    state->decoded_token_data.clear();
+    state->result_all.clear();
+
+    state->tdt_stream_state.initialized    = false;
+    state->tdt_stream_state.last_token     = 0;
+    state->tdt_stream_state.time_step      = 0;
+    state->tdt_stream_state.decoded_length = 0;
+
+    state->stream.buffer.clear();
+    state->stream.n_samples_advanced = 0;
+    state->stream.n_left_ctx         = 0;
+    state->stream.n_chunk            = 0;
+    state->stream.n_right_ctx        = 0;
+    state->stream.params             = {};
+    state->stream.initialized        = false;
+
+    state->enc_out_buffer.clear();
+    state->enc_out_frames = 0;
+    state->n_frames       = 0;
+    state->n_audio_ctx    = 0;
+}
+
+static int parakeet_stream_process_window(
+        struct parakeet_context * ctx,
+          struct parakeet_state * state,
+                    const float * samples,
+                            int   n_samples,
+                            int   n_chunk) {
+    const parakeet_stream & stream = state->stream;
+    const parakeet_full_params & params = stream.params;
+    const int d_enc = ctx->model.hparams.n_audio_state;
+
+    // process all the samples.
+    if (parakeet_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
+        return -2;
+    }
+
+    const int left_mel_frames = stream.n_left_ctx / PARAKEET_HOP_LENGTH;
+    const int chunk_mel_frames = n_chunk / PARAKEET_HOP_LENGTH;
+
+    state->n_audio_ctx = state->mel.n_len;
+    // process entire log mel spectrogram.
+    if (!parakeet_encode_internal(*ctx, *state, 0, params.n_threads,
+                                  params.abort_callback, params.abort_callback_user_data)) {
+        return -6;
+    }
+
+    const int left_enc_frames  = left_mel_frames  / ctx->model.hparams.subsampling_factor;
+    const int chunk_enc_frames = chunk_mel_frames / ctx->model.hparams.subsampling_factor;
+
+    if (chunk_enc_frames <= 0) {
+        return 0;
+    }
+
+    // Copy the center chunk so that it is the only part that the joint network sees.
+    state->enc_out_buffer.resize(chunk_enc_frames * d_enc);
+    ggml_backend_tensor_get(state->enc_out, state->enc_out_buffer.data(),
+                           left_enc_frames * d_enc * sizeof(float),
+                           chunk_enc_frames * d_enc * sizeof(float));
+
+    state->enc_out_frames = chunk_enc_frames;
+    state->n_frames = chunk_enc_frames;
+
+    const size_t tokens_before = state->decoded_tokens.size();
+
+    // Run the prediction and joint network on the center chunk.
+    if (!parakeet_decode_chunk(*ctx, *state, state->batch, chunk_enc_frames, params.n_threads, &params)) {
+        return -7;
+    }
+
+    const size_t tokens_after = state->decoded_tokens.size();
+    const size_t new_token_count = tokens_after - tokens_before;
+
+    if (new_token_count > 0) {
+        std::string text;
+        std::vector<parakeet_token_data> result_tokens;
+        const int64_t chunk_t0 = 100LL * stream.n_samples_advanced / PARAKEET_SAMPLE_RATE;
+        const int64_t chunk_t1 = 100LL * (stream.n_samples_advanced + n_chunk) / PARAKEET_SAMPLE_RATE;
+        const int frame_offset = chunk_t0 / ctx->model.hparams.subsampling_factor;
+
+        result_tokens.reserve(new_token_count);
+
+        for (size_t i = tokens_before; i < tokens_after; ++i) {
+            const auto token_id = state->decoded_tokens[i];
+            const char * token_str = parakeet_token_to_str(ctx, token_id);
+            if (token_str) {
+                const bool is_first_piece = (tokens_before == 0) && text.empty();
+                text += sentencepiece_piece_to_text(token_str, is_first_piece);
+            }
+
+            auto token_data = state->decoded_token_data[i];
+            token_data.frame_index += frame_offset;
+            token_data.t0 += chunk_t0;
+            token_data.t1 += chunk_t0;
+            result_tokens.push_back(token_data);
+        }
+
+        refine_timestamps_tdt(ctx->vocab, result_tokens);
+
+        if (!text.empty()) {
+            parakeet_segment segment;
+            segment.t0 = chunk_t0;
+            segment.t1 = chunk_t1;
+            segment.text = std::move(text);
+            segment.tokens = std::move(result_tokens);
+
+            state->result_all.push_back(std::move(segment));
+
+            if (params.new_segment_callback) {
+                params.new_segment_callback(ctx, state, 1, params.new_segment_callback_user_data);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int ms_to_n_samples(int ms) {
+    return ms * PARAKEET_SAMPLE_RATE / 1000;
+}
+
+int parakeet_stream_init(
+        struct parakeet_context * ctx,
+          struct parakeet_state * state,
+    struct parakeet_full_params   params) {
+    if (ctx == nullptr || state == nullptr) {
+        return -1;
+    }
+
+    const int n_left_ctx  = ms_to_n_samples(params.left_context_ms);
+    const int n_chunk     = ms_to_n_samples(params.chunk_length_ms);
+    const int n_right_ctx = ms_to_n_samples(params.right_context_ms);
+
+    if (n_left_ctx < 0 || n_chunk <= 0 || n_right_ctx < 0) {
+        return -1;
+    }
+
+    parakeet_stream_reset_state(state);
+
+    state->stream.n_left_ctx  = n_left_ctx;
+    state->stream.n_chunk     = n_chunk;
+    state->stream.n_right_ctx = n_right_ctx;
+    state->stream.params      = params;
+    state->stream.initialized = true;
+
+    if (n_left_ctx > 0) {
+        state->stream.buffer.assign(n_left_ctx, 0.0f);
+    }
+
+    return 0;
+}
+
+int parakeet_stream_push(
+        struct parakeet_context * ctx,
+          struct parakeet_state * state,
+                    const float * samples,
+                            int   n_samples) {
+    if (ctx == nullptr || state == nullptr || samples == nullptr || n_samples <= 0) {
+        return -1;
+    }
+
+    if (!state->stream.initialized) {
+        return -1;
+    }
+
+    const int n_total_samples = state->stream.n_left_ctx + state->stream.n_chunk + state->stream.n_right_ctx;
+
+    // Insert the new chunk of samples as the new center and right context.
+    state->stream.buffer.insert(state->stream.buffer.end(), samples, samples + n_samples);
+
+    // As long as we have enough samples to form a complete window we process it.
+    while (state->stream.buffer.size() >= (size_t) n_total_samples) {
+        const int ret = parakeet_stream_process_window(
+                ctx,
+                state,
+                state->stream.buffer.data(),
+                n_total_samples,
+                state->stream.n_chunk);
+        if (ret != 0) {
+            return ret;
+        }
+
+        // TODO: std::vector::erase is O(n) and not optimal. We should probably
+        // use a ring buffer instead.
+        // Shift the center and right context to the start of the buffer. This
+        // allows the next call to have the current center chunk as its left
+        // context, and the right context will become part of the next target
+        // chunk together with the new samples which will make up the rest of
+        // the target chunk and the new right context.
+        state->stream.buffer.erase(state->stream.buffer.begin(), state->stream.buffer.begin() + state->stream.n_chunk);
+
+        state->stream.n_samples_advanced += state->stream.n_chunk;
+    }
+
+    return 0;
+}
+
+int parakeet_stream_flush(
+        struct parakeet_context * ctx,
+          struct parakeet_state * state) {
+    if (ctx == nullptr || state == nullptr) {
+        return -1;
+    }
+
+    if (!state->stream.initialized) {
+        return -1;
+    }
+
+    while (state->stream.buffer.size() > (size_t) state->stream.n_left_ctx) {
+        const int n_remaining_samples = (int) state->stream.buffer.size() - state->stream.n_left_ctx;
+        const int n_flush_chunk       = std::min(state->stream.n_chunk, n_remaining_samples);
+        const int n_right_available   = std::min(state->stream.n_right_ctx, n_remaining_samples - n_flush_chunk);
+        const int n_copied            = state->stream.n_left_ctx + n_flush_chunk + n_right_available;
+
+        std::vector<float> flush_window(state->stream.n_left_ctx + n_flush_chunk + state->stream.n_right_ctx, 0.0f);
+
+        std::copy_n(state->stream.buffer.begin(), n_copied, flush_window.begin());
+
+        const int ret = parakeet_stream_process_window(
+                ctx,
+                state,
+                flush_window.data(),
+                (int) flush_window.size(),
+                n_flush_chunk);
+        if (ret != 0) {
+            return ret;
+        }
+
+        state->stream.buffer.erase(state->stream.buffer.begin(), state->stream.buffer.begin() + n_flush_chunk);
+        state->stream.n_samples_advanced += n_flush_chunk;
+    }
+
+    state->stream.buffer.clear();
+    state->stream.n_samples_advanced = 0;
+    state->stream.n_left_ctx         = 0;
+    state->stream.n_chunk            = 0;
+    state->stream.n_right_ctx        = 0;
+    state->stream.params             = {};
+    state->stream.initialized        = false;
+
+    return 0;
+}
+
 int parakeet_full_with_state(
         struct parakeet_context * ctx,
           struct parakeet_state * state,
@@ -3729,7 +3996,8 @@ int parakeet_full_with_state(
                     const auto token_id = state->decoded_tokens[i];
                     const char * token_str = parakeet_token_to_str(ctx, token_id);
                     if (token_str) {
-                        text += sentencepiece_piece_to_text(token_str, text.empty());
+                        const bool is_first_piece = (tokens_before == 0) && text.empty();
+                        text += sentencepiece_piece_to_text(token_str, is_first_piece);
                     }
 
                     auto token_data = state->decoded_token_data[i];
@@ -3787,6 +4055,11 @@ int parakeet_chunk(
         ggml_backend_buffer_clear(state->lstm_state.buffer, 0);
         state->decoded_tokens.clear();
         state->decoded_token_data.clear();
+
+        state->tdt_stream_state.initialized    = false;
+        state->tdt_stream_state.last_token     = 0;
+        state->tdt_stream_state.time_step      = 0;
+        state->tdt_stream_state.decoded_length = 0;
     }
 
     if (n_samples > 0) {
@@ -3800,7 +4073,7 @@ int parakeet_chunk(
         const int total_len = parakeet_n_len_from_state(state);
         const int model_max_ctx = parakeet_n_audio_ctx(ctx);
         params.audio_ctx = std::min(total_len, model_max_ctx);
-        PARAKEET_LOG_INFO("Processing audio: total_frames=%d, chunk_size=%d\n", total_len, params.audio_ctx);
+        PARAKEET_LOG_DEBUG("Processing audio: total_frames=%d, chunk_size=%d\n", total_len, params.audio_ctx);
     }
     state->n_audio_ctx = params.audio_ctx;
 
@@ -3829,7 +4102,8 @@ int parakeet_chunk(
             const auto token_id = state->decoded_tokens[i];
             const char * token_str = parakeet_token_to_str(ctx, token_id);
             if (token_str) {
-                text += sentencepiece_piece_to_text(token_str, text.empty());
+                const bool is_first_piece = (tokens_before == 0) && text.empty();
+                text += sentencepiece_piece_to_text(token_str, is_first_piece);
             }
 
             // Use the stored token data from parakeet_decode
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -121,6 +121,15 @@ target_compile_definitions(${PARAKEET_TEST} PRIVATE
     SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/jfk.wav")
 add_test(NAME ${PARAKEET_TEST} COMMAND ${PARAKEET_TEST})
 
+set(PARAKEET_TEST test-parakeet-stream)
+add_executable(${PARAKEET_TEST} ${PARAKEET_TEST}.cpp)
+target_include_directories(${PARAKEET_TEST} PRIVATE ../include ../ggml/include ../examples)
+target_link_libraries(${PARAKEET_TEST} PRIVATE parakeet common)
+target_compile_definitions(${PARAKEET_TEST} PRIVATE
+    PARAKEET_MODEL_PATH="${PROJECT_SOURCE_DIR}/models/ggml-parakeet-tdt-0.6b-v3.bin"
+    SAMPLE_PATH="${PROJECT_SOURCE_DIR}/samples/gb1.wav")
+add_test(NAME ${PARAKEET_TEST} COMMAND ${PARAKEET_TEST})
+
 set(PARAKEET_TEST test-parakeet-full)
 add_executable(${PARAKEET_TEST} ${PARAKEET_TEST}.cpp)
 target_include_directories(${PARAKEET_TEST} PRIVATE ../include ../ggml/include ../examples)
diff --git a/tests/test-parakeet-stream.cpp b/tests/test-parakeet-stream.cpp