ggml-org · justynleung · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/examples/parakeet-cli/README.md b/examples/parakeet-cli/README.md
@@ -28,6 +28,10 @@ options:
   -ng,    --no-gpu            [false  ] disable GPU
   -dev N, --device N          [0      ] GPU device to use
   -ps,    --print-segments    [false  ] print segment information
+  --stream                   process audio in overlapping windows
+  -lc N,   --left-context-ms N   left context per stream window (ms) in multiple of 80ms (default: 10000)
+  -cs N,   --chunk-ms N          emitted audio per stream window (ms) in multiple of 80ms (default: 2000)
+  -rc N,   --right-context-ms N  right context per stream window (ms) in multiple of 80ms (default: 2000)
 ```
 
 ### Example
@@ -39,6 +43,13 @@ parakeet_decode: starting decode with n_frames=138
 And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
 ```
 
+Streaming mode encodes overlapping `[left | chunk | right]` windows and emits only tokens that begin in the chunk. Defaults are `[10000 | 2000 | 2000]` (ms):
+```console
+$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3-f16.bin -f samples/jfk.wav --stream --left-context-ms 10000 --chunk-ms 2000 --right-context-ms 2000
+```
+
+This mode uses the existing encoder attention implementation. It does not reproduce NeMo configurable limited-right-context attention.
+
 To print segment information:
 ```console
 $ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3-f16.bin -f samples/jfk.wav --print-segments

diff --git a/examples/parakeet-cli/parakeet-cli.cpp b/examples/parakeet-cli/parakeet-cli.cpp
@@ -18,6 +18,11 @@ struct parakeet_params {
     bool print_segments = false;
     bool output_txt     = false;
     bool no_prints      = false;
+    bool stream         = false;
+
+    int32_t left_context_ms  = 10000;
+    int32_t chunk_ms         =  2000;
+    int32_t right_context_ms =  2000;
 
     std::string model       = "models/ggml-parakeet-tdt-0.6b-v3.bin";
     std::string output_file = "";
@@ -63,6 +68,10 @@ static bool parakeet_params_parse(int argc, char ** argv, parakeet_params & para
         else if (arg == "-otxt" || arg == "--output-txt")      { params.output_txt        = true; }
         else if (arg == "-of"   || arg == "--output-file")     { params.output_file       = ARGV_NEXT; }
         else if (arg == "-np"   || arg == "--no-prints")       { params.no_prints         = true; }
+        else if (arg == "--stream")                              { params.stream            = true; }
+        else if (arg == "-lc"   || arg == "--left-context-ms")   { params.left_context_ms   = std::stoi(ARGV_NEXT); }
+        else if (arg == "-cs"   || arg == "--chunk-ms")          { params.chunk_ms          = std::stoi(ARGV_NEXT); }
+        else if (arg == "-rc"   || arg == "--right-context-ms")  { params.right_context_ms  = std::stoi(ARGV_NEXT); }
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             parakeet_print_usage(argc, argv, params);
@@ -89,6 +98,10 @@ static void parakeet_print_usage(int /*argc*/, char ** argv, const parakeet_para
     fprintf(stderr, "  -otxt,  --output-txt        [%-7s] output result in a text file\n",                params.output_txt ? "true" : "false");
     fprintf(stderr, "  -of,    --output-file FILE  [%-7s] output file path (without file extension)\n",   "");
     fprintf(stderr, "  -np,    --no-prints         [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
+    fprintf(stderr, "          --stream            [%-7s] process audio in overlapping windows\n",          params.stream ? "true" : "false");
+    fprintf(stderr, "  -lc N,   --left-context-ms N   [%-7d] left context per stream window (ms) in multiple of 80ms\n",     params.left_context_ms);
+    fprintf(stderr, "  -cs N,   --chunk-ms N          [%-7d] emitted audio per stream window (ms) in multiple of 80ms\n",     params.chunk_ms);
+    fprintf(stderr, "  -rc N,   --right-context-ms N  [%-7d] right context per stream window (ms) in multiple of 80ms\n",     params.right_context_ms);
     fprintf(stderr, "\n");
 }
 
@@ -129,6 +142,11 @@ int main(int argc, char ** argv) {
     ctx_params.use_gpu     = params.use_gpu;
     ctx_params.gpu_device  = params.gpu_device;
 
+    struct parakeet_stream_params stream_params = parakeet_stream_default_params();
+    stream_params.left_context_ms  = params.left_context_ms;
+    stream_params.chunk_ms         = params.chunk_ms;
+    stream_params.right_context_ms = params.right_context_ms;
+
     if (!params.no_prints) {
         fprintf(stderr, "Loading Parakeet model from: %s\n", params.model.c_str());
     }
@@ -171,7 +189,9 @@ int main(int argc, char ** argv) {
         full_params.new_token_callback_user_data = &is_first;
 
         const int mel_frames = (int)(pcmf32.size() / PARAKEET_HOP_LENGTH);
-        int ret = parakeet_full(pctx, full_params, pcmf32.data(), pcmf32.size());
+        const int ret = params.stream
+            ? parakeet_full_stream(pctx, full_params, stream_params, pcmf32.data(), pcmf32.size())
+            : parakeet_full(pctx, full_params, pcmf32.data(), pcmf32.size());
 
         if (ret != 0) {
             fprintf(stderr, "error: failed to process audio file '%s'\n", fname.c_str());

diff --git a/include/parakeet.h b/include/parakeet.h
@@ -265,12 +265,21 @@ extern "C" {
         void * abort_callback_user_data;
     };
 
+    // Parameters for parakeet_full_stream(). All durations are positive milliseconds.
+    // Values must be multiples of the encoder frame duration (80 ms).
+    struct parakeet_stream_params {
+        int left_context_ms;
+        int chunk_ms;
+        int right_context_ms;
+    };
+
     // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see parakeet_free_context_params() & parakeet_free_params()
     PARAKEET_API struct parakeet_context_params * parakeet_context_default_params_by_ref(void);
     PARAKEET_API struct parakeet_context_params   parakeet_context_default_params       (void);
 
     PARAKEET_API struct parakeet_full_params * parakeet_full_default_params_by_ref(enum parakeet_sampling_strategy strategy);
     PARAKEET_API struct parakeet_full_params   parakeet_full_default_params       (enum parakeet_sampling_strategy strategy);
+    PARAKEET_API struct parakeet_stream_params parakeet_stream_default_params     (void);
 
     // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
     // Not thread safe for same context
@@ -287,6 +296,24 @@ extern "C" {
                             const float * samples,
                                     int   n_samples);
 
+    // Nvidia Nemo example of parakeet streaming
+    // https://github.com/NVIDIA-NeMo/NeMo/blob/main/examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py
+    // Example of 10-2-3 window: encoder (full 15s) -> decoder (middle 2s) -> text (middle s)
+    PARAKEET_API int parakeet_full_stream(
+                struct parakeet_context * ctx,
+            struct parakeet_full_params   params,
+            struct parakeet_stream_params stream_params,
+                            const float * samples,
+                                    int   n_samples);
+
+    PARAKEET_API int parakeet_full_stream_with_state(
+                struct parakeet_context * ctx,
+                  struct parakeet_state * state,
+            struct parakeet_full_params   params,
+            struct parakeet_stream_params stream_params,
+                            const float * samples,
+                                    int   n_samples);
+
     // Process a single chunk of audio data that fits within the model's audio context window.
     // This is more efficient than parakeet_full() for short audio clips.
     PARAKEET_API int parakeet_chunk(