Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions examples/parakeet-cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ options:
-ng, --no-gpu [false ] disable GPU
-dev N, --device N [0 ] GPU device to use
-ps, --print-segments [false ] print segment information
--stream process audio in overlapping windows
-lc N, --left-context-ms N left context per stream window (ms) in multiple of 80ms (default: 10000)
-cs N, --chunk-ms N emitted audio per stream window (ms) in multiple of 80ms (default: 2000)
-rc N, --right-context-ms N right context per stream window (ms) in multiple of 80ms (default: 2000)
```

### Example
Expand All @@ -39,6 +43,13 @@ parakeet_decode: starting decode with n_frames=138
And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
```

Streaming mode encodes overlapping `[left | chunk | right]` windows and emits only tokens that begin in the chunk. Defaults are `[10000 | 2000 | 2000]` (ms):
```console
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3-f16.bin -f samples/jfk.wav --stream --left-context-ms 10000 --chunk-ms 2000 --right-context-ms 2000
```

This mode uses the existing encoder attention implementation. It does not reproduce NeMo configurable limited-right-context attention.

To print segment information:
```console
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3-f16.bin -f samples/jfk.wav --print-segments
Expand Down
22 changes: 21 additions & 1 deletion examples/parakeet-cli/parakeet-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ struct parakeet_params {
bool print_segments = false;
bool output_txt = false;
bool no_prints = false;
bool stream = false;

int32_t left_context_ms = 10000;
int32_t chunk_ms = 2000;
int32_t right_context_ms = 2000;

std::string model = "models/ggml-parakeet-tdt-0.6b-v3.bin";
std::string output_file = "";
Expand Down Expand Up @@ -63,6 +68,10 @@ static bool parakeet_params_parse(int argc, char ** argv, parakeet_params & para
else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
else if (arg == "-of" || arg == "--output-file") { params.output_file = ARGV_NEXT; }
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
else if (arg == "--stream") { params.stream = true; }
else if (arg == "-lc" || arg == "--left-context-ms") { params.left_context_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-cs" || arg == "--chunk-ms") { params.chunk_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-rc" || arg == "--right-context-ms") { params.right_context_ms = std::stoi(ARGV_NEXT); }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
parakeet_print_usage(argc, argv, params);
Expand All @@ -89,6 +98,10 @@ static void parakeet_print_usage(int /*argc*/, char ** argv, const parakeet_para
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
fprintf(stderr, " -of, --output-file FILE [%-7s] output file path (without file extension)\n", "");
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
fprintf(stderr, " --stream [%-7s] process audio in overlapping windows\n", params.stream ? "true" : "false");
fprintf(stderr, " -lc N, --left-context-ms N [%-7d] left context per stream window (ms) in multiple of 80ms\n", params.left_context_ms);
fprintf(stderr, " -cs N, --chunk-ms N [%-7d] emitted audio per stream window (ms) in multiple of 80ms\n", params.chunk_ms);
fprintf(stderr, " -rc N, --right-context-ms N [%-7d] right context per stream window (ms) in multiple of 80ms\n", params.right_context_ms);
fprintf(stderr, "\n");
}

Expand Down Expand Up @@ -129,6 +142,11 @@ int main(int argc, char ** argv) {
ctx_params.use_gpu = params.use_gpu;
ctx_params.gpu_device = params.gpu_device;

struct parakeet_stream_params stream_params = parakeet_stream_default_params();
stream_params.left_context_ms = params.left_context_ms;
stream_params.chunk_ms = params.chunk_ms;
stream_params.right_context_ms = params.right_context_ms;

if (!params.no_prints) {
fprintf(stderr, "Loading Parakeet model from: %s\n", params.model.c_str());
}
Expand Down Expand Up @@ -171,7 +189,9 @@ int main(int argc, char ** argv) {
full_params.new_token_callback_user_data = &is_first;

const int mel_frames = (int)(pcmf32.size() / PARAKEET_HOP_LENGTH);
int ret = parakeet_full(pctx, full_params, pcmf32.data(), pcmf32.size());
const int ret = params.stream
? parakeet_full_stream(pctx, full_params, stream_params, pcmf32.data(), pcmf32.size())
: parakeet_full(pctx, full_params, pcmf32.data(), pcmf32.size());

if (ret != 0) {
fprintf(stderr, "error: failed to process audio file '%s'\n", fname.c_str());
Expand Down
27 changes: 27 additions & 0 deletions include/parakeet.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,21 @@ extern "C" {
void * abort_callback_user_data;
};

// Parameters for parakeet_full_stream(). All durations are positive milliseconds.
// Values must be multiples of the encoder frame duration (80 ms).
struct parakeet_stream_params {
int left_context_ms;
int chunk_ms;
int right_context_ms;
};

// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see parakeet_free_context_params() & parakeet_free_params()
PARAKEET_API struct parakeet_context_params * parakeet_context_default_params_by_ref(void);
PARAKEET_API struct parakeet_context_params parakeet_context_default_params (void);

PARAKEET_API struct parakeet_full_params * parakeet_full_default_params_by_ref(enum parakeet_sampling_strategy strategy);
PARAKEET_API struct parakeet_full_params parakeet_full_default_params (enum parakeet_sampling_strategy strategy);
PARAKEET_API struct parakeet_stream_params parakeet_stream_default_params (void);

// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Not thread safe for same context
Expand All @@ -287,6 +296,24 @@ extern "C" {
const float * samples,
int n_samples);

// Nvidia Nemo example of parakeet streaming
// https://github.com/NVIDIA-NeMo/NeMo/blob/main/examples/asr/asr_chunked_inference/rnnt/speech_to_text_streaming_infer_rnnt.py
// Example of 10-2-3 window: encoder (full 15s) -> decoder (middle 2s) -> text (middle s)
PARAKEET_API int parakeet_full_stream(
struct parakeet_context * ctx,
struct parakeet_full_params params,
struct parakeet_stream_params stream_params,
const float * samples,
int n_samples);

PARAKEET_API int parakeet_full_stream_with_state(
struct parakeet_context * ctx,
struct parakeet_state * state,
struct parakeet_full_params params,
struct parakeet_stream_params stream_params,
const float * samples,
int n_samples);

// Process a single chunk of audio data that fits within the model's audio context window.
// This is more efficient than parakeet_full() for short audio clips.
PARAKEET_API int parakeet_chunk(
Expand Down
Loading