Skip to content

Commit 2becf74

Browse files
committed
parakeet : add support for NVIDIA Parakeet
This is a work in progress to support the Parakeet model.
1 parent 95ea8f9 commit 2becf74

15 files changed

Lines changed: 5614 additions & 0 deletions

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,10 @@ target_compile_definitions(whisper PRIVATE
185185
WHISPER_VERSION="${PROJECT_VERSION}"
186186
)
187187

188+
target_compile_definitions(parakeet PRIVATE
189+
PARAKEET_VERSION="${PROJECT_VERSION}"
190+
)
191+
188192
configure_package_config_file(
189193
${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
190194
${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ else()
107107
add_subdirectory(server)
108108
add_subdirectory(quantize)
109109
add_subdirectory(vad-speech-segments)
110+
add_subdirectory(parakeet-cli)
110111
if (WHISPER_SDL2)
111112
add_subdirectory(stream)
112113
add_subdirectory(command)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
set(TARGET parakeet-cli)
2+
add_executable(${TARGET} parakeet-cli.cpp)
3+
4+
include(DefaultTargetOptions)
5+
6+
target_link_libraries(${TARGET} PRIVATE common parakeet ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
7+
8+
install(TARGETS ${TARGET} RUNTIME)

examples/parakeet-cli/README.md

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# whisper.cpp/examples/parakeet-cli
2+
3+
This is an example of using the [Parakeet] model in whisper.cpp.
4+
5+
### Download converted model
6+
```console
7+
$ hf download danbev/parakeet parakeet-tdt-0.6b-v3.bin --local-dir models
8+
```
9+
10+
### Building
11+
```console
12+
$ cmake -B build -S .
13+
$ cmake --build build --target parakeet-cli -j 12
14+
```
15+
16+
### Usage
17+
```console
18+
$ ./build/bin/parakeet-cli --help
19+
20+
usage: ./build/bin/parakeet-cli [options] file0 file1 ...
21+
supported audio formats: flac, mp3, ogg, wav
22+
23+
options:
24+
-h, --help [default] show this help message and exit
25+
-t N, --threads N [4 ] number of threads to use during computation
26+
-cl N, --chunk-length N [10000 ] chunk length in milliseconds
27+
-lc N, --left-context N [10000 ] left context in milliseconds
28+
-rc N, --right-context N [4960 ] right context in milliseconds
29+
-m, --model FILE [models/ggml-parakeet-tdt-0.6b-v3.bin] model path
30+
-f, --file FILE [ ] input audio file
31+
-ng, --no-gpu [false ] disable GPU
32+
-dev N, --device N [0 ] GPU device to use
33+
-fa, --flash-attn [true ] enable flash attention
34+
-nfa, --no-flash-attn [false ] disable flash attention
35+
-ps, --print-segments [false ] print segment information
36+
```
37+
38+
### Example
39+
```console
40+
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3.bin -f samples/jfk.wav
41+
Processing audio (176000 samples, 11.00 seconds)
42+
Processing audio: total_frames=1101, chunk_size=1101
43+
parakeet_decode: starting decode with n_frames=138
44+
And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
45+
```
46+
47+
To print segment information:
48+
```console
49+
$ ./build/bin/parakeet-cli -m models/parakeet-tdt-0.6b-v3.bin -f samples/jfk.wav --print-segments
50+
Processing audio (176000 samples, 11.00 seconds)
51+
Processing audio: total_frames=1101, chunk_size=1101
52+
parakeet_decode: starting decode with n_frames=138
53+
And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
54+
55+
Segments (1):
56+
Segment 0: [0 -> 1101] "And so, my fellow Americans, ask not what your country can do for you, ask what you can do for your country."
57+
Tokens [38]:
58+
[ 0] id= 1976 frame= 3 dur_idx= 4 dur_val= 4 p=0.9996 plog=-15.6206 t0= 24 t1= 56 word_start=true "▁And"
59+
[ 1] id= 547 frame= 7 dur_idx= 4 dur_val= 4 p=0.9999 plog=-18.7922 t0= 56 t1= 88 word_start=true "▁so"
60+
[ 2] id= 7877 frame= 11 dur_idx= 2 dur_val= 2 p=0.8451 plog=-14.5929 t0= 88 t1= 88 word_start=false ","
61+
[ 3] id= 1103 frame= 13 dur_idx= 3 dur_val= 3 p=0.9996 plog=-15.6127 t0= 104 t1= 128 word_start=true "▁my"
62+
[ 4] id= 309 frame= 16 dur_idx= 1 dur_val= 1 p=0.9912 plog=-11.9635 t0= 128 t1= 136 word_start=true "▁f"
63+
[ 5] id= 530 frame= 17 dur_idx= 2 dur_val= 2 p=1.0000 plog=-13.5239 t0= 136 t1= 152 word_start=false "ell"
64+
[ 6] id= 596 frame= 19 dur_idx= 3 dur_val= 3 p=1.0000 plog=-16.3120 t0= 152 t1= 176 word_start=false "ow"
65+
[ 7] id= 3213 frame= 22 dur_idx= 4 dur_val= 4 p=0.9999 plog=-10.1462 t0= 176 t1= 208 word_start=true "▁Amer"
66+
[ 8] id= 404 frame= 26 dur_idx= 4 dur_val= 4 p=1.0000 plog=-25.0910 t0= 208 t1= 240 word_start=false "ic"
67+
[ 9] id= 667 frame= 30 dur_idx= 4 dur_val= 4 p=1.0000 plog=-27.1707 t0= 240 t1= 272 word_start=false "ans"
68+
[10] id= 7877 frame= 37 dur_idx= 4 dur_val= 4 p=0.9094 plog=-16.3405 t0= 272 t1= 272 word_start=false ","
69+
[11] id= 279 frame= 41 dur_idx= 4 dur_val= 4 p=0.9980 plog=-19.7244 t0= 328 t1= 360 word_start=true "▁a"
70+
[12] id= 583 frame= 45 dur_idx= 4 dur_val= 4 p=1.0000 plog=-24.5312 t0= 360 t1= 392 word_start=false "sk"
71+
[13] id= 1491 frame= 53 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.2991 t0= 424 t1= 456 word_start=true "▁not"
72+
[14] id= 3470 frame= 65 dur_idx= 4 dur_val= 4 p=0.9995 plog=-16.7306 t0= 520 t1= 552 word_start=true "▁what"
73+
[15] id= 3629 frame= 69 dur_idx= 2 dur_val= 2 p=0.8139 plog=-11.6486 t0= 552 t1= 568 word_start=true "▁your"
74+
[16] id= 867 frame= 75 dur_idx= 1 dur_val= 1 p=0.9980 plog=-12.5265 t0= 600 t1= 608 word_start=true "▁co"
75+
[17] id= 331 frame= 76 dur_idx= 2 dur_val= 2 p=1.0000 plog=-11.6697 t0= 608 t1= 624 word_start=false "un"
76+
[18] id= 958 frame= 78 dur_idx= 2 dur_val= 2 p=1.0000 plog=-11.3621 t0= 624 t1= 640 word_start=false "tr"
77+
[19] id= 7893 frame= 80 dur_idx= 2 dur_val= 2 p=1.0000 plog=-14.3245 t0= 640 t1= 656 word_start=false "y"
78+
[20] id= 2059 frame= 82 dur_idx= 3 dur_val= 3 p=1.0000 plog=-17.7694 t0= 656 t1= 680 word_start=true "▁can"
79+
[21] id= 458 frame= 85 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.2510 t0= 680 t1= 712 word_start=true "▁do"
80+
[22] id= 509 frame= 89 dur_idx= 4 dur_val= 4 p=1.0000 plog=-23.0688 t0= 712 t1= 744 word_start=true "▁for"
81+
[23] id= 1180 frame= 93 dur_idx= 4 dur_val= 4 p=0.9999 plog=-25.0567 t0= 744 t1= 776 word_start=true "▁you"
82+
[24] id= 7877 frame= 98 dur_idx= 4 dur_val= 4 p=0.8820 plog=-14.2549 t0= 776 t1= 776 word_start=false ","
83+
[25] id= 279 frame=102 dur_idx= 3 dur_val= 3 p=0.9992 plog=-16.8176 t0= 816 t1= 840 word_start=true "▁a"
84+
[26] id= 583 frame=105 dur_idx= 4 dur_val= 4 p=1.0000 plog=-21.0352 t0= 840 t1= 872 word_start=false "sk"
85+
[27] id= 3470 frame=109 dur_idx= 3 dur_val= 3 p=0.9999 plog=-15.4659 t0= 872 t1= 896 word_start=true "▁what"
86+
[28] id= 1180 frame=112 dur_idx= 4 dur_val= 4 p=0.9997 plog=-17.6392 t0= 896 t1= 928 word_start=true "▁you"
87+
[29] id= 2059 frame=116 dur_idx= 3 dur_val= 3 p=0.9999 plog=-15.5484 t0= 928 t1= 952 word_start=true "▁can"
88+
[30] id= 458 frame=119 dur_idx= 2 dur_val= 2 p=1.0000 plog=-15.9953 t0= 952 t1= 968 word_start=true "▁do"
89+
[31] id= 509 frame=121 dur_idx= 3 dur_val= 3 p=1.0000 plog=-15.9605 t0= 968 t1= 992 word_start=true "▁for"
90+
[32] id= 3629 frame=124 dur_idx= 2 dur_val= 2 p=0.9994 plog=-12.2083 t0= 992 t1=1008 word_start=true "▁your"
91+
[33] id= 867 frame=126 dur_idx= 2 dur_val= 2 p=0.9969 plog=-9.1252 t0=1008 t1=1024 word_start=true "▁co"
92+
[34] id= 331 frame=128 dur_idx= 1 dur_val= 1 p=0.9999 plog=-12.6911 t0=1024 t1=1032 word_start=false "un"
93+
[35] id= 958 frame=129 dur_idx= 1 dur_val= 1 p=1.0000 plog=-8.8885 t0=1032 t1=1040 word_start=false "tr"
94+
[36] id= 7893 frame=130 dur_idx= 2 dur_val= 2 p=1.0000 plog=-14.1441 t0=1040 t1=1056 word_start=false "y"
95+
[37] id= 7883 frame=132 dur_idx= 4 dur_val= 4 p=0.9567 plog=-11.5227 t0=1056 t1=1056 word_start=false "."
96+
```
97+
98+
### Model conversion
99+
Clone the original model from Hugging Face:
100+
```console
101+
$ git clone https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3
102+
```
103+
Convert the model:
104+
```console
105+
(venv) $ python models/convert-parakeet-to-ggml.py \
106+
--model <path to cloned model> \
107+
--use-f32 \
108+
--out-dir models \
109+
--out-name ggml-parakeet-tdt-0.6b-v3.bin
110+
```
111+
112+
[Parakeet]: https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3
Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
#include "parakeet.h"
2+
#include "common-whisper.h"
3+
4+
#include <cstdio>
5+
#include <string>
6+
#include <thread>
7+
#include <vector>
8+
#include <cstring>
9+
10+
// command-line parameters
11+
struct parakeet_params {
12+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
13+
int32_t chunk_length_ms = 10000;
14+
int32_t left_context_ms = 10000;
15+
int32_t right_context_ms = 4960;
16+
17+
bool use_gpu = true;
18+
bool flash_attn = true;
19+
int32_t gpu_device = 0;
20+
21+
bool print_segments = false;
22+
23+
std::string model = "models/ggml-parakeet-tdt-0.6b-v3.bin";
24+
std::vector<std::string> fname_inp = {};
25+
};
26+
27+
static void parakeet_print_usage(int argc, char ** argv, const parakeet_params & params);
28+
29+
static char * requires_value_error(const std::string & arg) {
30+
fprintf(stderr, "error: argument %s requires value\n", arg.c_str());
31+
exit(1);
32+
}
33+
34+
static bool parakeet_params_parse(int argc, char ** argv, parakeet_params & params) {
35+
if (const char * env_device = std::getenv("PARAKEET_ARG_DEVICE")) {
36+
params.gpu_device = std::stoi(env_device);
37+
}
38+
39+
for (int i = 1; i < argc; i++) {
40+
std::string arg = argv[i];
41+
42+
if (arg == "-"){
43+
params.fname_inp.push_back(arg);
44+
continue;
45+
}
46+
47+
if (arg[0] != '-') {
48+
params.fname_inp.push_back(arg);
49+
continue;
50+
}
51+
52+
if (arg == "-h" || arg == "--help") {
53+
parakeet_print_usage(argc, argv, params);
54+
exit(0);
55+
}
56+
#define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
57+
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); }
58+
else if (arg == "-cl" || arg == "--chunk-length") { params.chunk_length_ms = std::stoi(ARGV_NEXT); }
59+
else if (arg == "-lc" || arg == "--left-context") { params.left_context_ms = std::stoi(ARGV_NEXT); }
60+
else if (arg == "-rc" || arg == "--right-context") { params.right_context_ms = std::stoi(ARGV_NEXT); }
61+
else if (arg == "-m" || arg == "--model") { params.model = ARGV_NEXT; }
62+
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(ARGV_NEXT); }
63+
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
64+
else if (arg == "-dev" || arg == "--device") { params.gpu_device = std::stoi(ARGV_NEXT); }
65+
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
66+
else if (arg == "-nfa" || arg == "--no-flash-attn") { params.flash_attn = false; }
67+
else if (arg == "-ps" || arg == "--print-segments") { params.print_segments = true; }
68+
else {
69+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
70+
parakeet_print_usage(argc, argv, params);
71+
exit(1);
72+
}
73+
}
74+
75+
return true;
76+
}
77+
78+
static void parakeet_print_usage(int /*argc*/, char ** argv, const parakeet_params & params) {
79+
fprintf(stderr, "\n");
80+
fprintf(stderr, "usage: %s [options] file0 file1 ...\n", argv[0]);
81+
fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
82+
fprintf(stderr, "\n");
83+
fprintf(stderr, "options:\n");
84+
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
85+
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
86+
fprintf(stderr, " -cl N, --chunk-length N [%-7d] chunk length in milliseconds\n", params.chunk_length_ms);
87+
fprintf(stderr, " -lc N, --left-context N [%-7d] left context in milliseconds\n", params.left_context_ms);
88+
fprintf(stderr, " -rc N, --right-context N [%-7d] right context in milliseconds\n", params.right_context_ms);
89+
fprintf(stderr, " -m, --model FILE [%-7s] model path\n", params.model.c_str());
90+
fprintf(stderr, " -f, --file FILE [%-7s] input audio file\n", "");
91+
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
92+
fprintf(stderr, " -dev N, --device N [%-7d] GPU device to use\n", params.gpu_device);
93+
fprintf(stderr, " -fa, --flash-attn [%-7s] enable flash attention\n", params.flash_attn ? "true" : "false");
94+
fprintf(stderr, " -nfa, --no-flash-attn [%-7s] disable flash attention\n", !params.flash_attn ? "true" : "false");
95+
fprintf(stderr, " -ps, --print-segments [%-7s] print segment information\n", params.print_segments ? "true" : "false");
96+
fprintf(stderr, "\n");
97+
}
98+
99+
void token_callback(parakeet_context * ctx, parakeet_state * state, const parakeet_token_data * token_data, void * user_data) {
100+
static bool is_first = true;
101+
102+
const char * token_str = parakeet_token_to_str(ctx, token_data->id);
103+
char text_buf[256];
104+
parakeet_token_to_text(token_str, is_first, text_buf, sizeof(text_buf));
105+
printf("%s", text_buf);
106+
fflush(stdout);
107+
108+
is_first = false;
109+
}
110+
111+
112+
int main(int argc, char ** argv) {
113+
parakeet_params params;
114+
115+
if (parakeet_params_parse(argc, argv, params) == false) {
116+
return 1;
117+
}
118+
119+
if (params.fname_inp.empty()) {
120+
fprintf(stderr, "error: no input files specified\n");
121+
parakeet_print_usage(argc, argv, params);
122+
return 1;
123+
}
124+
125+
// Process each input file
126+
for (const auto & fname : params.fname_inp) {
127+
fprintf(stderr, "\nProcessing file: %s\n", fname.c_str());
128+
129+
std::vector<float> pcmf32;
130+
std::vector<std::vector<float>> pcmf32s;
131+
if (!read_audio_data(fname.c_str(), pcmf32, pcmf32s, false)) {
132+
fprintf(stderr, "error: failed to read audio file '%s'\n", fname.c_str());
133+
continue;
134+
}
135+
136+
if (pcmf32.empty()) {
137+
fprintf(stderr, "error: no audio data in file '%s'\n", fname.c_str());
138+
continue;
139+
}
140+
141+
fprintf(stderr, "Loading Parakeet model from: %s\n", params.model.c_str());
142+
143+
struct parakeet_context_params ctx_params = parakeet_context_default_params();
144+
ctx_params.use_gpu = params.use_gpu;
145+
ctx_params.flash_attn = params.flash_attn;
146+
ctx_params.gpu_device = params.gpu_device;
147+
148+
struct parakeet_context * pctx = parakeet_init_from_file_with_params_no_state(params.model.c_str(), ctx_params);
149+
if (pctx == nullptr) {
150+
fprintf(stderr, "error: failed to load Parakeet model from '%s'\n", params.model.c_str());
151+
return 1;
152+
}
153+
struct parakeet_state * state = parakeet_init_state(pctx);
154+
if (state == nullptr) {
155+
fprintf(stderr, "error: failed to initialize parakeet state\n");
156+
parakeet_free(pctx);
157+
return 2;
158+
}
159+
160+
fprintf(stderr, "Successfully loaded Parakeet model\n");
161+
fprintf(stderr, "Processing audio (%zu samples, %.2f seconds)\n",
162+
pcmf32.size(), (float)pcmf32.size() / PARAKEET_SAMPLE_RATE);
163+
164+
struct parakeet_full_params full_params = parakeet_full_default_params(PARAKEET_SAMPLING_GREEDY);
165+
full_params.n_threads = params.n_threads;
166+
full_params.chunk_length_ms = params.chunk_length_ms;
167+
full_params.left_context_ms = params.left_context_ms;
168+
full_params.right_context_ms = params.right_context_ms;
169+
full_params.new_token_callback = token_callback;
170+
full_params.new_token_callback_user_data = nullptr;
171+
172+
const int mel_frames = pcmf32.size() / PARAKEET_HOP_LENGTH;
173+
const int model_max_ctx = parakeet_n_audio_ctx(pctx);
174+
const bool fits_single_chunk = mel_frames <= model_max_ctx;
175+
176+
int ret;
177+
if (fits_single_chunk) {
178+
ret = parakeet_chunk(pctx, state, full_params, pcmf32.data(), pcmf32.size());
179+
} else {
180+
ret = parakeet_full_with_state(pctx, state, full_params, pcmf32.data(), pcmf32.size());
181+
}
182+
183+
if (ret != 0) {
184+
fprintf(stderr, "error: failed to process audio file '%s'\n", fname.c_str());
185+
parakeet_free_state(state);
186+
parakeet_free(pctx);
187+
continue;
188+
}
189+
190+
printf("\n");
191+
192+
if (params.print_segments) {
193+
const int n_segments = parakeet_full_n_segments_from_state(state);
194+
fprintf(stderr, "\nSegments (%d):\n", n_segments);
195+
196+
for (int i = 0; i < n_segments; i++) {
197+
const char * text = parakeet_full_get_segment_text_from_state(state, i);
198+
const int64_t t0 = parakeet_full_get_segment_t0_from_state(state, i);
199+
const int64_t t1 = parakeet_full_get_segment_t1_from_state(state, i);
200+
const int n_tokens = parakeet_full_n_tokens_from_state(state, i);
201+
202+
fprintf(stderr, "Segment %d: [%lld -> %lld] \"%s\"\n", i, (long long)t0, (long long)t1, text);
203+
fprintf(stderr, "Tokens [%d]:\n", n_tokens);
204+
205+
for (int j = 0; j < n_tokens; j++) {
206+
parakeet_token_data token_data = parakeet_full_get_token_data_from_state(state, i, j);
207+
const char * token_str = parakeet_token_to_str(pctx, token_data.id);
208+
209+
fprintf(stderr, " [%2d] id=%5d frame=%3d dur_idx=%2d dur_val=%2d p=%.4f plog=%.4f t0=%4lld t1=%4lld word_start=%s \"%s\"\n",
210+
j,
211+
token_data.id,
212+
token_data.frame_index,
213+
token_data.duration_idx,
214+
token_data.duration_value,
215+
token_data.p,
216+
token_data.plog,
217+
(long long)token_data.t0,
218+
(long long)token_data.t1,
219+
token_data.is_word_start ? "true": "false",
220+
token_str);
221+
}
222+
}
223+
}
224+
225+
parakeet_free_state(state);
226+
parakeet_free(pctx);
227+
}
228+
229+
return 0;
230+
}

0 commit comments

Comments
 (0)