Skip to content

Commit 82c8ab7

Browse files
author
nemo
committed
Implement whisper_token_to_bytes
Not every token seems to be valid unicode but every token is interpreted as such in `pw.whisper_token_to_str`. While this can be caught with an exception handler it might be worthwhile to have a way of getting the token bytes instead and parsing them using `.decode`, e.g.: ```python str(pw.whisper_token_to_bytes(ctx, tid), 'utf8', 'ignore') ```
1 parent 4ab9616 commit 82c8ab7

1 file changed

Lines changed: 7 additions & 0 deletions

File tree

src/main.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,12 @@ const char * whisper_token_to_str_wrapper(struct whisper_context_wrapper * ctx_w
133133
return whisper_token_to_str(ctx_w->ptr, token);
134134
};
135135

136+
py::bytes whisper_token_to_bytes_wrapper(struct whisper_context_wrapper * ctx_w, whisper_token token){
137+
const char* str = whisper_token_to_str(ctx_w->ptr, token);
138+
size_t l = strlen(str);
139+
return py::bytes(str, l);
140+
}
141+
136142
whisper_token whisper_token_eot_wrapper(struct whisper_context_wrapper * ctx_w){
137143
return whisper_token_eot(ctx_w->ptr);
138144
}
@@ -488,6 +494,7 @@ PYBIND11_MODULE(_pywhispercpp, m) {
488494

489495

490496
m.def("whisper_token_to_str", &whisper_token_to_str_wrapper, "whisper_token_to_str");
497+
m.def("whisper_token_to_bytes", &whisper_token_to_bytes_wrapper, "whisper_token_to_bytes");
491498
m.def("whisper_token_eot", &whisper_token_eot_wrapper, "whisper_token_eot");
492499
m.def("whisper_token_sot", &whisper_token_sot_wrapper, "whisper_token_sot");
493500
m.def("whisper_token_prev", &whisper_token_prev_wrapper);

0 commit comments

Comments
 (0)