File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -671,6 +671,17 @@ extern "C" {
671671 WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx , int i_segment , int i_token );
672672 WHISPER_API float whisper_full_get_token_p_from_state (struct whisper_state * state , int i_segment , int i_token );
673673
674+ // Access the speech segments detected by the internal VAD (only when params.vad = true).
675+ // Times are on the original audio timeline, in centiseconds. The count is 0 when VAD was
676+ // not used, so callers can reuse whisper's own speech boundaries instead of running a
677+ // separate VAD pass.
678+ WHISPER_API int whisper_full_n_vad_segments (struct whisper_context * ctx );
679+ WHISPER_API int whisper_full_n_vad_segments_from_state (struct whisper_state * state );
680+ WHISPER_API int64_t whisper_full_get_vad_segment_t0 (struct whisper_context * ctx , int i );
681+ WHISPER_API int64_t whisper_full_get_vad_segment_t0_from_state (struct whisper_state * state , int i );
682+ WHISPER_API int64_t whisper_full_get_vad_segment_t1 (struct whisper_context * ctx , int i );
683+ WHISPER_API int64_t whisper_full_get_vad_segment_t1_from_state (struct whisper_state * state , int i );
684+
674685 //
675686 // Voice Activity Detection (VAD)
676687 //
Original file line number Diff line number Diff line change @@ -8075,6 +8075,30 @@ struct whisper_token_data whisper_full_get_token_data(struct whisper_context * c
80758075 return ctx->state ->result_all [i_segment].tokens [i_token];
80768076}
80778077
8078+ int whisper_full_n_vad_segments_from_state (struct whisper_state * state) {
8079+ return (int ) state->vad_segments .size ();
8080+ }
8081+
8082+ int whisper_full_n_vad_segments (struct whisper_context * ctx) {
8083+ return (int ) ctx->state ->vad_segments .size ();
8084+ }
8085+
8086+ int64_t whisper_full_get_vad_segment_t0_from_state (struct whisper_state * state, int i) {
8087+ return state->vad_segments [i].orig_start ;
8088+ }
8089+
8090+ int64_t whisper_full_get_vad_segment_t0 (struct whisper_context * ctx, int i) {
8091+ return ctx->state ->vad_segments [i].orig_start ;
8092+ }
8093+
8094+ int64_t whisper_full_get_vad_segment_t1_from_state (struct whisper_state * state, int i) {
8095+ return state->vad_segments [i].orig_end ;
8096+ }
8097+
8098+ int64_t whisper_full_get_vad_segment_t1 (struct whisper_context * ctx, int i) {
8099+ return ctx->state ->vad_segments [i].orig_end ;
8100+ }
8101+
80788102float whisper_full_get_token_p_from_state (struct whisper_state * state, int i_segment, int i_token) {
80798103 return state->result_all [i_segment].tokens [i_token].p ;
80808104}
Original file line number Diff line number Diff line change @@ -50,6 +50,18 @@ int main() {
5050 assert (whisper_full_get_segment_t0 (wctx, 0 ) == 32 );
5151 assert (whisper_full_get_segment_t1 (wctx, 0 ) == 1051 );
5252
53+ // internal VAD speech segments, on the original audio timeline (centiseconds)
54+ const int n_vad = whisper_full_n_vad_segments (wctx);
55+ assert (n_vad > 0 );
56+ int64_t vad_prev_end = -1 ;
57+ for (int i = 0 ; i < n_vad; ++i) {
58+ const int64_t t0 = whisper_full_get_vad_segment_t0 (wctx, i);
59+ const int64_t t1 = whisper_full_get_vad_segment_t1 (wctx, i);
60+ assert (t1 > t0);
61+ assert (t0 >= vad_prev_end); // segments are ordered and non-overlapping
62+ vad_prev_end = t1;
63+ }
64+
5365 whisper_free (wctx);
5466
5567 return 0 ;
You can’t perform that action at this time.
0 commit comments