Skip to content

Commit 99d2d51

Browse files
committed
whisper : expose internal VAD speech segments
When transcribing with params.vad = true, whisper already computes the speech segments and keeps them in the state. Expose them so callers can reuse those boundaries (for example to align or clip subtitles to speech) instead of running a second, separate VAD pass. Times are on the original audio timeline in centiseconds; the count is 0 when VAD was not used. test-vad-full.cpp checks the segments are ordered and non-empty.
1 parent df7638d commit 99d2d51

3 files changed

Lines changed: 47 additions & 0 deletions

File tree

include/whisper.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,17 @@ extern "C" {
671671
WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
672672
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
673673

674+
// Access the speech segments detected by the internal VAD (only when params.vad = true).
675+
// Times are on the original audio timeline, in centiseconds. The count is 0 when VAD was
676+
// not used, so callers can reuse whisper's own speech boundaries instead of running a
677+
// separate VAD pass.
678+
WHISPER_API int whisper_full_n_vad_segments (struct whisper_context * ctx);
679+
WHISPER_API int whisper_full_n_vad_segments_from_state (struct whisper_state * state);
680+
WHISPER_API int64_t whisper_full_get_vad_segment_t0 (struct whisper_context * ctx, int i);
681+
WHISPER_API int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i);
682+
WHISPER_API int64_t whisper_full_get_vad_segment_t1 (struct whisper_context * ctx, int i);
683+
WHISPER_API int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i);
684+
674685
//
675686
// Voice Activity Detection (VAD)
676687
//

src/whisper.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8075,6 +8075,30 @@ struct whisper_token_data whisper_full_get_token_data(struct whisper_context * c
80758075
return ctx->state->result_all[i_segment].tokens[i_token];
80768076
}
80778077

8078+
int whisper_full_n_vad_segments_from_state(struct whisper_state * state) {
8079+
return (int) state->vad_segments.size();
8080+
}
8081+
8082+
int whisper_full_n_vad_segments(struct whisper_context * ctx) {
8083+
return (int) ctx->state->vad_segments.size();
8084+
}
8085+
8086+
int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i) {
8087+
return state->vad_segments[i].orig_start;
8088+
}
8089+
8090+
int64_t whisper_full_get_vad_segment_t0(struct whisper_context * ctx, int i) {
8091+
return ctx->state->vad_segments[i].orig_start;
8092+
}
8093+
8094+
int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i) {
8095+
return state->vad_segments[i].orig_end;
8096+
}
8097+
8098+
int64_t whisper_full_get_vad_segment_t1(struct whisper_context * ctx, int i) {
8099+
return ctx->state->vad_segments[i].orig_end;
8100+
}
8101+
80788102
float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
80798103
return state->result_all[i_segment].tokens[i_token].p;
80808104
}

tests/test-vad-full.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,18 @@ int main() {
5050
assert(whisper_full_get_segment_t0(wctx, 0) == 32);
5151
assert(whisper_full_get_segment_t1(wctx, 0) == 1051);
5252

53+
// internal VAD speech segments, on the original audio timeline (centiseconds)
54+
const int n_vad = whisper_full_n_vad_segments(wctx);
55+
assert(n_vad > 0);
56+
int64_t vad_prev_end = -1;
57+
for (int i = 0; i < n_vad; ++i) {
58+
const int64_t t0 = whisper_full_get_vad_segment_t0(wctx, i);
59+
const int64_t t1 = whisper_full_get_vad_segment_t1(wctx, i);
60+
assert(t1 > t0);
61+
assert(t0 >= vad_prev_end); // segments are ordered and non-overlapping
62+
vad_prev_end = t1;
63+
}
64+
5365
whisper_free(wctx);
5466

5567
return 0;

0 commit comments

Comments
 (0)