Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions include/whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,17 @@ extern "C" {
WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);

// Access the speech segments detected by the internal VAD (only when params.vad = true).
// Times are on the original audio timeline, in centiseconds. The count is 0 when VAD was
// not used, so callers can reuse whisper's own speech boundaries instead of running a
// separate VAD pass.
WHISPER_API int whisper_full_n_vad_segments (struct whisper_context * ctx);
WHISPER_API int whisper_full_n_vad_segments_from_state (struct whisper_state * state);
WHISPER_API int64_t whisper_full_get_vad_segment_t0 (struct whisper_context * ctx, int i);
WHISPER_API int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i);
WHISPER_API int64_t whisper_full_get_vad_segment_t1 (struct whisper_context * ctx, int i);
WHISPER_API int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i);

//
// Voice Activity Detection (VAD)
//
Expand Down
24 changes: 24 additions & 0 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8139,6 +8139,30 @@ int64_t whisper_full_get_token_t1(struct whisper_context * ctx, int i_segment, i
return whisper_full_get_token_t1_from_state(ctx->state, i_segment, i_token);
}

int whisper_full_n_vad_segments_from_state(struct whisper_state * state) {
return (int) state->vad_segments.size();
}

int whisper_full_n_vad_segments(struct whisper_context * ctx) {
return (int) ctx->state->vad_segments.size();
}

int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i) {
return state->vad_segments[i].orig_start;
}

int64_t whisper_full_get_vad_segment_t0(struct whisper_context * ctx, int i) {
return ctx->state->vad_segments[i].orig_start;
}

int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i) {
return state->vad_segments[i].orig_end;
}

int64_t whisper_full_get_vad_segment_t1(struct whisper_context * ctx, int i) {
return ctx->state->vad_segments[i].orig_end;
}

float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
return state->result_all[i_segment].tokens[i_token].p;
}
Expand Down
12 changes: 12 additions & 0 deletions tests/test-vad-full.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ int main() {
prev_t0 = t0;
}

// internal VAD speech segments, on the original audio timeline (centiseconds)
const int n_vad = whisper_full_n_vad_segments(wctx);
assert(n_vad > 0);
int64_t vad_prev_end = -1;
for (int i = 0; i < n_vad; ++i) {
const int64_t t0 = whisper_full_get_vad_segment_t0(wctx, i);
const int64_t t1 = whisper_full_get_vad_segment_t1(wctx, i);
assert(t1 > t0);
assert(t0 >= vad_prev_end); // segments are ordered and non-overlapping
vad_prev_end = t1;
}

whisper_free(wctx);

return 0;
Expand Down
Loading