Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,16 @@ extern "C" {
WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);

// Get the start/end time of the specified token, in centiseconds. When VAD is enabled
// these are mapped back to the original audio timeline (a token landing in a removed
// inter-segment silence snaps to the nearest speech boundary), unlike
// whisper_full_get_token_data().t0/t1 which stay in VAD-processed time. Without VAD the
// raw token times are returned unchanged.
WHISPER_API int64_t whisper_full_get_token_t0 (struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API int64_t whisper_full_get_token_t0_from_state(struct whisper_state * state, int i_segment, int i_token);
WHISPER_API int64_t whisper_full_get_token_t1 (struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API int64_t whisper_full_get_token_t1_from_state(struct whisper_state * state, int i_segment, int i_token);

// Get the probability of the specified token in the specified segment
WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
Expand Down
64 changes: 64 additions & 0 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8075,6 +8075,70 @@ struct whisper_token_data whisper_full_get_token_data(struct whisper_context * c
return ctx->state->result_all[i_segment].tokens[i_token];
}

// map a token time (centiseconds) from the VAD-processed timeline back to the original
// audio. a token inside a speech segment is interpolated within that segment; a token that
// falls in the silence removed between two segments snaps to the nearer boundary, so it
// never ends up in the middle of a cut-out gap (which a single global interpolation over
// the whole mapping table would do).
static int64_t whisper_map_token_time_segment_aware(
int64_t t,
const std::vector<whisper_state::vad_segment_info> & segs) {
if (segs.empty()) {
return t;
}
if (t <= segs.front().vad_start) {
return segs.front().orig_start;
}
if (t >= segs.back().vad_end) {
return segs.back().orig_end;
}
for (size_t i = 0; i < segs.size(); ++i) {
const auto & s = segs[i];
if (t >= s.vad_start && t <= s.vad_end) {
const int64_t vd = s.vad_end - s.vad_start;
const int64_t od = s.orig_end - s.orig_start;
if (vd <= 0) {
return s.orig_start;
}
return s.orig_start + (t - s.vad_start) * od / vd;
}
if (i + 1 < segs.size() && t > s.vad_end && t < segs[i + 1].vad_start) {
const int64_t mid = (s.vad_end + segs[i + 1].vad_start) / 2;
return (t <= mid) ? s.orig_end : segs[i + 1].orig_start;
}
}
return t;
}

int64_t whisper_full_get_token_t0_from_state(struct whisper_state * state, int i_segment, int i_token) {
const int64_t t0 = state->result_all[i_segment].tokens[i_token].t0;
if (!state->has_vad_segments || state->vad_segments.empty()) {
return t0;
}
return whisper_map_token_time_segment_aware(t0, state->vad_segments);
}

int64_t whisper_full_get_token_t0(struct whisper_context * ctx, int i_segment, int i_token) {
return whisper_full_get_token_t0_from_state(ctx->state, i_segment, i_token);
}

int64_t whisper_full_get_token_t1_from_state(struct whisper_state * state, int i_segment, int i_token) {
const int64_t t1 = state->result_all[i_segment].tokens[i_token].t1;
if (!state->has_vad_segments || state->vad_segments.empty()) {
return t1;
}
const int64_t orig_t0 = whisper_full_get_token_t0_from_state(state, i_segment, i_token);
int64_t orig_t1 = whisper_map_token_time_segment_aware(t1, state->vad_segments);
if (orig_t1 < orig_t0 + 1) {
orig_t1 = orig_t0 + 1; // keep a strictly positive duration after snapping
}
return orig_t1;
}

int64_t whisper_full_get_token_t1(struct whisper_context * ctx, int i_segment, int i_token) {
return whisper_full_get_token_t1_from_state(ctx->state, i_segment, i_token);
}

float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
return state->result_all[i_segment].tokens[i_token].p;
}
Expand Down
12 changes: 12 additions & 0 deletions tests/test-vad-full.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ int main() {
assert(whisper_full_get_segment_t0(wctx, 0) == 32);
assert(whisper_full_get_segment_t1(wctx, 0) == 1051);

// token times mapped back to the original timeline: ordered, non-negative duration
const int n_tokens = whisper_full_n_tokens(wctx, 0);
assert(n_tokens > 0);
int64_t prev_t0 = -1;
for (int j = 0; j < n_tokens; ++j) {
const int64_t t0 = whisper_full_get_token_t0(wctx, 0, j);
const int64_t t1 = whisper_full_get_token_t1(wctx, 0, j);
assert(t0 >= 0 && t1 >= t0);
assert(t0 >= prev_t0);
prev_t0 = t0;
}

whisper_free(wctx);

return 0;
Expand Down
Loading