ggml-org · danbev · Jul 1, 2026 · Jun 27, 2026
diff --git a/include/whisper.h b/include/whisper.h
@@ -681,6 +681,17 @@ extern "C" {
     WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
     WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
 
+    // Access the speech segments detected by the internal VAD (only when params.vad = true).
+    // Times are on the original audio timeline, in centiseconds. The count is 0 when VAD was
+    // not used, so callers can reuse whisper's own speech boundaries instead of running a
+    // separate VAD pass.
+    WHISPER_API int     whisper_full_n_vad_segments               (struct whisper_context * ctx);
+    WHISPER_API int     whisper_full_n_vad_segments_from_state    (struct whisper_state * state);
+    WHISPER_API int64_t whisper_full_get_vad_segment_t0           (struct whisper_context * ctx, int i);
+    WHISPER_API int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i);
+    WHISPER_API int64_t whisper_full_get_vad_segment_t1           (struct whisper_context * ctx, int i);
+    WHISPER_API int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i);
+
     //
     // Voice Activity Detection (VAD)
     //

diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -8139,6 +8139,30 @@ int64_t whisper_full_get_token_t1(struct whisper_context * ctx, int i_segment, i
     return whisper_full_get_token_t1_from_state(ctx->state, i_segment, i_token);
 }
 
+int whisper_full_n_vad_segments_from_state(struct whisper_state * state) {
+    return (int) state->vad_segments.size();
+}
+
+int whisper_full_n_vad_segments(struct whisper_context * ctx) {
+    return (int) ctx->state->vad_segments.size();
+}
+
+int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i) {
+    return state->vad_segments[i].orig_start;
+}
+
+int64_t whisper_full_get_vad_segment_t0(struct whisper_context * ctx, int i) {
+    return ctx->state->vad_segments[i].orig_start;
+}
+
+int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i) {
+    return state->vad_segments[i].orig_end;
+}
+
+int64_t whisper_full_get_vad_segment_t1(struct whisper_context * ctx, int i) {
+    return ctx->state->vad_segments[i].orig_end;
+}
+
 float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
     return state->result_all[i_segment].tokens[i_token].p;
 }

diff --git a/tests/test-vad-full.cpp b/tests/test-vad-full.cpp
@@ -62,6 +62,18 @@ int main() {
         prev_t0 = t0;
     }
 
+    // internal VAD speech segments, on the original audio timeline (centiseconds)
+    const int n_vad = whisper_full_n_vad_segments(wctx);
+    assert(n_vad > 0);
+    int64_t vad_prev_end = -1;
+    for (int i = 0; i < n_vad; ++i) {
+        const int64_t t0 = whisper_full_get_vad_segment_t0(wctx, i);
+        const int64_t t1 = whisper_full_get_vad_segment_t1(wctx, i);
+        assert(t1 > t0);
+        assert(t0 >= vad_prev_end); // segments are ordered and non-overlapping
+        vad_prev_end = t1;
+    }
+
     whisper_free(wctx);
 
     return 0;