diff --git a/examples/addon.node/README.md b/examples/addon.node/README.md
index bb09ba104c6..da555b5059d 100644
--- a/examples/addon.node/README.md
+++ b/examples/addon.node/README.md
@@ -44,6 +44,39 @@ Run the VAD example with performance comparison:
 node vad-example.js
 ```
 
+### Cancellation Usage
+
+Run the cancellation example (cancels an in-flight transcription via `AbortSignal`):
+
+```shell
+node cancel-example.js
+```
+
+## Cancelling a transcription
+
+An in-flight transcription can be cancelled by passing an `AbortSignal` as the `signal` parameter:
+
+```javascript
+const ac = new AbortController();
+
+const promise = whisperAsync({
+  // ... other params ...
+  signal: ac.signal,
+});
+
+// cancel at any time
+ac.abort();
+
+const result = await promise;
+// result.cancelled === true
+// result.transcription contains the segments transcribed before cancellation
+```
+
+Cancellation is checked before each encoder run and before each ggml graph
+computation, so it usually takes effect within a fraction of a second.
+The promise resolves normally (it does not reject): `result.cancelled` is `true`
+and `result.transcription` contains the segments completed before the abort.
+
 ## Voice Activity Detection (VAD) Support
 
 VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence.
@@ -112,4 +145,5 @@ Both traditional whisper.cpp parameters and new VAD parameters are supported:
 - `comma_in_time`: Use comma in timestamps (default: true)
 - `print_progress`: Print progress info (default: false)
 - `progress_callback`: Progress callback function
+- `signal`: `AbortSignal` used to cancel the transcription (see above section)
 - VAD parameters (see above section)
diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp
index 71f65b0423c..b068f779a79 100644
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@@ -4,6 +4,8 @@
 
 #include "whisper.h"
 
+#include <atomic>
+#include <memory>
 #include <string>
 #include <thread>
 #include <vector>
@@ -11,6 +13,29 @@
 #include <cstdint>
 #include <cfloat>
 
+// True if `s` does not end in the middle of a UTF-8 multi-byte sequence. Used to
+// merge whisper byte-fallback tokens (rare CJK chars are split into 1-byte tokens)
+// back into whole characters before crossing the JS string boundary, which would
+// otherwise turn each partial byte into U+FFFD.
+static bool utf8_complete(const std::string & s) {
+    size_t i = 0;
+    const size_t n = s.size();
+    while (i < n) {
+        const unsigned char c = (unsigned char) s[i];
+        size_t len;
+        if (c < 0x80)             len = 1; // 0xxxxxxx
+        else if ((c >> 5) == 0x6) len = 2; // 110xxxxx
+        else if ((c >> 4) == 0xE) len = 3; // 1110xxxx
+        else if ((c >> 3) == 0x1E) len = 4; // 11110xxx
+        else                      len = 1; // stray continuation/invalid lead: don't stall
+        if (i + len > n) {
+            return false; // not enough continuation bytes yet
+        }
+        i += len;
+    }
+    return true;
+}
+
 struct whisper_params {
     int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
     int32_t n_processors = 1;
@@ -43,6 +68,7 @@ struct whisper_params {
     bool use_gpu        = true;
     bool flash_attn     = false;
     bool comma_in_time  = true;
+    bool token_timestamps = false; // emit per-token text + segment-aware mapped times
 
     std::string language = "en";
     std::string prompt;
@@ -143,14 +169,32 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
 void cb_log_disable(enum ggml_log_level, const char *, void *) {}
 
 struct whisper_result {
+    struct token_result {
+        std::string text;
+        int64_t     t0; // ms, original timeline (segment-aware mapped when VAD is on)
+        int64_t     t1; // ms
+        float       p;  // token probability
+    };
+
     std::vector<std::vector<std::string>> segments;
+
+    // Per-token output (populated only when params.token_timestamps is set). Lets the
+    // caller build subtitle cues from real token boundaries instead of abusing max_len=1.
+    std::vector<token_result> tokens;
+
+    // Speech segments detected by the internal VAD, on the original timeline (ms).
+    // Empty when VAD was not used, so the caller can reuse these instead of running a
+    // second, separate VAD pass over the same audio.
+    std::vector<std::pair<int64_t, int64_t>> vad_segments;
+
     std::string language;
 };
 
 class ProgressWorker : public Napi::AsyncWorker {
  public:
-    ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
-        : Napi::AsyncWorker(callback), params(params), env(env) {
+    ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env,
+                   std::shared_ptr<std::atomic<bool>> is_aborted)
+        : Napi::AsyncWorker(callback), params(params), env(env), is_aborted(std::move(is_aborted)) {
         // Create thread-safe function
         if (!progress_callback.IsEmpty()) {
             tsfn = Napi::ThreadSafeFunction::New(
@@ -185,6 +229,7 @@ class ProgressWorker : public Napi::AsyncWorker {
         }
 
         Napi::Object returnObj = Napi::Object::New(Env());
+        returnObj.Set("cancelled", Napi::Boolean::New(Env(), is_aborted->load()));
         if (!result.language.empty()) {
             returnObj.Set("language", Napi::String::New(Env(), result.language));
         }
@@ -197,6 +242,30 @@ class ProgressWorker : public Napi::AsyncWorker {
             transcriptionArray[i] = tmp;
          }
          returnObj.Set("transcription", transcriptionArray);
+
+         // Per-token rows: { text, t0, t1, p } with t0/t1 in ms on the original timeline.
+         Napi::Array tokensArray = Napi::Array::New(Env(), result.tokens.size());
+         for (uint64_t i = 0; i < result.tokens.size(); ++i) {
+             const auto & t = result.tokens[i];
+             Napi::Object tokenObj = Napi::Object::New(Env());
+             tokenObj.Set("text", Napi::String::New(Env(), t.text));
+             tokenObj.Set("t0", Napi::Number::New(Env(), (double) t.t0));
+             tokenObj.Set("t1", Napi::Number::New(Env(), (double) t.t1));
+             tokenObj.Set("p", Napi::Number::New(Env(), (double) t.p));
+             tokensArray[i] = tokenObj;
+         }
+         returnObj.Set("tokens", tokensArray);
+
+         // Internal VAD speech segments: { t0, t1 } in ms on the original timeline.
+         Napi::Array vadArray = Napi::Array::New(Env(), result.vad_segments.size());
+         for (uint64_t i = 0; i < result.vad_segments.size(); ++i) {
+             Napi::Object vadObj = Napi::Object::New(Env());
+             vadObj.Set("t0", Napi::Number::New(Env(), (double) result.vad_segments[i].first));
+             vadObj.Set("t1", Napi::Number::New(Env(), (double) result.vad_segments[i].second));
+             vadArray[i] = vadObj;
+         }
+         returnObj.Set("vadSegments", vadArray);
+
          Callback().Call({Env().Null(), returnObj});
     }
 
@@ -217,6 +286,7 @@ class ProgressWorker : public Napi::AsyncWorker {
     whisper_result result;
     Napi::Env env;
     Napi::ThreadSafeFunction tsfn;
+    std::shared_ptr<std::atomic<bool>> is_aborted;
 
     // Custom run function with progress callback support
     int run_with_progress(whisper_params &params, whisper_result & result) {
@@ -315,7 +385,7 @@ class ProgressWorker : public Napi::AsyncWorker {
                 wparams.offset_ms        = params.offset_t_ms;
                 wparams.duration_ms      = params.duration_ms;
 
-                wparams.token_timestamps = params.output_wts || params.max_len > 0;
+                wparams.token_timestamps = params.output_wts || params.max_len > 0 || params.token_timestamps;
                 wparams.thold_pt         = params.word_thold;
                 wparams.entropy_thold    = params.entropy_thold;
                 wparams.logprob_thold    = params.logprob_thold;
@@ -344,6 +414,18 @@ class ProgressWorker : public Napi::AsyncWorker {
                 };
                 wparams.progress_callback_user_data = this;
 
+                // Cancellation support: checked before each encoder run (coarse)
+                // and before each ggml graph computation (fine)
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                    return !static_cast<std::atomic<bool>*>(user_data)->load();
+                };
+                wparams.encoder_begin_callback_user_data = is_aborted.get();
+
+                wparams.abort_callback = [](void * user_data) {
+                    return static_cast<std::atomic<bool>*>(user_data)->load();
+                };
+                wparams.abort_callback_user_data = is_aborted.get();
+
                 // Set VAD parameters
                 wparams.vad            = params.vad;
                 wparams.vad_model_path = params.vad_model.c_str();
@@ -355,8 +437,16 @@ class ProgressWorker : public Napi::AsyncWorker {
                 wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
                 wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
 
-                if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
+                const int ret = whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors);
+
+                if (is_aborted->load()) {
+                    // cancelled - keep the segments transcribed so far
+                    break;
+                }
+
+                if (ret != 0) {
                     fprintf(stderr, "failed to process audio\n");
+                    whisper_free(ctx);
                     return 10;
                 }
             }
@@ -378,6 +468,72 @@ class ProgressWorker : public Napi::AsyncWorker {
             result.segments[i].emplace_back(text);
         }
 
+        // Per-token output: token text + segment-aware mapped times (original timeline).
+        // Skips special/timestamp tokens (id >= eot). Times are converted cs -> ms.
+        //
+        // whisper emits rare CJK characters as byte-fallback tokens (1 raw byte each),
+        // so a single character is spread over 2-3 tokens whose individual bytes are not
+        // valid UTF-8. Emitting them one-by-one would corrupt the character into U+FFFD at
+        // the JS string boundary, so accumulate raw bytes and only flush a display-token
+        // once the buffer is complete UTF-8: t0 from the first contributing token, t1 from
+        // the last, p averaged over the contributors.
+        if (params.token_timestamps) {
+            const whisper_token eot = whisper_token_eot(ctx);
+            for (int i = 0; i < n_segments; ++i) {
+                const int n_tokens = whisper_full_n_tokens(ctx, i);
+
+                std::string acc_text;
+                int64_t     acc_t0   = 0;
+                int64_t     acc_t1   = 0;
+                float       acc_psum = 0.0f;
+                int         acc_n    = 0;
+
+                for (int j = 0; j < n_tokens; ++j) {
+                    if (whisper_full_get_token_id(ctx, i, j) >= eot) {
+                        continue;
+                    }
+                    if (acc_n == 0) {
+                        acc_t0 = whisper_full_get_token_t0(ctx, i, j) * 10;
+                    }
+                    acc_text += whisper_full_get_token_text(ctx, i, j);
+                    acc_t1    = whisper_full_get_token_t1(ctx, i, j) * 10;
+                    acc_psum += whisper_full_get_token_p(ctx, i, j);
+                    acc_n    += 1;
+
+                    if (utf8_complete(acc_text)) {
+                        whisper_result::token_result tr;
+                        tr.text = acc_text;
+                        tr.t0   = acc_t0;
+                        tr.t1   = acc_t1;
+                        tr.p    = acc_psum / acc_n;
+                        result.tokens.push_back(std::move(tr));
+                        acc_text.clear();
+                        acc_psum = 0.0f;
+                        acc_n    = 0;
+                    }
+                }
+
+                // Defensive flush of any dangling bytes at segment end (normally empty).
+                if (!acc_text.empty()) {
+                    whisper_result::token_result tr;
+                    tr.text = acc_text;
+                    tr.t0   = acc_t0;
+                    tr.t1   = acc_t1;
+                    tr.p    = acc_n > 0 ? acc_psum / acc_n : 0.0f;
+                    result.tokens.push_back(std::move(tr));
+                }
+            }
+        }
+
+        // Expose the internal VAD speech boundaries (original timeline, ms). Empty if VAD off.
+        const int n_vad = whisper_full_n_vad_segments(ctx);
+        result.vad_segments.reserve(n_vad);
+        for (int i = 0; i < n_vad; ++i) {
+            result.vad_segments.emplace_back(
+                whisper_full_get_vad_segment_t0(ctx, i) * 10,
+                whisper_full_get_vad_segment_t1(ctx, i) * 10);
+        }
+
         whisper_print_timings(ctx);
         whisper_free(ctx);
 
@@ -432,6 +588,11 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
     comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
   }
 
+  bool token_timestamps = false;
+  if (whisper_params.Has("token_timestamps") && whisper_params.Get("token_timestamps").IsBoolean()) {
+    token_timestamps = whisper_params.Get("token_timestamps").As<Napi::Boolean>();
+  }
+
   int32_t max_len = 0;
   if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) {
     max_len = whisper_params.Get("max_len").As<Napi::Number>();
@@ -522,6 +683,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
   params.audio_ctx = audio_ctx;
   params.pcmf32 = pcmf32_vec;
   params.comma_in_time = comma_in_time;
+  params.token_timestamps = token_timestamps;
   params.max_len = max_len;
   params.max_context = max_context;
   params.print_progress = print_progress;
@@ -538,9 +700,29 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
   params.vad_speech_pad_ms = vad_speech_pad_ms;
   params.vad_samples_overlap = vad_samples_overlap;
 
+  // Cancellation support: an AbortSignal can be passed via params.signal.
+  // Its "abort" event sets a shared flag which is polled by the whisper.cpp
+  // abort callbacks on the worker thread.
+  auto is_aborted = std::make_shared<std::atomic<bool>>(false);
+  if (whisper_params.Has("signal") && whisper_params.Get("signal").IsObject()) {
+    Napi::Object signal = whisper_params.Get("signal").As<Napi::Object>();
+
+    if (signal.Get("aborted").ToBoolean().Value()) {
+      is_aborted->store(true);
+    } else if (signal.Has("addEventListener") && signal.Get("addEventListener").IsFunction()) {
+      Napi::Function add_listener = signal.Get("addEventListener").As<Napi::Function>();
+      Napi::Function on_abort = Napi::Function::New(env, [is_aborted](const Napi::CallbackInfo &) {
+        is_aborted->store(true);
+      });
+      Napi::Object options = Napi::Object::New(env);
+      options.Set("once", Napi::Boolean::New(env, true));
+      add_listener.Call(signal, { Napi::String::New(env, "abort"), on_abort, options });
+    }
+  }
+
   Napi::Function callback = info[1].As<Napi::Function>();
   // Create a new Worker class with progress callback support
-  ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
+  ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env, is_aborted);
   worker->Queue();
   return env.Undefined();
 }
diff --git a/examples/addon.node/cancel-example.js b/examples/addon.node/cancel-example.js
new file mode 100644
index 00000000000..2de382cb183
--- /dev/null
+++ b/examples/addon.node/cancel-example.js
@@ -0,0 +1,115 @@
+// Demonstrates cancelling an in-flight transcription via AbortSignal (params.signal).
+//
+// Usage: node cancel-example.js [--model=path/to/model.bin]
+
+const path = require("path");
+const os = require("os");
+const { promisify } = require("util");
+
+const isWindows = os.platform() === "win32";
+const buildPath = isWindows ? "../../build/bin/Release/addon.node" : "../../build/Release/addon.node";
+const { whisper } = require(path.join(__dirname, buildPath));
+
+const whisperAsync = promisify(whisper);
+
+const modelArg = process.argv.find((a) => a.startsWith("--model="));
+const model = modelArg
+  ? modelArg.slice("--model=".length)
+  : path.join(__dirname, "../../models/ggml-base.en.bin");
+
+// Long synthetic audio (tone + noise) so the transcription runs long enough
+// to be cancelled mid-flight.
+function syntheticAudio(seconds) {
+  const n = 16000 * seconds;
+  const pcm = new Float32Array(n);
+  for (let i = 0; i < n; i++) {
+    pcm[i] = 0.05 * Math.sin((2 * Math.PI * 440 * i) / 16000) + (Math.random() - 0.5) * 0.02;
+  }
+  return pcm;
+}
+
+const baseParams = {
+  language: "en",
+  model,
+  use_gpu: true,
+  no_prints: true,
+  no_timestamps: false,
+  comma_in_time: false,
+};
+
+async function cancelMidFlight() {
+  console.log("--- test 1: cancel mid-transcription ---");
+  const ac = new AbortController();
+  const progressSeen = [];
+
+  const t0 = Date.now();
+  const promise = whisperAsync({
+    ...baseParams,
+    fname_inp: "",
+    pcmf32: syntheticAudio(600),
+    signal: ac.signal,
+    progress_callback: (p) => {
+      progressSeen.push(p);
+      console.log(`progress: ${p}%`);
+      if (!ac.signal.aborted) {
+        console.log(">>> calling abort()");
+        ac.abort();
+      }
+    },
+  });
+
+  const result = await promise;
+  const elapsed = Date.now() - t0;
+
+  console.log(`cancelled = ${result.cancelled}, segments = ${result.transcription.length}, elapsed = ${elapsed} ms`);
+  if (result.cancelled !== true) throw new Error("FAIL: expected cancelled === true");
+  if (progressSeen.includes(100)) throw new Error("FAIL: transcription ran to completion, was not cancelled");
+  console.log("PASS\n");
+}
+
+async function preAbortedSignal() {
+  console.log("--- test 2: already-aborted signal ---");
+  const ac = new AbortController();
+  ac.abort();
+
+  const t0 = Date.now();
+  const result = await whisperAsync({
+    ...baseParams,
+    fname_inp: "",
+    pcmf32: syntheticAudio(600),
+    signal: ac.signal,
+  });
+  const elapsed = Date.now() - t0;
+
+  console.log(`cancelled = ${result.cancelled}, segments = ${result.transcription.length}, elapsed = ${elapsed} ms`);
+  if (result.cancelled !== true) throw new Error("FAIL: expected cancelled === true");
+  if (result.transcription.length !== 0) throw new Error("FAIL: expected no segments");
+  console.log("PASS\n");
+}
+
+async function normalRun() {
+  console.log("--- test 3: normal run without signal (regression) ---");
+  const t0 = Date.now();
+  const result = await whisperAsync({
+    ...baseParams,
+    fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
+  });
+  const elapsed = Date.now() - t0;
+
+  const text = result.transcription.map((s) => s[2]).join(" ");
+  console.log(`cancelled = ${result.cancelled}, segments = ${result.transcription.length}, elapsed = ${elapsed} ms`);
+  console.log(`text: ${text.trim()}`);
+  if (result.cancelled !== false) throw new Error("FAIL: expected cancelled === false");
+  if (!text.toLowerCase().includes("ask not")) throw new Error("FAIL: unexpected transcription");
+  console.log("PASS\n");
+}
+
+(async () => {
+  await cancelMidFlight();
+  await preAbortedSignal();
+  await normalRun();
+  console.log("ALL TESTS PASSED");
+})().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
diff --git a/include/whisper.h b/include/whisper.h
index b5dcdb2917a..efd79959b94 100644
--- a/include/whisper.h
+++ b/include/whisper.h
@@ -667,10 +667,31 @@ extern "C" {
     WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
     WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
 
+    // Get the start/end time of the specified token (centiseconds). When VAD is enabled
+    // these are mapped back to the original audio timeline (segment-aware: tokens that
+    // land in the artificial inter-segment silence snap to the nearest speech boundary),
+    // unlike whisper_full_get_token_data().t0/t1 which stay in VAD-processed time. Raw
+    // token times are returned unchanged when VAD was not used.
+    WHISPER_API int64_t whisper_full_get_token_t0           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API int64_t whisper_full_get_token_t0_from_state(struct whisper_state * state, int i_segment, int i_token);
+    WHISPER_API int64_t whisper_full_get_token_t1           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API int64_t whisper_full_get_token_t1_from_state(struct whisper_state * state, int i_segment, int i_token);
+
     // Get the probability of the specified token in the specified segment
     WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
     WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
 
+    // Access the speech segments detected by the internal VAD (only when params.vad = true).
+    // Times are on the ORIGINAL audio timeline, in centiseconds. The count is 0 when VAD was
+    // not used, letting callers reuse whisper's own speech boundaries instead of running a
+    // separate VAD pass.
+    WHISPER_API int     whisper_full_n_vad_segments               (struct whisper_context * ctx);
+    WHISPER_API int     whisper_full_n_vad_segments_from_state    (struct whisper_state * state);
+    WHISPER_API int64_t whisper_full_get_vad_segment_t0           (struct whisper_context * ctx, int i);
+    WHISPER_API int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i);
+    WHISPER_API int64_t whisper_full_get_vad_segment_t1           (struct whisper_context * ctx, int i);
+    WHISPER_API int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i);
+
     //
     // Voice Activity Detection (VAD)
     //
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 5ffc70af00e..3e5b6243f1d 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -8075,6 +8075,102 @@ struct whisper_token_data whisper_full_get_token_data(struct whisper_context * c
     return ctx->state->result_all[i_segment].tokens[i_token];
 }
 
+// Map a token timestamp (centiseconds, in the VAD-processed timeline) back to the
+// original audio timeline using the speech segments detected by the internal VAD.
+//
+// This is "segment-aware": a token that lands inside a real speech segment is
+// interpolated linearly within that segment, while a token that lands in the
+// artificial silence inserted *between* segments is snapped to the nearest real
+// speech boundary. That keeps token times on actual speech and preserves the true
+// inter-segment gaps, instead of smearing a token across a removed silence (which
+// is what a single global linear interpolation over vad_mapping_table would do).
+static int64_t whisper_map_token_time_segment_aware(
+        int64_t t,
+        const std::vector<whisper_state::vad_segment_info> & segs) {
+    if (segs.empty()) {
+        return t;
+    }
+    if (t <= segs.front().vad_start) {
+        return segs.front().orig_start;
+    }
+    if (t >= segs.back().vad_end) {
+        return segs.back().orig_end;
+    }
+    for (size_t i = 0; i < segs.size(); ++i) {
+        const auto & s = segs[i];
+        // Inside this speech segment -> linear interpolation onto the original span.
+        if (t >= s.vad_start && t <= s.vad_end) {
+            const int64_t vd = s.vad_end - s.vad_start;
+            const int64_t od = s.orig_end - s.orig_start;
+            if (vd <= 0) {
+                return s.orig_start;
+            }
+            return s.orig_start + (t - s.vad_start) * od / vd;
+        }
+        // In the artificial silence between segment i and i+1 -> snap to the nearer
+        // real boundary so the token never sits in the middle of a removed silence.
+        if (i + 1 < segs.size() && t > s.vad_end && t < segs[i + 1].vad_start) {
+            const int64_t mid = (s.vad_end + segs[i + 1].vad_start) / 2;
+            return (t <= mid) ? s.orig_end : segs[i + 1].orig_start;
+        }
+    }
+    return t;
+}
+
+int64_t whisper_full_get_token_t0_from_state(struct whisper_state * state, int i_segment, int i_token) {
+    const int64_t t0 = state->result_all[i_segment].tokens[i_token].t0;
+    if (!state->has_vad_segments || state->vad_segments.empty()) {
+        return t0;
+    }
+    return whisper_map_token_time_segment_aware(t0, state->vad_segments);
+}
+
+int64_t whisper_full_get_token_t0(struct whisper_context * ctx, int i_segment, int i_token) {
+    return whisper_full_get_token_t0_from_state(ctx->state, i_segment, i_token);
+}
+
+int64_t whisper_full_get_token_t1_from_state(struct whisper_state * state, int i_segment, int i_token) {
+    const int64_t t1 = state->result_all[i_segment].tokens[i_token].t1;
+    if (!state->has_vad_segments || state->vad_segments.empty()) {
+        return t1;
+    }
+    const int64_t orig_t0 = whisper_full_get_token_t0_from_state(state, i_segment, i_token);
+    int64_t orig_t1 = whisper_map_token_time_segment_aware(t1, state->vad_segments);
+    // Keep a strictly positive duration after snapping (timestamps are centiseconds).
+    if (orig_t1 < orig_t0 + 1) {
+        orig_t1 = orig_t0 + 1;
+    }
+    return orig_t1;
+}
+
+int64_t whisper_full_get_token_t1(struct whisper_context * ctx, int i_segment, int i_token) {
+    return whisper_full_get_token_t1_from_state(ctx->state, i_segment, i_token);
+}
+
+int whisper_full_n_vad_segments_from_state(struct whisper_state * state) {
+    return (int) state->vad_segments.size();
+}
+
+int whisper_full_n_vad_segments(struct whisper_context * ctx) {
+    return (int) ctx->state->vad_segments.size();
+}
+
+int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i) {
+    return state->vad_segments[i].orig_start;
+}
+
+int64_t whisper_full_get_vad_segment_t0(struct whisper_context * ctx, int i) {
+    return ctx->state->vad_segments[i].orig_start;
+}
+
+int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i) {
+    return state->vad_segments[i].orig_end;
+}
+
+int64_t whisper_full_get_vad_segment_t1(struct whisper_context * ctx, int i) {
+    return ctx->state->vad_segments[i].orig_end;
+}
+
 float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
     return state->result_all[i_segment].tokens[i_token].p;
 }
@@ -8397,24 +8493,75 @@ static int64_t sample_to_timestamp(int i_sample) {
 
 // a cost-function / heuristic that is high for text that takes longer to pronounce
 // obviously, can be improved
+// Iterate over UTF-8 code points (not raw bytes): a CJK character is 3 bytes,
+// so the old per-byte loop weighted every Han/Kana/Hangul glyph ~3x and never
+// matched full-width CJK punctuation, skewing how segment time is distributed
+// across tokens for Chinese/Japanese. Decode one code point at a time and give
+// full-width punctuation the same pause weights as their ASCII counterparts.
+// Pure-ASCII text decodes to identical weights as before (no regression).
 static float voice_length(const std::string & text) {
     float res = 0.0f;
 
-    for (char c : text) {
-        if (c == ' ') {
-            res += 0.01f;
-        } else if (c == ',') {
-            res += 2.00f;
-        } else if (c == '.') {
-            res += 3.00f;
-        } else if (c == '!') {
-            res += 3.00f;
-        } else if (c == '?') {
-            res += 3.00f;
-        } else if (c >= '0' && c <= '9') {
-            res += 3.00f;
+    const unsigned char * s = (const unsigned char *) text.data();
+    const size_t n = text.size();
+
+    for (size_t i = 0; i < n; ) {
+        const unsigned char c = s[i];
+        uint32_t cp = c;
+        int len = 1;
+        if (c < 0x80) {
+            len = 1;                 // 0xxxxxxx
+        } else if ((c >> 5) == 0x6) {
+            cp = c & 0x1F; len = 2;  // 110xxxxx
+        } else if ((c >> 4) == 0xE) {
+            cp = c & 0x0F; len = 3;  // 1110xxxx
+        } else if ((c >> 3) == 0x1E) {
+            cp = c & 0x07; len = 4;  // 11110xxx
         } else {
-            res += 1.00f;
+            cp = c; len = 1;         // stray continuation / invalid lead byte
+        }
+        if (i + (size_t) len <= n) {
+            bool ok = true;
+            for (int k = 1; k < len; ++k) {
+                const unsigned char cc = s[i + k];
+                if ((cc & 0xC0) != 0x80) { ok = false; break; } // not 10xxxxxx
+                cp = (cp << 6) | (cc & 0x3F);
+            }
+            if (!ok) { cp = c; len = 1; }
+        } else {
+            cp = c; len = 1;
+        }
+        i += (size_t) len;
+
+        switch (cp) {
+            case ' ':
+            case 0x3000:                 // IDEOGRAPHIC SPACE
+                res += 0.01f;
+                break;
+            case ',':
+            case 0xFF0C:                 // ， FULLWIDTH COMMA
+            case 0x3001:                 // 、 IDEOGRAPHIC COMMA
+            case 0xFF1B:                 // ； FULLWIDTH SEMICOLON
+            case 0xFF1A:                 // ： FULLWIDTH COLON
+                res += 2.00f;            // short pause
+                break;
+            case '.':
+            case '!':
+            case '?':
+            case 0x3002:                 // 。 IDEOGRAPHIC FULL STOP
+            case 0xFF0E:                 // ． FULLWIDTH FULL STOP
+            case 0xFF01:                 // ！ FULLWIDTH EXCLAMATION MARK
+            case 0xFF1F:                 // ？ FULLWIDTH QUESTION MARK
+            case 0x2026:                 // … HORIZONTAL ELLIPSIS
+                res += 3.00f;            // sentence-final pause
+                break;
+            default:
+                if ((cp >= '0' && cp <= '9') || (cp >= 0xFF10 && cp <= 0xFF19)) {
+                    res += 3.00f;        // digits (half/full width)
+                } else {
+                    res += 1.00f;        // letters, CJK ideographs, kana, hangul, ...
+                }
+                break;
         }
     }