ggml-org · buxuku · Jun 12, 2026 · Jun 27, 2026
diff --git a/examples/addon.node/README.md b/examples/addon.node/README.md
@@ -44,6 +44,39 @@ Run the VAD example with performance comparison:
 node vad-example.js
 ```
 
+### Cancellation Usage
+
+Run the cancellation example (cancels an in-flight transcription via `AbortSignal`):
+
+```shell
+node cancel-example.js
+```
+
+## Cancelling a transcription
+
+An in-flight transcription can be cancelled by passing an `AbortSignal` as the `signal` parameter:
+
+```javascript
+const ac = new AbortController();
+
+const promise = whisperAsync({
+  // ... other params ...
+  signal: ac.signal,
+});
+
+// cancel at any time
+ac.abort();
+
+const result = await promise;
+// result.cancelled === true
+// result.transcription contains the segments transcribed before cancellation
+```
+
+Cancellation is checked before each encoder run and before each ggml graph
+computation, so it usually takes effect within a fraction of a second.
+The promise resolves normally (it does not reject): `result.cancelled` is `true`
+and `result.transcription` contains the segments completed before the abort.
+
 ## Voice Activity Detection (VAD) Support
 
 VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence.
@@ -112,4 +145,5 @@ Both traditional whisper.cpp parameters and new VAD parameters are supported:
 - `comma_in_time`: Use comma in timestamps (default: true)
 - `print_progress`: Print progress info (default: false)
 - `progress_callback`: Progress callback function
+- `signal`: `AbortSignal` used to cancel the transcription (see above section)
 - VAD parameters (see above section)
diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp
@@ -4,13 +4,38 @@
 
 #include "whisper.h"
 
+#include <atomic>
+#include <memory>
 #include <string>
 #include <thread>
 #include <vector>
 #include <cmath>
 #include <cstdint>
 #include <cfloat>
 
+// True if `s` does not end in the middle of a UTF-8 multi-byte sequence. Used to
+// merge whisper byte-fallback tokens (rare CJK chars are split into 1-byte tokens)
+// back into whole characters before crossing the JS string boundary, which would
+// otherwise turn each partial byte into U+FFFD.
+static bool utf8_complete(const std::string & s) {
+    size_t i = 0;
+    const size_t n = s.size();
+    while (i < n) {
+        const unsigned char c = (unsigned char) s[i];
+        size_t len;
+        if (c < 0x80)             len = 1; // 0xxxxxxx
+        else if ((c >> 5) == 0x6) len = 2; // 110xxxxx
+        else if ((c >> 4) == 0xE) len = 3; // 1110xxxx
+        else if ((c >> 3) == 0x1E) len = 4; // 11110xxx
+        else                      len = 1; // stray continuation/invalid lead: don't stall
+        if (i + len > n) {
+            return false; // not enough continuation bytes yet
+        }
+        i += len;
+    }
+    return true;
+}
+
 struct whisper_params {
     int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
     int32_t n_processors = 1;
@@ -43,6 +68,7 @@ struct whisper_params {
     bool use_gpu        = true;
     bool flash_attn     = false;
     bool comma_in_time  = true;
+    bool token_timestamps = false; // emit per-token text + segment-aware mapped times
 
     std::string language = "en";
     std::string prompt;
@@ -143,14 +169,32 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
 void cb_log_disable(enum ggml_log_level, const char *, void *) {}
 
 struct whisper_result {
+    struct token_result {
+        std::string text;
+        int64_t     t0; // ms, original timeline (segment-aware mapped when VAD is on)
+        int64_t     t1; // ms
+        float       p;  // token probability
+    };
+
     std::vector<std::vector<std::string>> segments;
+
+    // Per-token output (populated only when params.token_timestamps is set). Lets the
+    // caller build subtitle cues from real token boundaries instead of abusing max_len=1.
+    std::vector<token_result> tokens;
+
+    // Speech segments detected by the internal VAD, on the original timeline (ms).
+    // Empty when VAD was not used, so the caller can reuse these instead of running a
+    // second, separate VAD pass over the same audio.
+    std::vector<std::pair<int64_t, int64_t>> vad_segments;
+
     std::string language;
 };
 
 class ProgressWorker : public Napi::AsyncWorker {
  public:
-    ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
-        : Napi::AsyncWorker(callback), params(params), env(env) {
+    ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env,
+                   std::shared_ptr<std::atomic<bool>> is_aborted)
+        : Napi::AsyncWorker(callback), params(params), env(env), is_aborted(std::move(is_aborted)) {
         // Create thread-safe function
         if (!progress_callback.IsEmpty()) {
             tsfn = Napi::ThreadSafeFunction::New(
@@ -185,6 +229,7 @@ class ProgressWorker : public Napi::AsyncWorker {
         }
 
         Napi::Object returnObj = Napi::Object::New(Env());
+        returnObj.Set("cancelled", Napi::Boolean::New(Env(), is_aborted->load()));
         if (!result.language.empty()) {
             returnObj.Set("language", Napi::String::New(Env(), result.language));
         }
@@ -197,6 +242,30 @@ class ProgressWorker : public Napi::AsyncWorker {
             transcriptionArray[i] = tmp;
          }
          returnObj.Set("transcription", transcriptionArray);
+
+         // Per-token rows: { text, t0, t1, p } with t0/t1 in ms on the original timeline.
+         Napi::Array tokensArray = Napi::Array::New(Env(), result.tokens.size());
+         for (uint64_t i = 0; i < result.tokens.size(); ++i) {
+             const auto & t = result.tokens[i];
+             Napi::Object tokenObj = Napi::Object::New(Env());
+             tokenObj.Set("text", Napi::String::New(Env(), t.text));
+             tokenObj.Set("t0", Napi::Number::New(Env(), (double) t.t0));
+             tokenObj.Set("t1", Napi::Number::New(Env(), (double) t.t1));
+             tokenObj.Set("p", Napi::Number::New(Env(), (double) t.p));
+             tokensArray[i] = tokenObj;
+         }
+         returnObj.Set("tokens", tokensArray);
+
+         // Internal VAD speech segments: { t0, t1 } in ms on the original timeline.
+         Napi::Array vadArray = Napi::Array::New(Env(), result.vad_segments.size());
+         for (uint64_t i = 0; i < result.vad_segments.size(); ++i) {
+             Napi::Object vadObj = Napi::Object::New(Env());
+             vadObj.Set("t0", Napi::Number::New(Env(), (double) result.vad_segments[i].first));
+             vadObj.Set("t1", Napi::Number::New(Env(), (double) result.vad_segments[i].second));
+             vadArray[i] = vadObj;
+         }
+         returnObj.Set("vadSegments", vadArray);
+
          Callback().Call({Env().Null(), returnObj});
     }
 
@@ -217,6 +286,7 @@ class ProgressWorker : public Napi::AsyncWorker {
     whisper_result result;
     Napi::Env env;
     Napi::ThreadSafeFunction tsfn;
+    std::shared_ptr<std::atomic<bool>> is_aborted;
 
     // Custom run function with progress callback support
     int run_with_progress(whisper_params &params, whisper_result & result) {
@@ -315,7 +385,7 @@ class ProgressWorker : public Napi::AsyncWorker {
                 wparams.offset_ms        = params.offset_t_ms;
                 wparams.duration_ms      = params.duration_ms;
 
-                wparams.token_timestamps = params.output_wts || params.max_len > 0;
+                wparams.token_timestamps = params.output_wts || params.max_len > 0 || params.token_timestamps;
                 wparams.thold_pt         = params.word_thold;
                 wparams.entropy_thold    = params.entropy_thold;
                 wparams.logprob_thold    = params.logprob_thold;
@@ -344,6 +414,18 @@ class ProgressWorker : public Napi::AsyncWorker {
                 };
                 wparams.progress_callback_user_data = this;
 
+                // Cancellation support: checked before each encoder run (coarse)
+                // and before each ggml graph computation (fine)
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                    return !static_cast<std::atomic<bool>*>(user_data)->load();
+                };
+                wparams.encoder_begin_callback_user_data = is_aborted.get();
+
+                wparams.abort_callback = [](void * user_data) {
+                    return static_cast<std::atomic<bool>*>(user_data)->load();
+                };
+                wparams.abort_callback_user_data = is_aborted.get();
+
                 // Set VAD parameters
                 wparams.vad            = params.vad;
                 wparams.vad_model_path = params.vad_model.c_str();
@@ -355,8 +437,16 @@ class ProgressWorker : public Napi::AsyncWorker {
                 wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
                 wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
 
-                if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
+                const int ret = whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors);
+
+                if (is_aborted->load()) {
+                    // cancelled - keep the segments transcribed so far
+                    break;
+                }
+
+                if (ret != 0) {
                     fprintf(stderr, "failed to process audio\n");
+                    whisper_free(ctx);
                     return 10;
                 }
             }
@@ -378,6 +468,72 @@ class ProgressWorker : public Napi::AsyncWorker {
             result.segments[i].emplace_back(text);
         }
 
+        // Per-token output: token text + segment-aware mapped times (original timeline).
+        // Skips special/timestamp tokens (id >= eot). Times are converted cs -> ms.
+        //
+        // whisper emits rare CJK characters as byte-fallback tokens (1 raw byte each),
+        // so a single character is spread over 2-3 tokens whose individual bytes are not
+        // valid UTF-8. Emitting them one-by-one would corrupt the character into U+FFFD at
+        // the JS string boundary, so accumulate raw bytes and only flush a display-token
+        // once the buffer is complete UTF-8: t0 from the first contributing token, t1 from
+        // the last, p averaged over the contributors.
+        if (params.token_timestamps) {
+            const whisper_token eot = whisper_token_eot(ctx);
+            for (int i = 0; i < n_segments; ++i) {
+                const int n_tokens = whisper_full_n_tokens(ctx, i);
+
+                std::string acc_text;
+                int64_t     acc_t0   = 0;
+                int64_t     acc_t1   = 0;
+                float       acc_psum = 0.0f;
+                int         acc_n    = 0;
+
+                for (int j = 0; j < n_tokens; ++j) {
+                    if (whisper_full_get_token_id(ctx, i, j) >= eot) {
+                        continue;
+                    }
+                    if (acc_n == 0) {
+                        acc_t0 = whisper_full_get_token_t0(ctx, i, j) * 10;
+                    }
+                    acc_text += whisper_full_get_token_text(ctx, i, j);
+                    acc_t1    = whisper_full_get_token_t1(ctx, i, j) * 10;
+                    acc_psum += whisper_full_get_token_p(ctx, i, j);
+                    acc_n    += 1;
+
+                    if (utf8_complete(acc_text)) {
+                        whisper_result::token_result tr;
+                        tr.text = acc_text;
+                        tr.t0   = acc_t0;
+                        tr.t1   = acc_t1;
+                        tr.p    = acc_psum / acc_n;
+                        result.tokens.push_back(std::move(tr));
+                        acc_text.clear();
+                        acc_psum = 0.0f;
+                        acc_n    = 0;
+                    }
+                }
+
+                // Defensive flush of any dangling bytes at segment end (normally empty).
+                if (!acc_text.empty()) {
+                    whisper_result::token_result tr;
+                    tr.text = acc_text;
+                    tr.t0   = acc_t0;
+                    tr.t1   = acc_t1;
+                    tr.p    = acc_n > 0 ? acc_psum / acc_n : 0.0f;
+                    result.tokens.push_back(std::move(tr));
+                }
+            }
+        }
+
+        // Expose the internal VAD speech boundaries (original timeline, ms). Empty if VAD off.
+        const int n_vad = whisper_full_n_vad_segments(ctx);
+        result.vad_segments.reserve(n_vad);
+        for (int i = 0; i < n_vad; ++i) {
+            result.vad_segments.emplace_back(
+                whisper_full_get_vad_segment_t0(ctx, i) * 10,
+                whisper_full_get_vad_segment_t1(ctx, i) * 10);
+        }
+
         whisper_print_timings(ctx);
         whisper_free(ctx);
 
@@ -432,6 +588,11 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
     comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
   }
 
+  bool token_timestamps = false;
+  if (whisper_params.Has("token_timestamps") && whisper_params.Get("token_timestamps").IsBoolean()) {
+    token_timestamps = whisper_params.Get("token_timestamps").As<Napi::Boolean>();
+  }
+
   int32_t max_len = 0;
   if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) {
     max_len = whisper_params.Get("max_len").As<Napi::Number>();
@@ -522,6 +683,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
   params.audio_ctx = audio_ctx;
   params.pcmf32 = pcmf32_vec;
   params.comma_in_time = comma_in_time;
+  params.token_timestamps = token_timestamps;
   params.max_len = max_len;
   params.max_context = max_context;
   params.print_progress = print_progress;
@@ -538,9 +700,29 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
   params.vad_speech_pad_ms = vad_speech_pad_ms;
   params.vad_samples_overlap = vad_samples_overlap;
 
+  // Cancellation support: an AbortSignal can be passed via params.signal.
+  // Its "abort" event sets a shared flag which is polled by the whisper.cpp
+  // abort callbacks on the worker thread.
+  auto is_aborted = std::make_shared<std::atomic<bool>>(false);
+  if (whisper_params.Has("signal") && whisper_params.Get("signal").IsObject()) {
+    Napi::Object signal = whisper_params.Get("signal").As<Napi::Object>();
+
+    if (signal.Get("aborted").ToBoolean().Value()) {
+      is_aborted->store(true);
+    } else if (signal.Has("addEventListener") && signal.Get("addEventListener").IsFunction()) {
+      Napi::Function add_listener = signal.Get("addEventListener").As<Napi::Function>();
+      Napi::Function on_abort = Napi::Function::New(env, [is_aborted](const Napi::CallbackInfo &) {
+        is_aborted->store(true);
+      });
+      Napi::Object options = Napi::Object::New(env);
+      options.Set("once", Napi::Boolean::New(env, true));
+      add_listener.Call(signal, { Napi::String::New(env, "abort"), on_abort, options });
+    }
+  }
+
   Napi::Function callback = info[1].As<Napi::Function>();
   // Create a new Worker class with progress callback support
-  ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
+  ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env, is_aborted);
   worker->Queue();
   return env.Undefined();
 }