diff --git a/examples/addon.node/README.md b/examples/addon.node/README.md index bb09ba104c6..da555b5059d 100644 --- a/examples/addon.node/README.md +++ b/examples/addon.node/README.md @@ -44,6 +44,39 @@ Run the VAD example with performance comparison: node vad-example.js ``` +### Cancellation Usage + +Run the cancellation example (cancels an in-flight transcription via `AbortSignal`): + +```shell +node cancel-example.js +``` + +## Cancelling a transcription + +An in-flight transcription can be cancelled by passing an `AbortSignal` as the `signal` parameter: + +```javascript +const ac = new AbortController(); + +const promise = whisperAsync({ + // ... other params ... + signal: ac.signal, +}); + +// cancel at any time +ac.abort(); + +const result = await promise; +// result.cancelled === true +// result.transcription contains the segments transcribed before cancellation +``` + +Cancellation is checked before each encoder run and before each ggml graph +computation, so it usually takes effect within a fraction of a second. +The promise resolves normally (it does not reject): `result.cancelled` is `true` +and `result.transcription` contains the segments completed before the abort. + ## Voice Activity Detection (VAD) Support VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence. @@ -112,4 +145,5 @@ Both traditional whisper.cpp parameters and new VAD parameters are supported: - `comma_in_time`: Use comma in timestamps (default: true) - `print_progress`: Print progress info (default: false) - `progress_callback`: Progress callback function +- `signal`: `AbortSignal` used to cancel the transcription (see above section) - VAD parameters (see above section) diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp index 71f65b0423c..b068f779a79 100644 --- a/examples/addon.node/addon.cpp +++ b/examples/addon.node/addon.cpp @@ -4,6 +4,8 @@ #include "whisper.h" +#include +#include #include #include #include @@ -11,6 +13,29 @@ #include #include +// True if `s` does not end in the middle of a UTF-8 multi-byte sequence. Used to +// merge whisper byte-fallback tokens (rare CJK chars are split into 1-byte tokens) +// back into whole characters before crossing the JS string boundary, which would +// otherwise turn each partial byte into U+FFFD. +static bool utf8_complete(const std::string & s) { + size_t i = 0; + const size_t n = s.size(); + while (i < n) { + const unsigned char c = (unsigned char) s[i]; + size_t len; + if (c < 0x80) len = 1; // 0xxxxxxx + else if ((c >> 5) == 0x6) len = 2; // 110xxxxx + else if ((c >> 4) == 0xE) len = 3; // 1110xxxx + else if ((c >> 3) == 0x1E) len = 4; // 11110xxx + else len = 1; // stray continuation/invalid lead: don't stall + if (i + len > n) { + return false; // not enough continuation bytes yet + } + i += len; + } + return true; +} + struct whisper_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); int32_t n_processors = 1; @@ -43,6 +68,7 @@ struct whisper_params { bool use_gpu = true; bool flash_attn = false; bool comma_in_time = true; + bool token_timestamps = false; // emit per-token text + segment-aware mapped times std::string language = "en"; std::string prompt; @@ -143,14 +169,32 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper void cb_log_disable(enum ggml_log_level, const char *, void *) {} struct whisper_result { + struct token_result { + std::string text; + int64_t t0; // ms, original timeline (segment-aware mapped when VAD is on) + int64_t t1; // ms + float p; // token probability + }; + std::vector> segments; + + // Per-token output (populated only when params.token_timestamps is set). Lets the + // caller build subtitle cues from real token boundaries instead of abusing max_len=1. + std::vector tokens; + + // Speech segments detected by the internal VAD, on the original timeline (ms). + // Empty when VAD was not used, so the caller can reuse these instead of running a + // second, separate VAD pass over the same audio. + std::vector> vad_segments; + std::string language; }; class ProgressWorker : public Napi::AsyncWorker { public: - ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env) - : Napi::AsyncWorker(callback), params(params), env(env) { + ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env, + std::shared_ptr> is_aborted) + : Napi::AsyncWorker(callback), params(params), env(env), is_aborted(std::move(is_aborted)) { // Create thread-safe function if (!progress_callback.IsEmpty()) { tsfn = Napi::ThreadSafeFunction::New( @@ -185,6 +229,7 @@ class ProgressWorker : public Napi::AsyncWorker { } Napi::Object returnObj = Napi::Object::New(Env()); + returnObj.Set("cancelled", Napi::Boolean::New(Env(), is_aborted->load())); if (!result.language.empty()) { returnObj.Set("language", Napi::String::New(Env(), result.language)); } @@ -197,6 +242,30 @@ class ProgressWorker : public Napi::AsyncWorker { transcriptionArray[i] = tmp; } returnObj.Set("transcription", transcriptionArray); + + // Per-token rows: { text, t0, t1, p } with t0/t1 in ms on the original timeline. + Napi::Array tokensArray = Napi::Array::New(Env(), result.tokens.size()); + for (uint64_t i = 0; i < result.tokens.size(); ++i) { + const auto & t = result.tokens[i]; + Napi::Object tokenObj = Napi::Object::New(Env()); + tokenObj.Set("text", Napi::String::New(Env(), t.text)); + tokenObj.Set("t0", Napi::Number::New(Env(), (double) t.t0)); + tokenObj.Set("t1", Napi::Number::New(Env(), (double) t.t1)); + tokenObj.Set("p", Napi::Number::New(Env(), (double) t.p)); + tokensArray[i] = tokenObj; + } + returnObj.Set("tokens", tokensArray); + + // Internal VAD speech segments: { t0, t1 } in ms on the original timeline. + Napi::Array vadArray = Napi::Array::New(Env(), result.vad_segments.size()); + for (uint64_t i = 0; i < result.vad_segments.size(); ++i) { + Napi::Object vadObj = Napi::Object::New(Env()); + vadObj.Set("t0", Napi::Number::New(Env(), (double) result.vad_segments[i].first)); + vadObj.Set("t1", Napi::Number::New(Env(), (double) result.vad_segments[i].second)); + vadArray[i] = vadObj; + } + returnObj.Set("vadSegments", vadArray); + Callback().Call({Env().Null(), returnObj}); } @@ -217,6 +286,7 @@ class ProgressWorker : public Napi::AsyncWorker { whisper_result result; Napi::Env env; Napi::ThreadSafeFunction tsfn; + std::shared_ptr> is_aborted; // Custom run function with progress callback support int run_with_progress(whisper_params ¶ms, whisper_result & result) { @@ -315,7 +385,7 @@ class ProgressWorker : public Napi::AsyncWorker { wparams.offset_ms = params.offset_t_ms; wparams.duration_ms = params.duration_ms; - wparams.token_timestamps = params.output_wts || params.max_len > 0; + wparams.token_timestamps = params.output_wts || params.max_len > 0 || params.token_timestamps; wparams.thold_pt = params.word_thold; wparams.entropy_thold = params.entropy_thold; wparams.logprob_thold = params.logprob_thold; @@ -344,6 +414,18 @@ class ProgressWorker : public Napi::AsyncWorker { }; wparams.progress_callback_user_data = this; + // Cancellation support: checked before each encoder run (coarse) + // and before each ggml graph computation (fine) + wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) { + return !static_cast*>(user_data)->load(); + }; + wparams.encoder_begin_callback_user_data = is_aborted.get(); + + wparams.abort_callback = [](void * user_data) { + return static_cast*>(user_data)->load(); + }; + wparams.abort_callback_user_data = is_aborted.get(); + // Set VAD parameters wparams.vad = params.vad; wparams.vad_model_path = params.vad_model.c_str(); @@ -355,8 +437,16 @@ class ProgressWorker : public Napi::AsyncWorker { wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; wparams.vad_params.samples_overlap = params.vad_samples_overlap; - if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) { + const int ret = whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors); + + if (is_aborted->load()) { + // cancelled - keep the segments transcribed so far + break; + } + + if (ret != 0) { fprintf(stderr, "failed to process audio\n"); + whisper_free(ctx); return 10; } } @@ -378,6 +468,72 @@ class ProgressWorker : public Napi::AsyncWorker { result.segments[i].emplace_back(text); } + // Per-token output: token text + segment-aware mapped times (original timeline). + // Skips special/timestamp tokens (id >= eot). Times are converted cs -> ms. + // + // whisper emits rare CJK characters as byte-fallback tokens (1 raw byte each), + // so a single character is spread over 2-3 tokens whose individual bytes are not + // valid UTF-8. Emitting them one-by-one would corrupt the character into U+FFFD at + // the JS string boundary, so accumulate raw bytes and only flush a display-token + // once the buffer is complete UTF-8: t0 from the first contributing token, t1 from + // the last, p averaged over the contributors. + if (params.token_timestamps) { + const whisper_token eot = whisper_token_eot(ctx); + for (int i = 0; i < n_segments; ++i) { + const int n_tokens = whisper_full_n_tokens(ctx, i); + + std::string acc_text; + int64_t acc_t0 = 0; + int64_t acc_t1 = 0; + float acc_psum = 0.0f; + int acc_n = 0; + + for (int j = 0; j < n_tokens; ++j) { + if (whisper_full_get_token_id(ctx, i, j) >= eot) { + continue; + } + if (acc_n == 0) { + acc_t0 = whisper_full_get_token_t0(ctx, i, j) * 10; + } + acc_text += whisper_full_get_token_text(ctx, i, j); + acc_t1 = whisper_full_get_token_t1(ctx, i, j) * 10; + acc_psum += whisper_full_get_token_p(ctx, i, j); + acc_n += 1; + + if (utf8_complete(acc_text)) { + whisper_result::token_result tr; + tr.text = acc_text; + tr.t0 = acc_t0; + tr.t1 = acc_t1; + tr.p = acc_psum / acc_n; + result.tokens.push_back(std::move(tr)); + acc_text.clear(); + acc_psum = 0.0f; + acc_n = 0; + } + } + + // Defensive flush of any dangling bytes at segment end (normally empty). + if (!acc_text.empty()) { + whisper_result::token_result tr; + tr.text = acc_text; + tr.t0 = acc_t0; + tr.t1 = acc_t1; + tr.p = acc_n > 0 ? acc_psum / acc_n : 0.0f; + result.tokens.push_back(std::move(tr)); + } + } + } + + // Expose the internal VAD speech boundaries (original timeline, ms). Empty if VAD off. + const int n_vad = whisper_full_n_vad_segments(ctx); + result.vad_segments.reserve(n_vad); + for (int i = 0; i < n_vad; ++i) { + result.vad_segments.emplace_back( + whisper_full_get_vad_segment_t0(ctx, i) * 10, + whisper_full_get_vad_segment_t1(ctx, i) * 10); + } + whisper_print_timings(ctx); whisper_free(ctx); @@ -432,6 +588,11 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { comma_in_time = whisper_params.Get("comma_in_time").As(); } + bool token_timestamps = false; + if (whisper_params.Has("token_timestamps") && whisper_params.Get("token_timestamps").IsBoolean()) { + token_timestamps = whisper_params.Get("token_timestamps").As(); + } + int32_t max_len = 0; if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) { max_len = whisper_params.Get("max_len").As(); @@ -522,6 +683,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { params.audio_ctx = audio_ctx; params.pcmf32 = pcmf32_vec; params.comma_in_time = comma_in_time; + params.token_timestamps = token_timestamps; params.max_len = max_len; params.max_context = max_context; params.print_progress = print_progress; @@ -538,9 +700,29 @@ Napi::Value whisper(const Napi::CallbackInfo& info) { params.vad_speech_pad_ms = vad_speech_pad_ms; params.vad_samples_overlap = vad_samples_overlap; + // Cancellation support: an AbortSignal can be passed via params.signal. + // Its "abort" event sets a shared flag which is polled by the whisper.cpp + // abort callbacks on the worker thread. + auto is_aborted = std::make_shared>(false); + if (whisper_params.Has("signal") && whisper_params.Get("signal").IsObject()) { + Napi::Object signal = whisper_params.Get("signal").As(); + + if (signal.Get("aborted").ToBoolean().Value()) { + is_aborted->store(true); + } else if (signal.Has("addEventListener") && signal.Get("addEventListener").IsFunction()) { + Napi::Function add_listener = signal.Get("addEventListener").As(); + Napi::Function on_abort = Napi::Function::New(env, [is_aborted](const Napi::CallbackInfo &) { + is_aborted->store(true); + }); + Napi::Object options = Napi::Object::New(env); + options.Set("once", Napi::Boolean::New(env, true)); + add_listener.Call(signal, { Napi::String::New(env, "abort"), on_abort, options }); + } + } + Napi::Function callback = info[1].As(); // Create a new Worker class with progress callback support - ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env); + ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env, is_aborted); worker->Queue(); return env.Undefined(); } diff --git a/examples/addon.node/cancel-example.js b/examples/addon.node/cancel-example.js new file mode 100644 index 00000000000..2de382cb183 --- /dev/null +++ b/examples/addon.node/cancel-example.js @@ -0,0 +1,115 @@ +// Demonstrates cancelling an in-flight transcription via AbortSignal (params.signal). +// +// Usage: node cancel-example.js [--model=path/to/model.bin] + +const path = require("path"); +const os = require("os"); +const { promisify } = require("util"); + +const isWindows = os.platform() === "win32"; +const buildPath = isWindows ? "../../build/bin/Release/addon.node" : "../../build/Release/addon.node"; +const { whisper } = require(path.join(__dirname, buildPath)); + +const whisperAsync = promisify(whisper); + +const modelArg = process.argv.find((a) => a.startsWith("--model=")); +const model = modelArg + ? modelArg.slice("--model=".length) + : path.join(__dirname, "../../models/ggml-base.en.bin"); + +// Long synthetic audio (tone + noise) so the transcription runs long enough +// to be cancelled mid-flight. +function syntheticAudio(seconds) { + const n = 16000 * seconds; + const pcm = new Float32Array(n); + for (let i = 0; i < n; i++) { + pcm[i] = 0.05 * Math.sin((2 * Math.PI * 440 * i) / 16000) + (Math.random() - 0.5) * 0.02; + } + return pcm; +} + +const baseParams = { + language: "en", + model, + use_gpu: true, + no_prints: true, + no_timestamps: false, + comma_in_time: false, +}; + +async function cancelMidFlight() { + console.log("--- test 1: cancel mid-transcription ---"); + const ac = new AbortController(); + const progressSeen = []; + + const t0 = Date.now(); + const promise = whisperAsync({ + ...baseParams, + fname_inp: "", + pcmf32: syntheticAudio(600), + signal: ac.signal, + progress_callback: (p) => { + progressSeen.push(p); + console.log(`progress: ${p}%`); + if (!ac.signal.aborted) { + console.log(">>> calling abort()"); + ac.abort(); + } + }, + }); + + const result = await promise; + const elapsed = Date.now() - t0; + + console.log(`cancelled = ${result.cancelled}, segments = ${result.transcription.length}, elapsed = ${elapsed} ms`); + if (result.cancelled !== true) throw new Error("FAIL: expected cancelled === true"); + if (progressSeen.includes(100)) throw new Error("FAIL: transcription ran to completion, was not cancelled"); + console.log("PASS\n"); +} + +async function preAbortedSignal() { + console.log("--- test 2: already-aborted signal ---"); + const ac = new AbortController(); + ac.abort(); + + const t0 = Date.now(); + const result = await whisperAsync({ + ...baseParams, + fname_inp: "", + pcmf32: syntheticAudio(600), + signal: ac.signal, + }); + const elapsed = Date.now() - t0; + + console.log(`cancelled = ${result.cancelled}, segments = ${result.transcription.length}, elapsed = ${elapsed} ms`); + if (result.cancelled !== true) throw new Error("FAIL: expected cancelled === true"); + if (result.transcription.length !== 0) throw new Error("FAIL: expected no segments"); + console.log("PASS\n"); +} + +async function normalRun() { + console.log("--- test 3: normal run without signal (regression) ---"); + const t0 = Date.now(); + const result = await whisperAsync({ + ...baseParams, + fname_inp: path.join(__dirname, "../../samples/jfk.wav"), + }); + const elapsed = Date.now() - t0; + + const text = result.transcription.map((s) => s[2]).join(" "); + console.log(`cancelled = ${result.cancelled}, segments = ${result.transcription.length}, elapsed = ${elapsed} ms`); + console.log(`text: ${text.trim()}`); + if (result.cancelled !== false) throw new Error("FAIL: expected cancelled === false"); + if (!text.toLowerCase().includes("ask not")) throw new Error("FAIL: unexpected transcription"); + console.log("PASS\n"); +} + +(async () => { + await cancelMidFlight(); + await preAbortedSignal(); + await normalRun(); + console.log("ALL TESTS PASSED"); +})().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/include/whisper.h b/include/whisper.h index b5dcdb2917a..efd79959b94 100644 --- a/include/whisper.h +++ b/include/whisper.h @@ -667,10 +667,31 @@ extern "C" { WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token); WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token); + // Get the start/end time of the specified token (centiseconds). When VAD is enabled + // these are mapped back to the original audio timeline (segment-aware: tokens that + // land in the artificial inter-segment silence snap to the nearest speech boundary), + // unlike whisper_full_get_token_data().t0/t1 which stay in VAD-processed time. Raw + // token times are returned unchanged when VAD was not used. + WHISPER_API int64_t whisper_full_get_token_t0 (struct whisper_context * ctx, int i_segment, int i_token); + WHISPER_API int64_t whisper_full_get_token_t0_from_state(struct whisper_state * state, int i_segment, int i_token); + WHISPER_API int64_t whisper_full_get_token_t1 (struct whisper_context * ctx, int i_segment, int i_token); + WHISPER_API int64_t whisper_full_get_token_t1_from_state(struct whisper_state * state, int i_segment, int i_token); + // Get the probability of the specified token in the specified segment WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token); WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token); + // Access the speech segments detected by the internal VAD (only when params.vad = true). + // Times are on the ORIGINAL audio timeline, in centiseconds. The count is 0 when VAD was + // not used, letting callers reuse whisper's own speech boundaries instead of running a + // separate VAD pass. + WHISPER_API int whisper_full_n_vad_segments (struct whisper_context * ctx); + WHISPER_API int whisper_full_n_vad_segments_from_state (struct whisper_state * state); + WHISPER_API int64_t whisper_full_get_vad_segment_t0 (struct whisper_context * ctx, int i); + WHISPER_API int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i); + WHISPER_API int64_t whisper_full_get_vad_segment_t1 (struct whisper_context * ctx, int i); + WHISPER_API int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i); + // // Voice Activity Detection (VAD) // diff --git a/src/whisper.cpp b/src/whisper.cpp index 5ffc70af00e..3e5b6243f1d 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -8075,6 +8075,102 @@ struct whisper_token_data whisper_full_get_token_data(struct whisper_context * c return ctx->state->result_all[i_segment].tokens[i_token]; } +// Map a token timestamp (centiseconds, in the VAD-processed timeline) back to the +// original audio timeline using the speech segments detected by the internal VAD. +// +// This is "segment-aware": a token that lands inside a real speech segment is +// interpolated linearly within that segment, while a token that lands in the +// artificial silence inserted *between* segments is snapped to the nearest real +// speech boundary. That keeps token times on actual speech and preserves the true +// inter-segment gaps, instead of smearing a token across a removed silence (which +// is what a single global linear interpolation over vad_mapping_table would do). +static int64_t whisper_map_token_time_segment_aware( + int64_t t, + const std::vector & segs) { + if (segs.empty()) { + return t; + } + if (t <= segs.front().vad_start) { + return segs.front().orig_start; + } + if (t >= segs.back().vad_end) { + return segs.back().orig_end; + } + for (size_t i = 0; i < segs.size(); ++i) { + const auto & s = segs[i]; + // Inside this speech segment -> linear interpolation onto the original span. + if (t >= s.vad_start && t <= s.vad_end) { + const int64_t vd = s.vad_end - s.vad_start; + const int64_t od = s.orig_end - s.orig_start; + if (vd <= 0) { + return s.orig_start; + } + return s.orig_start + (t - s.vad_start) * od / vd; + } + // In the artificial silence between segment i and i+1 -> snap to the nearer + // real boundary so the token never sits in the middle of a removed silence. + if (i + 1 < segs.size() && t > s.vad_end && t < segs[i + 1].vad_start) { + const int64_t mid = (s.vad_end + segs[i + 1].vad_start) / 2; + return (t <= mid) ? s.orig_end : segs[i + 1].orig_start; + } + } + return t; +} + +int64_t whisper_full_get_token_t0_from_state(struct whisper_state * state, int i_segment, int i_token) { + const int64_t t0 = state->result_all[i_segment].tokens[i_token].t0; + if (!state->has_vad_segments || state->vad_segments.empty()) { + return t0; + } + return whisper_map_token_time_segment_aware(t0, state->vad_segments); +} + +int64_t whisper_full_get_token_t0(struct whisper_context * ctx, int i_segment, int i_token) { + return whisper_full_get_token_t0_from_state(ctx->state, i_segment, i_token); +} + +int64_t whisper_full_get_token_t1_from_state(struct whisper_state * state, int i_segment, int i_token) { + const int64_t t1 = state->result_all[i_segment].tokens[i_token].t1; + if (!state->has_vad_segments || state->vad_segments.empty()) { + return t1; + } + const int64_t orig_t0 = whisper_full_get_token_t0_from_state(state, i_segment, i_token); + int64_t orig_t1 = whisper_map_token_time_segment_aware(t1, state->vad_segments); + // Keep a strictly positive duration after snapping (timestamps are centiseconds). + if (orig_t1 < orig_t0 + 1) { + orig_t1 = orig_t0 + 1; + } + return orig_t1; +} + +int64_t whisper_full_get_token_t1(struct whisper_context * ctx, int i_segment, int i_token) { + return whisper_full_get_token_t1_from_state(ctx->state, i_segment, i_token); +} + +int whisper_full_n_vad_segments_from_state(struct whisper_state * state) { + return (int) state->vad_segments.size(); +} + +int whisper_full_n_vad_segments(struct whisper_context * ctx) { + return (int) ctx->state->vad_segments.size(); +} + +int64_t whisper_full_get_vad_segment_t0_from_state(struct whisper_state * state, int i) { + return state->vad_segments[i].orig_start; +} + +int64_t whisper_full_get_vad_segment_t0(struct whisper_context * ctx, int i) { + return ctx->state->vad_segments[i].orig_start; +} + +int64_t whisper_full_get_vad_segment_t1_from_state(struct whisper_state * state, int i) { + return state->vad_segments[i].orig_end; +} + +int64_t whisper_full_get_vad_segment_t1(struct whisper_context * ctx, int i) { + return ctx->state->vad_segments[i].orig_end; +} + float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) { return state->result_all[i_segment].tokens[i_token].p; } @@ -8397,24 +8493,75 @@ static int64_t sample_to_timestamp(int i_sample) { // a cost-function / heuristic that is high for text that takes longer to pronounce // obviously, can be improved +// Iterate over UTF-8 code points (not raw bytes): a CJK character is 3 bytes, +// so the old per-byte loop weighted every Han/Kana/Hangul glyph ~3x and never +// matched full-width CJK punctuation, skewing how segment time is distributed +// across tokens for Chinese/Japanese. Decode one code point at a time and give +// full-width punctuation the same pause weights as their ASCII counterparts. +// Pure-ASCII text decodes to identical weights as before (no regression). static float voice_length(const std::string & text) { float res = 0.0f; - for (char c : text) { - if (c == ' ') { - res += 0.01f; - } else if (c == ',') { - res += 2.00f; - } else if (c == '.') { - res += 3.00f; - } else if (c == '!') { - res += 3.00f; - } else if (c == '?') { - res += 3.00f; - } else if (c >= '0' && c <= '9') { - res += 3.00f; + const unsigned char * s = (const unsigned char *) text.data(); + const size_t n = text.size(); + + for (size_t i = 0; i < n; ) { + const unsigned char c = s[i]; + uint32_t cp = c; + int len = 1; + if (c < 0x80) { + len = 1; // 0xxxxxxx + } else if ((c >> 5) == 0x6) { + cp = c & 0x1F; len = 2; // 110xxxxx + } else if ((c >> 4) == 0xE) { + cp = c & 0x0F; len = 3; // 1110xxxx + } else if ((c >> 3) == 0x1E) { + cp = c & 0x07; len = 4; // 11110xxx } else { - res += 1.00f; + cp = c; len = 1; // stray continuation / invalid lead byte + } + if (i + (size_t) len <= n) { + bool ok = true; + for (int k = 1; k < len; ++k) { + const unsigned char cc = s[i + k]; + if ((cc & 0xC0) != 0x80) { ok = false; break; } // not 10xxxxxx + cp = (cp << 6) | (cc & 0x3F); + } + if (!ok) { cp = c; len = 1; } + } else { + cp = c; len = 1; + } + i += (size_t) len; + + switch (cp) { + case ' ': + case 0x3000: // IDEOGRAPHIC SPACE + res += 0.01f; + break; + case ',': + case 0xFF0C: // , FULLWIDTH COMMA + case 0x3001: // 、 IDEOGRAPHIC COMMA + case 0xFF1B: // ; FULLWIDTH SEMICOLON + case 0xFF1A: // : FULLWIDTH COLON + res += 2.00f; // short pause + break; + case '.': + case '!': + case '?': + case 0x3002: // 。 IDEOGRAPHIC FULL STOP + case 0xFF0E: // . FULLWIDTH FULL STOP + case 0xFF01: // ! FULLWIDTH EXCLAMATION MARK + case 0xFF1F: // ? FULLWIDTH QUESTION MARK + case 0x2026: // … HORIZONTAL ELLIPSIS + res += 3.00f; // sentence-final pause + break; + default: + if ((cp >= '0' && cp <= '9') || (cp >= 0xFF10 && cp <= 0xFF19)) { + res += 3.00f; // digits (half/full width) + } else { + res += 1.00f; // letters, CJK ideographs, kana, hangul, ... + } + break; } }