ggml-org
diff --git a/‎examples/cli/cli.cpp‎
Lines changed: 10 additions & 0 deletions b/‎examples/cli/cli.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/whisper.h‎
Lines changed: 4 additions & 0 deletions b/‎include/whisper.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎plans/stable-timestamps/stable-timestamps_001.md‎
Lines changed: 107 additions & 0 deletions b/‎plans/stable-timestamps/stable-timestamps_001.md‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎plans/stable-timestamps/stable-timestamps_002.md‎
Lines changed: 172 additions & 0 deletions b/‎plans/stable-timestamps/stable-timestamps_002.md‎
Lines changed: 172 additions & 0 deletions
@@ -105,6 +105,7 @@ struct whisper_params {
 
     // Voice Activity Detection (VAD) parameters
     bool        vad           = false;
+    bool        stable_timestamps = false;
     std::string vad_model     = "";
     float       vad_threshold = 0.5f;
     int         vad_min_speech_duration_ms = 250;
@@ -210,6 +211,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (                  arg == "--grammar-penalty")      { params.grammar_penalty = std::stof(ARGV_NEXT); }
         // Voice Activity Detection (VAD)
         else if (                  arg == "--vad")                         { params.vad                         = true; }
+        else if (                  arg == "--stable-timestamps")           { params.stable_timestamps           = true; }
         else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = ARGV_NEXT; }
         else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(ARGV_NEXT); }
         else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
@@ -293,6 +295,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
     // Voice Activity Detection (VAD) parameters
     fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
     fprintf(stderr, "             --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",            params.vad ? "true" : "false");
+    fprintf(stderr, "             --stable-timestamps             [%-7s] enable stable timestamps (requires --vad-model)\n", params.stable_timestamps ? "true" : "false");
     fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                                   params.vad_model.c_str());
     fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n",           params.vad_threshold);
     fprintf(stderr, "  -vspd N,   --vad-min-speech-duration-ms  N [%-7d] VAD min speech duration (0.0-1.0)\n",                params.vad_min_speech_duration_ms);
@@ -1002,6 +1005,12 @@ int main(int argc, char ** argv) {
         exit(0);
     }
 
+    if (params.stable_timestamps && params.vad_model.empty()) {
+        fprintf(stderr, "error: --stable-timestamps requires --vad-model\n");
+        whisper_print_usage(argc, argv, params);
+        return 2;
+    }
+
     if (params.no_prints) {
         whisper_log_set(cb_log_disable, NULL);
     }
@@ -1211,6 +1220,7 @@ int main(int argc, char ** argv) {
 
             wparams.suppress_nst     = params.suppress_nst;
 
+            wparams.stable_timestamps = params.stable_timestamps;
             wparams.vad            = params.vad;
             wparams.vad_model_path = params.vad_model.c_str();
 
 
@@ -583,6 +583,10 @@ extern "C" {
         size_t                           i_start_rule;
         float                            grammar_penalty;
 
+        // Stable timestamps - snap word boundaries to speech edges using VAD
+        // Requires vad_model_path to be set. Forces vad=true, token_timestamps=true, max_initial_ts=0.
+        bool         stable_timestamps;
+
         // Voice Activity Detection (VAD) params
         bool         vad;                         // Enable VAD
         const char * vad_model_path;              // Path to VAD model
 
@@ -0,0 +1,107 @@
+# Stable Timestamps - How stable-ts Works
+
+Reference repo: https://github.com/jianfch/stable-ts
+
+## Overview
+
+stable-ts improves Whisper's word-level timestamps with near-zero performance cost. The core idea: Whisper gives rough timestamps, then stable-ts clips them to where sound actually exists. No model weights are changed.
+
+## The 5 Mechanisms
+
+### 1. Post-Hoc Silence Snapping (main workhorse, always on)
+
+**Files:** `stabilization/__init__.py`, `stabilization/nonvad.py`, `stabilization/silero_vad.py`
+
+After Whisper produces timestamps, every word boundary is checked against a silence map and snapped to speech edges.
+
+**Silence map construction (non-VAD mode):**
+1. `abs(waveform)` -> normalize by 99.9th percentile
+2. Interpolate down to one value per audio token position (320 samples per token at 16kHz)
+3. Average-pool with kernel size 5 (reflection padding) to smooth
+4. Quantize: `mask = (mask * 20).round()` -> anything rounding to 0 = silent
+5. Convert boolean mask to start/end silence timing arrays
+
+**Snapping logic:**
+- If word.start falls inside silence -> move start to silence_end
+- If word.end falls inside silence -> move end to silence_start
+- If silence is contained within a word -> snap the boundary with less "error" (ratio of overshoot vs silence duration, threshold 10%)
+- First word in segment: prefer keeping end (snap start forward)
+- Last word in segment: prefer keeping start (snap end backward)
+- Minimum word duration is enforced during snapping
+
+### 2. Better Cross-Attention / DTW Alignment
+
+**File:** `timing.py`
+
+Three improvements to how word timestamps are extracted from cross-attention:
+
+**a) Gap padding:**
+Prepend `" ..."` tokens before each segment's tokens in DTW. This absorbs early cross-attention energy that would otherwise cause timestamps to start too early.
+
+**b) Dynamic head selection (`dynamic_heads`):**
+Instead of hardcoded `model.alignment_heads`, score ALL attention heads by how monotonically their peaks track the DTW path. Select best k (default 6) per token. Can run multiple iterations where each pass refines head selection using previous DTW result.
+
+**c) `max_initial_timestamp=None`:**
+Vanilla Whisper forces first timestamp <= 1s. stable-ts removes this constraint so speech starting later in a 30s chunk isn't forced early.
+
+**d) New alignment algorithm (`aligner='new'`, from arxiv:2509.09987):**
+Score all (layer, head) pairs by column-norm and row-norm of attention matrix. Select top-k (default 20) globally, normalize each by column norm, average, then DTW.
+
+### 3. Constrained Decoding (opt-in, off by default)
+
+**File:** `decode.py`
+
+Subclasses Whisper's `DecodingTask`. During token sampling, timestamp tokens corresponding to silent audio regions are set to `-inf`. The decoder literally cannot predict a timestamp in silence.
+
+```
+ts_logits[:, ts_token_mask] = -inf
+```
+
+Controlled by `suppress_ts_tokens=True` (defaults to `False`).
+
+Also caches audio features across temperature fallbacks (vanilla Whisper re-encodes mel each time).
+
+### 4. Binary-Search Refinement (optional, expensive)
+
+**File:** `non_whisper/refinement.py`
+
+Called explicitly via `model.refine()`. For each word boundary:
+1. Progressively mute audio inward from the boundary
+2. Run inference, monitor token probability
+3. If probability holds -> mute more (boundary can be tighter)
+4. If probability drops -> restore (speech is there)
+5. Binary search converges to latest-possible-start / earliest-possible-end
+
+Precision ~0.1s default. Runs inference dozens of times per word - slow but optional.
+
+### 5. Hallucination Filtering
+
+**File:** `whisper_word_level/original_whisper.py`
+
+- Segments with >50% zero-duration words -> discarded
+- Segments below avg probability threshold -> discarded
+- Entirely silent 30s chunks -> skipped without running decoder
+- Long silence gaps within chunks -> audio truncated to prevent hallucinated text
+- Punctuation-only segments -> deleted
+
+## Cost Summary
+
+| Mechanism | Speed Cost | Always On? | Benefit |
+|-----------|-----------|------------|---------|
+| Silence snapping | ~0 | Yes | 60% of improvement |
+| Better DTW (gap padding, dynamic heads) | ~0 | Yes | 20% of improvement |
+| Hallucination filtering | ~0 | Yes | Cleaner output |
+| Constrained decoding | ~0 | No (opt-in) | Prevents silent timestamps |
+| Binary-search refinement | Very high | No (explicit call) | Tightest possible boundaries |
+
+## What to Port to whisper.cpp
+
+**Priority 1 (easy, high impact):** Post-hoc silence snapping. ~100 lines of C. No model changes needed. Just audio analysis + timestamp adjustment on existing output.
+
+**Priority 2 (medium effort):** Gap padding in DTW step. Requires touching `whisper_exp_compute_token_level_timestamps()`.
+
+**Priority 3 (medium effort):** Dynamic attention head selection. whisper.cpp already extracts cross-attention for DTW. Need to expose all heads and score them.
+
+**Priority 4 (low priority):** Constrained decoding. Invasive to sampling loop.
+
+**Priority 5 (skip):** Binary-search refinement. Too expensive, wrong fit for whisper.cpp's use case.
@@ -0,0 +1,172 @@
+# Stable Timestamps - How whisper.cpp Works (Relevant Internals)
+
+## Codebase Structure
+
+- `include/whisper.h` (741 lines) -- Public C API
+- `src/whisper.cpp` (9016 lines) -- Entire implementation in one file
+- `src/whisper-arch.h` -- Tensor name maps (encoder/decoder/VAD)
+- `ggml/` -- Tensor library backend
+- `examples/cli/cli.cpp` -- Main CLI
+
+## Key Data Structures (all in `src/whisper.cpp`)
+
+### Token Data (`whisper_token_data`, whisper.h:131)
+```c
+id, tid (timestamp token), p (probability), plog, pt (timestamp prob),
+ptsum (sum of timestamp probs), t0/t1, t_dtw, vlen (voice length)
+```
+
+### Segment (`whisper_segment`, line 460)
+```c
+t0, t1, text, no_speech_prob, tokens (vector<whisper_token_data>)
+```
+
+### State (`whisper_state`, line 834)
+Holds: `mel`, `kv_self/kv_cross`, `decoders[8]`, `result_all` (segments), `energy` (PCM signal energy), `aheads_masks`, `aheads_cross_QKs`, `vad_context/segments/mapping`
+
+## Decoding Pipeline
+
+Entry: **`whisper_full_with_state()`** at line 6805
+
+1. **PCM -> Mel** (line 6818): `whisper_pcm_to_mel_with_state()` -- FFT + mel filterbank, 80 bands, hop=160 (10ms/frame)
+2. **Signal energy** (line 6847): `get_signal_energy(samples, n_samples, 32)` -- smoothed abs amplitude for token timestamps
+3. **Main loop** (line 7012): `while(true)` over 30s chunks, advancing by `seek`
+4. **Encoder** (line 7033): `whisper_encode_internal()` -- conv + encoder + cross-attn KV cache
+5. **Prompt setup** (line 7098-7157): `[<prev>] + past + [<sot>] + [<lang>] + [<transcribe>]`
+6. **Token-by-token** (line 7197): `for (i = 0; i < n_max; ++i)` where `n_max = n_text_ctx/2 - 4`
+
+### Logit Processing -- `whisper_process_logits()` at line 6155
+
+This is WHERE ALL LOGIT FILTERING HAPPENS:
+
+- **Line 6232**: `logits_filter_callback` -- user-supplied callback (external injection point)
+- **Line 6268-6308**: Timestamp pairing constraints (must come in pairs, must increase)
+- **Line 6291-6298**: `max_initial_ts` constraint -- limits first timestamp to <= 1.0s
+  - **stable-ts removes this** by setting it to `None`
+  - whisper.cpp param: `params.max_initial_ts` (default 1.0f, line 5950)
+- **Line 6300-6308**: Increasing timestamp enforcement via `decoder.seek_delta/2`
+- **Line 6314-6365**: Force timestamp when `sum(ts_probs) > max(text_probs)`
+
+**INJECTION POINT for constrained decoding:** Between lines 6300-6308 (after increasing-ts check), add `logits[token_beg + t] = -INFINITY` for silent positions. Or use the existing `logits_filter_callback` externally.
+
+### Sampling -- `whisper_sample_token()` at line 6438
+Greedy: argmax. Also computes `tid` (best timestamp), `pt` (timestamp prob), `ptsum` (sum timestamp probs).
+
+## Word-Level Timestamps
+
+### Method 1: Non-DTW (simpler, existing)
+
+**`whisper_exp_compute_token_level_timestamps()`** at line 8433
+
+1. Uses `state.energy` (smoothed PCM amplitude)
+2. Confident timestamps from `token.tid` when `pt > thold_pt && ptsum > thold_ptsum`
+3. Fills gaps by proportional splitting based on `vlen`
+4. **Energy-based refinement** (lines 8563-8631): Expands/contracts token boundaries using signal energy. This is a PRIMITIVE form of silence snapping already present -- but crude.
+
+### Method 2: DTW (experimental, more accurate)
+
+**`whisper_exp_compute_token_level_timestamps_dtw()`** at line 8815
+
+1. Build token sequence: `[sot] + [lang] + [no_timestamps] + all_text_tokens + [eot]`
+2. Full decoder pass with `save_alignment_heads_QKs=true`
+3. Copy cross-attention QKs to CPU: shape `[n_tokens, n_audio_tokens, n_heads]`
+4. Normalize (line 8907)
+5. Median filter width 7 over audio dimension (line 8914)
+6. **Mean across heads** (line 8919) -- all selected heads weighted equally
+7. Scale by -1 (line 8920)
+8. Standard DTW + backtrace via `dtw_and_backtrace()` (line 8690)
+9. Assign timestamps from DTW path (lines 8940-8963)
+
+**IMPORTANT:** DTW does NOT work with `flash_attn=true` (line 3708-3710) because flash attention doesn't expose intermediate attention weights.
+
+Called at lines 7725-7728 after all segments created for a 30s window.
+
+### Alignment Heads -- Hardcoded (lines 384-409)
+
+```c
+static const whisper_ahead g_aheads_large_v3[] = {
+    {7,0}, {10,17}, {12,18}, {13,12}, {16,1}, {17,14}, {19,11}, {21,4}, {24,1}, {25,6}
+};
+static const whisper_ahead g_aheads_large_v3_turbo[] = {
+    {2,4}, {2,11}, {3,3}, {3,6}, {3,11}, {3,14}
+};
+```
+
+Selected via `get_alignment_heads_by_layer()` (line 8666). Modes: preset-specific, N-top-most layers, or custom user-provided heads.
+
+Masks built in `aheads_masks_init()` (line 1160), used during decoder graph construction at lines 2720-2734 in the cross-attention block.
+
+### WHERE TO ADD IMPROVEMENTS:
+
+**Gap padding:** In DTW function at line 8843-8860 when building token sequence. Insert `" ..."` tokens after `no_timestamps` but before text tokens. Adjust `sot_sequence_length`.
+
+**Dynamic head selection:** At line 8919 (currently takes mean). Instead: score each head for monotonicity, select top-k, then average only those. Would need to expose all heads first (currently only preset heads captured).
+
+## VAD Support (Already Exists!)
+
+whisper.cpp has full Silero-style neural VAD:
+
+- **`whisper_vad()`** at line 6621 -- called from `whisper_full()` when `params.vad == true`
+- Strips silence, concatenates speech segments with overlap
+- Builds `vad_mapping_table` to remap timestamps back to original audio
+- **Per-frame speech probabilities** available via `whisper_vad_probs()` API
+- Params: `threshold`, `min_speech_duration_ms`, `min_silence_duration_ms`, etc.
+
+This is relevant because: we could use the existing VAD probabilities as input for the silence mask instead of building our own loudness-based detector (or offer both options like stable-ts).
+
+## Segment Creation & Output
+
+### How Segments Are Created (lines 7616-7718)
+1. Scan tokens for timestamp tokens (`id > whisper_token_beg()`)
+2. Text between timestamps -> segment with `t0`, `t1`, text, tokens
+3. Pushed to `result_all`
+4. If `token_timestamps == true`: per-segment token timestamps computed
+5. If DTW enabled: DTW timestamps computed per-window after all segments
+
+### WHERE TO HOOK POST-HOC SNAPPING:
+
+**Option A -- Internal:** After DTW (line 7735) or after non-DTW token timestamps (lines 7663/7708), iterate all segments and snap word boundaries to speech edges using silence mask.
+
+**Option B -- End of pipeline:** Before `whisper_full_with_state()` returns (line 7753), as a final pass over all `result_all`.
+
+**Option C -- New public API:** `whisper_snap_timestamps(ctx, state)` that callers invoke after `whisper_full()`. Cleanest, non-invasive.
+
+## Existing Energy-Based "Snapping" (Primitive)
+
+Lines 8563-8631 in `whisper_exp_compute_token_level_timestamps()`:
+- Computes energy sum in token's time range
+- Expands/contracts boundaries based on energy threshold
+- Already exists but is crude compared to stable-ts
+
+## Key Constants
+
+| Constant | Value | Meaning |
+|----------|-------|---------|
+| `WHISPER_SAMPLE_RATE` | 16000 | Hz |
+| `WHISPER_HOP_LENGTH` | 160 | samples per mel frame = 10ms |
+| `WHISPER_CHUNK_SIZE` | 30 | seconds per chunk |
+| `WHISPER_N_FFT` | 400 | FFT window size |
+| Audio token resolution | 320 samples = 20ms | Each audio ctx position |
+| Timestamp token resolution | 20ms | Each increment of timestamp token |
+| `n_audio_ctx` | 1500 | Audio tokens per 30s chunk |
+| `n_text_ctx` | 448 | Max text tokens |
+
+## Public API Surface (relevant)
+
+```c
+// After transcription:
+whisper_full_n_segments(ctx)
+whisper_full_get_segment_t0/t1(ctx, i)        // centiseconds (1 = 10ms)
+whisper_full_get_segment_text(ctx, i)
+whisper_full_n_tokens(ctx, i)
+whisper_full_get_token_data(ctx, i, j)        // -> whisper_token_data
+whisper_full_get_segment_no_speech_prob(ctx, i)
+
+// Params:
+params.token_timestamps      // enable non-DTW word timestamps
+params.max_initial_ts        // default 1.0s (stable-ts sets to 0)
+params.logits_filter_callback // can inject custom logit filters externally
+ctx_params.dtw_token_timestamps // enable DTW mode
+ctx_params.dtw_aheads_preset    // which alignment heads
+params.vad                   // enable built-in VAD
+```