From 24a436d350f954bdbe41f788dd4e7d45ef6ebd2a Mon Sep 17 00:00:00 2001
From: Liz Fong-Jones <lizf@honeycomb.io>
Date: Fri, 3 Apr 2026 21:29:08 -0700
Subject: [PATCH 1/2] whisper : add --seg-len-hint to discourage progressively
 shorter segments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When processing long audio, whisper tends to produce progressively
shorter segments because timestamp tokens in the decoder prompt context
condition the model to insert more frequent segment breaks.

Add a seg_len_hint parameter (in ms) that thins timestamp tokens in
the rolling prompt context, keeping at most one per seg_len_hint
interval. This breaks the feedback loop while preserving text tokens
for continuity. The model can still break on natural boundaries
(speaker turns, pauses) — the hint only affects context conditioning,
not the actual segment creation.

Usage: --seg-len-hint 2000 (for ~2 second target segments)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 examples/cli/cli.cpp |  4 ++++
 include/whisper.h    |  1 +
 src/whisper.cpp      | 29 ++++++++++++++++++++++++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
index 4e84c1b2750..6b50774def9 100644
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@@ -41,6 +41,7 @@ struct whisper_params {
     int32_t progress_step = 5;
     int32_t max_context   = -1;
     int32_t max_len       = 0;
+    int32_t seg_len_hint  = 0;
     int32_t best_of       = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
     int32_t beam_size     = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
     int32_t audio_ctx     = 0;
@@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-d"    || arg == "--duration")             { params.duration_ms     = std::stoi(ARGV_NEXT); }
         else if (arg == "-mc"   || arg == "--max-context")          { params.max_context     = std::stoi(ARGV_NEXT); }
         else if (arg == "-ml"   || arg == "--max-len")              { params.max_len         = std::stoi(ARGV_NEXT); }
+        else if (arg == "-slh"  || arg == "--seg-len-hint")         { params.seg_len_hint    = std::stoi(ARGV_NEXT); }
         else if (arg == "-bo"   || arg == "--best-of")              { params.best_of         = std::stoi(ARGV_NEXT); }
         else if (arg == "-bs"   || arg == "--beam-size")            { params.beam_size       = std::stoi(ARGV_NEXT); }
         else if (arg == "-ac"   || arg == "--audio-ctx")            { params.audio_ctx       = std::stoi(ARGV_NEXT); }
@@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
     fprintf(stderr, "  -d  N,     --duration N           [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
     fprintf(stderr, "  -mc N,     --max-context N        [%-7d] maximum number of text context tokens to store\n", params.max_context);
     fprintf(stderr, "  -ml N,     --max-len N            [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -slh N,    --seg-len-hint N       [%-7d] target segment length in ms\n",                    params.seg_len_hint);
     fprintf(stderr, "  -sow,      --split-on-word        [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
     fprintf(stderr, "  -bo N,     --best-of N            [%-7d] number of best candidates to keep\n",              params.best_of);
     fprintf(stderr, "  -bs N,     --beam-size N          [%-7d] beam size for beam search\n",                      params.beam_size);
@@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) {
             wparams.thold_pt         = params.word_thold;
             wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
             wparams.split_on_word    = params.split_on_word;
+            wparams.seg_len_hint     = params.seg_len_hint;
             wparams.audio_ctx        = params.audio_ctx;
 
             wparams.debug_mode       = params.debug_mode;
diff --git a/include/whisper.h b/include/whisper.h
index f4cc6bf7abd..2dc76193adf 100644
--- a/include/whisper.h
+++ b/include/whisper.h
@@ -508,6 +508,7 @@ extern "C" {
         int   max_len;          // max segment length in characters
         bool  split_on_word;    // split on word rather than on token (when used with max_len)
         int   max_tokens;       // max tokens per segment (0 = no limit)
+        int   seg_len_hint;     // target segment length in ms — thins timestamps in context to discourage short segments (0 = off)
 
         // [EXPERIMENTAL] speed-up techniques
         // note: these can significantly reduce the quality of the output
diff --git a/src/whisper.cpp b/src/whisper.cpp
index 86bfafeaad8..6460bac3244 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -5927,6 +5927,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.max_len           =*/ 0,
         /*.split_on_word     =*/ false,
         /*.max_tokens        =*/ 0,
+        /*.seg_len_hint      =*/ 0,
 
         /*.debug_mode        =*/ false,
         /*.audio_ctx         =*/ 0,
@@ -6896,6 +6897,9 @@ int whisper_full_with_state(
     // calculate the maximum context budget for prompt history
     const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
 
+    // track last timestamp kept in prompt context for seg_len_hint thinning
+    int last_prompt_ts = 0;
+
     // prepare prompt
     {
         std::vector<whisper_token> prompt_tokens;
@@ -7585,9 +7589,32 @@ int whisper_full_with_state(
             }
 
             // Add newly decoded tokens to the rolling context
+            // When seg_len_hint is set, thin out timestamp tokens in the context to prevent
+            // the model from conditioning on frequent segment breaks (which causes
+            // progressively shorter segments)
             if (!is_no_speech) {
+                const whisper_token token_beg = whisper_token_beg(ctx);
+                const whisper_token token_eot = whisper_token_eot(ctx);
+                // convert seg_len_hint from ms to 20ms timestamp steps
+                const int min_timestamp_gap = params.seg_len_hint / 20;
+
                 for (int i = 0; i < result_len; ++i) {
-                    prompt_past1.push_back(tokens_cur[i].id);
+                    const whisper_token id = tokens_cur[i].id;
+                    if (id >= token_eot && id <= token_beg) {
+                        // special non-timestamp token (eot, sot, etc.) — skip
+                        continue;
+                    }
+                    if (min_timestamp_gap > 0 && id > token_beg) {
+                        // timestamp token — only keep if enough time since last one
+                        const int ts = id - token_beg;
+                        if (ts - last_prompt_ts >= min_timestamp_gap) {
+                            last_prompt_ts = ts;
+                            prompt_past1.push_back(id);
+                        }
+                        continue;
+                    }
+                    // regular text token (or timestamp when seg_len_hint=0) — always keep
+                    prompt_past1.push_back(id);
                 }
             }
 

From 4f2b6ff9eae3e6b0001dfd186a30202c30a503c2 Mon Sep 17 00:00:00 2001
From: Liz Fong-Jones <lizf@honeycomb.io>
Date: Fri, 10 Apr 2026 14:53:49 -0700
Subject: [PATCH 2/2] whisper-server : expose --seg-len-hint as CLI flag and
 POST form field

The initial --seg-len-hint commit wired the flag into whisper-cli but not
whisper-server. Mirrors the existing best_of / beam_size pattern at
server.cpp:221-222 (CLI) and :505-511 (POST form field) and assigns the
value to wparams.seg_len_hint during inference setup.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 examples/server/server.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f6a7a83181a..eacb8fff2f8 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -78,6 +78,7 @@ struct whisper_params {
     int32_t progress_step = 5;
     int32_t max_context   = -1;
     int32_t max_len       = 0;
+    int32_t seg_len_hint  = 0;
     int32_t best_of       = 2;
     int32_t beam_size     = -1;
     int32_t audio_ctx     = 0;
@@ -146,6 +147,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
     fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
     fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -slh N,    --seg-len-hint N    [%-7d] target segment length in ms\n",                    params.seg_len_hint);
     fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
     fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
     fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
@@ -218,6 +220,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
         else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(argv[++i]); }
         else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(argv[++i]); }
         else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
+        else if (arg == "-slh"  || arg == "--seg-len-hint")    { params.seg_len_hint    = std::stoi(argv[++i]); }
         else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
         else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
         else if (arg == "-ac"   || arg == "--audio-ctx")       { params.audio_ctx       = std::stoi(argv[++i]); }
@@ -502,6 +505,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
     {
         params.max_len = std::stoi(req.get_file_value("max_len").content);
     }
+    if (req.has_file("seg_len_hint"))
+    {
+        params.seg_len_hint = std::stoi(req.get_file_value("seg_len_hint").content);
+    }
     if (req.has_file("best_of"))
     {
         params.best_of = std::stoi(req.get_file_value("best_of").content);
@@ -932,6 +939,8 @@ int main(int argc, char ** argv) {
             wparams.greedy.best_of        = params.best_of;
             wparams.beam_search.beam_size = params.beam_size;
 
+            wparams.seg_len_hint     = params.seg_len_hint;
+
             wparams.temperature      = params.temperature;
             wparams.no_speech_thold = params.no_speech_thold;
             wparams.temperature_inc  = params.temperature_inc;