ggml-org · lizthegrey · Apr 4, 2026 · Apr 10, 2026
diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp
@@ -41,6 +41,7 @@ struct whisper_params {
     int32_t progress_step = 5;
     int32_t max_context   = -1;
     int32_t max_len       = 0;
+    int32_t seg_len_hint  = 0;
     int32_t best_of       = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
     int32_t beam_size     = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
     int32_t audio_ctx     = 0;
@@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-d"    || arg == "--duration")             { params.duration_ms     = std::stoi(ARGV_NEXT); }
         else if (arg == "-mc"   || arg == "--max-context")          { params.max_context     = std::stoi(ARGV_NEXT); }
         else if (arg == "-ml"   || arg == "--max-len")              { params.max_len         = std::stoi(ARGV_NEXT); }
+        else if (arg == "-slh"  || arg == "--seg-len-hint")         { params.seg_len_hint    = std::stoi(ARGV_NEXT); }
         else if (arg == "-bo"   || arg == "--best-of")              { params.best_of         = std::stoi(ARGV_NEXT); }
         else if (arg == "-bs"   || arg == "--beam-size")            { params.beam_size       = std::stoi(ARGV_NEXT); }
         else if (arg == "-ac"   || arg == "--audio-ctx")            { params.audio_ctx       = std::stoi(ARGV_NEXT); }
@@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
     fprintf(stderr, "  -d  N,     --duration N           [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
     fprintf(stderr, "  -mc N,     --max-context N        [%-7d] maximum number of text context tokens to store\n", params.max_context);
     fprintf(stderr, "  -ml N,     --max-len N            [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -slh N,    --seg-len-hint N       [%-7d] target segment length in ms\n",                    params.seg_len_hint);
     fprintf(stderr, "  -sow,      --split-on-word        [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
     fprintf(stderr, "  -bo N,     --best-of N            [%-7d] number of best candidates to keep\n",              params.best_of);
     fprintf(stderr, "  -bs N,     --beam-size N          [%-7d] beam size for beam search\n",                      params.beam_size);
@@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) {
             wparams.thold_pt         = params.word_thold;
             wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
             wparams.split_on_word    = params.split_on_word;
+            wparams.seg_len_hint     = params.seg_len_hint;
             wparams.audio_ctx        = params.audio_ctx;
 
             wparams.debug_mode       = params.debug_mode;

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -78,6 +78,7 @@ struct whisper_params {
     int32_t progress_step = 5;
     int32_t max_context   = -1;
     int32_t max_len       = 0;
+    int32_t seg_len_hint  = 0;
     int32_t best_of       = 2;
     int32_t beam_size     = -1;
     int32_t audio_ctx     = 0;
@@ -146,6 +147,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
     fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
     fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -slh N,    --seg-len-hint N    [%-7d] target segment length in ms\n",                    params.seg_len_hint);
     fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
     fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
     fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
@@ -218,6 +220,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
         else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(argv[++i]); }
         else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(argv[++i]); }
         else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
+        else if (arg == "-slh"  || arg == "--seg-len-hint")    { params.seg_len_hint    = std::stoi(argv[++i]); }
         else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
         else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
         else if (arg == "-ac"   || arg == "--audio-ctx")       { params.audio_ctx       = std::stoi(argv[++i]); }
@@ -502,6 +505,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
     {
         params.max_len = std::stoi(req.get_file_value("max_len").content);
     }
+    if (req.has_file("seg_len_hint"))
+    {
+        params.seg_len_hint = std::stoi(req.get_file_value("seg_len_hint").content);
+    }
     if (req.has_file("best_of"))
     {
         params.best_of = std::stoi(req.get_file_value("best_of").content);
@@ -932,6 +939,8 @@ int main(int argc, char ** argv) {
             wparams.greedy.best_of        = params.best_of;
             wparams.beam_search.beam_size = params.beam_size;
 
+            wparams.seg_len_hint     = params.seg_len_hint;
+
             wparams.temperature      = params.temperature;
             wparams.no_speech_thold = params.no_speech_thold;
             wparams.temperature_inc  = params.temperature_inc;

diff --git a/include/whisper.h b/include/whisper.h
@@ -508,6 +508,7 @@ extern "C" {
         int   max_len;          // max segment length in characters
         bool  split_on_word;    // split on word rather than on token (when used with max_len)
         int   max_tokens;       // max tokens per segment (0 = no limit)
+        int   seg_len_hint;     // target segment length in ms — thins timestamps in context to discourage short segments (0 = off)
 
         // [EXPERIMENTAL] speed-up techniques
         // note: these can significantly reduce the quality of the output

diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -5927,6 +5927,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.max_len           =*/ 0,
         /*.split_on_word     =*/ false,
         /*.max_tokens        =*/ 0,
+        /*.seg_len_hint      =*/ 0,
 
         /*.debug_mode        =*/ false,
         /*.audio_ctx         =*/ 0,
@@ -6896,6 +6897,9 @@ int whisper_full_with_state(
     // calculate the maximum context budget for prompt history
     const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);
 
+    // track last timestamp kept in prompt context for seg_len_hint thinning
+    int last_prompt_ts = 0;
+
     // prepare prompt
     {
         std::vector<whisper_token> prompt_tokens;
@@ -7585,9 +7589,32 @@ int whisper_full_with_state(
             }
 
             // Add newly decoded tokens to the rolling context
+            // When seg_len_hint is set, thin out timestamp tokens in the context to prevent
+            // the model from conditioning on frequent segment breaks (which causes
+            // progressively shorter segments)
             if (!is_no_speech) {
+                const whisper_token token_beg = whisper_token_beg(ctx);
+                const whisper_token token_eot = whisper_token_eot(ctx);
+                // convert seg_len_hint from ms to 20ms timestamp steps
+                const int min_timestamp_gap = params.seg_len_hint / 20;
+
                 for (int i = 0; i < result_len; ++i) {
-                    prompt_past1.push_back(tokens_cur[i].id);
+                    const whisper_token id = tokens_cur[i].id;
+                    if (id >= token_eot && id <= token_beg) {
+                        // special non-timestamp token (eot, sot, etc.) — skip
+                        continue;
+                    }
+                    if (min_timestamp_gap > 0 && id > token_beg) {
+                        // timestamp token — only keep if enough time since last one
+                        const int ts = id - token_beg;
+                        if (ts - last_prompt_ts >= min_timestamp_gap) {
+                            last_prompt_ts = ts;
+                            prompt_past1.push_back(id);
+                        }
+                        continue;
+                    }
+                    // regular text token (or timestamp when seg_len_hint=0) — always keep
+                    prompt_past1.push_back(id);
                 }
             }