From 24a436d350f954bdbe41f788dd4e7d45ef6ebd2a Mon Sep 17 00:00:00 2001 From: Liz Fong-Jones Date: Fri, 3 Apr 2026 21:29:08 -0700 Subject: [PATCH 1/2] whisper : add --seg-len-hint to discourage progressively shorter segments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When processing long audio, whisper tends to produce progressively shorter segments because timestamp tokens in the decoder prompt context condition the model to insert more frequent segment breaks. Add a seg_len_hint parameter (in ms) that thins timestamp tokens in the rolling prompt context, keeping at most one per seg_len_hint interval. This breaks the feedback loop while preserving text tokens for continuity. The model can still break on natural boundaries (speaker turns, pauses) — the hint only affects context conditioning, not the actual segment creation. Usage: --seg-len-hint 2000 (for ~2 second target segments) Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/cli/cli.cpp | 4 ++++ include/whisper.h | 1 + src/whisper.cpp | 29 ++++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/examples/cli/cli.cpp b/examples/cli/cli.cpp index 4e84c1b2750..6b50774def9 100644 --- a/examples/cli/cli.cpp +++ b/examples/cli/cli.cpp @@ -41,6 +41,7 @@ struct whisper_params { int32_t progress_step = 5; int32_t max_context = -1; int32_t max_len = 0; + int32_t seg_len_hint = 0; int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of; int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size; int32_t audio_ctx = 0; @@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(ARGV_NEXT); } else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(ARGV_NEXT); } else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(ARGV_NEXT); } + else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(ARGV_NEXT); } else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(ARGV_NEXT); } else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(ARGV_NEXT); } else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(ARGV_NEXT); } @@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms); fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context); fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len); + fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint); fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false"); fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of); fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size); @@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) { wparams.thold_pt = params.word_thold; wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len; wparams.split_on_word = params.split_on_word; + wparams.seg_len_hint = params.seg_len_hint; wparams.audio_ctx = params.audio_ctx; wparams.debug_mode = params.debug_mode; diff --git a/include/whisper.h b/include/whisper.h index f4cc6bf7abd..2dc76193adf 100644 --- a/include/whisper.h +++ b/include/whisper.h @@ -508,6 +508,7 @@ extern "C" { int max_len; // max segment length in characters bool split_on_word; // split on word rather than on token (when used with max_len) int max_tokens; // max tokens per segment (0 = no limit) + int seg_len_hint; // target segment length in ms — thins timestamps in context to discourage short segments (0 = off) // [EXPERIMENTAL] speed-up techniques // note: these can significantly reduce the quality of the output diff --git a/src/whisper.cpp b/src/whisper.cpp index 86bfafeaad8..6460bac3244 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -5927,6 +5927,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str /*.max_len =*/ 0, /*.split_on_word =*/ false, /*.max_tokens =*/ 0, + /*.seg_len_hint =*/ 0, /*.debug_mode =*/ false, /*.audio_ctx =*/ 0, @@ -6896,6 +6897,9 @@ int whisper_full_with_state( // calculate the maximum context budget for prompt history const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2); + // track last timestamp kept in prompt context for seg_len_hint thinning + int last_prompt_ts = 0; + // prepare prompt { std::vector prompt_tokens; @@ -7585,9 +7589,32 @@ int whisper_full_with_state( } // Add newly decoded tokens to the rolling context + // When seg_len_hint is set, thin out timestamp tokens in the context to prevent + // the model from conditioning on frequent segment breaks (which causes + // progressively shorter segments) if (!is_no_speech) { + const whisper_token token_beg = whisper_token_beg(ctx); + const whisper_token token_eot = whisper_token_eot(ctx); + // convert seg_len_hint from ms to 20ms timestamp steps + const int min_timestamp_gap = params.seg_len_hint / 20; + for (int i = 0; i < result_len; ++i) { - prompt_past1.push_back(tokens_cur[i].id); + const whisper_token id = tokens_cur[i].id; + if (id >= token_eot && id <= token_beg) { + // special non-timestamp token (eot, sot, etc.) — skip + continue; + } + if (min_timestamp_gap > 0 && id > token_beg) { + // timestamp token — only keep if enough time since last one + const int ts = id - token_beg; + if (ts - last_prompt_ts >= min_timestamp_gap) { + last_prompt_ts = ts; + prompt_past1.push_back(id); + } + continue; + } + // regular text token (or timestamp when seg_len_hint=0) — always keep + prompt_past1.push_back(id); } } From 4f2b6ff9eae3e6b0001dfd186a30202c30a503c2 Mon Sep 17 00:00:00 2001 From: Liz Fong-Jones Date: Fri, 10 Apr 2026 14:53:49 -0700 Subject: [PATCH 2/2] whisper-server : expose --seg-len-hint as CLI flag and POST form field The initial --seg-len-hint commit wired the flag into whisper-cli but not whisper-server. Mirrors the existing best_of / beam_size pattern at server.cpp:221-222 (CLI) and :505-511 (POST form field) and assigns the value to wparams.seg_len_hint during inference setup. Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/server/server.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f6a7a83181a..eacb8fff2f8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -78,6 +78,7 @@ struct whisper_params { int32_t progress_step = 5; int32_t max_context = -1; int32_t max_len = 0; + int32_t seg_len_hint = 0; int32_t best_of = 2; int32_t beam_size = -1; int32_t audio_ctx = 0; @@ -146,6 +147,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms); fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context); fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len); + fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint); fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false"); fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of); fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size); @@ -218,6 +220,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); } else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); } else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); } + else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(argv[++i]); } else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); } else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); } else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } @@ -502,6 +505,10 @@ void get_req_parameters(const Request & req, whisper_params & params) { params.max_len = std::stoi(req.get_file_value("max_len").content); } + if (req.has_file("seg_len_hint")) + { + params.seg_len_hint = std::stoi(req.get_file_value("seg_len_hint").content); + } if (req.has_file("best_of")) { params.best_of = std::stoi(req.get_file_value("best_of").content); @@ -932,6 +939,8 @@ int main(int argc, char ** argv) { wparams.greedy.best_of = params.best_of; wparams.beam_search.beam_size = params.beam_size; + wparams.seg_len_hint = params.seg_len_hint; + wparams.temperature = params.temperature; wparams.no_speech_thold = params.no_speech_thold; wparams.temperature_inc = params.temperature_inc;