Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ struct whisper_params {
int32_t progress_step = 5;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t seg_len_hint = 0;
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
int32_t audio_ctx = 0;
Expand Down Expand Up @@ -159,6 +160,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(ARGV_NEXT); }
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(ARGV_NEXT); }
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(ARGV_NEXT); }
else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(ARGV_NEXT); }
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(ARGV_NEXT); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(ARGV_NEXT); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(ARGV_NEXT); }
Expand Down Expand Up @@ -241,6 +243,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
Expand Down Expand Up @@ -1186,6 +1189,7 @@ int main(int argc, char ** argv) {
wparams.thold_pt = params.word_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.split_on_word = params.split_on_word;
wparams.seg_len_hint = params.seg_len_hint;
wparams.audio_ctx = params.audio_ctx;

wparams.debug_mode = params.debug_mode;
Expand Down
9 changes: 9 additions & 0 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ struct whisper_params {
int32_t progress_step = 5;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t seg_len_hint = 0;
int32_t best_of = 2;
int32_t beam_size = -1;
int32_t audio_ctx = 0;
Expand Down Expand Up @@ -146,6 +147,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -slh N, --seg-len-hint N [%-7d] target segment length in ms\n", params.seg_len_hint);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
Expand Down Expand Up @@ -218,6 +220,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
else if (arg == "-slh" || arg == "--seg-len-hint") { params.seg_len_hint = std::stoi(argv[++i]); }
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
Expand Down Expand Up @@ -502,6 +505,10 @@ void get_req_parameters(const Request & req, whisper_params & params)
{
params.max_len = std::stoi(req.get_file_value("max_len").content);
}
if (req.has_file("seg_len_hint"))
{
params.seg_len_hint = std::stoi(req.get_file_value("seg_len_hint").content);
}
if (req.has_file("best_of"))
{
params.best_of = std::stoi(req.get_file_value("best_of").content);
Expand Down Expand Up @@ -932,6 +939,8 @@ int main(int argc, char ** argv) {
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;

wparams.seg_len_hint = params.seg_len_hint;

wparams.temperature = params.temperature;
wparams.no_speech_thold = params.no_speech_thold;
wparams.temperature_inc = params.temperature_inc;
Expand Down
1 change: 1 addition & 0 deletions include/whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ extern "C" {
int max_len; // max segment length in characters
bool split_on_word; // split on word rather than on token (when used with max_len)
int max_tokens; // max tokens per segment (0 = no limit)
int seg_len_hint; // target segment length in ms — thins timestamps in context to discourage short segments (0 = off)

// [EXPERIMENTAL] speed-up techniques
// note: these can significantly reduce the quality of the output
Expand Down
29 changes: 28 additions & 1 deletion src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5927,6 +5927,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.max_len =*/ 0,
/*.split_on_word =*/ false,
/*.max_tokens =*/ 0,
/*.seg_len_hint =*/ 0,

/*.debug_mode =*/ false,
/*.audio_ctx =*/ 0,
Expand Down Expand Up @@ -6896,6 +6897,9 @@ int whisper_full_with_state(
// calculate the maximum context budget for prompt history
const int max_prompt_ctx = std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2);

// track last timestamp kept in prompt context for seg_len_hint thinning
int last_prompt_ts = 0;

// prepare prompt
{
std::vector<whisper_token> prompt_tokens;
Expand Down Expand Up @@ -7585,9 +7589,32 @@ int whisper_full_with_state(
}

// Add newly decoded tokens to the rolling context
// When seg_len_hint is set, thin out timestamp tokens in the context to prevent
// the model from conditioning on frequent segment breaks (which causes
// progressively shorter segments)
if (!is_no_speech) {
const whisper_token token_beg = whisper_token_beg(ctx);
const whisper_token token_eot = whisper_token_eot(ctx);
// convert seg_len_hint from ms to 20ms timestamp steps
const int min_timestamp_gap = params.seg_len_hint / 20;

for (int i = 0; i < result_len; ++i) {
prompt_past1.push_back(tokens_cur[i].id);
const whisper_token id = tokens_cur[i].id;
if (id >= token_eot && id <= token_beg) {
// special non-timestamp token (eot, sot, etc.) — skip
continue;
}
if (min_timestamp_gap > 0 && id > token_beg) {
// timestamp token — only keep if enough time since last one
const int ts = id - token_beg;
if (ts - last_prompt_ts >= min_timestamp_gap) {
last_prompt_ts = ts;
prompt_past1.push_back(id);
}
continue;
}
// regular text token (or timestamp when seg_len_hint=0) — always keep
prompt_past1.push_back(id);
}
}

Expand Down