Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,15 @@ common_speculative * common_speculative_init(
llama_context * ctx_tgt) {
llama_context * ctx_dft = nullptr;
if (params.draft.model) {
// for SWA draft models, force swa_full on the draft context so prefix
// reuse (seq_rm + seq_add) works beyond the SWA window — otherwise the
// draft must re-decode from the window edge on every long-context
// request and acceptance length degrades significantly
if (llama_model_n_swa(params.draft.model) > 0 && !params.draft.cparams.swa_full) {
LOG_INF("%s: draft model uses SWA — enabling swa_full for the draft context\n", __func__);
params.draft.cparams.swa_full = true;
}

ctx_dft = llama_init_from_model(params.draft.model, params.draft.cparams);
if (ctx_dft == nullptr) {
LOG_ERR("%s", "failed to create draft context\n");
Expand Down
7 changes: 7 additions & 0 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,13 @@ struct server_context_impl {
return false;
}

// for SWA draft models, force swa_full on the draft context so prefix
// reuse works beyond the SWA window during speculation
if (llama_model_n_swa(model_dft.get()) > 0 && !params_dft.swa_full) {
SRV_INF("%s", "draft model uses SWA — enabling swa_full for the draft context\n");
params_dft.swa_full = true;
}

params_base.speculative.draft.model = model_dft.get();
params_base.speculative.draft.cparams = common_context_params_to_llama(params_dft);
}
Expand Down