Skip to content

Commit ffdd983

Browse files
authored
server : fix swa-full logic (#22288)
1 parent 793d0a7 commit ffdd983

1 file changed

Lines changed: 8 additions & 5 deletions

File tree

tools/server/server-context.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,10 @@ struct server_context_impl {
675675

676676
int32_t n_ctx; // total context for all clients / slots
677677

678+
// set to llama_model_n_swa(model)
679+
// if swa_full is enabled, this is set to 0 to simulate a non-SWA model
680+
int32_t n_swa;
681+
678682
// slots / clients
679683
std::vector<server_slot> slots;
680684

@@ -854,6 +858,8 @@ struct server_context_impl {
854858
}
855859
}
856860

861+
n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
862+
857863
// Necessary similarity of prompt for slot selection
858864
slot_prompt_similarity = params_base.slot_prompt_similarity;
859865

@@ -2415,9 +2421,6 @@ struct server_context_impl {
24152421

24162422
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
24172423

2418-
// note: when n_swa == 0, the model does not use SWA
2419-
const auto n_swa = std::max(0, llama_model_n_swa(model));
2420-
24212424
// the largest pos_min required for a checkpoint to be useful
24222425
const auto pos_min_thold = std::max(0, pos_next - n_swa);
24232426

@@ -2589,10 +2592,10 @@ struct server_context_impl {
25892592
// make a checkpoint of the parts of the memory that cannot be rolled back.
25902593
// checkpoints are created only if:
25912594
// - the model does not support partial sequence removal
2592-
// - the model uses SWA and we are not using `swa_full`
2595+
// - the model uses SWA (and we are not using `swa_full`)
25932596
do_checkpoint = do_checkpoint && (
25942597
(slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
2595-
(llama_model_n_swa(model) > 0 && !params_base.swa_full));
2598+
(n_swa > 0));
25962599

25972600
bool has_mtmd = false;
25982601

0 commit comments

Comments
 (0)