@@ -675,6 +675,10 @@ struct server_context_impl {
675675
676676 int32_t n_ctx; // total context for all clients / slots
677677
678+ // set to llama_model_n_swa(model)
679+ // if swa_full is enabled, this is set to 0 to simulate a non-SWA model
680+ int32_t n_swa;
681+
678682 // slots / clients
679683 std::vector<server_slot> slots;
680684
@@ -854,6 +858,8 @@ struct server_context_impl {
854858 }
855859 }
856860
861+ n_swa = params_base.swa_full ? 0 : llama_model_n_swa (model);
862+
857863 // Necessary similarity of prompt for slot selection
858864 slot_prompt_similarity = params_base.slot_prompt_similarity ;
859865
@@ -2415,9 +2421,6 @@ struct server_context_impl {
24152421
24162422 llama_pos pos_next = slot.prompt .tokens .pos_next (n_past);
24172423
2418- // note: when n_swa == 0, the model does not use SWA
2419- const auto n_swa = std::max (0 , llama_model_n_swa (model));
2420-
24212424 // the largest pos_min required for a checkpoint to be useful
24222425 const auto pos_min_thold = std::max (0 , pos_next - n_swa);
24232426
@@ -2589,10 +2592,10 @@ struct server_context_impl {
25892592 // make a checkpoint of the parts of the memory that cannot be rolled back.
25902593 // checkpoints are created only if:
25912594 // - the model does not support partial sequence removal
2592- // - the model uses SWA and we are not using `swa_full`
2595+ // - the model uses SWA ( and we are not using `swa_full`)
25932596 do_checkpoint = do_checkpoint && (
25942597 (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
2595- (llama_model_n_swa (model) > 0 && !params_base. swa_full ));
2598+ (n_swa > 0 ));
25962599
25972600 bool has_mtmd = false ;
25982601
0 commit comments