server : fix swa-full logic (#22288)

ggerganov · web-flow · commit ffdd983fb83f · 2026-04-24T10:17:37.000+03:00
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -675,6 +675,10 @@ struct server_context_impl {
 
     int32_t n_ctx; // total context for all clients / slots
 
+    // set to llama_model_n_swa(model)
+    // if swa_full is enabled, this is set to 0 to simulate a non-SWA model
+    int32_t n_swa;
+
     // slots / clients
     std::vector<server_slot> slots;
 
@@ -854,6 +858,8 @@ struct server_context_impl {
             }
         }
 
+        n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
+
         // Necessary similarity of prompt for slot selection
         slot_prompt_similarity = params_base.slot_prompt_similarity;
 
@@ -2415,9 +2421,6 @@ struct server_context_impl {
 
                             llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
 
-                            // note: when n_swa == 0, the model does not use SWA
-                            const auto n_swa = std::max(0, llama_model_n_swa(model));
-
                             // the largest pos_min required for a checkpoint to be useful
                             const auto pos_min_thold = std::max(0, pos_next - n_swa);
 
@@ -2589,10 +2592,10 @@ struct server_context_impl {
                     // make a checkpoint of the parts of the memory that cannot be rolled back.
                     // checkpoints are created only if:
                     // - the model does not support partial sequence removal
-                    // - the model uses SWA and we are not using `swa_full`
+                    // - the model uses SWA (and we are not using `swa_full`)
                     do_checkpoint = do_checkpoint && (
                             (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
-                            (llama_model_n_swa(model) > 0 && !params_base.swa_full));
+                            (n_swa > 0));
 
                     bool has_mtmd = false;