Add ModelParameters.enableSwaFull() (--swa-full) launch flag

claude · claude · commit b049986dfab8 · 2026-06-28T12:14:17.000Z
Add a valueless boolean model launch flag mirroring the existing enableFlashAttn() / ModelFlag.FLASH_ATTN pattern. Default off. --swa-full keeps the full-size sliding-window-attention KV cache so the SWA layers' KV becomes reusable across requests, restoring cross-request prompt-prefix cache reuse (pairs with setCacheReuse) at ~2x the SWA-layer KV RAM. Beneficial only for multi-request sessions sharing a prompt prefix, so it is opt-in. - ModelFlag.SWA_FULL("--swa-full") - ModelParameters.enableSwaFull() - ModelFlagTest: enum->string mapping row + enum count 34->35 - ModelParametersExtendedTest: enableSwaFull + not-by-default tests - CHANGELOG entry Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01SRXbL5RqW3B1XZRea3Rfc7
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
 - Explicit `setMmprojAuto(boolean)` and `setMmprojOffload(boolean)` controls, including the upstream `--no-mmproj-auto` and `--no-mmproj-offload` flags.
 - Per-request KV controls: `InferenceParameters.withSlotId(int)` and `withCacheReuse(int)`.
 - Per-request DRY sampling to `InferenceParameters` (`dry_multiplier`/`dry_base`/`dry_allowed_length`/`dry_penalty_last_n`/`dry_sequence_breakers`).
+- `ModelParameters.enableSwaFull()` (`--swa-full`): keep full-size SWA KV cache to enable cross-request prompt-prefix reuse.
 - Typed cache observability through `Usage.getCachedTokens()`, `Usage.getProcessedPromptTokens()`, `SlotMetrics`, and `ServerMetrics.getSlotMetrics()`.
 - Authenticated JSON `GET /metrics` and `GET /slots` endpoints on the embedded server.
 
diff --git a/src/main/java/net/ladenthin/llama/args/ModelFlag.java b/src/main/java/net/ladenthin/llama/args/ModelFlag.java
@@ -22,6 +22,11 @@ public enum ModelFlag {
     /** Enable Flash Attention. */
     FLASH_ATTN("--flash-attn"),
 
+    /** Keep the full-size sliding-window-attention (SWA) KV cache, enabling cross-request
+     *  prompt-prefix reuse (pairs with --cache-reuse) at ~2x the SWA-layer KV RAM. Default off.
+     *  Env: LLAMA_ARG_SWA_FULL. */
+    SWA_FULL("--swa-full"),
+
     /** Disable internal libllama performance timings. */
     NO_PERF("--no-perf"),
 
diff --git a/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java b/src/main/java/net/ladenthin/llama/parameters/ModelParameters.java
@@ -255,6 +255,17 @@ public ModelParameters enableFlashAttn() {
         return setFlag(ModelFlag.FLASH_ATTN);
     }
 
+    /**
+     * Use the full-size SWA KV cache so the sliding-window layers' KV is reusable across requests
+     * (restores prompt-prefix cache reuse with {@link #setCacheReuse(int)}); costs ~2x SWA-layer
+     * KV RAM. Off by default; only beneficial for multi-request sessions sharing a prompt prefix.
+     *
+     * @return this builder
+     */
+    public ModelParameters enableSwaFull() {
+        return setFlag(ModelFlag.SWA_FULL);
+    }
+
     /**
      * Disable internal libllama performance timings (default: false).
      *
diff --git a/src/test/java/net/ladenthin/llama/args/ModelFlagTest.java b/src/test/java/net/ladenthin/llama/args/ModelFlagTest.java
@@ -19,6 +19,7 @@ public static Collection<Object[]> data() {
         return Arrays.asList(new Object[][] {
             {ModelFlag.NO_CONTEXT_SHIFT, "--no-context-shift"},
             {ModelFlag.FLASH_ATTN, "--flash-attn"},
+            {ModelFlag.SWA_FULL, "--swa-full"},
             {ModelFlag.NO_PERF, "--no-perf"},
             {ModelFlag.ESCAPE, "--escape"},
             {ModelFlag.NO_ESCAPE, "--no-escape"},
@@ -66,7 +67,7 @@ public void testGetCliFlag(ModelFlag flag, String expectedCliFlag) {
 
     @Test
     public void testEnumCount() {
-        assertEquals(34, ModelFlag.values().length);
+        assertEquals(35, ModelFlag.values().length);
     }
 
     @ParameterizedTest(name = "{0} -> {1}")
diff --git a/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java b/src/test/java/net/ladenthin/llama/parameters/ModelParametersExtendedTest.java
@@ -641,6 +641,18 @@ public void testEnableFlashAttn() {
         assertThat(p.parameters.get("--flash-attn"), is(nullValue()));
     }
 
+    @Test
+    public void testEnableSwaFull() {
+        ModelParameters p = new ModelParameters().enableSwaFull();
+        assertThat(p.parameters, hasKey("--swa-full"));
+        assertThat(p.parameters.get("--swa-full"), is(nullValue()));
+    }
+
+    @Test
+    public void testSwaFullNotEnabledByDefault() {
+        assertThat(new ModelParameters().parameters, not(hasKey("--swa-full")));
+    }
+
     @Test
     public void testDisablePerf() {
         ModelParameters p = new ModelParameters().disablePerf();