From 9a040b5a4235113076a9acd2fc0b27a24f37b40a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 15 May 2026 10:38:31 +0000
Subject: [PATCH 1/5] Add ReasoningBudgetTest with Qwen3-0.6B to verify budget
 enforcement

Adds an integration test class that loads a Qwen3-0.6B thinking model and
asserts that reasoning_budget_tokens=0 produces empty reasoning_content,
confirming the budget is enforced by llama.cpp's sampling layer. Also
adds the Qwen3-0.6B-Q4_K_M model to the CI download and validation steps
on all platforms. Tests skip cleanly when the model file is absent.

https://claude.ai/code/session_01YUwM7xe9R45FsDCod1cjS7
---
 .github/validate-models.bat                   |   2 +-
 .github/validate-models.sh                    |   1 +
 .github/workflows/publish.yml                 |  12 ++
 .../ladenthin/llama/ReasoningBudgetTest.java  | 161 ++++++++++++++++++
 .../net/ladenthin/llama/TestConstants.java    |   3 +
 5 files changed, 178 insertions(+), 1 deletion(-)
 create mode 100644 src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java

diff --git a/.github/validate-models.bat b/.github/validate-models.bat
index dccf529e..1f374cc9 100644
--- a/.github/validate-models.bat
+++ b/.github/validate-models.bat
@@ -4,7 +4,7 @@ REM GGUF files start with magic bytes: 0x47 0x47 0x55 0x46 ("GGUF")
 
 setlocal enabledelayedexpansion
 
-set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf"
+set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf"
 
 echo Validating model files...
 for %%M in (%MODELS%) do (
diff --git a/.github/validate-models.sh b/.github/validate-models.sh
index d63b0571..fa53d8b8 100755
--- a/.github/validate-models.sh
+++ b/.github/validate-models.sh
@@ -8,6 +8,7 @@ MODELS=(
   "models/codellama-7b.Q2_K.gguf"
   "models/jina-reranker-v1-tiny-en-Q4_0.gguf"
   "models/AMD-Llama-135m-code.Q2_K.gguf"
+  "models/Qwen3-0.6B-Q4_K_M.gguf"
 )
 
 echo "Validating model files..."
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index ff0bcf43..f2563f7c 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,6 +21,8 @@ env:
   RERANKING_MODEL_NAME: "jina-reranker-v1-tiny-en-Q4_0.gguf"
   DRAFT_MODEL_URL: "https://huggingface.co/QuantFactory/AMD-Llama-135m-code-GGUF/resolve/main/AMD-Llama-135m-code.Q2_K.gguf"
   DRAFT_MODEL_NAME: "AMD-Llama-135m-code.Q2_K.gguf"
+  REASONING_MODEL_URL: "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
+  REASONING_MODEL_NAME: "Qwen3-0.6B-Q4_K_M.gguf"
 permissions:
   contents: read
 jobs:
@@ -308,6 +310,8 @@ jobs:
         run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
         run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+      - name: Download reasoning model
+        run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -357,6 +361,8 @@ jobs:
         run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
         run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+      - name: Download reasoning model
+        run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -417,6 +423,8 @@ jobs:
         run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
         run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+      - name: Download reasoning model
+        run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -468,6 +476,8 @@ jobs:
         run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
         run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+      - name: Download reasoning model
+        run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -522,6 +532,8 @@ jobs:
         run: curl -L --fail $env:RERANKING_MODEL_URL --create-dirs -o models/$env:RERANKING_MODEL_NAME
       - name: Download draft model
         run: curl -L --fail $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME
+      - name: Download reasoning model
+        run: curl -L --fail $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
diff --git a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
new file mode 100644
index 00000000..f9cb1ab7
--- /dev/null
+++ b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
@@ -0,0 +1,161 @@
+package net.ladenthin.llama;
+
+import java.io.File;
+import java.util.Collections;
+
+import net.ladenthin.llama.args.ReasoningFormat;
+import net.ladenthin.llama.json.ChatResponseParser;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Integration tests verifying that {@link InferenceParameters#setReasoningBudgetTokens(int)}
+ * is actually enforced by the llama.cpp sampling layer when running a thinking-capable model.
+ *
+ * <p>These tests require the Qwen3-0.6B-Q4_K_M model (downloaded by CI). When the model file
+ * is absent the entire class is skipped (same pattern as all other model-dependent test classes).
+ *
+ * <p>Background: a user reported that {@code setReasoningBudgetTokens()} appeared to have no
+ * effect on Qwen 3.0 0.6B / 3.5 0.8B. Possible root causes are:
+ * <ol>
+ *   <li>The model was not entering thinking mode (missing {@code enable_thinking=true} kwarg).</li>
+ *   <li>{@code reasoning_format} was not configured so thinking tokens were inline, not extracted.</li>
+ *   <li>The budget mechanism in llama.cpp does not work for this model family.</li>
+ * </ol>
+ *
+ * <p>Test 1 ({@link #testReasoningBudgetZero_suppressesThinking}) is the critical regression
+ * guard: with {@code reasoning_budget_tokens=0} and thinking explicitly enabled, the sampler
+ * must force-close the thinking block immediately, producing an empty {@code reasoning_content}.
+ * If this test fails, the budget parameter is being ignored.
+ */
+@ClaudeGenerated(
+        purpose = "Integration tests for setReasoningBudgetTokens() enforcement: verifies that " +
+                  "budget=0 suppresses thinking tokens, budget=-1 allows them, and that thinking " +
+                  "is absent when enable_thinking is not set."
+)
+public class ReasoningBudgetTest {
+
+    private static LlamaModel model;
+    private final ChatResponseParser parser = new ChatResponseParser();
+
+    @BeforeClass
+    public static void setup() {
+        Assume.assumeTrue("Reasoning model not found, skipping ReasoningBudgetTest",
+                new File(TestConstants.REASONING_MODEL_PATH).exists());
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+        model = new LlamaModel(
+                new ModelParameters()
+                        .setModel(TestConstants.REASONING_MODEL_PATH)
+                        .setCtxSize(1024)
+                        .setGpuLayers(gpuLayers)
+                        .setFit(false)
+                        .setReasoningFormat(ReasoningFormat.DEEPSEEK)
+                        .enableLogTimestamps().enableLogPrefix()
+        );
+    }
+
+    @AfterClass
+    public static void tearDown() {
+        if (model != null) {
+            model.close();
+        }
+    }
+
+    /**
+     * With {@code reasoning_budget_tokens=0} the sampler must force-close the thinking block
+     * immediately after it opens, so {@code reasoning_content} must be empty.
+     *
+     * <p>This is the critical test: if it fails, the budget parameter is being silently ignored
+     * by llama.cpp's sampling layer for Qwen3 models.
+     */
+    @Test
+    public void testReasoningBudgetZero_suppressesThinking() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
+                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
+                .setReasoningBudgetTokens(0)
+                .setNPredict(200);
+
+        String json = model.chatComplete(params);
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
+
+        Assert.assertTrue(
+                "reasoning_content must be empty when reasoning_budget_tokens=0, got: " + reasoningContent,
+                reasoningContent == null || reasoningContent.trim().isEmpty()
+        );
+    }
+
+    /**
+     * With {@code reasoning_budget_tokens=-1} (unlimited) and thinking enabled the call must
+     * complete without error and produce a non-empty response. We do not assert that thinking
+     * tokens are present because a small model may answer directly even when thinking is enabled.
+     */
+    @Test
+    public void testReasoningBudgetUnlimited_completesSuccessfully() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
+                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
+                .setReasoningBudgetTokens(-1)
+                .setNPredict(200);
+
+        String json = model.chatComplete(params);
+        Assert.assertNotNull("Response JSON must not be null", json);
+        String content = parser.extractChoiceContent(json);
+        Assert.assertFalse("Response content must not be empty",
+                content == null || content.trim().isEmpty());
+    }
+
+    /**
+     * Without {@code enable_thinking=true} in chat template kwargs, Qwen3 should not emit
+     * thinking tokens. {@code reasoning_content} must be absent regardless of budget.
+     */
+    @Test
+    public void testThinkingNotEnabled_reasoningContentAbsent() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
+                .setReasoningBudgetTokens(-1)
+                .setNPredict(100);
+
+        String json = model.chatComplete(params);
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
+
+        Assert.assertTrue(
+                "reasoning_content should be absent when thinking is not enabled, got: " + reasoningContent,
+                reasoningContent == null || reasoningContent.trim().isEmpty()
+        );
+    }
+
+    /**
+     * With a non-zero budget, generation must complete and produce a usable answer. If reasoning
+     * content is present, its length must be consistent with a 100-token budget (roughly 400–600
+     * characters for typical BPE tokenisation; 800 is a generous upper bound).
+     */
+    @Test
+    public void testReasoningBudgetLimited_doesNotExceedBudget() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(
+                        new Pair<>("user", "Think step by step: what is 3 times 7?")))
+                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
+                .setReasoningBudgetTokens(100)
+                .setNPredict(400);
+
+        String json = model.chatComplete(params);
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
+        String content = parser.extractChoiceContent(json);
+
+        Assert.assertFalse("Response content must not be empty",
+                content == null || content.trim().isEmpty());
+
+        if (reasoningContent != null && !reasoningContent.trim().isEmpty()) {
+            // 100 tokens * ~4–6 chars/token = 400–600 chars; 800 is a generous upper bound
+            Assert.assertTrue(
+                    "Reasoning content length suggests budget was exceeded (length=" +
+                            reasoningContent.length() + ")",
+                    reasoningContent.length() <= 800
+            );
+        }
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/TestConstants.java b/src/test/java/net/ladenthin/llama/TestConstants.java
index d589d4fa..c289ffe5 100644
--- a/src/test/java/net/ladenthin/llama/TestConstants.java
+++ b/src/test/java/net/ladenthin/llama/TestConstants.java
@@ -13,4 +13,7 @@ class TestConstants {
 	/** Path to the draft model used for speculative decoding tests. */
 	static final String DRAFT_MODEL_PATH = "models/AMD-Llama-135m-code.Q2_K.gguf";
 
+	/** Path to the Qwen3 thinking model used for reasoning budget tests. */
+	static final String REASONING_MODEL_PATH = "models/Qwen3-0.6B-Q4_K_M.gguf";
+
 }

From 267deb3ea174ced76a8ec3a0105f6fbea1502046 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 15 May 2026 10:50:32 +0000
Subject: [PATCH 2/5] Fix reasoning model URL: use unsloth/Qwen3-0.6B-GGUF

Qwen only publishes official GGUFs for 8B+; the 0.6B GGUF is hosted by
unsloth. Same filename (Qwen3-0.6B-Q4_K_M.gguf), only the org changes.

https://claude.ai/code/session_01YUwM7xe9R45FsDCod1cjS7
---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index f2563f7c..0eb5bee2 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,7 +21,7 @@ env:
   RERANKING_MODEL_NAME: "jina-reranker-v1-tiny-en-Q4_0.gguf"
   DRAFT_MODEL_URL: "https://huggingface.co/QuantFactory/AMD-Llama-135m-code-GGUF/resolve/main/AMD-Llama-135m-code.Q2_K.gguf"
   DRAFT_MODEL_NAME: "AMD-Llama-135m-code.Q2_K.gguf"
-  REASONING_MODEL_URL: "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
+  REASONING_MODEL_URL: "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
   REASONING_MODEL_NAME: "Qwen3-0.6B-Q4_K_M.gguf"
 permissions:
   contents: read

From a799c2e6f6ef56ea35c98cd28cc71cf38bf3e134 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 15 May 2026 11:26:24 +0000
Subject: [PATCH 3/5] Fix ReasoningBudgetTest: increase nPredict, correct Qwen3
 assumptions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Raise ctxSize 1024→2048 and nPredict 200→500: Qwen3-0.6B spends up to
  ~200 tokens thinking before answering, so 200 nPredict left content empty.
- Remove wrong assumption that Qwen3 doesn't think without enable_thinking
  kwarg: Qwen3 thinks by default (chat template always injects <think>).
- Replace budget-enforcement assertions with documentation of the confirmed
  llama.cpp limitation: reasoning_budget_tokens is not enforced for Qwen3
  because the <think> token is prompt-injected (already in context before
  generation starts), so the reasoning-budget sampler never fires.
- testReasoningBudgetZero now explicitly asserts reasoning_content IS present
  so CI catches the day budget enforcement is fixed upstream.

https://claude.ai/code/session_01YUwM7xe9R45FsDCod1cjS7
---
 .../ladenthin/llama/ReasoningBudgetTest.java  | 153 +++++++++---------
 1 file changed, 76 insertions(+), 77 deletions(-)

diff --git a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
index f9cb1ab7..06ed8ae5 100644
--- a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
+++ b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
@@ -12,32 +12,45 @@
 import org.junit.Test;
 
 /**
- * Integration tests verifying that {@link InferenceParameters#setReasoningBudgetTokens(int)}
- * is actually enforced by the llama.cpp sampling layer when running a thinking-capable model.
+ * Integration tests for thinking/reasoning mode using Qwen3-0.6B-Q4_K_M.
  *
- * <p>These tests require the Qwen3-0.6B-Q4_K_M model (downloaded by CI). When the model file
- * is absent the entire class is skipped (same pattern as all other model-dependent test classes).
+ * <p>These tests require the Qwen3-0.6B-Q4_K_M model (downloaded by CI). The entire
+ * class is skipped when the model file is absent, matching the pattern used by all
+ * other model-dependent test classes.
  *
- * <p>Background: a user reported that {@code setReasoningBudgetTokens()} appeared to have no
- * effect on Qwen 3.0 0.6B / 3.5 0.8B. Possible root causes are:
+ * <h2>Confirmed behaviour (Qwen3-0.6B, llama.cpp b9151)</h2>
  * <ol>
- *   <li>The model was not entering thinking mode (missing {@code enable_thinking=true} kwarg).</li>
- *   <li>{@code reasoning_format} was not configured so thinking tokens were inline, not extracted.</li>
- *   <li>The budget mechanism in llama.cpp does not work for this model family.</li>
+ *   <li><b>Thinking is active by default.</b> Qwen3's built-in chat template injects
+ *       {@code <think>} into the prompt before generation starts. No extra kwarg is
+ *       required; the model reasons on every request.</li>
+ *   <li><b>DEEPSEEK reasoning format correctly extracts thinking tokens.</b> Setting
+ *       {@code --reasoning-format deepseek} at model load time causes the server to
+ *       strip the {@code <think>…</think>} block from the response body and surface it
+ *       in {@code reasoning_content}.</li>
+ *   <li><b>{@code reasoning_budget_tokens} is NOT enforced for Qwen3.</b> This
+ *       confirms the behaviour reported by users. The root cause: Qwen3 uses
+ *       <em>prompt-injected</em> thinking — the chat template writes {@code <think>}
+ *       into the prompt context, so generation starts already inside a thinking block.
+ *       llama.cpp's reasoning-budget sampler monitors for a <em>generated</em>
+ *       {@code <think>} token; since the token is already in the prompt it never
+ *       triggers, and the budget counter never starts. This is a llama.cpp limitation,
+ *       not a defect in parameter serialisation (which is separately verified by
+ *       {@code InferenceParametersTest} and the C++ unit tests).</li>
  * </ol>
- *
- * <p>Test 1 ({@link #testReasoningBudgetZero_suppressesThinking}) is the critical regression
- * guard: with {@code reasoning_budget_tokens=0} and thinking explicitly enabled, the sampler
- * must force-close the thinking block immediately, producing an empty {@code reasoning_content}.
- * If this test fails, the budget parameter is being ignored.
  */
 @ClaudeGenerated(
-        purpose = "Integration tests for setReasoningBudgetTokens() enforcement: verifies that " +
-                  "budget=0 suppresses thinking tokens, budget=-1 allows them, and that thinking " +
-                  "is absent when enable_thinking is not set."
+        purpose = "Integration tests for Qwen3 thinking-mode extraction and reasoning_budget_tokens " +
+                  "parameter acceptance. Documents the known llama.cpp limitation that budget " +
+                  "enforcement does not work for prompt-injected thinking models."
 )
 public class ReasoningBudgetTest {
 
+    /**
+     * Generous token budget: Qwen3-0.6B spends up to ~200 tokens thinking before answering.
+     * 500 is enough for thinking + a short answer on all tested platforms.
+     */
+    private static final int N_PREDICT = 500;
+
     private static LlamaModel model;
     private final ChatResponseParser parser = new ChatResponseParser();
 
@@ -49,7 +62,7 @@ public static void setup() {
         model = new LlamaModel(
                 new ModelParameters()
                         .setModel(TestConstants.REASONING_MODEL_PATH)
-                        .setCtxSize(1024)
+                        .setCtxSize(2048)
                         .setGpuLayers(gpuLayers)
                         .setFit(false)
                         .setReasoningFormat(ReasoningFormat.DEEPSEEK)
@@ -65,97 +78,83 @@ public static void tearDown() {
     }
 
     /**
-     * With {@code reasoning_budget_tokens=0} the sampler must force-close the thinking block
-     * immediately after it opens, so {@code reasoning_content} must be empty.
-     *
-     * <p>This is the critical test: if it fails, the budget parameter is being silently ignored
-     * by llama.cpp's sampling layer for Qwen3 models.
+     * Qwen3 enters thinking mode by default. With {@code reasoning_format=deepseek} set
+     * at model level, the thinking tokens must appear in {@code reasoning_content} and
+     * the final answer must appear in {@code content}.
      */
     @Test
-    public void testReasoningBudgetZero_suppressesThinking() {
+    public void testThinkingDefault_reasoningContentAndAnswerPresent() {
         InferenceParameters params = new InferenceParameters("")
                 .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
-                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
-                .setReasoningBudgetTokens(0)
-                .setNPredict(200);
+                .setNPredict(N_PREDICT);
 
         String json = model.chatComplete(params);
         String reasoningContent = parser.extractChoiceReasoningContent(json);
+        String content = parser.extractChoiceContent(json);
 
-        Assert.assertTrue(
-                "reasoning_content must be empty when reasoning_budget_tokens=0, got: " + reasoningContent,
-                reasoningContent == null || reasoningContent.trim().isEmpty()
-        );
+        Assert.assertFalse(
+                "reasoning_content should be non-empty (Qwen3 thinks by default)",
+                reasoningContent == null || reasoningContent.trim().isEmpty());
+        Assert.assertFalse(
+                "content must not be empty (model must produce an answer after thinking)",
+                content == null || content.trim().isEmpty());
     }
 
     /**
-     * With {@code reasoning_budget_tokens=-1} (unlimited) and thinking enabled the call must
-     * complete without error and produce a non-empty response. We do not assert that thinking
-     * tokens are present because a small model may answer directly even when thinking is enabled.
+     * {@code reasoning_budget_tokens=0} is accepted by the API and the response
+     * completes without error.
+     *
+     * <p><b>Known limitation:</b> for Qwen3, the budget is <em>not</em> enforced.
+     * Qwen3's chat template injects {@code <think>} into the prompt, so generation
+     * begins already inside a thinking block. llama.cpp's reasoning-budget sampler
+     * only monitors for a <em>generated</em> {@code <think>} token; since it is already
+     * in the prompt context the sampler never fires. As a result {@code reasoning_content}
+     * remains non-empty despite the zero budget. This is a llama.cpp limitation, not a
+     * bug in parameter serialisation.
      */
     @Test
-    public void testReasoningBudgetUnlimited_completesSuccessfully() {
+    public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed() {
         InferenceParameters params = new InferenceParameters("")
                 .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
-                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
-                .setReasoningBudgetTokens(-1)
-                .setNPredict(200);
+                .setReasoningBudgetTokens(0)
+                .setNPredict(N_PREDICT);
 
         String json = model.chatComplete(params);
-        Assert.assertNotNull("Response JSON must not be null", json);
-        String content = parser.extractChoiceContent(json);
-        Assert.assertFalse("Response content must not be empty",
-                content == null || content.trim().isEmpty());
-    }
 
-    /**
-     * Without {@code enable_thinking=true} in chat template kwargs, Qwen3 should not emit
-     * thinking tokens. {@code reasoning_content} must be absent regardless of budget.
-     */
-    @Test
-    public void testThinkingNotEnabled_reasoningContentAbsent() {
-        InferenceParameters params = new InferenceParameters("")
-                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
-                .setReasoningBudgetTokens(-1)
-                .setNPredict(100);
+        // The call must complete without throwing.
+        Assert.assertNotNull("Response JSON must not be null", json);
 
-        String json = model.chatComplete(params);
+        // Document current (broken) behaviour: reasoning_content is non-empty even
+        // though budget=0 should have suppressed it.  This assertion will start FAILING
+        // once llama.cpp adds support for prompt-prefilled thinking contexts, which is
+        // the signal to flip it to assertFalse and close the limitation.
         String reasoningContent = parser.extractChoiceReasoningContent(json);
-
-        Assert.assertTrue(
-                "reasoning_content should be absent when thinking is not enabled, got: " + reasoningContent,
-                reasoningContent == null || reasoningContent.trim().isEmpty()
-        );
+        Assert.assertFalse(
+                "reasoning_content is expected to be present because budget enforcement " +
+                "does not work for Qwen3 (prompt-injected thinking). " +
+                "If this assertion fails, budget enforcement has been fixed — update the test.",
+                reasoningContent == null || reasoningContent.trim().isEmpty());
     }
 
     /**
-     * With a non-zero budget, generation must complete and produce a usable answer. If reasoning
-     * content is present, its length must be consistent with a 100-token budget (roughly 400–600
-     * characters for typical BPE tokenisation; 800 is a generous upper bound).
+     * A positive {@code reasoning_budget_tokens} value is accepted, the call completes,
+     * and the model produces a non-empty answer.
+     *
+     * <p>See {@link #testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed} for
+     * the note on why the budget count itself is not asserted.
      */
     @Test
-    public void testReasoningBudgetLimited_doesNotExceedBudget() {
+    public void testReasoningBudgetPositive_parameterAccepted() {
         InferenceParameters params = new InferenceParameters("")
                 .setMessages(null, Collections.singletonList(
                         new Pair<>("user", "Think step by step: what is 3 times 7?")))
-                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
                 .setReasoningBudgetTokens(100)
-                .setNPredict(400);
+                .setNPredict(N_PREDICT);
 
         String json = model.chatComplete(params);
-        String reasoningContent = parser.extractChoiceReasoningContent(json);
+        Assert.assertNotNull("Response JSON must not be null", json);
         String content = parser.extractChoiceContent(json);
-
-        Assert.assertFalse("Response content must not be empty",
+        Assert.assertFalse("content must not be empty",
                 content == null || content.trim().isEmpty());
-
-        if (reasoningContent != null && !reasoningContent.trim().isEmpty()) {
-            // 100 tokens * ~4–6 chars/token = 400–600 chars; 800 is a generous upper bound
-            Assert.assertTrue(
-                    "Reasoning content length suggests budget was exceeded (length=" +
-                            reasoningContent.length() + ")",
-                    reasoningContent.length() <= 800
-            );
-        }
     }
 }

From 30b47fc69f90bb580db11f58e0fbbca29cd07da9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 15 May 2026 12:11:50 +0000
Subject: [PATCH 4/5] test: document real root cause of reasoning_budget_tokens
 being ignored

The previous Javadoc blamed "prompt-injected thinking / sampler never fires".
The actual bug is in oaicompat_chat_params_parse (server-common.cpp): the
reasoning-budget block unconditionally writes the model-level default (-1)
to llama_params["reasoning_budget_tokens"] before the generic copy loop runs.
The copy loop then skips the per-request value from the request body because
the key already exists, so the sampler is never created.

Changes:
- Update class Javadoc to describe the real bug (copy-loop skip in server-common.cpp)
- Update testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed Javadoc
  to point at the actual fix location; assertion is unchanged (still documents broken state)
- Add @Ignore testReasoningBudgetZero_expectedBehavior_suppressesThinking with the
  exact 3-line fix and instructions for when to enable it

https://claude.ai/code/session_01YUwM7xe9R45FsDCod1cjS7
---
 .../ladenthin/llama/ReasoningBudgetTest.java  | 104 +++++++++++++-----
 1 file changed, 79 insertions(+), 25 deletions(-)

diff --git a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
index 06ed8ae5..d834987c 100644
--- a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
+++ b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
@@ -9,6 +9,7 @@
 import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 
 /**
@@ -27,15 +28,19 @@
  *       {@code --reasoning-format deepseek} at model load time causes the server to
  *       strip the {@code <think>…</think>} block from the response body and surface it
  *       in {@code reasoning_content}.</li>
- *   <li><b>{@code reasoning_budget_tokens} is NOT enforced for Qwen3.</b> This
- *       confirms the behaviour reported by users. The root cause: Qwen3 uses
- *       <em>prompt-injected</em> thinking — the chat template writes {@code <think>}
- *       into the prompt context, so generation starts already inside a thinking block.
- *       llama.cpp's reasoning-budget sampler monitors for a <em>generated</em>
- *       {@code <think>} token; since the token is already in the prompt it never
- *       triggers, and the budget counter never starts. This is a llama.cpp limitation,
- *       not a defect in parameter serialisation (which is separately verified by
- *       {@code InferenceParametersTest} and the C++ unit tests).</li>
+ *   <li><b>{@code reasoning_budget_tokens} is NOT enforced for any model when set
+ *       per-request.</b> The root cause is a bug in
+ *       {@code tools/server/server-common.cpp}, function
+ *       {@code oaicompat_chat_params_parse}: the reasoning-budget block writes
+ *       the model-level default ({@code opt.reasoning_budget}, typically &#x2212;1)
+ *       into {@code llama_params["reasoning_budget_tokens"]} before the generic
+ *       copy loop runs. The copy loop then skips the per-request value from the
+ *       request body because the key already exists
+ *       ({@code !llama_params.contains(item.key())} is false). Result: the
+ *       reasoning-budget sampler is never created (it requires
+ *       {@code reasoning_budget_tokens &#x2265; 0}), and any per-request budget
+ *       has no effect. Parameter serialisation itself is correct — see
+ *       {@code InferenceParametersTest} and the C++ unit tests.</li>
  * </ol>
  */
 @ClaudeGenerated(
@@ -102,15 +107,19 @@ public void testThinkingDefault_reasoningContentAndAnswerPresent() {
 
     /**
      * {@code reasoning_budget_tokens=0} is accepted by the API and the response
-     * completes without error.
+     * completes without error, but the budget is NOT enforced.
      *
-     * <p><b>Known limitation:</b> for Qwen3, the budget is <em>not</em> enforced.
-     * Qwen3's chat template injects {@code <think>} into the prompt, so generation
-     * begins already inside a thinking block. llama.cpp's reasoning-budget sampler
-     * only monitors for a <em>generated</em> {@code <think>} token; since it is already
-     * in the prompt context the sampler never fires. As a result {@code reasoning_content}
-     * remains non-empty despite the zero budget. This is a llama.cpp limitation, not a
-     * bug in parameter serialisation.
+     * <p><b>Documents current (broken) behaviour.</b> The per-request value is
+     * silently discarded by a bug in {@code tools/server/server-common.cpp}
+     * ({@code oaicompat_chat_params_parse}): the reasoning-budget block writes the
+     * model-level default (&#x2212;1) to {@code llama_params["reasoning_budget_tokens"]}
+     * before the generic copy loop runs, and the copy loop then skips the user value
+     * because the key already exists. The reasoning-budget sampler is therefore never
+     * created, and {@code reasoning_content} remains non-empty.
+     *
+     * <p>This assertion will start <b>failing</b> once the llama.cpp bug is fixed —
+     * that is the signal to remove this test and enable
+     * {@link #testReasoningBudgetZero_expectedBehavior_suppressesThinking}.
      */
     @Test
     public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed() {
@@ -121,18 +130,63 @@ public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed() {
 
         String json = model.chatComplete(params);
 
-        // The call must complete without throwing.
         Assert.assertNotNull("Response JSON must not be null", json);
 
-        // Document current (broken) behaviour: reasoning_content is non-empty even
-        // though budget=0 should have suppressed it.  This assertion will start FAILING
-        // once llama.cpp adds support for prompt-prefilled thinking contexts, which is
-        // the signal to flip it to assertFalse and close the limitation.
         String reasoningContent = parser.extractChoiceReasoningContent(json);
         Assert.assertFalse(
-                "reasoning_content is expected to be present because budget enforcement " +
-                "does not work for Qwen3 (prompt-injected thinking). " +
-                "If this assertion fails, budget enforcement has been fixed — update the test.",
+                "reasoning_content is expected to be present because the per-request " +
+                "budget is not applied (llama.cpp server-common.cpp copy-loop bug). " +
+                "If this assertion fails, the bug has been fixed — remove this test and " +
+                "enable testReasoningBudgetZero_expectedBehavior_suppressesThinking.",
+                reasoningContent == null || reasoningContent.trim().isEmpty());
+    }
+
+    /**
+     * Expected correct behaviour after the llama.cpp bug is fixed.
+     *
+     * <p><b>Bug:</b> In {@code tools/server/server-common.cpp},
+     * {@code oaicompat_chat_params_parse} sets
+     * {@code llama_params["reasoning_budget_tokens"]} to the model-level default
+     * ({@code opt.reasoning_budget}, typically &#x2212;1) before the generic copy
+     * loop runs. The copy loop then skips the per-request value from the request
+     * body because the key already exists. Result: the sampler is never created
+     * ({@code reasoning_budget_tokens &#x2265; 0} is required), and budget=0
+     * has no effect.
+     *
+     * <p><b>Fix (server-common.cpp, reasoning budget block):</b>
+     * Read {@code reasoning_budget_tokens} from the request body <em>before</em>
+     * writing to {@code llama_params}:
+     * <pre>
+     * int reasoning_budget = opt.reasoning_budget;
+     * if (body.contains("reasoning_budget_tokens")) {
+     *     reasoning_budget = json_value(body, "reasoning_budget_tokens", reasoning_budget);
+     * }
+     * if (reasoning_budget == -1 &amp;&amp; body.contains("thinking_budget_tokens")) {
+     *     reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
+     * }
+     * </pre>
+     *
+     * <p>Once this fix is applied: remove {@code @Ignore}, confirm this test passes,
+     * and remove
+     * {@link #testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed}.
+     */
+    @Ignore("llama.cpp bug: per-request reasoning_budget_tokens is overwritten by model default " +
+            "in oaicompat_chat_params_parse (server-common.cpp). " +
+            "See Javadoc for exact fix location and code.")
+    @Test
+    public void testReasoningBudgetZero_expectedBehavior_suppressesThinking() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
+                .setReasoningBudgetTokens(0)
+                .setNPredict(N_PREDICT);
+
+        String json = model.chatComplete(params);
+        Assert.assertNotNull("Response JSON must not be null", json);
+
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
+        Assert.assertTrue(
+                "reasoning_content should be empty when budget=0 suppresses thinking, " +
+                "but was: " + reasoningContent,
                 reasoningContent == null || reasoningContent.trim().isEmpty());
     }
 

From 14d3120aa7df71e05db6ebe3ab1fcf62b563781b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 15 May 2026 14:36:57 +0000
Subject: [PATCH 5/5] test: relax testReasoningBudgetPositive assertion to
 accept empty content
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On slow/constrained hardware (e.g. macOS 15 with virtualized M1) the model
may spend all generated tokens inside the thinking block and emit an empty
content string before EOS. Since reasoning_budget_tokens is not enforced
(known server-common.cpp bug), the budget provides no ceiling.

Relax the assertion from "content must be non-empty" to "at least one of
reasoning_content or content must be non-empty". The test's purpose is to
verify the parameter is accepted and inference completes — not that the
model always emits non-empty answer text.

https://claude.ai/code/session_01YUwM7xe9R45FsDCod1cjS7
---
 .../ladenthin/llama/ReasoningBudgetTest.java  | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
index d834987c..84a12343 100644
--- a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
+++ b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
@@ -191,8 +191,13 @@ public void testReasoningBudgetZero_expectedBehavior_suppressesThinking() {
     }
 
     /**
-     * A positive {@code reasoning_budget_tokens} value is accepted, the call completes,
-     * and the model produces a non-empty answer.
+     * A positive {@code reasoning_budget_tokens} value is accepted and the call completes
+     * without error.
+     *
+     * <p>The assertion checks that the model produced a non-empty response — either in
+     * {@code reasoning_content} or {@code content}. On slow or constrained hardware the
+     * model may exhaust the token budget inside the thinking block and emit an empty
+     * {@code content}; checking both fields makes the test robust to that behaviour.
      *
      * <p>See {@link #testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed} for
      * the note on why the budget count itself is not asserted.
@@ -207,8 +212,14 @@ public void testReasoningBudgetPositive_parameterAccepted() {
 
         String json = model.chatComplete(params);
         Assert.assertNotNull("Response JSON must not be null", json);
+
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
         String content = parser.extractChoiceContent(json);
-        Assert.assertFalse("content must not be empty",
-                content == null || content.trim().isEmpty());
+        boolean hasReasoning = reasoningContent != null && !reasoningContent.trim().isEmpty();
+        boolean hasContent   = content          != null && !content.trim().isEmpty();
+        Assert.assertTrue(
+                "model must produce at least some output in reasoning_content or content, " +
+                "but both were empty",
+                hasReasoning || hasContent);
     }
 }