Add ReasoningBudgetTest with Qwen3-0.6B to verify budget enforcement

claude · claude · commit 9a040b5a4235 · 2026-05-15T10:38:31.000Z
Adds an integration test class that loads a Qwen3-0.6B thinking model and asserts that reasoning_budget_tokens=0 produces empty reasoning_content, confirming the budget is enforced by llama.cpp's sampling layer. Also adds the Qwen3-0.6B-Q4_K_M model to the CI download and validation steps on all platforms. Tests skip cleanly when the model file is absent. https://claude.ai/code/session_01YUwM7xe9R45FsDCod1cjS7
diff --git a/.github/validate-models.bat b/.github/validate-models.bat
@@ -4,7 +4,7 @@ REM GGUF files start with magic bytes: 0x47 0x47 0x55 0x46 ("GGUF")
 
 setlocal enabledelayedexpansion
 
-set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf"
+set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf"
 
 echo Validating model files...
 for %%M in (%MODELS%) do (
diff --git a/.github/validate-models.sh b/.github/validate-models.sh
@@ -8,6 +8,7 @@ MODELS=(
   "models/codellama-7b.Q2_K.gguf"
   "models/jina-reranker-v1-tiny-en-Q4_0.gguf"
   "models/AMD-Llama-135m-code.Q2_K.gguf"
+  "models/Qwen3-0.6B-Q4_K_M.gguf"
 )
 
 echo "Validating model files..."
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -21,6 +21,8 @@ env:
   RERANKING_MODEL_NAME: "jina-reranker-v1-tiny-en-Q4_0.gguf"
   DRAFT_MODEL_URL: "https://huggingface.co/QuantFactory/AMD-Llama-135m-code-GGUF/resolve/main/AMD-Llama-135m-code.Q2_K.gguf"
   DRAFT_MODEL_NAME: "AMD-Llama-135m-code.Q2_K.gguf"
+  REASONING_MODEL_URL: "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
+  REASONING_MODEL_NAME: "Qwen3-0.6B-Q4_K_M.gguf"
 permissions:
   contents: read
 jobs:
@@ -308,6 +310,8 @@ jobs:
         run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
         run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+      - name: Download reasoning model
+        run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -357,6 +361,8 @@ jobs:
         run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
         run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+      - name: Download reasoning model
+        run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -417,6 +423,8 @@ jobs:
         run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
         run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+      - name: Download reasoning model
+        run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -468,6 +476,8 @@ jobs:
         run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
         run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+      - name: Download reasoning model
+        run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -522,6 +532,8 @@ jobs:
         run: curl -L --fail $env:RERANKING_MODEL_URL --create-dirs -o models/$env:RERANKING_MODEL_NAME
       - name: Download draft model
         run: curl -L --fail $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME
+      - name: Download reasoning model
+        run: curl -L --fail $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
diff --git a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
@@ -0,0 +1,161 @@
+package net.ladenthin.llama;
+
+import java.io.File;
+import java.util.Collections;
+
+import net.ladenthin.llama.args.ReasoningFormat;
+import net.ladenthin.llama.json.ChatResponseParser;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Assume;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Integration tests verifying that {@link InferenceParameters#setReasoningBudgetTokens(int)}
+ * is actually enforced by the llama.cpp sampling layer when running a thinking-capable model.
+ *
+ * <p>These tests require the Qwen3-0.6B-Q4_K_M model (downloaded by CI). When the model file
+ * is absent the entire class is skipped (same pattern as all other model-dependent test classes).
+ *
+ * <p>Background: a user reported that {@code setReasoningBudgetTokens()} appeared to have no
+ * effect on Qwen 3.0 0.6B / 3.5 0.8B. Possible root causes are:
+ * <ol>
+ *   <li>The model was not entering thinking mode (missing {@code enable_thinking=true} kwarg).</li>
+ *   <li>{@code reasoning_format} was not configured so thinking tokens were inline, not extracted.</li>
+ *   <li>The budget mechanism in llama.cpp does not work for this model family.</li>
+ * </ol>
+ *
+ * <p>Test 1 ({@link #testReasoningBudgetZero_suppressesThinking}) is the critical regression
+ * guard: with {@code reasoning_budget_tokens=0} and thinking explicitly enabled, the sampler
+ * must force-close the thinking block immediately, producing an empty {@code reasoning_content}.
+ * If this test fails, the budget parameter is being ignored.
+ */
+@ClaudeGenerated(
+        purpose = "Integration tests for setReasoningBudgetTokens() enforcement: verifies that " +
+                  "budget=0 suppresses thinking tokens, budget=-1 allows them, and that thinking " +
+                  "is absent when enable_thinking is not set."
+)
+public class ReasoningBudgetTest {
+
+    private static LlamaModel model;
+    private final ChatResponseParser parser = new ChatResponseParser();
+
+    @BeforeClass
+    public static void setup() {
+        Assume.assumeTrue("Reasoning model not found, skipping ReasoningBudgetTest",
+                new File(TestConstants.REASONING_MODEL_PATH).exists());
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+        model = new LlamaModel(
+                new ModelParameters()
+                        .setModel(TestConstants.REASONING_MODEL_PATH)
+                        .setCtxSize(1024)
+                        .setGpuLayers(gpuLayers)
+                        .setFit(false)
+                        .setReasoningFormat(ReasoningFormat.DEEPSEEK)
+                        .enableLogTimestamps().enableLogPrefix()
+        );
+    }
+
+    @AfterClass
+    public static void tearDown() {
+        if (model != null) {
+            model.close();
+        }
+    }
+
+    /**
+     * With {@code reasoning_budget_tokens=0} the sampler must force-close the thinking block
+     * immediately after it opens, so {@code reasoning_content} must be empty.
+     *
+     * <p>This is the critical test: if it fails, the budget parameter is being silently ignored
+     * by llama.cpp's sampling layer for Qwen3 models.
+     */
+    @Test
+    public void testReasoningBudgetZero_suppressesThinking() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
+                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
+                .setReasoningBudgetTokens(0)
+                .setNPredict(200);
+
+        String json = model.chatComplete(params);
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
+
+        Assert.assertTrue(
+                "reasoning_content must be empty when reasoning_budget_tokens=0, got: " + reasoningContent,
+                reasoningContent == null || reasoningContent.trim().isEmpty()
+        );
+    }
+
+    /**
+     * With {@code reasoning_budget_tokens=-1} (unlimited) and thinking enabled the call must
+     * complete without error and produce a non-empty response. We do not assert that thinking
+     * tokens are present because a small model may answer directly even when thinking is enabled.
+     */
+    @Test
+    public void testReasoningBudgetUnlimited_completesSuccessfully() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
+                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
+                .setReasoningBudgetTokens(-1)
+                .setNPredict(200);
+
+        String json = model.chatComplete(params);
+        Assert.assertNotNull("Response JSON must not be null", json);
+        String content = parser.extractChoiceContent(json);
+        Assert.assertFalse("Response content must not be empty",
+                content == null || content.trim().isEmpty());
+    }
+
+    /**
+     * Without {@code enable_thinking=true} in chat template kwargs, Qwen3 should not emit
+     * thinking tokens. {@code reasoning_content} must be absent regardless of budget.
+     */
+    @Test
+    public void testThinkingNotEnabled_reasoningContentAbsent() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
+                .setReasoningBudgetTokens(-1)
+                .setNPredict(100);
+
+        String json = model.chatComplete(params);
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
+
+        Assert.assertTrue(
+                "reasoning_content should be absent when thinking is not enabled, got: " + reasoningContent,
+                reasoningContent == null || reasoningContent.trim().isEmpty()
+        );
+    }
+
+    /**
+     * With a non-zero budget, generation must complete and produce a usable answer. If reasoning
+     * content is present, its length must be consistent with a 100-token budget (roughly 400–600
+     * characters for typical BPE tokenisation; 800 is a generous upper bound).
+     */
+    @Test
+    public void testReasoningBudgetLimited_doesNotExceedBudget() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(
+                        new Pair<>("user", "Think step by step: what is 3 times 7?")))
+                .setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
+                .setReasoningBudgetTokens(100)
+                .setNPredict(400);
+
+        String json = model.chatComplete(params);
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
+        String content = parser.extractChoiceContent(json);
+
+        Assert.assertFalse("Response content must not be empty",
+                content == null || content.trim().isEmpty());
+
+        if (reasoningContent != null && !reasoningContent.trim().isEmpty()) {
+            // 100 tokens * ~4–6 chars/token = 400–600 chars; 800 is a generous upper bound
+            Assert.assertTrue(
+                    "Reasoning content length suggests budget was exceeded (length=" +
+                            reasoningContent.length() + ")",
+                    reasoningContent.length() <= 800
+            );
+        }
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/TestConstants.java b/src/test/java/net/ladenthin/llama/TestConstants.java
@@ -13,4 +13,7 @@ class TestConstants {
 	/** Path to the draft model used for speculative decoding tests. */
 	static final String DRAFT_MODEL_PATH = "models/AMD-Llama-135m-code.Q2_K.gguf";
 
+	/** Path to the Qwen3 thinking model used for reasoning budget tests. */
+	static final String REASONING_MODEL_PATH = "models/Qwen3-0.6B-Q4_K_M.gguf";
+
 }

Original file line number	Diff line number	Diff line change
`@@ -8,6 +8,7 @@ MODELS=(`
`8`	`8`	`"models/codellama-7b.Q2_K.gguf"`
`9`	`9`	`"models/jina-reranker-v1-tiny-en-Q4_0.gguf"`
`10`	`10`	`"models/AMD-Llama-135m-code.Q2_K.gguf"`
	`11`	`+ "models/Qwen3-0.6B-Q4_K_M.gguf"`
`11`	`12`	`)`
`12`	`13`
`13`	`14`	`echo "Validating model files..."`
Original file line number	Diff line number	Diff line change
`@@ -13,4 +13,7 @@ class TestConstants {`
`13`	`13`	`/** Path to the draft model used for speculative decoding tests. */`
`14`	`14`	`static final String DRAFT_MODEL_PATH = "models/AMD-Llama-135m-code.Q2_K.gguf";`
`15`	`15`
	`16`	`+ /** Path to the Qwen3 thinking model used for reasoning budget tests. */`
	`17`	`+ static final String REASONING_MODEL_PATH = "models/Qwen3-0.6B-Q4_K_M.gguf";`
	`18`	`+`
`16`	`19`	`}`