Skip to content

Commit 9a040b5

Browse files
committed
Add ReasoningBudgetTest with Qwen3-0.6B to verify budget enforcement
Adds an integration test class that loads a Qwen3-0.6B thinking model and asserts that reasoning_budget_tokens=0 produces empty reasoning_content, confirming the budget is enforced by llama.cpp's sampling layer. Also adds the Qwen3-0.6B-Q4_K_M model to the CI download and validation steps on all platforms. Tests skip cleanly when the model file is absent. https://claude.ai/code/session_01YUwM7xe9R45FsDCod1cjS7
1 parent 9e0c4f2 commit 9a040b5

5 files changed

Lines changed: 178 additions & 1 deletion

File tree

.github/validate-models.bat

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ REM GGUF files start with magic bytes: 0x47 0x47 0x55 0x46 ("GGUF")
44

55
setlocal enabledelayedexpansion
66

7-
set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf"
7+
set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf"
88

99
echo Validating model files...
1010
for %%M in (%MODELS%) do (

.github/validate-models.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ MODELS=(
88
"models/codellama-7b.Q2_K.gguf"
99
"models/jina-reranker-v1-tiny-en-Q4_0.gguf"
1010
"models/AMD-Llama-135m-code.Q2_K.gguf"
11+
"models/Qwen3-0.6B-Q4_K_M.gguf"
1112
)
1213

1314
echo "Validating model files..."

.github/workflows/publish.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ env:
2121
RERANKING_MODEL_NAME: "jina-reranker-v1-tiny-en-Q4_0.gguf"
2222
DRAFT_MODEL_URL: "https://huggingface.co/QuantFactory/AMD-Llama-135m-code-GGUF/resolve/main/AMD-Llama-135m-code.Q2_K.gguf"
2323
DRAFT_MODEL_NAME: "AMD-Llama-135m-code.Q2_K.gguf"
24+
REASONING_MODEL_URL: "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
25+
REASONING_MODEL_NAME: "Qwen3-0.6B-Q4_K_M.gguf"
2426
permissions:
2527
contents: read
2628
jobs:
@@ -308,6 +310,8 @@ jobs:
308310
run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
309311
- name: Download draft model
310312
run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
313+
- name: Download reasoning model
314+
run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
311315
- name: List files in models directory
312316
run: ls -l models/
313317
- name: Validate model files
@@ -357,6 +361,8 @@ jobs:
357361
run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
358362
- name: Download draft model
359363
run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
364+
- name: Download reasoning model
365+
run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
360366
- name: List files in models directory
361367
run: ls -l models/
362368
- name: Validate model files
@@ -417,6 +423,8 @@ jobs:
417423
run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
418424
- name: Download draft model
419425
run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
426+
- name: Download reasoning model
427+
run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
420428
- name: List files in models directory
421429
run: ls -l models/
422430
- name: Validate model files
@@ -468,6 +476,8 @@ jobs:
468476
run: curl -L --fail ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
469477
- name: Download draft model
470478
run: curl -L --fail ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
479+
- name: Download reasoning model
480+
run: curl -L --fail ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
471481
- name: List files in models directory
472482
run: ls -l models/
473483
- name: Validate model files
@@ -522,6 +532,8 @@ jobs:
522532
run: curl -L --fail $env:RERANKING_MODEL_URL --create-dirs -o models/$env:RERANKING_MODEL_NAME
523533
- name: Download draft model
524534
run: curl -L --fail $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME
535+
- name: Download reasoning model
536+
run: curl -L --fail $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME
525537
- name: List files in models directory
526538
run: ls -l models/
527539
- name: Validate model files
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
package net.ladenthin.llama;
2+
3+
import java.io.File;
4+
import java.util.Collections;
5+
6+
import net.ladenthin.llama.args.ReasoningFormat;
7+
import net.ladenthin.llama.json.ChatResponseParser;
8+
import org.junit.AfterClass;
9+
import org.junit.Assert;
10+
import org.junit.Assume;
11+
import org.junit.BeforeClass;
12+
import org.junit.Test;
13+
14+
/**
15+
* Integration tests verifying that {@link InferenceParameters#setReasoningBudgetTokens(int)}
16+
* is actually enforced by the llama.cpp sampling layer when running a thinking-capable model.
17+
*
18+
* <p>These tests require the Qwen3-0.6B-Q4_K_M model (downloaded by CI). When the model file
19+
* is absent the entire class is skipped (same pattern as all other model-dependent test classes).
20+
*
21+
* <p>Background: a user reported that {@code setReasoningBudgetTokens()} appeared to have no
22+
* effect on Qwen 3.0 0.6B / 3.5 0.8B. Possible root causes are:
23+
* <ol>
24+
* <li>The model was not entering thinking mode (missing {@code enable_thinking=true} kwarg).</li>
25+
* <li>{@code reasoning_format} was not configured so thinking tokens were inline, not extracted.</li>
26+
* <li>The budget mechanism in llama.cpp does not work for this model family.</li>
27+
* </ol>
28+
*
29+
* <p>Test 1 ({@link #testReasoningBudgetZero_suppressesThinking}) is the critical regression
30+
* guard: with {@code reasoning_budget_tokens=0} and thinking explicitly enabled, the sampler
31+
* must force-close the thinking block immediately, producing an empty {@code reasoning_content}.
32+
* If this test fails, the budget parameter is being ignored.
33+
*/
34+
@ClaudeGenerated(
35+
purpose = "Integration tests for setReasoningBudgetTokens() enforcement: verifies that " +
36+
"budget=0 suppresses thinking tokens, budget=-1 allows them, and that thinking " +
37+
"is absent when enable_thinking is not set."
38+
)
39+
public class ReasoningBudgetTest {
40+
41+
private static LlamaModel model;
42+
private final ChatResponseParser parser = new ChatResponseParser();
43+
44+
@BeforeClass
45+
public static void setup() {
46+
Assume.assumeTrue("Reasoning model not found, skipping ReasoningBudgetTest",
47+
new File(TestConstants.REASONING_MODEL_PATH).exists());
48+
int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
49+
model = new LlamaModel(
50+
new ModelParameters()
51+
.setModel(TestConstants.REASONING_MODEL_PATH)
52+
.setCtxSize(1024)
53+
.setGpuLayers(gpuLayers)
54+
.setFit(false)
55+
.setReasoningFormat(ReasoningFormat.DEEPSEEK)
56+
.enableLogTimestamps().enableLogPrefix()
57+
);
58+
}
59+
60+
@AfterClass
61+
public static void tearDown() {
62+
if (model != null) {
63+
model.close();
64+
}
65+
}
66+
67+
/**
68+
* With {@code reasoning_budget_tokens=0} the sampler must force-close the thinking block
69+
* immediately after it opens, so {@code reasoning_content} must be empty.
70+
*
71+
* <p>This is the critical test: if it fails, the budget parameter is being silently ignored
72+
* by llama.cpp's sampling layer for Qwen3 models.
73+
*/
74+
@Test
75+
public void testReasoningBudgetZero_suppressesThinking() {
76+
InferenceParameters params = new InferenceParameters("")
77+
.setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
78+
.setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
79+
.setReasoningBudgetTokens(0)
80+
.setNPredict(200);
81+
82+
String json = model.chatComplete(params);
83+
String reasoningContent = parser.extractChoiceReasoningContent(json);
84+
85+
Assert.assertTrue(
86+
"reasoning_content must be empty when reasoning_budget_tokens=0, got: " + reasoningContent,
87+
reasoningContent == null || reasoningContent.trim().isEmpty()
88+
);
89+
}
90+
91+
/**
92+
* With {@code reasoning_budget_tokens=-1} (unlimited) and thinking enabled the call must
93+
* complete without error and produce a non-empty response. We do not assert that thinking
94+
* tokens are present because a small model may answer directly even when thinking is enabled.
95+
*/
96+
@Test
97+
public void testReasoningBudgetUnlimited_completesSuccessfully() {
98+
InferenceParameters params = new InferenceParameters("")
99+
.setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
100+
.setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
101+
.setReasoningBudgetTokens(-1)
102+
.setNPredict(200);
103+
104+
String json = model.chatComplete(params);
105+
Assert.assertNotNull("Response JSON must not be null", json);
106+
String content = parser.extractChoiceContent(json);
107+
Assert.assertFalse("Response content must not be empty",
108+
content == null || content.trim().isEmpty());
109+
}
110+
111+
/**
112+
* Without {@code enable_thinking=true} in chat template kwargs, Qwen3 should not emit
113+
* thinking tokens. {@code reasoning_content} must be absent regardless of budget.
114+
*/
115+
@Test
116+
public void testThinkingNotEnabled_reasoningContentAbsent() {
117+
InferenceParameters params = new InferenceParameters("")
118+
.setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
119+
.setReasoningBudgetTokens(-1)
120+
.setNPredict(100);
121+
122+
String json = model.chatComplete(params);
123+
String reasoningContent = parser.extractChoiceReasoningContent(json);
124+
125+
Assert.assertTrue(
126+
"reasoning_content should be absent when thinking is not enabled, got: " + reasoningContent,
127+
reasoningContent == null || reasoningContent.trim().isEmpty()
128+
);
129+
}
130+
131+
/**
132+
* With a non-zero budget, generation must complete and produce a usable answer. If reasoning
133+
* content is present, its length must be consistent with a 100-token budget (roughly 400–600
134+
* characters for typical BPE tokenisation; 800 is a generous upper bound).
135+
*/
136+
@Test
137+
public void testReasoningBudgetLimited_doesNotExceedBudget() {
138+
InferenceParameters params = new InferenceParameters("")
139+
.setMessages(null, Collections.singletonList(
140+
new Pair<>("user", "Think step by step: what is 3 times 7?")))
141+
.setChatTemplateKwargs(Collections.singletonMap("enable_thinking", "true"))
142+
.setReasoningBudgetTokens(100)
143+
.setNPredict(400);
144+
145+
String json = model.chatComplete(params);
146+
String reasoningContent = parser.extractChoiceReasoningContent(json);
147+
String content = parser.extractChoiceContent(json);
148+
149+
Assert.assertFalse("Response content must not be empty",
150+
content == null || content.trim().isEmpty());
151+
152+
if (reasoningContent != null && !reasoningContent.trim().isEmpty()) {
153+
// 100 tokens * ~4–6 chars/token = 400–600 chars; 800 is a generous upper bound
154+
Assert.assertTrue(
155+
"Reasoning content length suggests budget was exceeded (length=" +
156+
reasoningContent.length() + ")",
157+
reasoningContent.length() <= 800
158+
);
159+
}
160+
}
161+
}

src/test/java/net/ladenthin/llama/TestConstants.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,7 @@ class TestConstants {
1313
/** Path to the draft model used for speculative decoding tests. */
1414
static final String DRAFT_MODEL_PATH = "models/AMD-Llama-135m-code.Q2_K.gguf";
1515

16+
/** Path to the Qwen3 thinking model used for reasoning budget tests. */
17+
static final String REASONING_MODEL_PATH = "models/Qwen3-0.6B-Q4_K_M.gguf";
18+
1619
}

0 commit comments

Comments
 (0)