99import org .junit .Assert ;
1010import org .junit .Assume ;
1111import org .junit .BeforeClass ;
12+ import org .junit .Ignore ;
1213import org .junit .Test ;
1314
1415/**
2728 * {@code --reasoning-format deepseek} at model load time causes the server to
2829 * strip the {@code <think>…</think>} block from the response body and surface it
2930 * in {@code reasoning_content}.</li>
30- * <li><b>{@code reasoning_budget_tokens} is NOT enforced for Qwen3.</b> This
31- * confirms the behaviour reported by users. The root cause: Qwen3 uses
32- * <em>prompt-injected</em> thinking — the chat template writes {@code <think>}
33- * into the prompt context, so generation starts already inside a thinking block.
34- * llama.cpp's reasoning-budget sampler monitors for a <em>generated</em>
35- * {@code <think>} token; since the token is already in the prompt it never
36- * triggers, and the budget counter never starts. This is a llama.cpp limitation,
37- * not a defect in parameter serialisation (which is separately verified by
38- * {@code InferenceParametersTest} and the C++ unit tests).</li>
31+ * <li><b>{@code reasoning_budget_tokens} is NOT enforced for any model when set
32+ * per-request.</b> The root cause is a bug in
33+ * {@code tools/server/server-common.cpp}, function
34+ * {@code oaicompat_chat_params_parse}: the reasoning-budget block writes
35+ * the model-level default ({@code opt.reasoning_budget}, typically −1)
36+ * into {@code llama_params["reasoning_budget_tokens"]} before the generic
37+ * copy loop runs. The copy loop then skips the per-request value from the
38+ * request body because the key already exists
39+ * ({@code !llama_params.contains(item.key())} is false). Result: the
40+ * reasoning-budget sampler is never created (it requires
41+ * {@code reasoning_budget_tokens ≥ 0}), and any per-request budget
42+ * has no effect. Parameter serialisation itself is correct — see
43+ * {@code InferenceParametersTest} and the C++ unit tests.</li>
3944 * </ol>
4045 */
4146@ ClaudeGenerated (
@@ -102,15 +107,19 @@ public void testThinkingDefault_reasoningContentAndAnswerPresent() {
102107
103108 /**
104109 * {@code reasoning_budget_tokens=0} is accepted by the API and the response
105- * completes without error.
110+ * completes without error, but the budget is NOT enforced .
106111 *
107- * <p><b>Known limitation:</b> for Qwen3, the budget is <em>not</em> enforced.
108- * Qwen3's chat template injects {@code <think>} into the prompt, so generation
109- * begins already inside a thinking block. llama.cpp's reasoning-budget sampler
110- * only monitors for a <em>generated</em> {@code <think>} token; since it is already
111- * in the prompt context the sampler never fires. As a result {@code reasoning_content}
112- * remains non-empty despite the zero budget. This is a llama.cpp limitation, not a
113- * bug in parameter serialisation.
112+ * <p><b>Documents current (broken) behaviour.</b> The per-request value is
113+ * silently discarded by a bug in {@code tools/server/server-common.cpp}
114+ * ({@code oaicompat_chat_params_parse}): the reasoning-budget block writes the
115+ * model-level default (−1) to {@code llama_params["reasoning_budget_tokens"]}
116+ * before the generic copy loop runs, and the copy loop then skips the user value
117+ * because the key already exists. The reasoning-budget sampler is therefore never
118+ * created, and {@code reasoning_content} remains non-empty.
119+ *
120+ * <p>This assertion will start <b>failing</b> once the llama.cpp bug is fixed —
121+ * that is the signal to remove this test and enable
122+ * {@link #testReasoningBudgetZero_expectedBehavior_suppressesThinking}.
114123 */
115124 @ Test
116125 public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed () {
@@ -121,18 +130,63 @@ public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed() {
121130
122131 String json = model .chatComplete (params );
123132
124- // The call must complete without throwing.
125133 Assert .assertNotNull ("Response JSON must not be null" , json );
126134
127- // Document current (broken) behaviour: reasoning_content is non-empty even
128- // though budget=0 should have suppressed it. This assertion will start FAILING
129- // once llama.cpp adds support for prompt-prefilled thinking contexts, which is
130- // the signal to flip it to assertFalse and close the limitation.
131135 String reasoningContent = parser .extractChoiceReasoningContent (json );
132136 Assert .assertFalse (
133- "reasoning_content is expected to be present because budget enforcement " +
134- "does not work for Qwen3 (prompt-injected thinking). " +
135- "If this assertion fails, budget enforcement has been fixed — update the test." ,
137+ "reasoning_content is expected to be present because the per-request " +
138+ "budget is not applied (llama.cpp server-common.cpp copy-loop bug). " +
139+ "If this assertion fails, the bug has been fixed — remove this test and " +
140+ "enable testReasoningBudgetZero_expectedBehavior_suppressesThinking." ,
141+ reasoningContent == null || reasoningContent .trim ().isEmpty ());
142+ }
143+
144+ /**
145+ * Expected correct behaviour after the llama.cpp bug is fixed.
146+ *
147+ * <p><b>Bug:</b> In {@code tools/server/server-common.cpp},
148+ * {@code oaicompat_chat_params_parse} sets
149+ * {@code llama_params["reasoning_budget_tokens"]} to the model-level default
150+ * ({@code opt.reasoning_budget}, typically −1) before the generic copy
151+ * loop runs. The copy loop then skips the per-request value from the request
152+ * body because the key already exists. Result: the sampler is never created
153+ * ({@code reasoning_budget_tokens ≥ 0} is required), and budget=0
154+ * has no effect.
155+ *
156+ * <p><b>Fix (server-common.cpp, reasoning budget block):</b>
157+ * Read {@code reasoning_budget_tokens} from the request body <em>before</em>
158+ * writing to {@code llama_params}:
159+ * <pre>
160+ * int reasoning_budget = opt.reasoning_budget;
161+ * if (body.contains("reasoning_budget_tokens")) {
162+ * reasoning_budget = json_value(body, "reasoning_budget_tokens", reasoning_budget);
163+ * }
164+ * if (reasoning_budget == -1 && body.contains("thinking_budget_tokens")) {
165+ * reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
166+ * }
167+ * </pre>
168+ *
169+ * <p>Once this fix is applied: remove {@code @Ignore}, confirm this test passes,
170+ * and remove
171+ * {@link #testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed}.
172+ */
173+ @ Ignore ("llama.cpp bug: per-request reasoning_budget_tokens is overwritten by model default " +
174+ "in oaicompat_chat_params_parse (server-common.cpp). " +
175+ "See Javadoc for exact fix location and code." )
176+ @ Test
177+ public void testReasoningBudgetZero_expectedBehavior_suppressesThinking () {
178+ InferenceParameters params = new InferenceParameters ("" )
179+ .setMessages (null , Collections .singletonList (new Pair <>("user" , "What is 2+2?" )))
180+ .setReasoningBudgetTokens (0 )
181+ .setNPredict (N_PREDICT );
182+
183+ String json = model .chatComplete (params );
184+ Assert .assertNotNull ("Response JSON must not be null" , json );
185+
186+ String reasoningContent = parser .extractChoiceReasoningContent (json );
187+ Assert .assertTrue (
188+ "reasoning_content should be empty when budget=0 suppresses thinking, " +
189+ "but was: " + reasoningContent ,
136190 reasoningContent == null || reasoningContent .trim ().isEmpty ());
137191 }
138192
0 commit comments