test: document real root cause of reasoning_budget_tokens being ignored

claude · claude · commit 30b47fc69f90 · 2026-05-15T12:11:50.000Z
The previous Javadoc blamed "prompt-injected thinking / sampler never fires". The actual bug is in oaicompat_chat_params_parse (server-common.cpp): the reasoning-budget block unconditionally writes the model-level default (-1) to llama_params["reasoning_budget_tokens"] before the generic copy loop runs. The copy loop then skips the per-request value from the request body because the key already exists, so the sampler is never created. Changes: - Update class Javadoc to describe the real bug (copy-loop skip in server-common.cpp) - Update testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed Javadoc to point at the actual fix location; assertion is unchanged (still documents broken state) - Add @ignore testReasoningBudgetZero_expectedBehavior_suppressesThinking with the exact 3-line fix and instructions for when to enable it https://claude.ai/code/session_01YUwM7xe9R45FsDCod1cjS7
diff --git a/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java b/src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java
@@ -9,6 +9,7 @@
 import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 
 /**
@@ -27,15 +28,19 @@
  *       {@code --reasoning-format deepseek} at model load time causes the server to
  *       strip the {@code <think>…</think>} block from the response body and surface it
  *       in {@code reasoning_content}.</li>
- *   <li><b>{@code reasoning_budget_tokens} is NOT enforced for Qwen3.</b> This
- *       confirms the behaviour reported by users. The root cause: Qwen3 uses
- *       <em>prompt-injected</em> thinking — the chat template writes {@code <think>}
- *       into the prompt context, so generation starts already inside a thinking block.
- *       llama.cpp's reasoning-budget sampler monitors for a <em>generated</em>
- *       {@code <think>} token; since the token is already in the prompt it never
- *       triggers, and the budget counter never starts. This is a llama.cpp limitation,
- *       not a defect in parameter serialisation (which is separately verified by
- *       {@code InferenceParametersTest} and the C++ unit tests).</li>
+ *   <li><b>{@code reasoning_budget_tokens} is NOT enforced for any model when set
+ *       per-request.</b> The root cause is a bug in
+ *       {@code tools/server/server-common.cpp}, function
+ *       {@code oaicompat_chat_params_parse}: the reasoning-budget block writes
+ *       the model-level default ({@code opt.reasoning_budget}, typically &#x2212;1)
+ *       into {@code llama_params["reasoning_budget_tokens"]} before the generic
+ *       copy loop runs. The copy loop then skips the per-request value from the
+ *       request body because the key already exists
+ *       ({@code !llama_params.contains(item.key())} is false). Result: the
+ *       reasoning-budget sampler is never created (it requires
+ *       {@code reasoning_budget_tokens &#x2265; 0}), and any per-request budget
+ *       has no effect. Parameter serialisation itself is correct — see
+ *       {@code InferenceParametersTest} and the C++ unit tests.</li>
  * </ol>
  */
 @ClaudeGenerated(
@@ -102,15 +107,19 @@ public void testThinkingDefault_reasoningContentAndAnswerPresent() {
 
     /**
      * {@code reasoning_budget_tokens=0} is accepted by the API and the response
-     * completes without error.
+     * completes without error, but the budget is NOT enforced.
      *
-     * <p><b>Known limitation:</b> for Qwen3, the budget is <em>not</em> enforced.
-     * Qwen3's chat template injects {@code <think>} into the prompt, so generation
-     * begins already inside a thinking block. llama.cpp's reasoning-budget sampler
-     * only monitors for a <em>generated</em> {@code <think>} token; since it is already
-     * in the prompt context the sampler never fires. As a result {@code reasoning_content}
-     * remains non-empty despite the zero budget. This is a llama.cpp limitation, not a
-     * bug in parameter serialisation.
+     * <p><b>Documents current (broken) behaviour.</b> The per-request value is
+     * silently discarded by a bug in {@code tools/server/server-common.cpp}
+     * ({@code oaicompat_chat_params_parse}): the reasoning-budget block writes the
+     * model-level default (&#x2212;1) to {@code llama_params["reasoning_budget_tokens"]}
+     * before the generic copy loop runs, and the copy loop then skips the user value
+     * because the key already exists. The reasoning-budget sampler is therefore never
+     * created, and {@code reasoning_content} remains non-empty.
+     *
+     * <p>This assertion will start <b>failing</b> once the llama.cpp bug is fixed —
+     * that is the signal to remove this test and enable
+     * {@link #testReasoningBudgetZero_expectedBehavior_suppressesThinking}.
      */
     @Test
     public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed() {
@@ -121,18 +130,63 @@ public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed() {
 
         String json = model.chatComplete(params);
 
-        // The call must complete without throwing.
         Assert.assertNotNull("Response JSON must not be null", json);
 
-        // Document current (broken) behaviour: reasoning_content is non-empty even
-        // though budget=0 should have suppressed it.  This assertion will start FAILING
-        // once llama.cpp adds support for prompt-prefilled thinking contexts, which is
-        // the signal to flip it to assertFalse and close the limitation.
         String reasoningContent = parser.extractChoiceReasoningContent(json);
         Assert.assertFalse(
-                "reasoning_content is expected to be present because budget enforcement " +
-                "does not work for Qwen3 (prompt-injected thinking). " +
-                "If this assertion fails, budget enforcement has been fixed — update the test.",
+                "reasoning_content is expected to be present because the per-request " +
+                "budget is not applied (llama.cpp server-common.cpp copy-loop bug). " +
+                "If this assertion fails, the bug has been fixed — remove this test and " +
+                "enable testReasoningBudgetZero_expectedBehavior_suppressesThinking.",
+                reasoningContent == null || reasoningContent.trim().isEmpty());
+    }
+
+    /**
+     * Expected correct behaviour after the llama.cpp bug is fixed.
+     *
+     * <p><b>Bug:</b> In {@code tools/server/server-common.cpp},
+     * {@code oaicompat_chat_params_parse} sets
+     * {@code llama_params["reasoning_budget_tokens"]} to the model-level default
+     * ({@code opt.reasoning_budget}, typically &#x2212;1) before the generic copy
+     * loop runs. The copy loop then skips the per-request value from the request
+     * body because the key already exists. Result: the sampler is never created
+     * ({@code reasoning_budget_tokens &#x2265; 0} is required), and budget=0
+     * has no effect.
+     *
+     * <p><b>Fix (server-common.cpp, reasoning budget block):</b>
+     * Read {@code reasoning_budget_tokens} from the request body <em>before</em>
+     * writing to {@code llama_params}:
+     * <pre>
+     * int reasoning_budget = opt.reasoning_budget;
+     * if (body.contains("reasoning_budget_tokens")) {
+     *     reasoning_budget = json_value(body, "reasoning_budget_tokens", reasoning_budget);
+     * }
+     * if (reasoning_budget == -1 &amp;&amp; body.contains("thinking_budget_tokens")) {
+     *     reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
+     * }
+     * </pre>
+     *
+     * <p>Once this fix is applied: remove {@code @Ignore}, confirm this test passes,
+     * and remove
+     * {@link #testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed}.
+     */
+    @Ignore("llama.cpp bug: per-request reasoning_budget_tokens is overwritten by model default " +
+            "in oaicompat_chat_params_parse (server-common.cpp). " +
+            "See Javadoc for exact fix location and code.")
+    @Test
+    public void testReasoningBudgetZero_expectedBehavior_suppressesThinking() {
+        InferenceParameters params = new InferenceParameters("")
+                .setMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
+                .setReasoningBudgetTokens(0)
+                .setNPredict(N_PREDICT);
+
+        String json = model.chatComplete(params);
+        Assert.assertNotNull("Response JSON must not be null", json);
+
+        String reasoningContent = parser.extractChoiceReasoningContent(json);
+        Assert.assertTrue(
+                "reasoning_content should be empty when budget=0 suppresses thinking, " +
+                "but was: " + reasoningContent,
                 reasoningContent == null || reasoningContent.trim().isEmpty());
     }