Skip to content

Commit ce9fcfc

Browse files
Merge pull request #257 from bernardladenthin/claude/funny-thompson-y9xgxw
Fix flaky reasoning budget tests on Metal by using greedy sampling
2 parents 42f2f2a + 5829f53 commit ce9fcfc

1 file changed

Lines changed: 12 additions & 0 deletions

File tree

src/test/java/net/ladenthin/llama/ReasoningBudgetTest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,17 @@ public static void tearDown() {
9797
* Qwen3 enters thinking mode by default. With {@code reasoning_format=deepseek} set
9898
* at model level, the thinking tokens must appear in {@code reasoning_content} and
9999
* the final answer must appear in {@code content}.
100+
*
101+
* <p>{@code temperature=0} (greedy sampling) is used so the model deterministically
102+
* enters the {@code <think>} block on every platform, including Metal (macOS arm64)
103+
* where GPU floating-point arithmetic can produce slightly different logit
104+
* distributions and occasionally sample a non-thinking first token.
100105
*/
101106
@Test
102107
public void testThinkingDefault_reasoningContentAndAnswerPresent() {
103108
InferenceParameters params = new InferenceParameters("")
104109
.withMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
110+
.withTemperature(0.0f)
105111
.withNPredict(N_PREDICT);
106112

107113
String json = model.chatComplete(params);
@@ -132,11 +138,17 @@ public void testThinkingDefault_reasoningContentAndAnswerPresent() {
132138
* that is the signal to remove this test and enable
133139
* {@link #testReasoningBudgetZero_expectedBehavior_suppressesThinking}.
134140
* Tracked in <a href="https://github.com/ggml-org/llama.cpp/pull/23116">llama.cpp PR #23116</a>.
141+
*
142+
* <p>{@code temperature=0} (greedy sampling) is used so the model deterministically
143+
* enters the {@code <think>} block on every platform. Without it, Metal (macOS arm64)
144+
* occasionally samples a non-thinking first token even when the budget is unlimited
145+
* (due to the bug), causing a spurious test failure.
135146
*/
136147
@Test
137148
public void testReasoningBudgetZero_parameterAccepted_thinkingNotSuppressed() {
138149
InferenceParameters params = new InferenceParameters("")
139150
.withMessages(null, Collections.singletonList(new Pair<>("user", "What is 2+2?")))
151+
.withTemperature(0.0f)
140152
.withReasoningBudgetTokens(0)
141153
.withNPredict(N_PREDICT);
142154

0 commit comments

Comments
 (0)