Skip to content

Commit 1c6c3ad

Browse files
localai-botmudler
andauthored
fix(reasoning): stop <think> leaking into content when autoparser is in pure-content mode (#9991)
When LocalAI templates a thinking model outside of jinja (the default for the qwen3 gallery family), llama.cpp's chat parser falls back to a "pure content" PEG parser that dumps the entire raw response into ChatDelta.Content with an empty ReasoningContent. The Go side then trusted that content verbatim and overrode tokenCallback's correctly-split reasoning, so <think>...</think> blocks ended up in the OpenAI `content` field. Regression from v4.0.0 introduced when the autoparser ChatDeltas path was added (#9224). The override now runs Go-side reasoning extraction defensively when the autoparser delivered content but no reasoning. The streaming worker gains a sticky preferAutoparser flag that flips on the first chunk where the autoparser classified reasoning_content; until then we use the streaming Go-side extractor. Realtime mirrors the non-streaming fallback. When the autoparser already populated ReasoningContent we trust it untouched, so jinja-enabled installs are not regressed. gallery/qwen3.yaml now enables use_jinja, letting the autoparser classify <think> natively for all 20+ qwen3 family entries that share this template. Fixes #9985 Assisted-by: Claude:opus-4-7 [Read] [Edit] [Bash] [Write] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
1 parent c2cd3b9 commit 1c6c3ad

6 files changed

Lines changed: 184 additions & 51 deletions

File tree

core/http/endpoints/openai/chat.go

Lines changed: 53 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,57 @@ func mergeToolCallDeltas(existing []schema.ToolCall, deltas []schema.ToolCall) [
6868
return existing
6969
}
7070

71+
// applyAutoparserOverride replaces the Go-side reasoning-extraction result with
72+
// the C++ autoparser's classified ChatDeltas when those deltas contain
73+
// actionable content or reasoning. It preserves the original logprobs.
74+
//
75+
// When the autoparser did not classify any reasoning (deltaReasoning == "") but
76+
// deltaContent still carries an unparsed reasoning tag pair (e.g. the
77+
// non-jinja "pure content" fallback path on a <think> model — issue #9985),
78+
// the Go-side reasoning extractor is run on deltaContent as a defensive
79+
// fallback so <think>…</think> blocks do not leak into the OpenAI `content`
80+
// field.
81+
func applyAutoparserOverride(
82+
chatDeltas []*pb.ChatDelta,
83+
thinkingStartToken string,
84+
reasoningConfig reason.Config,
85+
existing []schema.Choice,
86+
) []schema.Choice {
87+
if len(chatDeltas) == 0 {
88+
return existing
89+
}
90+
deltaContent := functions.ContentFromChatDeltas(chatDeltas)
91+
deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
92+
if deltaContent == "" && deltaReasoning == "" {
93+
return existing
94+
}
95+
// Fallback for non-jinja models (issue #9985): when the C++ autoparser
96+
// did not classify reasoning but the raw content still contains a known
97+
// reasoning tag pair, run Go-side extraction on the content so that the
98+
// <think>…</think> block does not leak into the OpenAI `content` field.
99+
// When the autoparser DID populate ReasoningContent, leave its
100+
// content/reasoning split alone — trust the parser. We replace
101+
// deltaContent unconditionally because ExtractReasoningWithConfig is a
102+
// no-op when no tag pair matches; this also strips empty thinking
103+
// blocks like "<think></think>" that some models emit when reasoning
104+
// is disabled.
105+
if deltaReasoning == "" && deltaContent != "" {
106+
deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig)
107+
}
108+
xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
109+
"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
110+
stopReason := FinishReasonStop
111+
message := &schema.Message{Role: "assistant", Content: &deltaContent}
112+
if deltaReasoning != "" {
113+
message.Reasoning = &deltaReasoning
114+
}
115+
newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
116+
if len(existing) > 0 && existing[0].Logprobs != nil {
117+
newChoice.Logprobs = existing[0].Logprobs
118+
}
119+
return []schema.Choice{newChoice}
120+
}
121+
71122
// ChatEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/chat/create
72123
// @Summary Generate a chat completions for a given prompt and model.
73124
// @Tags inference
@@ -757,24 +808,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
757808
// For non-tool requests: prefer C++ autoparser chat deltas over
758809
// Go-side tag extraction (which can mangle output when thinkingStartToken
759810
// differs from the model's actual reasoning tags, e.g. Gemma 4).
760-
if !shouldUseFn && len(chatDeltas) > 0 {
761-
deltaContent := functions.ContentFromChatDeltas(chatDeltas)
762-
deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
763-
if deltaContent != "" || deltaReasoning != "" {
764-
xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
765-
"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
766-
stopReason := FinishReasonStop
767-
message := &schema.Message{Role: "assistant", Content: &deltaContent}
768-
if deltaReasoning != "" {
769-
message.Reasoning = &deltaReasoning
770-
}
771-
newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
772-
// Preserve logprobs from the original result
773-
if len(result) > 0 && result[0].Logprobs != nil {
774-
newChoice.Logprobs = result[0].Logprobs
775-
}
776-
result = []schema.Choice{newChoice}
777-
}
811+
if !shouldUseFn {
812+
result = applyAutoparserOverride(chatDeltas, thinkingStartToken, config.ReasoningConfig, result)
778813
}
779814

780815
// Tool parsing is deferred here (only when shouldUseFn) so chat deltas are available

core/http/endpoints/openai/chat_stream_workers.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,13 @@ func processStream(
5252
thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
5353
extractor := reason.NewReasoningExtractor(thinkingStartToken, cfg.ReasoningConfig)
5454

55+
// preferAutoparser is sticky: once the C++ autoparser has ever classified
56+
// reasoning_content, we trust it for the rest of the stream. Until then we
57+
// fall back to Go-side extraction so that a "pure content" autoparser
58+
// (non-jinja path, issue #9985) does not leak <think>…</think> tokens
59+
// straight into the OpenAI `content` field.
60+
preferAutoparser := false
61+
5562
_, finalUsage, _, err := ComputeChoices(req, s, cfg, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
5663
var reasoningDelta, contentDelta string
5764

@@ -64,8 +71,16 @@ func processStream(
6471
// Otherwise fall back to Go-side extraction.
6572
if tokenUsage.HasChatDeltaContent() {
6673
rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
67-
contentDelta = cd
68-
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
74+
if rawReasoning != "" {
75+
preferAutoparser = true
76+
}
77+
if preferAutoparser {
78+
contentDelta = cd
79+
reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
80+
} else {
81+
reasoningDelta = goReasoning
82+
contentDelta = goContent
83+
}
6984
} else {
7085
reasoningDelta = goReasoning
7186
contentDelta = goContent

core/http/endpoints/openai/chat_test.go

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package openai
33
import (
44
"github.com/mudler/LocalAI/core/config"
55
"github.com/mudler/LocalAI/pkg/functions"
6+
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
7+
reason "github.com/mudler/LocalAI/pkg/reasoning"
68
. "github.com/onsi/ginkgo/v2"
79
. "github.com/onsi/gomega"
810

@@ -94,6 +96,98 @@ var _ = Describe("handleQuestion", func() {
9496
})
9597
})
9698

99+
var _ = Describe("applyAutoparserOverride", func() {
100+
// Regression test for https://github.com/mudler/LocalAI/issues/9985.
101+
// When LocalAI templates a <think>-style reasoning model outside of jinja
102+
// (e.g. the gallery qwen3 entry), the llama.cpp autoparser falls back to
103+
// the "pure content" PEG parser which dumps the entire raw response,
104+
// including <think>…</think>, into ChatDelta.Content and leaves
105+
// ChatDelta.ReasoningContent empty. The Go side previously trusted that
106+
// content verbatim and clobbered the tokenCallback's correctly-split
107+
// reasoning, so <think> blocks leaked into the OpenAI `content` field.
108+
Context("autoparser delivered content with embedded <think> tags and empty reasoning (issue #9985)", func() {
109+
It("splits <think>…</think> out of content into the reasoning field", func() {
110+
raw := "<think>\nOkay, the user said \"Hello\". I should reply warmly.\n</think>\n\nHello! How can I assist you today? 😊"
111+
chatDeltas := []*pb.ChatDelta{
112+
{Content: raw, ReasoningContent: ""},
113+
}
114+
115+
result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
116+
117+
Expect(result).To(HaveLen(1))
118+
Expect(result[0].Message).ToNot(BeNil())
119+
Expect(result[0].Message.Content).ToNot(BeNil())
120+
121+
content := *(result[0].Message.Content.(*string))
122+
Expect(content).ToNot(ContainSubstring("<think>"),
123+
"raw <think> tag must not leak into OpenAI content field")
124+
Expect(content).ToNot(ContainSubstring("</think>"),
125+
"raw </think> tag must not leak into OpenAI content field")
126+
Expect(content).To(ContainSubstring("Hello! How can I assist you today?"),
127+
"the model's actual answer must still be in content")
128+
129+
Expect(result[0].Message.Reasoning).ToNot(BeNil(),
130+
"reasoning extracted from <think>…</think> must populate Reasoning")
131+
Expect(*result[0].Message.Reasoning).To(ContainSubstring("Okay, the user said"))
132+
})
133+
134+
It("does not run extraction when the autoparser already populated reasoning", func() {
135+
// When the autoparser actually classified reasoning, leave its
136+
// content/reasoning split untouched.
137+
content := "Hello! How can I assist you today?"
138+
reasoning := "Already split by the C++ autoparser."
139+
chatDeltas := []*pb.ChatDelta{
140+
{Content: content, ReasoningContent: reasoning},
141+
}
142+
143+
result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
144+
145+
Expect(result).To(HaveLen(1))
146+
Expect(*(result[0].Message.Content.(*string))).To(Equal(content))
147+
Expect(result[0].Message.Reasoning).ToNot(BeNil())
148+
Expect(*result[0].Message.Reasoning).To(Equal(reasoning))
149+
})
150+
151+
It("passes plain content through unchanged when no reasoning tags are present", func() {
152+
content := "Just a normal answer with no reasoning at all."
153+
chatDeltas := []*pb.ChatDelta{
154+
{Content: content, ReasoningContent: ""},
155+
}
156+
157+
result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
158+
159+
Expect(result).To(HaveLen(1))
160+
Expect(*(result[0].Message.Content.(*string))).To(Equal(content))
161+
Expect(result[0].Message.Reasoning).To(BeNil())
162+
})
163+
164+
It("strips an empty <think></think> block (qwen3 /no_think mode)", func() {
165+
// qwen3 with the /no_think directive still emits an empty thinking
166+
// block. The Go-side fallback must strip it from content rather than
167+
// pass <think></think> through verbatim. No reasoning is set because
168+
// the block has no body.
169+
raw := "<think>\n\n</think>\n\nHello! How can I assist you today?"
170+
chatDeltas := []*pb.ChatDelta{
171+
{Content: raw, ReasoningContent: ""},
172+
}
173+
174+
result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
175+
176+
Expect(result).To(HaveLen(1))
177+
content := *(result[0].Message.Content.(*string))
178+
Expect(content).ToNot(ContainSubstring("<think>"))
179+
Expect(content).ToNot(ContainSubstring("</think>"))
180+
Expect(content).To(ContainSubstring("Hello! How can I assist you today?"))
181+
})
182+
183+
It("returns the existing result when chatDeltas is empty", func() {
184+
existing := []schema.Choice{{Index: 7}}
185+
result := applyAutoparserOverride(nil, "", reason.Config{}, existing)
186+
Expect(result).To(Equal(existing))
187+
})
188+
})
189+
})
190+
97191
var _ = Describe("mergeToolCallDeltas", func() {
98192
Context("with new tool calls", func() {
99193
It("should append new tool calls", func() {

core/http/endpoints/openai/realtime.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1572,6 +1572,15 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
15721572
"tool_calls", len(deltaToolCalls),
15731573
"content_len", len(deltaContent),
15741574
"reasoning_len", len(deltaReasoning))
1575+
// Issue #9985: when the autoparser only delivered content (no
1576+
// reasoning_content), it may be running in the "pure content"
1577+
// PEG fallback (non-jinja path) which leaves <think>…</think>
1578+
// embedded in the content. Run Go-side extraction defensively.
1579+
// ExtractReasoningWithConfig is a no-op when no tag pair matches,
1580+
// so it's safe to apply unconditionally in the no-reasoning branch.
1581+
if deltaReasoning == "" && deltaContent != "" {
1582+
deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
1583+
}
15751584
reasoningText = deltaReasoning
15761585
responseWithoutReasoning = deltaContent
15771586
textContent = deltaContent

core/http/endpoints/openresponses/responses.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1971,6 +1971,10 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
19711971

19721972
// Source reasoning from: (1) ChatDeltas from C++ autoparser, (2) extractor's
19731973
// streaming state, (3) final extraction from the finetuned result.
1974+
// Issue #9985: when the autoparser delivered Content but no
1975+
// ReasoningContent, it was running in the "pure content" PEG fallback
1976+
// (non-jinja path) which leaves reasoning tags embedded in content.
1977+
// Fall back to the streaming Go-side extractor's split in that case.
19741978
if chatDeltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas); chatDeltaReasoning != "" {
19751979
finalReasoning = chatDeltaReasoning
19761980
finalCleanedResult = functions.ContentFromChatDeltas(chatDeltas)

gallery/qwen3.yaml

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,12 @@ config_file: |
1111
- <dummy32000>
1212
- </s>
1313
- <|endoftext|>
14+
# Delegate templating to llama.cpp's jinja runtime so the C++ autoparser
15+
# can classify <think>…</think> blocks into reasoning_content natively
16+
# (issue #9985). Without use_jinja the autoparser falls back to a
17+
# "pure content" PEG parser that leaks reasoning tags into content.
18+
options:
19+
- use_jinja:true
1420
template:
15-
chat: |
16-
{{.Input -}}
17-
<|im_start|>assistant
18-
chat_message: |
19-
<|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
20-
{{ if eq .RoleName "tool" -}}
21-
<tool_response>
22-
{{ end -}}
23-
{{ if .Content -}}
24-
{{.Content }}
25-
{{ end -}}
26-
{{ if eq .RoleName "tool" -}}
27-
</tool_response>
28-
{{ end -}}
29-
{{ if .FunctionCall -}}
30-
<tool_call>
31-
{{toJson .FunctionCall}}
32-
</tool_call>
33-
{{ end -}}<|im_end|>
34-
completion: |
35-
{{.Input}}
36-
function: |
37-
<|im_start|>system
38-
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
39-
{{range .Functions}}
40-
{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
41-
{{end}}
42-
For each function call return a json object with function name and arguments: {"name": <function-name>, "arguments": <json-arguments-object>}
43-
<|im_end|>
44-
{{.Input -}}
45-
<|im_start|>assistant
21+
use_tokenizer_template: true
4622
name: qwen3

0 commit comments

Comments
 (0)