fix(reasoning): stop <think> leaking into content when autoparser is in pure-content mode (#9991)

localai-bot · mudler · web-flow · commit 1c6c3adad654 · 2026-05-25T22:39:50.000+02:00
When LocalAI templates a thinking model outside of jinja (the default for the qwen3 gallery family), llama.cpp's chat parser falls back to a "pure content" PEG parser that dumps the entire raw response into ChatDelta.Content with an empty ReasoningContent. The Go side then trusted that content verbatim and overrode tokenCallback's correctly-split reasoning, so <think>...</think> blocks ended up in the OpenAI `content` field. Regression from v4.0.0 introduced when the autoparser ChatDeltas path was added (#9224). The override now runs Go-side reasoning extraction defensively when the autoparser delivered content but no reasoning. The streaming worker gains a sticky preferAutoparser flag that flips on the first chunk where the autoparser classified reasoning_content; until then we use the streaming Go-side extractor. Realtime mirrors the non-streaming fallback. When the autoparser already populated ReasoningContent we trust it untouched, so jinja-enabled installs are not regressed. gallery/qwen3.yaml now enables use_jinja, letting the autoparser classify <think> natively for all 20+ qwen3 family entries that share this template. Fixes #9985 Assisted-by: Claude:opus-4-7 [Read] [Edit] [Bash] [Write] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -68,6 +68,57 @@ func mergeToolCallDeltas(existing []schema.ToolCall, deltas []schema.ToolCall) [
 	return existing
 }
 
+// applyAutoparserOverride replaces the Go-side reasoning-extraction result with
+// the C++ autoparser's classified ChatDeltas when those deltas contain
+// actionable content or reasoning. It preserves the original logprobs.
+//
+// When the autoparser did not classify any reasoning (deltaReasoning == "") but
+// deltaContent still carries an unparsed reasoning tag pair (e.g. the
+// non-jinja "pure content" fallback path on a <think> model — issue #9985),
+// the Go-side reasoning extractor is run on deltaContent as a defensive
+// fallback so <think>…</think> blocks do not leak into the OpenAI `content`
+// field.
+func applyAutoparserOverride(
+	chatDeltas []*pb.ChatDelta,
+	thinkingStartToken string,
+	reasoningConfig reason.Config,
+	existing []schema.Choice,
+) []schema.Choice {
+	if len(chatDeltas) == 0 {
+		return existing
+	}
+	deltaContent := functions.ContentFromChatDeltas(chatDeltas)
+	deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
+	if deltaContent == "" && deltaReasoning == "" {
+		return existing
+	}
+	// Fallback for non-jinja models (issue #9985): when the C++ autoparser
+	// did not classify reasoning but the raw content still contains a known
+	// reasoning tag pair, run Go-side extraction on the content so that the
+	// <think>…</think> block does not leak into the OpenAI `content` field.
+	// When the autoparser DID populate ReasoningContent, leave its
+	// content/reasoning split alone — trust the parser. We replace
+	// deltaContent unconditionally because ExtractReasoningWithConfig is a
+	// no-op when no tag pair matches; this also strips empty thinking
+	// blocks like "<think></think>" that some models emit when reasoning
+	// is disabled.
+	if deltaReasoning == "" && deltaContent != "" {
+		deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig)
+	}
+	xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
+		"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
+	stopReason := FinishReasonStop
+	message := &schema.Message{Role: "assistant", Content: &deltaContent}
+	if deltaReasoning != "" {
+		message.Reasoning = &deltaReasoning
+	}
+	newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
+	if len(existing) > 0 && existing[0].Logprobs != nil {
+		newChoice.Logprobs = existing[0].Logprobs
+	}
+	return []schema.Choice{newChoice}
+}
+
 // ChatEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/chat/create
 // @Summary Generate a chat completions for a given prompt and model.
 // @Tags inference
@@ -757,24 +808,8 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				// For non-tool requests: prefer C++ autoparser chat deltas over
 				// Go-side tag extraction (which can mangle output when thinkingStartToken
 				// differs from the model's actual reasoning tags, e.g. Gemma 4).
-				if !shouldUseFn && len(chatDeltas) > 0 {
-					deltaContent := functions.ContentFromChatDeltas(chatDeltas)
-					deltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas)
-					if deltaContent != "" || deltaReasoning != "" {
-						xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
-							"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
-						stopReason := FinishReasonStop
-						message := &schema.Message{Role: "assistant", Content: &deltaContent}
-						if deltaReasoning != "" {
-							message.Reasoning = &deltaReasoning
-						}
-						newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
-						// Preserve logprobs from the original result
-						if len(result) > 0 && result[0].Logprobs != nil {
-							newChoice.Logprobs = result[0].Logprobs
-						}
-						result = []schema.Choice{newChoice}
-					}
+				if !shouldUseFn {
+					result = applyAutoparserOverride(chatDeltas, thinkingStartToken, config.ReasoningConfig, result)
 				}
 
 				// Tool parsing is deferred here (only when shouldUseFn) so chat deltas are available
diff --git a/core/http/endpoints/openai/chat_stream_workers.go b/core/http/endpoints/openai/chat_stream_workers.go
@@ -52,6 +52,13 @@ func processStream(
 	thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)
 	extractor := reason.NewReasoningExtractor(thinkingStartToken, cfg.ReasoningConfig)
 
+	// preferAutoparser is sticky: once the C++ autoparser has ever classified
+	// reasoning_content, we trust it for the rest of the stream. Until then we
+	// fall back to Go-side extraction so that a "pure content" autoparser
+	// (non-jinja path, issue #9985) does not leak <think>…</think> tokens
+	// straight into the OpenAI `content` field.
+	preferAutoparser := false
+
 	_, finalUsage, _, err := ComputeChoices(req, s, cfg, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
 		var reasoningDelta, contentDelta string
 
@@ -64,8 +71,16 @@ func processStream(
 		// Otherwise fall back to Go-side extraction.
 		if tokenUsage.HasChatDeltaContent() {
 			rawReasoning, cd := tokenUsage.ChatDeltaReasoningAndContent()
-			contentDelta = cd
-			reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
+			if rawReasoning != "" {
+				preferAutoparser = true
+			}
+			if preferAutoparser {
+				contentDelta = cd
+				reasoningDelta = extractor.ProcessChatDeltaReasoning(rawReasoning)
+			} else {
+				reasoningDelta = goReasoning
+				contentDelta = goContent
+			}
 		} else {
 			reasoningDelta = goReasoning
 			contentDelta = goContent
diff --git a/core/http/endpoints/openai/chat_test.go b/core/http/endpoints/openai/chat_test.go
@@ -3,6 +3,8 @@ package openai
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/functions"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	reason "github.com/mudler/LocalAI/pkg/reasoning"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 
@@ -94,6 +96,98 @@ var _ = Describe("handleQuestion", func() {
 	})
 })
 
+var _ = Describe("applyAutoparserOverride", func() {
+	// Regression test for https://github.com/mudler/LocalAI/issues/9985.
+	// When LocalAI templates a <think>-style reasoning model outside of jinja
+	// (e.g. the gallery qwen3 entry), the llama.cpp autoparser falls back to
+	// the "pure content" PEG parser which dumps the entire raw response,
+	// including <think>…</think>, into ChatDelta.Content and leaves
+	// ChatDelta.ReasoningContent empty. The Go side previously trusted that
+	// content verbatim and clobbered the tokenCallback's correctly-split
+	// reasoning, so <think> blocks leaked into the OpenAI `content` field.
+	Context("autoparser delivered content with embedded <think> tags and empty reasoning (issue #9985)", func() {
+		It("splits <think>…</think> out of content into the reasoning field", func() {
+			raw := "<think>\nOkay, the user said \"Hello\". I should reply warmly.\n</think>\n\nHello! How can I assist you today? 😊"
+			chatDeltas := []*pb.ChatDelta{
+				{Content: raw, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(result[0].Message).ToNot(BeNil())
+			Expect(result[0].Message.Content).ToNot(BeNil())
+
+			content := *(result[0].Message.Content.(*string))
+			Expect(content).ToNot(ContainSubstring("<think>"),
+				"raw <think> tag must not leak into OpenAI content field")
+			Expect(content).ToNot(ContainSubstring("</think>"),
+				"raw </think> tag must not leak into OpenAI content field")
+			Expect(content).To(ContainSubstring("Hello! How can I assist you today?"),
+				"the model's actual answer must still be in content")
+
+			Expect(result[0].Message.Reasoning).ToNot(BeNil(),
+				"reasoning extracted from <think>…</think> must populate Reasoning")
+			Expect(*result[0].Message.Reasoning).To(ContainSubstring("Okay, the user said"))
+		})
+
+		It("does not run extraction when the autoparser already populated reasoning", func() {
+			// When the autoparser actually classified reasoning, leave its
+			// content/reasoning split untouched.
+			content := "Hello! How can I assist you today?"
+			reasoning := "Already split by the C++ autoparser."
+			chatDeltas := []*pb.ChatDelta{
+				{Content: content, ReasoningContent: reasoning},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal(content))
+			Expect(result[0].Message.Reasoning).ToNot(BeNil())
+			Expect(*result[0].Message.Reasoning).To(Equal(reasoning))
+		})
+
+		It("passes plain content through unchanged when no reasoning tags are present", func() {
+			content := "Just a normal answer with no reasoning at all."
+			chatDeltas := []*pb.ChatDelta{
+				{Content: content, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal(content))
+			Expect(result[0].Message.Reasoning).To(BeNil())
+		})
+
+		It("strips an empty <think></think> block (qwen3 /no_think mode)", func() {
+			// qwen3 with the /no_think directive still emits an empty thinking
+			// block. The Go-side fallback must strip it from content rather than
+			// pass <think></think> through verbatim. No reasoning is set because
+			// the block has no body.
+			raw := "<think>\n\n</think>\n\nHello! How can I assist you today?"
+			chatDeltas := []*pb.ChatDelta{
+				{Content: raw, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, "", reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			content := *(result[0].Message.Content.(*string))
+			Expect(content).ToNot(ContainSubstring("<think>"))
+			Expect(content).ToNot(ContainSubstring("</think>"))
+			Expect(content).To(ContainSubstring("Hello! How can I assist you today?"))
+		})
+
+		It("returns the existing result when chatDeltas is empty", func() {
+			existing := []schema.Choice{{Index: 7}}
+			result := applyAutoparserOverride(nil, "", reason.Config{}, existing)
+			Expect(result).To(Equal(existing))
+		})
+	})
+})
+
 var _ = Describe("mergeToolCallDeltas", func() {
 	Context("with new tool calls", func() {
 		It("should append new tool calls", func() {
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
@@ -1572,6 +1572,15 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 			"tool_calls", len(deltaToolCalls),
 			"content_len", len(deltaContent),
 			"reasoning_len", len(deltaReasoning))
+		// Issue #9985: when the autoparser only delivered content (no
+		// reasoning_content), it may be running in the "pure content"
+		// PEG fallback (non-jinja path) which leaves <think>…</think>
+		// embedded in the content. Run Go-side extraction defensively.
+		// ExtractReasoningWithConfig is a no-op when no tag pair matches,
+		// so it's safe to apply unconditionally in the no-reasoning branch.
+		if deltaReasoning == "" && deltaContent != "" {
+			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
+		}
 		reasoningText = deltaReasoning
 		responseWithoutReasoning = deltaContent
 		textContent = deltaContent
diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go
@@ -1971,6 +1971,10 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 
 			// Source reasoning from: (1) ChatDeltas from C++ autoparser, (2) extractor's
 			// streaming state, (3) final extraction from the finetuned result.
+			// Issue #9985: when the autoparser delivered Content but no
+			// ReasoningContent, it was running in the "pure content" PEG fallback
+			// (non-jinja path) which leaves reasoning tags embedded in content.
+			// Fall back to the streaming Go-side extractor's split in that case.
 			if chatDeltaReasoning := functions.ReasoningFromChatDeltas(chatDeltas); chatDeltaReasoning != "" {
 				finalReasoning = chatDeltaReasoning
 				finalCleanedResult = functions.ContentFromChatDeltas(chatDeltas)
diff --git a/gallery/qwen3.yaml b/gallery/qwen3.yaml
@@ -11,36 +11,12 @@ config_file: |
         - <dummy32000>
         - </s>
         - <|endoftext|>
+    # Delegate templating to llama.cpp's jinja runtime so the C++ autoparser
+    # can classify <think>…</think> blocks into reasoning_content natively
+    # (issue #9985). Without use_jinja the autoparser falls back to a
+    # "pure content" PEG parser that leaks reasoning tags into content.
+    options:
+        - use_jinja:true
     template:
-        chat: |
-            {{.Input -}}
-            <|im_start|>assistant
-        chat_message: |
-            <|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
-            {{ if eq .RoleName "tool" -}}
-            <tool_response>
-            {{ end -}}
-            {{ if .Content -}}
-            {{.Content }}
-            {{ end -}}
-            {{ if eq .RoleName "tool" -}}
-            </tool_response>
-            {{ end -}}
-            {{ if .FunctionCall -}}
-            <tool_call>
-            {{toJson .FunctionCall}}
-            </tool_call>
-            {{ end -}}<|im_end|>
-        completion: |
-            {{.Input}}
-        function: |
-            <|im_start|>system
-            You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-            {{range .Functions}}
-            {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
-            {{end}}
-            For each function call return a json object with function name and arguments: {"name": <function-name>, "arguments": <json-arguments-object>}
-            <|im_end|>
-            {{.Input -}}
-            <|im_start|>assistant
+        use_tokenizer_template: true
 name: qwen3