@@ -174,8 +174,78 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
174174
175175 result := ""
176176 lastEmittedCount := 0
177+
178+ // Track accumulated content for incremental reasoning and content extraction (mirrors process())
179+ accumulatedContent := ""
180+ lastEmittedReasoning := ""
181+ lastEmittedCleanedContent := ""
182+ sentInitialRole := false
183+
177184 _ , tokenUsage , chatDeltas , err := ComputeChoices (req , prompt , config , cl , startupOptions , loader , func (s string , c * []schema.Choice ) {}, func (s string , usage backend.TokenUsage ) bool {
178185 result += s
186+ accumulatedContent += s
187+
188+ // Incremental reasoning extraction — emit reasoning deltas in their own SSE chunks
189+ // before any tool-call chunks (OpenAI spec: reasoning and tool_calls never share a delta)
190+ currentReasoning , cleanedContent := reason .ExtractReasoningWithConfig (accumulatedContent , thinkingStartToken , config .ReasoningConfig )
191+
192+ var reasoningDelta * string
193+ if currentReasoning != lastEmittedReasoning {
194+ if len (currentReasoning ) > len (lastEmittedReasoning ) && strings .HasPrefix (currentReasoning , lastEmittedReasoning ) {
195+ newReasoning := currentReasoning [len (lastEmittedReasoning ):]
196+ reasoningDelta = & newReasoning
197+ lastEmittedReasoning = currentReasoning
198+ } else if currentReasoning != "" {
199+ reasoningDelta = & currentReasoning
200+ lastEmittedReasoning = currentReasoning
201+ }
202+ }
203+
204+ if reasoningDelta != nil && * reasoningDelta != "" {
205+ responses <- schema.OpenAIResponse {
206+ ID : id ,
207+ Created : created ,
208+ Model : req .Model ,
209+ Choices : []schema.Choice {{
210+ Delta : & schema.Message {Reasoning : reasoningDelta },
211+ Index : 0 ,
212+ }},
213+ Object : "chat.completion.chunk" ,
214+ }
215+ }
216+
217+ // Stream content deltas (cleaned of reasoning tags) while no tool calls
218+ // have been detected. Once the incremental parser finds tool calls,
219+ // content stops — per OpenAI spec, content and tool_calls don't mix.
220+ if lastEmittedCount == 0 && cleanedContent != "" {
221+ var deltaContent string
222+ if len (cleanedContent ) > len (lastEmittedCleanedContent ) && strings .HasPrefix (cleanedContent , lastEmittedCleanedContent ) {
223+ deltaContent = cleanedContent [len (lastEmittedCleanedContent ):]
224+ lastEmittedCleanedContent = cleanedContent
225+ } else if cleanedContent != lastEmittedCleanedContent {
226+ deltaContent = cleanedContent
227+ lastEmittedCleanedContent = cleanedContent
228+ }
229+ if deltaContent != "" {
230+ if ! sentInitialRole {
231+ responses <- schema.OpenAIResponse {
232+ ID : id , Created : created , Model : req .Model ,
233+ Choices : []schema.Choice {{Delta : & schema.Message {Role : "assistant" }, Index : 0 }},
234+ Object : "chat.completion.chunk" ,
235+ }
236+ sentInitialRole = true
237+ }
238+ responses <- schema.OpenAIResponse {
239+ ID : id , Created : created , Model : req .Model ,
240+ Choices : []schema.Choice {{
241+ Delta : & schema.Message {Content : & deltaContent },
242+ Index : 0 ,
243+ }},
244+ Object : "chat.completion.chunk" ,
245+ }
246+ }
247+ }
248+
179249 // Try incremental XML parsing for streaming support using iterative parser
180250 // This allows emitting partial tool calls as they're being generated
181251 cleanedResult := functions .CleanupLLMResult (result , config .FunctionsConfig )
@@ -306,20 +376,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
306376
307377 switch {
308378 case noActionToRun :
309- initialMessage := schema.OpenAIResponse {
310- ID : id ,
311- Created : created ,
312- Model : req .Model , // we have to return what the user sent here, due to OpenAI spec.
313- Choices : []schema.Choice {{Delta : & schema.Message {Role : "assistant" }, Index : 0 , FinishReason : nil }},
314- Object : "chat.completion.chunk" ,
315- }
316- responses <- initialMessage
317-
318- result , err := handleQuestion (config , functionResults , result , prompt )
319- if err != nil {
320- xlog .Error ("error handling question" , "error" , err )
321- return err
322- }
323379 usage := schema.OpenAIUsage {
324380 PromptTokens : tokenUsage .Prompt ,
325381 CompletionTokens : tokenUsage .Completion ,
@@ -330,25 +386,43 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
330386 usage .TimingPromptProcessing = tokenUsage .TimingPromptProcessing
331387 }
332388
333- var deltaReasoning * string
334- if reasoning != "" {
335- deltaReasoning = & reasoning
336- }
337- delta := & schema.Message {Content : & result }
338- if deltaReasoning != nil {
339- delta .Reasoning = deltaReasoning
340- }
389+ if sentInitialRole {
390+ // Content was already streamed during the callback — just emit usage.
391+ delta := & schema.Message {}
392+ if reasoning != "" && lastEmittedReasoning == "" {
393+ delta .Reasoning = & reasoning
394+ }
395+ responses <- schema.OpenAIResponse {
396+ ID : id , Created : created , Model : req .Model ,
397+ Choices : []schema.Choice {{Delta : delta , Index : 0 }},
398+ Object : "chat.completion.chunk" ,
399+ Usage : usage ,
400+ }
401+ } else {
402+ // Content was NOT streamed — send everything at once (fallback).
403+ responses <- schema.OpenAIResponse {
404+ ID : id , Created : created , Model : req .Model ,
405+ Choices : []schema.Choice {{Delta : & schema.Message {Role : "assistant" }, Index : 0 }},
406+ Object : "chat.completion.chunk" ,
407+ }
341408
342- resp := schema.OpenAIResponse {
343- ID : id ,
344- Created : created ,
345- Model : req .Model , // we have to return what the user sent here, due to OpenAI spec.
346- Choices : []schema.Choice {{Delta : delta , Index : 0 , FinishReason : nil }},
347- Object : "chat.completion.chunk" ,
348- Usage : usage ,
349- }
409+ result , err := handleQuestion (config , functionResults , result , prompt )
410+ if err != nil {
411+ xlog .Error ("error handling question" , "error" , err )
412+ return err
413+ }
350414
351- responses <- resp
415+ delta := & schema.Message {Content : & result }
416+ if reasoning != "" {
417+ delta .Reasoning = & reasoning
418+ }
419+ responses <- schema.OpenAIResponse {
420+ ID : id , Created : created , Model : req .Model ,
421+ Choices : []schema.Choice {{Delta : delta , Index : 0 }},
422+ Object : "chat.completion.chunk" ,
423+ Usage : usage ,
424+ }
425+ }
352426
353427 default :
354428 for i , ss := range functionResults {
0 commit comments