Extend SSEDelta to contain information for both reasoning and output

nv-alicheng · nv-alicheng · commit e12f20b00d50 · 2025-12-16T12:56:53.000-08:00
diff --git a/src/inference_endpoint/endpoint_client/worker.py b/src/inference_endpoint/endpoint_client/worker.py
@@ -404,50 +404,80 @@ async def _iter_sse_lines(
     async def _handle_streaming_request(self, query: Query) -> None:
         """Handle streaming response."""
         async for response in self._make_http_request(query):
-            accumulated_content = []
+            output_chunks = []
+            reasoning_chunks = []
             first_chunk_sent = False
 
             # Process SSE stream - yields batches of chunks
             async for chunk_batch in self._iter_sse_lines(response):
-                accumulated_content.extend(chunk_batch)
-
-                # Determine which chunks to send: all or just first
-                chunks_to_send = (
-                    chunk_batch
-                    if self.http_config.stream_all_chunks
-                    else chunk_batch[:1]
-                    if not first_chunk_sent
-                    else []
-                )
-
-                # Send chunks
-                for content in chunks_to_send:
-                    await self._response_socket.send(
-                        StreamChunk(
-                            id=query.id,
-                            response_chunk=content,
-                            is_complete=False,
-                            metadata={
-                                "first_chunk": not first_chunk_sent,
-                                "final_chunk": False,
-                            },
-                        )
+                output_delta = []
+                reasoning_delta = []
+                for delta in chunk_batch:
+                    if delta.content:
+                        output_delta.append(delta.content)
+                    elif delta.reasoning:
+                        reasoning_delta.append(delta.reasoning)
+                    else:
+                        logger.debug("empty SSE delta")
+                        continue
+
+                for delta_batch, accumulator in (
+                    (reasoning_delta, reasoning_chunks),
+                    (output_delta, output_chunks),
+                ):
+                    if not delta_batch:
+                        continue
+                    accumulator.extend(delta_batch)
+
+                    # Determine which chunks to send: all or just first
+                    chunks_to_send = (
+                        delta_batch
+                        if self.http_config.stream_all_chunks
+                        else delta_batch[:1]
+                        if not first_chunk_sent
+                        else []
                     )
-                    first_chunk_sent = True
-                    if self.http_config.record_worker_events:
-                        EventRecorder.record_event(
-                            SampleEvent.ZMQ_RESPONSE_SENT,
-                            time.monotonic_ns(),
-                            sample_uuid=query.id,
-                            assert_active=True,
+
+                    # Send chunks
+                    for content in chunks_to_send:
+                        await self._response_socket.send(
+                            StreamChunk(
+                                id=query.id,
+                                response_chunk=content,
+                                is_complete=False,
+                                metadata={
+                                    "first_chunk": not first_chunk_sent,
+                                    "final_chunk": False,
+                                },
+                            )
                         )
+                        first_chunk_sent = True
+                        if self.http_config.record_worker_events:
+                            EventRecorder.record_event(
+                                SampleEvent.ZMQ_RESPONSE_SENT,
+                                time.monotonic_ns(),
+                                sample_uuid=query.id,
+                                assert_active=True,
+                            )
 
             # Send final complete response
-            response_output = []
-            if accumulated_content:
-                response_output.append(accumulated_content[0])
-                if len(accumulated_content) > 1:
-                    response_output.append("".join(accumulated_content[1:]))
+            if reasoning_chunks:
+                resp_reasoning = [reasoning_chunks[0]]
+                if len(reasoning_chunks) > 1:
+                    resp_reasoning.append("".join(reasoning_chunks[1:]))
+                response_output = {
+                    "output": "".join(output_chunks),
+                    "reasoning": resp_reasoning,
+                }
+            elif output_chunks:
+                resp_output = [output_chunks[0]]
+                if len(output_chunks) > 1:
+                    resp_output.append("".join(output_chunks[1:]))
+                response_output = {
+                    "output": resp_output,
+                }
+            else:
+                response_output = {"output": []}
 
             await self._response_socket.send(
                 QueryResult(
diff --git a/src/inference_endpoint/load_generator/sample.py b/src/inference_endpoint/load_generator/sample.py
@@ -186,7 +186,7 @@ def query_result_complete(self, result: QueryResult) -> None:
             SampleEvent.COMPLETE,
             timestamp_ns,
             sample_uuid=result.id,
-            data={"output": result.response_output},
+            data=result.response_output,
         )
 
         for hook in self.complete_hooks:
diff --git a/src/inference_endpoint/openai/openai_adapter.py b/src/inference_endpoint/openai/openai_adapter.py
@@ -41,6 +41,7 @@ class SSEDelta(msgspec.Struct):
     """SSE delta object containing content."""
 
     content: str = ""
+    reasoning: str = ""
 
 
 class SSEChoice(msgspec.Struct):
@@ -75,7 +76,7 @@ def decode_response(cls, response_bytes: bytes, query_id: str) -> QueryResult:
     def decode_sse_message(cls, json_bytes: bytes) -> str:
         """Decode SSE message and extract content string."""
         msg = msgspec.json.decode(json_bytes, type=SSEMessage)
-        return msg.choices[0].delta.content
+        return msg.choices[0].delta
 
     # ========================================================================
     # Internal APIs
diff --git a/src/inference_endpoint/openai/openai_msgspec_adapter.py b/src/inference_endpoint/openai/openai_msgspec_adapter.py
@@ -127,7 +127,7 @@ def decode_response(cls, response_bytes: bytes, query_id: str) -> QueryResult:
     def decode_sse_message(cls, json_bytes: bytes) -> str:
         """Decode SSE message and extract content string."""
         msg = cls._sse_decoder.decode(json_bytes)
-        return msg.choices[0].delta.content
+        return msg.choices[0].delta
 
     # ========================================================================
     # Internal APIs

Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,7 @@ def query_result_complete(self, result: QueryResult) -> None:`
`186`	`186`	`SampleEvent.COMPLETE,`
`187`	`187`	`timestamp_ns,`
`188`	`188`	`sample_uuid=result.id,`
`189`		`- data={"output": result.response_output},`
	`189`	`+ data=result.response_output,`
`190`	`190`	`)`
`191`	`191`
`192`	`192`	`for hook in self.complete_hooks:`