feat: #2669 add opt-in reasoning content replay for chat completion models (#2670)

seratch · web-flow · commit 34ff8481bb40 · 2026-03-20T16:20:36.000+09:00
diff --git a/src/agents/extensions/models/litellm_model.py b/src/agents/extensions/models/litellm_model.py
@@ -49,6 +49,7 @@
 from ...models.fake_id import FAKE_RESPONSES_ID
 from ...models.interface import Model, ModelTracing
 from ...models.openai_responses import Converter as OpenAIResponsesConverter
+from ...models.reasoning_content_replay import ShouldReplayReasoningContent
 from ...retry import ModelRetryAdvice, ModelRetryAdviceRequest
 from ...tool import Tool
 from ...tracing import generation_span
@@ -146,10 +147,12 @@ def __init__(
         model: str,
         base_url: str | None = None,
         api_key: str | None = None,
+        should_replay_reasoning_content: ShouldReplayReasoningContent | None = None,
     ):
         self.model = model
         self.base_url = base_url
         self.api_key = api_key
+        self.should_replay_reasoning_content = should_replay_reasoning_content
 
     def get_retry_advice(self, request: ModelRetryAdviceRequest) -> ModelRetryAdvice | None:
         # LiteLLM exceptions mirror OpenAI-style status/header fields.
@@ -383,9 +386,11 @@ async def _fetch_response(
 
         converted_messages = Converter.items_to_messages(
             input,
+            base_url=self.base_url,
             preserve_thinking_blocks=preserve_thinking_blocks,
             preserve_tool_output_all_content=True,
             model=self.model,
+            should_replay_reasoning_content=self.should_replay_reasoning_content,
         )
 
         # Fix message ordering: reorder to ensure tool_use comes before tool_result.
diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py
@@ -55,6 +55,12 @@
     ensure_tool_choice_supports_backend,
 )
 from .fake_id import FAKE_RESPONSES_ID
+from .reasoning_content_replay import (
+    ReasoningContentReplayContext,
+    ReasoningContentSource,
+    ShouldReplayReasoningContent,
+    default_should_replay_reasoning_content,
+)
 
 ResponseInputContentWithAudioParam = Union[
     ResponseInputContentParam,
@@ -422,6 +428,8 @@ def items_to_messages(
         model: str | None = None,
         preserve_thinking_blocks: bool = False,
         preserve_tool_output_all_content: bool = False,
+        base_url: str | None = None,
+        should_replay_reasoning_content: ShouldReplayReasoningContent | None = None,
     ) -> list[ChatCompletionMessageParam]:
         """
         Convert a sequence of 'Item' objects into a list of ChatCompletionMessageParam.
@@ -441,6 +449,12 @@ def items_to_messages(
                 When True, all content types including images are preserved. This is useful
                 for model providers (e.g. Anthropic via LiteLLM) that support processing
                 non-text content in tool results.
+            base_url: The request base URL, if the caller knows the concrete endpoint.
+                This is used by reasoning-content replay hooks to distinguish direct
+                provider calls from proxy or gateway requests.
+            should_replay_reasoning_content: Optional hook that decides whether a
+                reasoning item should be replayed into the next assistant message as
+                `reasoning_content`.
 
         Rules:
         - EasyInputMessage or InputMessage (role=user) => ChatCompletionUserMessageParam
@@ -464,8 +478,9 @@ def items_to_messages(
         current_assistant_msg: ChatCompletionAssistantMessageParam | None = None
         pending_thinking_blocks: list[dict[str, str]] | None = None
         pending_reasoning_content: str | None = None  # For DeepSeek reasoning_content
+        normalized_base_url = base_url.rstrip("/") if base_url is not None else None
 
-        def flush_assistant_message() -> None:
+        def flush_assistant_message(*, clear_pending_reasoning_content: bool = True) -> None:
             nonlocal current_assistant_msg, pending_reasoning_content
             if current_assistant_msg is not None:
                 # The API doesn't support empty arrays for tool_calls
@@ -475,7 +490,15 @@ def flush_assistant_message() -> None:
                     pending_reasoning_content = None
                 result.append(current_assistant_msg)
                 current_assistant_msg = None
-            else:
+            elif clear_pending_reasoning_content:
+                pending_reasoning_content = None
+
+        def apply_pending_reasoning_content(
+            assistant_msg: ChatCompletionAssistantMessageParam,
+        ) -> None:
+            nonlocal pending_reasoning_content
+            if pending_reasoning_content:
+                assistant_msg["reasoning_content"] = pending_reasoning_content  # type: ignore[typeddict-unknown-key]
                 pending_reasoning_content = None
 
         def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
@@ -485,6 +508,8 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
                 current_assistant_msg["content"] = None
                 current_assistant_msg["tool_calls"] = []
 
+            apply_pending_reasoning_content(current_assistant_msg)
+
             return current_assistant_msg
 
         for item in items:
@@ -553,7 +578,9 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
 
             # 3) response output message => assistant
             elif resp_msg := cls.maybe_response_output_message(item):
-                flush_assistant_message()
+                # A reasoning item can be followed by an assistant message and then tool calls
+                # in the same turn, so preserve pending reasoning_content across this flush.
+                flush_assistant_message(clear_pending_reasoning_content=False)
                 new_asst = ChatCompletionAssistantMessageParam(role="assistant")
                 contents = resp_msg["content"]
 
@@ -594,6 +621,7 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
                     pending_thinking_blocks = None  # Clear after using
 
                 new_asst["tool_calls"] = []
+                apply_pending_reasoning_content(new_asst)
                 current_assistant_msg = new_asst
 
             # 4) function/file-search calls => attach to assistant
@@ -619,11 +647,6 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
             elif func_call := cls.maybe_function_tool_call(item):
                 asst = ensure_assistant_message()
 
-                # If we have pending reasoning content for DeepSeek, add it to the assistant message
-                if pending_reasoning_content:
-                    asst["reasoning_content"] = pending_reasoning_content  # type: ignore[typeddict-unknown-key]
-                    pending_reasoning_content = None  # Clear after using
-
                 # If we have pending thinking blocks, use them as the content
                 # This is required for Anthropic API tool calls with interleaved thinking
                 if pending_thinking_blocks:
@@ -708,6 +731,7 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
 
                 item_provider_data: dict[str, Any] = reasoning_item.get("provider_data", {})  # type: ignore[assignment]
                 item_model = item_provider_data.get("model", "")
+                should_replay = False
 
                 if (
                     model
@@ -740,17 +764,23 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
                     # This preserves the original behavior
                     pending_thinking_blocks = reconstructed_thinking_blocks
 
-                # DeepSeek requires reasoning_content field in assistant messages with tool calls
-                # Items may not all originate from DeepSeek, so need to check for model match.
-                # For backward compatibility, if provider_data is missing, ignore the check.
-                elif (
-                    model
-                    and "deepseek" in model.lower()
-                    and (
-                        (item_model and "deepseek" in item_model.lower())
-                        or item_provider_data == {}
+                if model is not None:
+                    replay_context = ReasoningContentReplayContext(
+                        model=model,
+                        base_url=normalized_base_url,
+                        reasoning=ReasoningContentSource(
+                            item=reasoning_item,
+                            origin_model=item_model or None,
+                            provider_data=item_provider_data,
+                        ),
                     )
-                ):
+                    should_replay = (
+                        should_replay_reasoning_content(replay_context)
+                        if should_replay_reasoning_content is not None
+                        else default_should_replay_reasoning_content(replay_context)
+                    )
+
+                if should_replay:
                     summary_items = reasoning_item.get("summary", [])
                     if summary_items:
                         reasoning_texts = []
diff --git a/src/agents/models/openai_chatcompletions.py b/src/agents/models/openai_chatcompletions.py
@@ -39,6 +39,7 @@
 from .fake_id import FAKE_RESPONSES_ID
 from .interface import Model, ModelTracing
 from .openai_responses import Converter as OpenAIResponsesConverter
+from .reasoning_content_replay import ShouldReplayReasoningContent
 
 if TYPE_CHECKING:
     from ..model_settings import ModelSettings
@@ -53,9 +54,11 @@ def __init__(
         self,
         model: str | ChatModel,
         openai_client: AsyncOpenAI,
+        should_replay_reasoning_content: ShouldReplayReasoningContent | None = None,
     ) -> None:
         self.model = model
         self._client = openai_client
+        self.should_replay_reasoning_content = should_replay_reasoning_content
 
     def _non_null_or_omit(self, value: Any) -> Any:
         return value if value is not None else omit
@@ -314,7 +317,12 @@ async def _fetch_response(
         prompt: ResponsePromptParam | None = None,
     ) -> ChatCompletion | tuple[Response, AsyncStream[ChatCompletionChunk]]:
         self._validate_official_openai_input_content_types(input)
-        converted_messages = Converter.items_to_messages(input, model=self.model)
+        converted_messages = Converter.items_to_messages(
+            input,
+            model=self.model,
+            base_url=str(self._client.base_url),
+            should_replay_reasoning_content=self.should_replay_reasoning_content,
+        )
 
         if system_instructions:
             converted_messages.insert(
diff --git a/src/agents/models/reasoning_content_replay.py b/src/agents/models/reasoning_content_replay.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import Any, Callable
+
+
+@dataclass
+class ReasoningContentSource:
+    """The reasoning item being considered for replay into the next request."""
+
+    item: Any
+    """The raw reasoning item."""
+
+    origin_model: str | None
+    """The model that originally produced the reasoning item, if known."""
+
+    provider_data: Mapping[str, Any]
+    """Provider-specific metadata captured on the reasoning item."""
+
+
+@dataclass
+class ReasoningContentReplayContext:
+    """Context passed to reasoning-content replay hooks."""
+
+    model: str
+    """The model that will receive the next Chat Completions request."""
+
+    base_url: str | None
+    """The request base URL, if the SDK knows the concrete endpoint."""
+
+    reasoning: ReasoningContentSource
+    """The reasoning item candidate being evaluated for replay."""
+
+
+ShouldReplayReasoningContent = Callable[[ReasoningContentReplayContext], bool]
+
+
+def default_should_replay_reasoning_content(context: ReasoningContentReplayContext) -> bool:
+    """Return whether the SDK should replay reasoning content by default."""
+
+    if "deepseek" not in context.model.lower():
+        return False
+
+    origin_model = context.reasoning.origin_model
+    # Replay only when the current request targets DeepSeek and the reasoning item either
+    # came from a DeepSeek model or predates provider tracking. This avoids mixing reasoning
+    # content from a different model family into the DeepSeek assistant message.
+    return (
+        origin_model is not None and "deepseek" in origin_model.lower()
+    ) or context.reasoning.provider_data == {}
+
+
+__all__ = [
+    "ReasoningContentReplayContext",
+    "ReasoningContentSource",
+    "ShouldReplayReasoningContent",
+    "default_should_replay_reasoning_content",
+]
diff --git a/tests/models/test_reasoning_content_replay_hook.py b/tests/models/test_reasoning_content_replay_hook.py
diff --git a/tests/test_anthropic_thinking_blocks.py b/tests/test_anthropic_thinking_blocks.py