fix: #3123 avoid replaying assistant conversation item IDs for OpenAIConversationsSession (#3127)

seratch · web-flow · commit 601ecf5503c8 · 2026-05-05T20:55:29.000+09:00
diff --git a/src/agents/run_internal/session_persistence.py b/src/agents/run_internal/session_persistence.py
@@ -86,6 +86,7 @@ async def prepare_input_with_session(
         history = await session.get_items(limit=resolved_settings.limit)
     else:
         history = await session.get_items()
+    is_openai_conversation_session = isinstance(session, OpenAIConversationsSession)
     converted_history = [
         strip_internal_input_item_metadata(ensure_input_item_format(item)) for item in history
     ]
@@ -122,28 +123,38 @@ async def prepare_input_with_session(
         # The callback may reorder, drop, or duplicate items. Keep separate reference maps for
         # the copied history and copied new-input lists so we can reconstruct which output items
         # belong to the new turn and therefore still need to be persisted.
-        history_refs = _build_reference_map(history_for_callback)
+        history_refs = _build_reference_map(
+            history_for_callback,
+            ignore_openai_conversation_item_ids=is_openai_conversation_session,
+        )
         new_refs = _build_reference_map(new_items_for_callback)
-        history_counts = _build_frequency_map(history_for_callback)
+        history_counts = _build_frequency_map(
+            history_for_callback,
+            ignore_openai_conversation_item_ids=is_openai_conversation_session,
+        )
         new_counts = _build_frequency_map(new_items_for_callback)
 
         appended: list[Any] = []
         for combined_index, item in enumerate(combined):
-            key = _session_item_key(item)
-            if _consume_reference(new_refs, key, item):
-                new_counts[key] = max(new_counts.get(key, 0) - 1, 0)
+            history_key = _session_item_key(
+                item,
+                ignore_openai_conversation_item_ids=is_openai_conversation_session,
+            )
+            new_key = _session_item_key(item)
+            if _consume_reference(new_refs, new_key, item):
+                new_counts[new_key] = max(new_counts.get(new_key, 0) - 1, 0)
                 appended.append(item)
                 continue
-            if _consume_reference(history_refs, key, item):
-                history_counts[key] = max(history_counts.get(key, 0) - 1, 0)
+            if _consume_reference(history_refs, history_key, item):
+                history_counts[history_key] = max(history_counts.get(history_key, 0) - 1, 0)
                 prune_history_indexes.add(combined_index)
                 continue
-            if history_counts.get(key, 0) > 0:
-                history_counts[key] = history_counts.get(key, 0) - 1
+            if history_counts.get(history_key, 0) > 0:
+                history_counts[history_key] = history_counts.get(history_key, 0) - 1
                 prune_history_indexes.add(combined_index)
                 continue
-            if new_counts.get(key, 0) > 0:
-                new_counts[key] = max(new_counts.get(key, 0) - 1, 0)
+            if new_counts.get(new_key, 0) > 0:
+                new_counts[new_key] = max(new_counts.get(new_key, 0) - 1, 0)
                 appended.append(item)
                 continue
             appended.append(item)
@@ -159,6 +170,11 @@ async def prepare_input_with_session(
 
     # Normalize exactly as the runtime does elsewhere so the prepared model input and the
     # persisted session items are derived from the same item shape and dedupe rules.
+    if is_openai_conversation_session and prune_history_indexes:
+        prepared_items_raw = _sanitize_openai_conversation_history_items_for_model_input(
+            prepared_items_raw,
+            prune_history_indexes,
+        )
     prepared_as_inputs = [ensure_input_item_format(item) for item in prepared_items_raw]
     filtered = drop_orphan_function_calls(
         prepared_as_inputs,
@@ -555,6 +571,32 @@ def _sanitize_openai_conversation_item(item: TResponseInputItem) -> TResponseInp
     return item
 
 
+def _sanitize_openai_conversation_history_items_for_model_input(
+    items: Sequence[TResponseInputItem],
+    history_indexes: set[int],
+) -> list[TResponseInputItem]:
+    """Remove Conversation item metadata only from session-history items sent to the model."""
+    sanitized_items: list[TResponseInputItem] = []
+    for index, item in enumerate(items):
+        if index in history_indexes:
+            sanitized_items.append(_sanitize_openai_conversation_history_item_for_model_input(item))
+        else:
+            sanitized_items.append(item)
+    return sanitized_items
+
+
+def _sanitize_openai_conversation_history_item_for_model_input(
+    item: TResponseInputItem,
+) -> TResponseInputItem:
+    """Remove Conversation replay metadata from assistant messages only."""
+    if isinstance(item, dict) and item.get("type") == "message" and item.get("role") == "assistant":
+        clean_item = cast(dict[str, Any], strip_internal_input_item_metadata(item))
+        clean_item.pop("id", None)
+        clean_item.pop("provider_data", None)
+        return cast(TResponseInputItem, clean_item)
+    return item
+
+
 def _fingerprint_or_repr(item: TResponseInputItem, *, ignore_ids_for_matching: bool) -> str:
     """Fingerprint an item or fall back to repr when unavailable."""
     return fingerprint_input_item(item, ignore_ids_for_matching=ignore_ids_for_matching) or repr(
@@ -677,7 +719,7 @@ def _collect_retry_owned_tail_serializations(
     return []
 
 
-def _session_item_key(item: Any) -> str:
+def _session_item_key(item: Any, *, ignore_openai_conversation_item_ids: bool = False) -> str:
     """Return a stable representation of a session item for comparison."""
     try:
         if hasattr(item, "model_dump"):
@@ -691,16 +733,30 @@ def _session_item_key(item: Any) -> str:
                 dict[str, Any],
                 strip_internal_input_item_metadata(cast(TResponseInputItem, payload)),
             )
+            if ignore_openai_conversation_item_ids:
+                payload = cast(
+                    dict[str, Any],
+                    _sanitize_openai_conversation_history_item_for_model_input(
+                        cast(TResponseInputItem, payload)
+                    ),
+                )
         return json.dumps(payload, sort_keys=True, default=str)
     except Exception:
         return repr(item)
 
 
-def _build_reference_map(items: Sequence[Any]) -> dict[str, list[Any]]:
+def _build_reference_map(
+    items: Sequence[Any],
+    *,
+    ignore_openai_conversation_item_ids: bool = False,
+) -> dict[str, list[Any]]:
     """Map serialized keys to the concrete session items used to build them."""
     refs: dict[str, list[Any]] = {}
     for item in items:
-        key = _session_item_key(item)
+        key = _session_item_key(
+            item,
+            ignore_openai_conversation_item_ids=ignore_openai_conversation_item_ids,
+        )
         refs.setdefault(key, []).append(item)
     return refs
 
@@ -719,10 +775,17 @@ def _consume_reference(ref_map: dict[str, list[Any]], key: str, candidate: Any)
     return False
 
 
-def _build_frequency_map(items: Sequence[Any]) -> dict[str, int]:
+def _build_frequency_map(
+    items: Sequence[Any],
+    *,
+    ignore_openai_conversation_item_ids: bool = False,
+) -> dict[str, int]:
     """Count how many times each serialized key appears in a collection."""
     freq: dict[str, int] = {}
     for item in items:
-        key = _session_item_key(item)
+        key = _session_item_key(
+            item,
+            ignore_openai_conversation_item_ids=ignore_openai_conversation_item_ids,
+        )
         freq[key] = freq.get(key, 0) + 1
     return freq
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
@@ -2117,6 +2117,219 @@ def callback(
     assert [cast(dict[str, Any], item).get("content") for item in session_items] == ["new"]
 
 
+@pytest.mark.asyncio
+async def test_prepare_input_with_openai_conversation_strips_assistant_history_ids() -> None:
+    class DummyOpenAIConversationsSession(OpenAIConversationsSession):
+        def __init__(self, history: list[TResponseInputItem]) -> None:
+            self.history = history
+
+        async def get_items(self, limit: int | None = None) -> list[TResponseInputItem]:
+            if limit is None:
+                return list(self.history)
+            return self.history[-limit:]
+
+        async def add_items(self, items: list[TResponseInputItem]) -> None:
+            self.history.extend(items)
+
+        async def pop_item(self) -> TResponseInputItem | None:
+            return self.history.pop() if self.history else None
+
+        async def clear_session(self) -> None:
+            self.history.clear()
+
+    history_item = cast(
+        TResponseInputItem,
+        {
+            "id": "conv_item_assistant",
+            "type": "message",
+            "role": "assistant",
+            "content": "history",
+            "provider_data": {"server": "metadata"},
+        },
+    )
+    user_history_item = cast(
+        TResponseInputItem,
+        {
+            "id": "conv_item_user",
+            "type": "message",
+            "role": "user",
+            "content": "user history",
+            "provider_data": {"server": "metadata"},
+        },
+    )
+    function_call_item = cast(
+        TResponseInputItem,
+        {
+            "id": "conv_item_call",
+            "type": "function_call",
+            "call_id": "call_history",
+            "name": "lookup",
+            "arguments": "{}",
+        },
+    )
+    function_call_output_item = cast(
+        TResponseInputItem,
+        {
+            "id": "conv_item_output",
+            "type": "function_call_output",
+            "call_id": "call_history",
+            "output": "ok",
+        },
+    )
+    session = DummyOpenAIConversationsSession(
+        history=[user_history_item, history_item, function_call_item, function_call_output_item]
+    )
+
+    prepared, session_items = await prepare_input_with_session("new", session, None)
+
+    assert isinstance(prepared, list)
+    user_payload = cast(dict[str, Any], prepared[0])
+    history_payload = cast(dict[str, Any], prepared[1])
+    call_payload = cast(dict[str, Any], prepared[2])
+    output_payload = cast(dict[str, Any], prepared[3])
+    new_payload = cast(dict[str, Any], prepared[4])
+    assert user_payload["role"] == "user"
+    assert user_payload["id"] == "conv_item_user"
+    assert "provider_data" in user_payload
+    assert history_payload["role"] == "assistant"
+    assert "id" not in history_payload
+    assert "provider_data" not in history_payload
+    assert call_payload["id"] == "conv_item_call"
+    assert output_payload["id"] == "conv_item_output"
+    assert new_payload["role"] == "user"
+    assert new_payload["content"] == "new"
+    assert [cast(dict[str, Any], item).get("content") for item in session_items] == ["new"]
+
+
+@pytest.mark.asyncio
+async def test_prepare_input_with_regular_session_preserves_history_ids() -> None:
+    history_item = cast(
+        TResponseInputItem,
+        {
+            "id": "message_id",
+            "type": "message",
+            "role": "assistant",
+            "content": "history",
+        },
+    )
+    session = SimpleListSession(history=[history_item])
+
+    prepared, _ = await prepare_input_with_session("new", session, None)
+
+    assert isinstance(prepared, list)
+    history_payload = cast(dict[str, Any], prepared[0])
+    assert history_payload["id"] == "message_id"
+
+
+@pytest.mark.asyncio
+async def test_prepare_input_with_openai_conversation_callback_matches_assistant_no_ids() -> None:
+    class DummyOpenAIConversationsSession(OpenAIConversationsSession):
+        def __init__(self, history: list[TResponseInputItem]) -> None:
+            self.history = history
+
+        async def get_items(self, limit: int | None = None) -> list[TResponseInputItem]:
+            if limit is None:
+                return list(self.history)
+            return self.history[-limit:]
+
+        async def add_items(self, items: list[TResponseInputItem]) -> None:
+            self.history.extend(items)
+
+        async def pop_item(self) -> TResponseInputItem | None:
+            return self.history.pop() if self.history else None
+
+        async def clear_session(self) -> None:
+            self.history.clear()
+
+    history_item = cast(
+        TResponseInputItem,
+        {
+            "id": "conv_item_assistant",
+            "type": "message",
+            "role": "assistant",
+            "content": "history",
+            "provider_data": {"server": "metadata"},
+        },
+    )
+    session = DummyOpenAIConversationsSession(history=[history_item])
+
+    def callback(
+        history: list[TResponseInputItem], new_input: list[TResponseInputItem]
+    ) -> list[TResponseInputItem]:
+        history_copy = dict(cast(dict[str, Any], history[0]))
+        history_copy.pop("id", None)
+        history_copy.pop("provider_data", None)
+        return [
+            cast(TResponseInputItem, history_copy),
+            cast(TResponseInputItem, dict(cast(dict[str, Any], new_input[0]))),
+        ]
+
+    prepared, session_items = await prepare_input_with_session("new", session, callback)
+
+    assert isinstance(prepared, list)
+    assert [cast(dict[str, Any], item).get("content") for item in prepared] == [
+        "history",
+        "new",
+    ]
+    assert [cast(dict[str, Any], item).get("content") for item in session_items] == ["new"]
+
+
+@pytest.mark.asyncio
+async def test_prepare_input_with_openai_conversation_callback_keeps_user_ids_distinct() -> None:
+    class DummyOpenAIConversationsSession(OpenAIConversationsSession):
+        def __init__(self, history: list[TResponseInputItem]) -> None:
+            self.history = history
+
+        async def get_items(self, limit: int | None = None) -> list[TResponseInputItem]:
+            if limit is None:
+                return list(self.history)
+            return self.history[-limit:]
+
+        async def add_items(self, items: list[TResponseInputItem]) -> None:
+            self.history.extend(items)
+
+        async def pop_item(self) -> TResponseInputItem | None:
+            return self.history.pop() if self.history else None
+
+        async def clear_session(self) -> None:
+            self.history.clear()
+
+    history_item = cast(
+        TResponseInputItem,
+        {
+            "id": "conv_item_user",
+            "type": "message",
+            "role": "user",
+            "content": "history",
+            "provider_data": {"server": "metadata"},
+        },
+    )
+    session = DummyOpenAIConversationsSession(history=[history_item])
+
+    def callback(
+        history: list[TResponseInputItem], new_input: list[TResponseInputItem]
+    ) -> list[TResponseInputItem]:
+        history_copy = dict(cast(dict[str, Any], history[0]))
+        history_copy.pop("id", None)
+        history_copy.pop("provider_data", None)
+        return [
+            cast(TResponseInputItem, history_copy),
+            cast(TResponseInputItem, dict(cast(dict[str, Any], new_input[0]))),
+        ]
+
+    prepared, session_items = await prepare_input_with_session("new", session, callback)
+
+    assert isinstance(prepared, list)
+    assert [cast(dict[str, Any], item).get("content") for item in prepared] == [
+        "history",
+        "new",
+    ]
+    assert [cast(dict[str, Any], item).get("content") for item in session_items] == [
+        "history",
+        "new",
+    ]
+
+
 @pytest.mark.asyncio
 async def test_persist_session_items_for_guardrail_trip_uses_original_input_when_missing() -> None:
     session = SimpleListSession()