fix: #3104 stabilize chat completions tool call output indexes (#3161)

seratch · web-flow · commit 8c8a2eb32e11 · 2026-05-07T08:49:55.000+09:00
diff --git a/src/agents/models/chatcmpl_stream_handler.py b/src/agents/models/chatcmpl_stream_handler.py
@@ -65,6 +65,7 @@ class StreamingState:
     function_calls: dict[int, ResponseFunctionToolCall] = field(default_factory=dict)
     # Fields for real-time function call streaming
     function_call_streaming: dict[int, bool] = field(default_factory=dict)
+    # Stable output indexes for function calls, including fallback calls.
     function_call_output_idx: dict[int, int] = field(default_factory=dict)
     # Store accumulated thinking text and signature for Anthropic compatibility
     thinking_text: str = ""
@@ -145,6 +146,17 @@ def _finish_reasoning_item(
         )
         state.reasoning_item_done = True
 
+    @staticmethod
+    def _function_call_starting_index(state: StreamingState) -> int:
+        starting_index = 0
+        if state.reasoning_content_index_and_output:
+            starting_index += 1
+        if state.text_content_index_and_output:
+            starting_index += 1
+        if state.refusal_content_index_and_output:
+            starting_index += 1
+        return starting_index
+
     @classmethod
     async def handle_stream(
         cls,
@@ -456,6 +468,10 @@ async def handle_stream(
                             call_id="",
                         )
                         state.function_call_streaming[tc_delta.index] = False
+                        state.function_call_output_idx[tc_delta.index] = (
+                            cls._function_call_starting_index(state)
+                            + len(state.function_call_output_idx)
+                        )
 
                     tc_function = tc_delta.function
 
@@ -527,25 +543,10 @@ async def handle_stream(
                         and function_call.name
                         and function_call.call_id
                     ):
-                        # Calculate the output index for this function call
-                        function_call_starting_index = 0
-                        if state.reasoning_content_index_and_output:
-                            function_call_starting_index += 1
-                        if state.text_content_index_and_output:
-                            function_call_starting_index += 1
-                        if state.refusal_content_index_and_output:
-                            function_call_starting_index += 1
-
-                        # Add offset for already started function calls
-                        function_call_starting_index += sum(
-                            1 for streaming in state.function_call_streaming.values() if streaming
-                        )
+                        output_index = state.function_call_output_idx[tc_delta.index]
 
-                        # Mark this function call as streaming and store its output index
+                        # Mark this function call as streaming.
                         state.function_call_streaming[tc_delta.index] = True
-                        state.function_call_output_idx[tc_delta.index] = (
-                            function_call_starting_index
-                        )
 
                         # Send initial function call added event
                         func_call_item = ResponseFunctionToolCall(
@@ -570,7 +571,7 @@ async def handle_stream(
                             func_call_item.provider_data = merged_provider_data  # type: ignore[attr-defined]
                         yield ResponseOutputItemAddedEvent(
                             item=func_call_item,
-                            output_index=function_call_starting_index,
+                            output_index=output_index,
                             type="response.output_item.added",
                             sequence_number=sequence_number.get_and_increment(),
                         )
@@ -593,12 +594,7 @@ async def handle_stream(
         for event in cls._finish_reasoning_item(state, sequence_number):
             yield event
 
-        function_call_starting_index = 0
-        if state.reasoning_content_index_and_output:
-            function_call_starting_index += 1
-
         if state.text_content_index_and_output:
-            function_call_starting_index += 1
             # Send end event for this content part
             yield ResponseContentPartDoneEvent(
                 content_index=state.text_content_index_and_output[0],
@@ -611,7 +607,6 @@ async def handle_stream(
             )
 
         if state.refusal_content_index_and_output:
-            function_call_starting_index += 1
             # Send end event for this content part
             yield ResponseContentPartDoneEvent(
                 content_index=state.refusal_content_index_and_output[0],
@@ -656,18 +651,7 @@ async def handle_stream(
             else:
                 # Function call was not streamed (fallback to old behavior)
                 # This handles edge cases where function name never arrived
-                fallback_starting_index = 0
-                if state.reasoning_content_index_and_output:
-                    fallback_starting_index += 1
-                if state.text_content_index_and_output:
-                    fallback_starting_index += 1
-                if state.refusal_content_index_and_output:
-                    fallback_starting_index += 1
-
-                # Add offset for already started function calls
-                fallback_starting_index += sum(
-                    1 for streaming in state.function_call_streaming.values() if streaming
-                )
+                output_index = state.function_call_output_idx[index]
 
                 # Build function call kwargs, include provider_data if present
                 fallback_func_call_kwargs: dict[str, Any] = {
@@ -690,20 +674,20 @@ async def handle_stream(
                 # Send all events at once (backward compatibility)
                 yield ResponseOutputItemAddedEvent(
                     item=ResponseFunctionToolCall(**fallback_func_call_kwargs),
-                    output_index=fallback_starting_index,
+                    output_index=output_index,
                     type="response.output_item.added",
                     sequence_number=sequence_number.get_and_increment(),
                 )
                 yield ResponseFunctionCallArgumentsDeltaEvent(
                     delta=function_call.arguments,
                     item_id=FAKE_RESPONSES_ID,
-                    output_index=fallback_starting_index,
+                    output_index=output_index,
                     type="response.function_call_arguments.delta",
                     sequence_number=sequence_number.get_and_increment(),
                 )
                 yield ResponseOutputItemDoneEvent(
                     item=ResponseFunctionToolCall(**fallback_func_call_kwargs),
-                    output_index=fallback_starting_index,
+                    output_index=output_index,
                     type="response.output_item.done",
                     sequence_number=sequence_number.get_and_increment(),
                 )
diff --git a/tests/models/test_openai_chatcompletions_stream.py b/tests/models/test_openai_chatcompletions_stream.py
@@ -553,3 +553,208 @@ async def patched_fetch_response(self, *args, **kwargs):
     assert isinstance(function_call_output, ResponseFunctionToolCall)
     assert function_call_output.name == "write_file"
     assert function_call_output.arguments == '{"filename": "test.py", "content": "print(hello)"}'
+
+
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_fallback_function_calls_have_unique_output_indexes(monkeypatch) -> None:
+    tool_call_delta1 = ChoiceDeltaToolCall(
+        index=0,
+        function=ChoiceDeltaToolCallFunction(
+            name="first_tool",
+            arguments='{"a": 1}',
+        ),
+        type="function",
+    )
+    tool_call_delta2 = ChoiceDeltaToolCall(
+        index=1,
+        function=ChoiceDeltaToolCallFunction(
+            name="second_tool",
+            arguments='{"b": 2}',
+        ),
+        type="function",
+    )
+
+    chunk1 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[tool_call_delta1]))],
+    )
+    chunk2 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[tool_call_delta2]))],
+        usage=CompletionUsage(completion_tokens=1, prompt_tokens=1, total_tokens=2),
+    )
+
+    async def fake_stream() -> AsyncIterator[ChatCompletionChunk]:
+        for c in (chunk1, chunk2):
+            yield c
+
+    async def patched_fetch_response(self, *args, **kwargs):
+        resp = Response(
+            id="resp-id",
+            created_at=0,
+            model="fake-model",
+            object="response",
+            output=[],
+            tool_choice="none",
+            tools=[],
+            parallel_tool_calls=False,
+        )
+        return resp, fake_stream()
+
+    monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
+    model = OpenAIProvider(use_responses=False).get_model("gpt-4")
+
+    output_events = []
+    async for event in model.stream_response(
+        system_instructions=None,
+        input="",
+        model_settings=ModelSettings(),
+        tools=[],
+        output_schema=None,
+        handoffs=[],
+        tracing=ModelTracing.DISABLED,
+        previous_response_id=None,
+        conversation_id=None,
+        prompt=None,
+    ):
+        output_events.append(event)
+
+    added_indexes = [
+        event.output_index for event in output_events if event.type == "response.output_item.added"
+    ]
+    delta_indexes = [
+        event.output_index
+        for event in output_events
+        if event.type == "response.function_call_arguments.delta"
+    ]
+    done_indexes = [
+        event.output_index for event in output_events if event.type == "response.output_item.done"
+    ]
+
+    assert added_indexes == [0, 1]
+    assert delta_indexes == [0, 1]
+    assert done_indexes == [0, 1]
+
+
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_fallback_function_call_keeps_index_before_streamed_call(monkeypatch) -> None:
+    fallback_first = ChoiceDeltaToolCall(
+        index=0,
+        function=ChoiceDeltaToolCallFunction(
+            name="fallback_first",
+            arguments='{"a": 1}',
+        ),
+        type="function",
+    )
+    streamed_second_start = ChoiceDeltaToolCall(
+        index=1,
+        id="tool-call-2",
+        function=ChoiceDeltaToolCallFunction(
+            name="streamed_second",
+            arguments="",
+        ),
+        type="function",
+    )
+    streamed_second_args = ChoiceDeltaToolCall(
+        index=1,
+        function=ChoiceDeltaToolCallFunction(arguments='{"b": 2}'),
+        type="function",
+    )
+
+    chunk1 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[fallback_first]))],
+    )
+    chunk2 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[streamed_second_start]))],
+    )
+    chunk3 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[streamed_second_args]))],
+        usage=CompletionUsage(completion_tokens=1, prompt_tokens=1, total_tokens=2),
+    )
+
+    async def fake_stream() -> AsyncIterator[ChatCompletionChunk]:
+        for c in (chunk1, chunk2, chunk3):
+            yield c
+
+    async def patched_fetch_response(self, *args, **kwargs):
+        resp = Response(
+            id="resp-id",
+            created_at=0,
+            model="fake-model",
+            object="response",
+            output=[],
+            tool_choice="none",
+            tools=[],
+            parallel_tool_calls=False,
+        )
+        return resp, fake_stream()
+
+    monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
+    model = OpenAIProvider(use_responses=False).get_model("gpt-4")
+
+    output_events = []
+    async for event in model.stream_response(
+        system_instructions=None,
+        input="",
+        model_settings=ModelSettings(),
+        tools=[],
+        output_schema=None,
+        handoffs=[],
+        tracing=ModelTracing.DISABLED,
+        previous_response_id=None,
+        conversation_id=None,
+        prompt=None,
+    ):
+        output_events.append(event)
+
+    completed = next(
+        event.response for event in output_events if event.type == "response.completed"
+    )
+    assert [
+        item.name for item in completed.output if isinstance(item, ResponseFunctionToolCall)
+    ] == [
+        "fallback_first",
+        "streamed_second",
+    ]
+
+    added_by_name = {
+        event.item.name: event.output_index
+        for event in output_events
+        if event.type == "response.output_item.added"
+        and isinstance(event.item, ResponseFunctionToolCall)
+    }
+    delta_indexes = [
+        event.output_index
+        for event in output_events
+        if event.type == "response.function_call_arguments.delta"
+    ]
+    done_by_name = {
+        event.item.name: event.output_index
+        for event in output_events
+        if event.type == "response.output_item.done"
+        and isinstance(event.item, ResponseFunctionToolCall)
+    }
+
+    assert added_by_name == {"fallback_first": 0, "streamed_second": 1}
+    assert delta_indexes == [1, 0]
+    assert done_by_name == {"streamed_second": 1, "fallback_first": 0}