fix: stabilize chat completions tool call indexes

Aphroq · Aphroq · commit ba023863cbea · 2026-05-04T11:45:51.000Z
diff --git a/src/agents/models/chatcmpl_stream_handler.py b/src/agents/models/chatcmpl_stream_handler.py
@@ -88,6 +88,19 @@ class ChatCmplStreamHandler:
     def _assistant_message_output_index(state: StreamingState) -> int:
         return 1 if state.reasoning_content_index_and_output is not None else 0
 
+    @staticmethod
+    def _function_call_output_base(state: StreamingState) -> int:
+        output_index = 0
+        if state.reasoning_content_index_and_output:
+            output_index += 1
+        if state.text_content_index_and_output or state.refusal_content_index_and_output:
+            output_index += 1
+        return output_index
+
+    @classmethod
+    def _next_function_call_output_index(cls, state: StreamingState) -> int:
+        return cls._function_call_output_base(state) + len(state.function_calls)
+
     @classmethod
     def _finish_reasoning_summary_part(
         cls,
@@ -447,6 +460,9 @@ async def handle_stream(
             if delta.tool_calls:
                 for tc_delta in delta.tool_calls:
                     if tc_delta.index not in state.function_calls:
+                        state.function_call_output_idx[tc_delta.index] = (
+                            cls._next_function_call_output_index(state)
+                        )
                         state.function_calls[tc_delta.index] = ResponseFunctionToolCall(
                             id=FAKE_RESPONSES_ID,
                             arguments="",
@@ -526,25 +542,9 @@ async def handle_stream(
                         and function_call.name
                         and function_call.call_id
                     ):
-                        # Calculate the output index for this function call
-                        function_call_starting_index = 0
-                        if state.reasoning_content_index_and_output:
-                            function_call_starting_index += 1
-                        if state.text_content_index_and_output:
-                            function_call_starting_index += 1
-                        if state.refusal_content_index_and_output:
-                            function_call_starting_index += 1
-
-                        # Add offset for already started function calls
-                        function_call_starting_index += sum(
-                            1 for streaming in state.function_call_streaming.values() if streaming
-                        )
-
                         # Mark this function call as streaming and store its output index
                         state.function_call_streaming[tc_delta.index] = True
-                        state.function_call_output_idx[tc_delta.index] = (
-                            function_call_starting_index
-                        )
+                        function_call_output_index = state.function_call_output_idx[tc_delta.index]
 
                         # Send initial function call added event
                         func_call_item = ResponseFunctionToolCall(
@@ -569,7 +569,7 @@ async def handle_stream(
                             func_call_item.provider_data = merged_provider_data  # type: ignore[attr-defined]
                         yield ResponseOutputItemAddedEvent(
                             item=func_call_item,
-                            output_index=function_call_starting_index,
+                            output_index=function_call_output_index,
                             type="response.output_item.added",
                             sequence_number=sequence_number.get_and_increment(),
                         )
@@ -592,12 +592,7 @@ async def handle_stream(
         for event in cls._finish_reasoning_item(state, sequence_number):
             yield event
 
-        function_call_starting_index = 0
-        if state.reasoning_content_index_and_output:
-            function_call_starting_index += 1
-
         if state.text_content_index_and_output:
-            function_call_starting_index += 1
             # Send end event for this content part
             yield ResponseContentPartDoneEvent(
                 content_index=state.text_content_index_and_output[0],
@@ -609,7 +604,6 @@ async def handle_stream(
             )
 
         if state.refusal_content_index_and_output:
-            function_call_starting_index += 1
             # Send end event for this content part
             yield ResponseContentPartDoneEvent(
                 content_index=state.refusal_content_index_and_output[0],
@@ -621,7 +615,6 @@ async def handle_stream(
             )
 
         # Send completion events for function calls
-        fallback_emitted_count = 0
         for index, function_call in state.function_calls.items():
             if state.function_call_streaming.get(index, False):
                 # Function call was streamed, just send the completion event
@@ -654,19 +647,7 @@ async def handle_stream(
             else:
                 # Function call was not streamed (fallback to old behavior)
                 # This handles edge cases where function name never arrived
-                fallback_starting_index = 0
-                if state.reasoning_content_index_and_output:
-                    fallback_starting_index += 1
-                if state.text_content_index_and_output:
-                    fallback_starting_index += 1
-                if state.refusal_content_index_and_output:
-                    fallback_starting_index += 1
-
-                # Add offset for already started function calls
-                fallback_starting_index += sum(
-                    1 for streaming in state.function_call_streaming.values() if streaming
-                )
-                fallback_output_index = fallback_starting_index + fallback_emitted_count
+                fallback_output_index = state.function_call_output_idx[index]
 
                 # Build function call kwargs, include provider_data if present
                 fallback_func_call_kwargs: dict[str, Any] = {
@@ -706,7 +687,6 @@ async def handle_stream(
                     type="response.output_item.done",
                     sequence_number=sequence_number.get_and_increment(),
                 )
-                fallback_emitted_count += 1
 
         # Finally, send the Response completed event
         outputs: list[ResponseOutputItem] = []
diff --git a/tests/models/test_openai_chatcompletions_stream.py b/tests/models/test_openai_chatcompletions_stream.py
@@ -628,3 +628,211 @@ async def patched_fetch_response(self, *args, **kwargs):
     assert [event.output_index for event in added_events] == [0, 1]
     assert [event.output_index for event in delta_events] == [0, 1]
     assert [event.output_index for event in done_events] == [0, 1]
+
+
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_stream_response_mixed_tool_calls_use_final_output_indexes(monkeypatch) -> None:
+    fallback_tool_call = ChoiceDeltaToolCall(
+        index=0,
+        function=ChoiceDeltaToolCallFunction(name="first_tool", arguments='{"a": 1}'),
+        type="function",
+    )
+    streamed_tool_call = ChoiceDeltaToolCall(
+        index=1,
+        id="second-tool-call-id",
+        function=ChoiceDeltaToolCallFunction(name="second_tool", arguments='{"b": 2}'),
+        type="function",
+    )
+    chunk1 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[fallback_tool_call]))],
+    )
+    chunk2 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[streamed_tool_call]))],
+        usage=CompletionUsage(completion_tokens=1, prompt_tokens=1, total_tokens=2),
+    )
+
+    async def fake_stream() -> AsyncIterator[ChatCompletionChunk]:
+        for chunk in (chunk1, chunk2):
+            yield chunk
+
+    async def patched_fetch_response(self, *args, **kwargs):
+        response = Response(
+            id="resp-id",
+            created_at=0,
+            model="fake-model",
+            object="response",
+            output=[],
+            tool_choice="none",
+            tools=[],
+            parallel_tool_calls=False,
+        )
+        return response, fake_stream()
+
+    monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
+    model = OpenAIProvider(use_responses=False).get_model("gpt-4")
+    output_events = []
+
+    async for event in model.stream_response(
+        system_instructions=None,
+        input="",
+        model_settings=ModelSettings(),
+        tools=[],
+        output_schema=None,
+        handoffs=[],
+        tracing=ModelTracing.DISABLED,
+        previous_response_id=None,
+        conversation_id=None,
+        prompt=None,
+    ):
+        output_events.append(event)
+
+    added_events = [event for event in output_events if event.type == "response.output_item.added"]
+    delta_events = [
+        event for event in output_events if event.type == "response.function_call_arguments.delta"
+    ]
+    done_events = [event for event in output_events if event.type == "response.output_item.done"]
+    completed_event = next(event for event in output_events if event.type == "response.completed")
+
+    added_event_indexes = {}
+    for event in added_events:
+        assert isinstance(event.item, ResponseFunctionToolCall)
+        added_event_indexes[event.item.name] = event.output_index
+
+    done_event_indexes = {}
+    for event in done_events:
+        assert isinstance(event.item, ResponseFunctionToolCall)
+        done_event_indexes[event.item.name] = event.output_index
+
+    completed_output_names = []
+    for output in completed_event.response.output:
+        assert isinstance(output, ResponseFunctionToolCall)
+        completed_output_names.append(output.name)
+
+    assert added_event_indexes == {
+        "first_tool": 0,
+        "second_tool": 1,
+    }
+    assert {event.delta: event.output_index for event in delta_events} == {
+        '{"a": 1}': 0,
+        '{"b": 2}': 1,
+    }
+    assert done_event_indexes == {
+        "first_tool": 0,
+        "second_tool": 1,
+    }
+    assert completed_output_names == ["first_tool", "second_tool"]
+
+
+@pytest.mark.allow_call_model_methods
+@pytest.mark.asyncio
+async def test_stream_response_text_before_mixed_tool_calls_offsets_tool_indexes(
+    monkeypatch,
+) -> None:
+    fallback_tool_call = ChoiceDeltaToolCall(
+        index=0,
+        function=ChoiceDeltaToolCallFunction(name="first_tool", arguments='{"a": 1}'),
+        type="function",
+    )
+    streamed_tool_call = ChoiceDeltaToolCall(
+        index=1,
+        id="second-tool-call-id",
+        function=ChoiceDeltaToolCallFunction(name="second_tool", arguments='{"b": 2}'),
+        type="function",
+    )
+    chunk1 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(content="Preparing tools"))],
+    )
+    chunk2 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[fallback_tool_call]))],
+    )
+    chunk3 = ChatCompletionChunk(
+        id="chunk-id",
+        created=1,
+        model="fake",
+        object="chat.completion.chunk",
+        choices=[Choice(index=0, delta=ChoiceDelta(tool_calls=[streamed_tool_call]))],
+        usage=CompletionUsage(completion_tokens=1, prompt_tokens=1, total_tokens=2),
+    )
+
+    async def fake_stream() -> AsyncIterator[ChatCompletionChunk]:
+        for chunk in (chunk1, chunk2, chunk3):
+            yield chunk
+
+    async def patched_fetch_response(self, *args, **kwargs):
+        response = Response(
+            id="resp-id",
+            created_at=0,
+            model="fake-model",
+            object="response",
+            output=[],
+            tool_choice="none",
+            tools=[],
+            parallel_tool_calls=False,
+        )
+        return response, fake_stream()
+
+    monkeypatch.setattr(OpenAIChatCompletionsModel, "_fetch_response", patched_fetch_response)
+    model = OpenAIProvider(use_responses=False).get_model("gpt-4")
+    output_events = []
+
+    async for event in model.stream_response(
+        system_instructions=None,
+        input="",
+        model_settings=ModelSettings(),
+        tools=[],
+        output_schema=None,
+        handoffs=[],
+        tracing=ModelTracing.DISABLED,
+        previous_response_id=None,
+        conversation_id=None,
+        prompt=None,
+    ):
+        output_events.append(event)
+
+    added_events = [event for event in output_events if event.type == "response.output_item.added"]
+    delta_events = [
+        event for event in output_events if event.type == "response.function_call_arguments.delta"
+    ]
+    done_events = [event for event in output_events if event.type == "response.output_item.done"]
+    completed_event = next(event for event in output_events if event.type == "response.completed")
+
+    added_tool_indexes = {}
+    for event in added_events:
+        if isinstance(event.item, ResponseFunctionToolCall):
+            added_tool_indexes[event.item.name] = event.output_index
+
+    done_tool_indexes = {}
+    for event in done_events:
+        if isinstance(event.item, ResponseFunctionToolCall):
+            done_tool_indexes[event.item.name] = event.output_index
+
+    assert added_tool_indexes == {"first_tool": 1, "second_tool": 2}
+    assert {event.delta: event.output_index for event in delta_events} == {
+        '{"a": 1}': 1,
+        '{"b": 2}': 2,
+    }
+    assert done_tool_indexes == {"first_tool": 1, "second_tool": 2}
+    assert isinstance(completed_event.response.output[0], ResponseOutputMessage)
+    completed_tool_outputs = completed_event.response.output[1:]
+    completed_tool_names = []
+    for output in completed_tool_outputs:
+        assert isinstance(output, ResponseFunctionToolCall)
+        completed_tool_names.append(output.name)
+    assert completed_tool_names == ["first_tool", "second_tool"]
diff --git a/tests/models/test_reasoning_content.py b/tests/models/test_reasoning_content.py