ref(openai): Fix token usage reporting for empty streams and non-streaming responses

ericapisani · claude · ericapisani · commit 6e7fffca31eb · 2026-04-02T13:39:07.000+02:00
Move _calculate_completions_token_usage outside the data_buf check so
token usage from stream metadata is recorded even when no content chunks
are produced (e.g. content filter). Also count output tokens from
response.output when streaming_message_responses is absent in the
Responses API path.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py
@@ -293,6 +293,12 @@ def _calculate_responses_token_usage(
         if streaming_message_responses is not None:
             for message in streaming_message_responses:
                 output_tokens += count_tokens(message)
+        elif hasattr(response, "output"):
+            for output_item in response.output:
+                if hasattr(output_item, "content"):
+                    for content_item in output_item.content:
+                        if hasattr(content_item, "text"):
+                            output_tokens += count_tokens(content_item.text)
 
     # Do not set token data if it is 0
     input_tokens = input_tokens or None
@@ -794,18 +800,20 @@ def _wrap_synchronous_completions_chunk_iterator(
             set_data_normalized(
                 span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
             )
+        all_responses = None
         if len(data_buf) > 0:
             all_responses = ["".join(chunk) for chunk in data_buf]
             if should_send_default_pii() and integration.include_prompts:
                 set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses)
-            _calculate_completions_token_usage(
-                messages=messages,
-                response=response,
-                span=span,
-                streaming_message_responses=all_responses,
-                streaming_message_total_token_usage=streaming_message_total_token_usage,
-                count_tokens=integration.count_tokens,
-            )
+
+        _calculate_completions_token_usage(
+            messages=messages,
+            response=response,
+            span=span,
+            streaming_message_responses=all_responses,
+            streaming_message_total_token_usage=streaming_message_total_token_usage,
+            count_tokens=integration.count_tokens,
+        )
 
     if finish_span:
         span.__exit__(None, None, None)
@@ -854,18 +862,20 @@ async def _wrap_asynchronous_completions_chunk_iterator(
             set_data_normalized(
                 span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
             )
+        all_responses = None
         if len(data_buf) > 0:
             all_responses = ["".join(chunk) for chunk in data_buf]
             if should_send_default_pii() and integration.include_prompts:
                 set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses)
-            _calculate_completions_token_usage(
-                messages=messages,
-                response=response,
-                span=span,
-                streaming_message_responses=all_responses,
-                streaming_message_total_token_usage=streaming_message_total_token_usage,
-                count_tokens=integration.count_tokens,
-            )
+
+        _calculate_completions_token_usage(
+            messages=messages,
+            response=response,
+            span=span,
+            streaming_message_responses=all_responses,
+            streaming_message_total_token_usage=streaming_message_total_token_usage,
+            count_tokens=integration.count_tokens,
+        )
 
     if finish_span:
         span.__exit__(None, None, None)
@@ -921,6 +931,7 @@ def _wrap_synchronous_responses_event_iterator(
             all_responses = ["".join(chunk) for chunk in data_buf]
             if should_send_default_pii() and integration.include_prompts:
                 set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses)
+
             if count_tokens_manually:
                 _calculate_responses_token_usage(
                     input=input,
diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py
@@ -693,6 +693,136 @@ def test_streaming_chat_completion_with_usage_in_stream(
     assert span["data"]["gen_ai.usage.total_tokens"] == 30
 
 
+@pytest.mark.skipif(
+    OPENAI_VERSION <= (1, 1, 0),
+    reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.",
+)
+def test_streaming_chat_completion_empty_content_preserves_token_usage(
+    sentry_init,
+    capture_events,
+    get_model_response,
+    server_side_event_chunks,
+):
+    """Token usage from the stream is recorded even when no content is produced (e.g. content filter)."""
+    sentry_init(
+        integrations=[OpenAIIntegration(include_prompts=False)],
+        traces_sample_rate=1.0,
+        send_default_pii=False,
+    )
+    events = capture_events()
+
+    client = OpenAI(api_key="z")
+    returned_stream = get_model_response(
+        server_side_event_chunks(
+            [
+                ChatCompletionChunk(
+                    id="1",
+                    choices=[],
+                    created=100000,
+                    model="model-id",
+                    object="chat.completion.chunk",
+                    usage=CompletionUsage(
+                        prompt_tokens=20,
+                        completion_tokens=0,
+                        total_tokens=20,
+                    ),
+                ),
+            ],
+            include_event_type=False,
+        )
+    )
+
+    with mock.patch.object(
+        client.chat._client._client,
+        "send",
+        return_value=returned_stream,
+    ):
+        with start_transaction(name="openai tx"):
+            response_stream = client.chat.completions.create(
+                model="some-model",
+                messages=[{"role": "user", "content": "hello"}],
+                stream=True,
+                stream_options={"include_usage": True},
+            )
+            for _ in response_stream:
+                pass
+
+    tx = events[0]
+    assert tx["type"] == "transaction"
+    span = tx["spans"][0]
+    assert span["op"] == "gen_ai.chat"
+    assert span["data"]["gen_ai.usage.input_tokens"] == 20
+    assert "gen_ai.usage.output_tokens" not in span["data"]
+    assert span["data"]["gen_ai.usage.total_tokens"] == 20
+
+
+@pytest.mark.skipif(
+    OPENAI_VERSION <= (1, 1, 0),
+    reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.",
+)
+@pytest.mark.asyncio
+async def test_streaming_chat_completion_empty_content_preserves_token_usage_async(
+    sentry_init,
+    capture_events,
+    get_model_response,
+    async_iterator,
+    server_side_event_chunks,
+):
+    """Token usage from the stream is recorded even when no content is produced - async variant."""
+    sentry_init(
+        integrations=[OpenAIIntegration(include_prompts=False)],
+        traces_sample_rate=1.0,
+        send_default_pii=False,
+    )
+    events = capture_events()
+
+    client = AsyncOpenAI(api_key="z")
+    returned_stream = get_model_response(
+        async_iterator(
+            server_side_event_chunks(
+                [
+                    ChatCompletionChunk(
+                        id="1",
+                        choices=[],
+                        created=100000,
+                        model="model-id",
+                        object="chat.completion.chunk",
+                        usage=CompletionUsage(
+                            prompt_tokens=20,
+                            completion_tokens=0,
+                            total_tokens=20,
+                        ),
+                    ),
+                ],
+                include_event_type=False,
+            )
+        )
+    )
+
+    with mock.patch.object(
+        client.chat._client._client,
+        "send",
+        return_value=returned_stream,
+    ):
+        with start_transaction(name="openai tx"):
+            response_stream = await client.chat.completions.create(
+                model="some-model",
+                messages=[{"role": "user", "content": "hello"}],
+                stream=True,
+                stream_options={"include_usage": True},
+            )
+            async for _ in response_stream:
+                pass
+
+    tx = events[0]
+    assert tx["type"] == "transaction"
+    span = tx["spans"][0]
+    assert span["op"] == "gen_ai.chat"
+    assert span["data"]["gen_ai.usage.input_tokens"] == 20
+    assert "gen_ai.usage.output_tokens" not in span["data"]
+    assert span["data"]["gen_ai.usage.total_tokens"] == 20
+
+
 @pytest.mark.skipif(
     OPENAI_VERSION <= (1, 1, 0),
     reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.",
@@ -2247,6 +2377,70 @@ def count_tokens(msg):
         )
 
 
+@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available")
+def test_responses_token_usage_manual_output_counting_response_output():
+    """When output_tokens is missing, output tokens are counted from response.output."""
+    span = mock.MagicMock()
+
+    def count_tokens(msg):
+        return len(str(msg))
+
+    response = mock.MagicMock()
+    response.usage = mock.MagicMock()
+    response.usage.input_tokens = 20
+    response.usage.total_tokens = 20
+    response.output = [
+        ResponseOutputMessage(
+            id="msg-1",
+            content=[
+                ResponseOutputText(
+                    annotations=[],
+                    text="one",
+                    type="output_text",
+                ),
+            ],
+            role="assistant",
+            status="completed",
+            type="message",
+        ),
+        ResponseOutputMessage(
+            id="msg-2",
+            content=[
+                ResponseOutputText(
+                    annotations=[],
+                    text="two",
+                    type="output_text",
+                ),
+                ResponseOutputText(
+                    annotations=[],
+                    text="three",
+                    type="output_text",
+                ),
+            ],
+            role="assistant",
+            status="completed",
+            type="message",
+        ),
+    ]
+    input = []
+    streaming_message_responses = None
+
+    with mock.patch(
+        "sentry_sdk.integrations.openai.record_token_usage"
+    ) as mock_record_token_usage:
+        _calculate_responses_token_usage(
+            input, response, span, streaming_message_responses, count_tokens
+        )
+        mock_record_token_usage.assert_called_once_with(
+            span,
+            input_tokens=20,
+            input_tokens_cached=None,
+            output_tokens=11,
+            output_tokens_reasoning=None,
+            total_tokens=20,
+        )
+
+
 @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available")
 def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events):
     sentry_init(