fix(litellm): Avoid double span exits when streaming (#5933)

alexander-alderman-webb · web-flow · commit 96ebbf67d911 · 2026-04-13T17:02:13.000+02:00
Avoid an unhandled exception by only exiting the span on the final invocation of `_success_callback` when litellm streams a response. Remove the shared span reference in the metadata dictionary immediately before the span is finished, to avoid race conditions between concurrent `_success_callback` invocations.

The `litellm.success_callback` callbacks are fired multiple times when streaming a response with litellm.
diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py
@@ -168,7 +168,8 @@ def _success_callback(
 ) -> None:
     """Handle successful completion."""
 
-    span = _get_metadata_dict(kwargs).get("_sentry_span")
+    metadata = _get_metadata_dict(kwargs)
+    span = metadata.get("_sentry_span")
     if span is None:
         return
 
@@ -220,8 +221,13 @@ def _success_callback(
             )
 
     finally:
-        # Always finish the span and clean up
-        span.__exit__(None, None, None)
+        is_streaming = kwargs.get("stream")
+        # Callback is fired multiple times when streaming a response.
+        # Streaming flag checked at https://github.com/BerriAI/litellm/blob/33c3f13443eaf990ac8c6e3da78bddbc2b7d0e7a/litellm/litellm_core_utils/litellm_logging.py#L1603
+        if is_streaming is not True or "complete_streaming_response" in kwargs:
+            span = metadata.pop("_sentry_span", None)
+            if span is not None:
+                span.__exit__(None, None, None)
 
 
 def _failure_callback(
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1110,6 +1110,120 @@ def inner(response_content, serialize_pydantic=False, request_headers=None):
     return inner
 
 
+@pytest.fixture
+def streaming_chat_completions_model_response():
+    return [
+        openai.types.chat.ChatCompletionChunk(
+            id="chatcmpl-test",
+            object="chat.completion.chunk",
+            created=10000000,
+            model="gpt-3.5-turbo",
+            choices=[
+                openai.types.chat.chat_completion_chunk.Choice(
+                    index=0,
+                    delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
+                        role="assistant"
+                    ),
+                    finish_reason=None,
+                ),
+            ],
+        ),
+        openai.types.chat.ChatCompletionChunk(
+            id="chatcmpl-test",
+            object="chat.completion.chunk",
+            created=10000000,
+            model="gpt-3.5-turbo",
+            choices=[
+                openai.types.chat.chat_completion_chunk.Choice(
+                    index=0,
+                    delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
+                        content="Tes"
+                    ),
+                    finish_reason=None,
+                ),
+            ],
+        ),
+        openai.types.chat.ChatCompletionChunk(
+            id="chatcmpl-test",
+            object="chat.completion.chunk",
+            created=10000000,
+            model="gpt-3.5-turbo",
+            choices=[
+                openai.types.chat.chat_completion_chunk.Choice(
+                    index=0,
+                    delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
+                        content="t r"
+                    ),
+                    finish_reason=None,
+                ),
+            ],
+        ),
+        openai.types.chat.ChatCompletionChunk(
+            id="chatcmpl-test",
+            object="chat.completion.chunk",
+            created=10000000,
+            model="gpt-3.5-turbo",
+            choices=[
+                openai.types.chat.chat_completion_chunk.Choice(
+                    index=0,
+                    delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
+                        content="esp"
+                    ),
+                    finish_reason=None,
+                ),
+            ],
+        ),
+        openai.types.chat.ChatCompletionChunk(
+            id="chatcmpl-test",
+            object="chat.completion.chunk",
+            created=10000000,
+            model="gpt-3.5-turbo",
+            choices=[
+                openai.types.chat.chat_completion_chunk.Choice(
+                    index=0,
+                    delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
+                        content="ons"
+                    ),
+                    finish_reason=None,
+                ),
+            ],
+        ),
+        openai.types.chat.ChatCompletionChunk(
+            id="chatcmpl-test",
+            object="chat.completion.chunk",
+            created=10000000,
+            model="gpt-3.5-turbo",
+            choices=[
+                openai.types.chat.chat_completion_chunk.Choice(
+                    index=0,
+                    delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(
+                        content="e"
+                    ),
+                    finish_reason=None,
+                ),
+            ],
+        ),
+        openai.types.chat.ChatCompletionChunk(
+            id="chatcmpl-test",
+            object="chat.completion.chunk",
+            created=10000000,
+            model="gpt-3.5-turbo",
+            choices=[
+                openai.types.chat.chat_completion_chunk.Choice(
+                    index=0,
+                    delta=openai.types.chat.chat_completion_chunk.ChoiceDelta(),
+                    finish_reason="stop",
+                ),
+            ],
+            usage=openai.types.CompletionUsage(
+                prompt_tokens=10,
+                completion_tokens=20,
+                total_tokens=30,
+            ),
+        ),
+    ]
+
+
 @pytest.fixture
 def nonstreaming_responses_model_response():
     return openai.types.responses.Response(
diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py
@@ -31,10 +31,28 @@ async def __call__(self, *args, **kwargs):
 )
 from sentry_sdk.utils import package_version
 
+from openai import OpenAI
+
+from concurrent.futures import ThreadPoolExecutor
+
+import litellm.utils as litellm_utils
+from litellm.litellm_core_utils import streaming_handler
+from litellm.litellm_core_utils import thread_pool_executor
+from litellm.litellm_core_utils import litellm_logging
+
 
 LITELLM_VERSION = package_version("litellm")
 
 
+@pytest.fixture()
+def reset_litellm_executor():
+    yield
+    thread_pool_executor.executor = ThreadPoolExecutor(max_workers=100)
+    litellm_utils.executor = thread_pool_executor.executor
+    streaming_handler.executor = thread_pool_executor.executor
+    litellm_logging.executor = thread_pool_executor.executor
+
+
 @pytest.fixture
 def clear_litellm_cache():
     """
@@ -212,7 +230,14 @@ def test_nonstreaming_chat_completion(
     ],
 )
 def test_streaming_chat_completion(
-    sentry_init, capture_events, send_default_pii, include_prompts
+    reset_litellm_executor,
+    sentry_init,
+    capture_events,
+    send_default_pii,
+    include_prompts,
+    get_model_response,
+    server_side_event_chunks,
+    streaming_chat_completions_model_response,
 ):
     sentry_init(
         integrations=[LiteLLMIntegration(include_prompts=include_prompts)],
@@ -222,29 +247,45 @@ def test_streaming_chat_completion(
     events = capture_events()
 
     messages = [{"role": "user", "content": "Hello!"}]
-    mock_response = MockCompletionResponse()
 
-    with start_transaction(name="litellm test"):
-        kwargs = {
-            "model": "gpt-3.5-turbo",
-            "messages": messages,
-            "stream": True,
-        }
+    client = OpenAI(api_key="z")
 
-        _input_callback(kwargs)
-        _success_callback(
-            kwargs,
-            mock_response,
-            datetime.now(),
-            datetime.now(),
-        )
+    model_response = get_model_response(
+        server_side_event_chunks(
+            streaming_chat_completions_model_response,
+            include_event_type=False,
+        ),
+        request_headers={"X-Stainless-Raw-Response": "True"},
+    )
+
+    with mock.patch.object(
+        client.completions._client._client,
+        "send",
+        return_value=model_response,
+    ):
+        with start_transaction(name="litellm test"):
+            response = litellm.completion(
+                model="gpt-3.5-turbo",
+                messages=messages,
+                client=client,
+                stream=True,
+            )
+            for _ in response:
+                pass
+
+            streaming_handler.executor.shutdown(wait=True)
 
     assert len(events) == 1
     (event,) = events
 
     assert event["type"] == "transaction"
-    assert len(event["spans"]) == 1
-    (span,) = event["spans"]
+    chat_spans = list(
+        x
+        for x in event["spans"]
+        if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm"
+    )
+    assert len(chat_spans) == 1
+    span = chat_spans[0]
 
     assert span["op"] == OP.GEN_AI_CHAT
     assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True