feat(litellm): Add async callbacks

alexander-alderman-webb · alexander-alderman-webb · commit 99105caffe8e · 2026-04-10T13:14:47.000+02:00
diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py
@@ -170,6 +170,10 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None:
             set_data_normalized(span, f"gen_ai.litellm.{key}", value)
 
 
+async def _async_input_callback(kwargs: "Dict[str, Any]") -> None:
+    return _input_callback(kwargs)
+
+
 def _success_callback(
     kwargs: "Dict[str, Any]",
     completion_response: "Any",
@@ -233,10 +237,28 @@ def _success_callback(
         is_streaming = kwargs.get("stream")
         # Callback is fired multiple times when streaming a response.
         # Streaming flag checked at https://github.com/BerriAI/litellm/blob/33c3f13443eaf990ac8c6e3da78bddbc2b7d0e7a/litellm/litellm_core_utils/litellm_logging.py#L1603
-        if is_streaming is not True or "complete_streaming_response" in kwargs:
+        if (
+            is_streaming is not True
+            or "complete_streaming_response" in kwargs
+            or "async_complete_streaming_response" in kwargs
+        ):
             span.__exit__(None, None, None)
 
 
+async def _async_success_callback(
+    kwargs: "Dict[str, Any]",
+    completion_response: "Any",
+    start_time: "datetime",
+    end_time: "datetime",
+) -> None:
+    return _success_callback(
+        kwargs,
+        completion_response,
+        start_time,
+        end_time,
+    )
+
+
 def _failure_callback(
     kwargs: "Dict[str, Any]",
     exception: Exception,
@@ -261,6 +283,20 @@ def _failure_callback(
         span.__exit__(type(exception), exception, None)
 
 
+async def _async_failure_callback(
+    kwargs: "Dict[str, Any]",
+    exception: Exception,
+    start_time: "datetime",
+    end_time: "datetime",
+) -> None:
+    return _failure_callback(
+        kwargs,
+        exception,
+        start_time,
+        end_time,
+    )
+
+
 class LiteLLMIntegration(Integration):
     """
     LiteLLM integration for Sentry.
@@ -318,11 +354,17 @@ def setup_once() -> None:
         litellm.input_callback = input_callback or []
         if _input_callback not in litellm.input_callback:
             litellm.input_callback.append(_input_callback)
+        if _async_input_callback not in litellm.input_callback:
+            litellm.input_callback.append(_async_input_callback)
 
         litellm.success_callback = success_callback or []
         if _success_callback not in litellm.success_callback:
             litellm.success_callback.append(_success_callback)
+        if _async_success_callback not in litellm.success_callback:
+            litellm.success_callback.append(_async_success_callback)
 
         litellm.failure_callback = failure_callback or []
         if _failure_callback not in litellm.failure_callback:
             litellm.failure_callback.append(_failure_callback)
+        if _async_failure_callback not in litellm.failure_callback:
+            litellm.failure_callback.append(_async_failure_callback)
diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py
@@ -2,6 +2,7 @@
 import json
 import pytest
 import time
+import asyncio
 from unittest import mock
 from datetime import datetime
 
@@ -31,13 +32,14 @@ async def __call__(self, *args, **kwargs):
 )
 from sentry_sdk.utils import package_version
 
-from openai import OpenAI
+from openai import OpenAI, AsyncOpenAI
 
 from concurrent.futures import ThreadPoolExecutor
 
 import litellm.utils as litellm_utils
 from litellm.litellm_core_utils import streaming_handler
 from litellm.litellm_core_utils import thread_pool_executor
+from litellm.litellm_core_utils.logging_worker import GLOBAL_LOGGING_WORKER
 from litellm.llms.custom_httpx.http_handler import HTTPHandler
 
 
@@ -240,6 +242,89 @@ def test_nonstreaming_chat_completion(
     assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30
 
 
+@pytest.mark.asyncio(loop_scope="session")
+@pytest.mark.parametrize(
+    "send_default_pii, include_prompts",
+    [
+        (True, True),
+        (True, False),
+        (False, True),
+        (False, False),
+    ],
+)
+async def test_async_nonstreaming_chat_completion(
+    sentry_init,
+    capture_events,
+    send_default_pii,
+    include_prompts,
+    get_model_response,
+    nonstreaming_chat_completions_model_response,
+):
+    sentry_init(
+        integrations=[LiteLLMIntegration(include_prompts=include_prompts)],
+        traces_sample_rate=1.0,
+        send_default_pii=send_default_pii,
+    )
+    events = capture_events()
+
+    messages = [{"role": "user", "content": "Hello!"}]
+
+    client = AsyncOpenAI(api_key="z")
+
+    model_response = get_model_response(
+        nonstreaming_chat_completions_model_response,
+        serialize_pydantic=True,
+        request_headers={"X-Stainless-Raw-Response": "true"},
+    )
+
+    with mock.patch.object(
+        client.completions._client._client,
+        "send",
+        return_value=model_response,
+    ):
+        with start_transaction(name="litellm test"):
+            await litellm.acompletion(
+                model="gpt-3.5-turbo",
+                messages=messages,
+                client=client,
+            )
+
+            await GLOBAL_LOGGING_WORKER.flush()
+            await asyncio.sleep(0.5)
+
+    assert len(events) == 1
+    (event,) = events
+
+    assert event["type"] == "transaction"
+    assert event["transaction"] == "litellm test"
+
+    chat_spans = list(
+        x
+        for x in event["spans"]
+        if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm"
+    )
+    assert len(chat_spans) == 1
+    span = chat_spans[0]
+
+    assert span["op"] == OP.GEN_AI_CHAT
+    assert span["description"] == "chat gpt-3.5-turbo"
+    assert span["data"][SPANDATA.GEN_AI_REQUEST_MODEL] == "gpt-3.5-turbo"
+    assert span["data"][SPANDATA.GEN_AI_RESPONSE_MODEL] == "gpt-3.5-turbo"
+    assert span["data"][SPANDATA.GEN_AI_SYSTEM] == "openai"
+    assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "chat"
+
+    if send_default_pii and include_prompts:
+        assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"]
+        assert SPANDATA.GEN_AI_RESPONSE_TEXT in span["data"]
+    else:
+        assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"]
+        assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"]
+
+    assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 10
+    assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 20
+    assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 30
+
+
 @pytest.mark.parametrize(
     "send_default_pii, include_prompts",
     [
@@ -311,6 +396,81 @@ def test_streaming_chat_completion(
     assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True
 
 
+@pytest.mark.asyncio(loop_scope="session")
+@pytest.mark.parametrize(
+    "send_default_pii, include_prompts",
+    [
+        (True, True),
+        (True, False),
+        (False, True),
+        (False, False),
+    ],
+)
+async def test_async_streaming_chat_completion(
+    sentry_init,
+    capture_events,
+    send_default_pii,
+    include_prompts,
+    get_model_response,
+    async_iterator,
+    server_side_event_chunks,
+    streaming_chat_completions_model_response,
+):
+    sentry_init(
+        integrations=[LiteLLMIntegration(include_prompts=include_prompts)],
+        traces_sample_rate=1.0,
+        send_default_pii=send_default_pii,
+    )
+    events = capture_events()
+
+    messages = [{"role": "user", "content": "Hello!"}]
+
+    client = AsyncOpenAI(api_key="z")
+
+    model_response = get_model_response(
+        async_iterator(
+            server_side_event_chunks(
+                streaming_chat_completions_model_response,
+                include_event_type=False,
+            ),
+        ),
+        request_headers={"X-Stainless-Raw-Response": "true"},
+    )
+
+    with mock.patch.object(
+        client.completions._client._client,
+        "send",
+        return_value=model_response,
+    ):
+        with start_transaction(name="litellm test"):
+            response = await litellm.acompletion(
+                model="gpt-3.5-turbo",
+                messages=messages,
+                client=client,
+                stream=True,
+            )
+            async for _ in response:
+                pass
+
+            await GLOBAL_LOGGING_WORKER.flush()
+            await asyncio.sleep(0.5)
+
+    assert len(events) == 1
+    (event,) = events
+
+    assert event["type"] == "transaction"
+    chat_spans = list(
+        x
+        for x in event["spans"]
+        if x["op"] == OP.GEN_AI_CHAT and x["origin"] == "auto.ai.litellm"
+    )
+    assert len(chat_spans) == 1
+    span = chat_spans[0]
+
+    assert span["op"] == OP.GEN_AI_CHAT
+    assert span["data"][SPANDATA.GEN_AI_RESPONSE_STREAMING] is True
+
+
 def test_embeddings_create(sentry_init, capture_events, clear_litellm_cache):
     """
     Test that litellm.embedding() calls are properly instrumented.