fix(litellm): emit input_audio for audio inline_data parts

GWeale · copybara-github · commit 4073238151ee · 2026-04-24T17:04:12.000-07:00
LiteLLM/OpenAI reject `audio_url` content blocks (the API accepts `input_audio` with raw base64 + a `format` field). Audio inline_data was either silently dropped or rejected with a BadRequestError. Close #5406 Co-authored-by: George Weale <gweale@google.com> PiperOrigin-RevId: 905286679
diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py
@@ -100,10 +100,11 @@
 _JSON_DECODER = json.JSONDecoder()
 
 # Mapping of major MIME type prefixes to LiteLLM content types for URL blocks.
+# Audio is handled separately as `input_audio` content blocks because LiteLLM
+# (and OpenAI) do not accept an `audio_url` content type.
 _MEDIA_URL_CONTENT_TYPE_BY_MAJOR_MIME_TYPE = {
     "image": "image_url",
     "video": "video_url",
-    "audio": "audio_url",
 }
 
 # Mapping of LiteLLM finish_reason strings to FinishReason enum values
@@ -346,6 +347,18 @@ def _media_url_content_type(mime_type: str) -> str | None:
   return _MEDIA_URL_CONTENT_TYPE_BY_MAJOR_MIME_TYPE.get(major_mime_type)
 
 
+def _audio_format_from_mime_type(mime_type: str) -> str:
+  """Maps an audio MIME type to the format string for `input_audio` blocks."""
+  subtype = _normalize_mime_type(mime_type).split("/", 1)[1]
+  if subtype.startswith("x-"):
+    subtype = subtype[2:]
+  if subtype == "mpeg":
+    return "mp3"
+  if subtype in ("wave", "vnd.wave"):
+    return "wav"
+  return subtype
+
+
 def _iter_reasoning_texts(reasoning_value: Any) -> Iterable[str]:
   """Yields textual fragments from provider specific reasoning payloads."""
   if reasoning_value is None:
@@ -1038,6 +1051,15 @@ async def _get_content(
         })
         continue
       base64_string = base64.b64encode(part.inline_data.data).decode("utf-8")
+      if mime_type.startswith("audio/"):
+        content_objects.append({
+            "type": "input_audio",
+            "input_audio": {
+                "data": base64_string,
+                "format": _audio_format_from_mime_type(mime_type),
+            },
+        })
+        continue
       data_uri = f"data:{mime_type};base64,{base64_string}"
       # LiteLLM providers extract the MIME type from the data URI; avoid
       # passing a separate `format` field that some backends reject.
diff --git a/tests/unittests/models/test_litellm.py b/tests/unittests/models/test_litellm.py
@@ -2899,12 +2899,6 @@ async def test_get_content_file_uri_file_id_required_falls_back_to_text(
             "video_url",
             id="video",
         ),
-        pytest.param(
-            "https://example.com/audio.mp3",
-            "audio/mpeg",
-            "audio_url",
-            id="audio",
-        ),
     ],
 )
 async def test_get_content_file_uri_media_url_file_id_required_uses_url_type(
@@ -3169,17 +3163,57 @@ async def test_get_content_file_uri_mime_type_inference(
 
 
 @pytest.mark.asyncio
-async def test_get_content_audio():
+@pytest.mark.parametrize(
+    "mime_type,expected_format",
+    [
+        ("audio/mpeg", "mp3"),
+        ("audio/mp3", "mp3"),
+        ("audio/wav", "wav"),
+        ("audio/x-wav", "wav"),
+        ("audio/wave", "wav"),
+        ("audio/flac", "flac"),
+        ("audio/ogg", "ogg"),
+        ("audio/mp4", "mp4"),
+    ],
+)
+async def test_get_content_audio_inline_data_emits_input_audio(
+    mime_type, expected_format
+):
+  """Audio inline_data is serialised as `input_audio` with raw base64 + format."""
+  parts = [types.Part.from_bytes(data=b"test_audio_data", mime_type=mime_type)]
+  content = await _get_content(parts)
+  assert content == [{
+      "type": "input_audio",
+      "input_audio": {
+          "data": "dGVzdF9hdWRpb19kYXRh",
+          "format": expected_format,
+      },
+  }]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "provider,model",
+    [
+        ("openai", "openai/gpt-4o"),
+        ("azure", "azure/gpt-4"),
+    ],
+)
+async def test_get_content_audio_file_uri_http_falls_back_to_text(
+    provider, model
+):
+  """Audio HTTP file_uri falls back to a text reference for openai/azure."""
+  file_uri = "https://example.com/audio.mp3"
   parts = [
-      types.Part.from_bytes(data=b"test_audio_data", mime_type="audio/mpeg")
+      types.Part(
+          file_data=types.FileData(file_uri=file_uri, mime_type="audio/mpeg")
+      )
   ]
-  content = await _get_content(parts)
-  assert content[0]["type"] == "audio_url"
-  assert (
-      content[0]["audio_url"]["url"]
-      == "data:audio/mpeg;base64,dGVzdF9hdWRpb19kYXRh"
-  )
-  assert "format" not in content[0]["audio_url"]
+  content = await _get_content(parts, provider=provider, model=model)
+  assert content == [{
+      "type": "text",
+      "text": f'[File reference: "{file_uri}"]',
+  }]
 
 
 def test_to_litellm_role():