Skip to content

Commit 4073238

Browse files
GWealecopybara-github
authored andcommitted
fix(litellm): emit input_audio for audio inline_data parts
LiteLLM/OpenAI reject `audio_url` content blocks (the API accepts `input_audio` with raw base64 + a `format` field). Audio inline_data was either silently dropped or rejected with a BadRequestError. Close #5406 Co-authored-by: George Weale <gweale@google.com> PiperOrigin-RevId: 905286679
1 parent 02deeb9 commit 4073238

2 files changed

Lines changed: 72 additions & 16 deletions

File tree

src/google/adk/models/lite_llm.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,11 @@
100100
_JSON_DECODER = json.JSONDecoder()
101101

102102
# Mapping of major MIME type prefixes to LiteLLM content types for URL blocks.
103+
# Audio is handled separately as `input_audio` content blocks because LiteLLM
104+
# (and OpenAI) do not accept an `audio_url` content type.
103105
_MEDIA_URL_CONTENT_TYPE_BY_MAJOR_MIME_TYPE = {
104106
"image": "image_url",
105107
"video": "video_url",
106-
"audio": "audio_url",
107108
}
108109

109110
# Mapping of LiteLLM finish_reason strings to FinishReason enum values
@@ -346,6 +347,18 @@ def _media_url_content_type(mime_type: str) -> str | None:
346347
return _MEDIA_URL_CONTENT_TYPE_BY_MAJOR_MIME_TYPE.get(major_mime_type)
347348

348349

350+
def _audio_format_from_mime_type(mime_type: str) -> str:
351+
"""Maps an audio MIME type to the format string for `input_audio` blocks."""
352+
subtype = _normalize_mime_type(mime_type).split("/", 1)[1]
353+
if subtype.startswith("x-"):
354+
subtype = subtype[2:]
355+
if subtype == "mpeg":
356+
return "mp3"
357+
if subtype in ("wave", "vnd.wave"):
358+
return "wav"
359+
return subtype
360+
361+
349362
def _iter_reasoning_texts(reasoning_value: Any) -> Iterable[str]:
350363
"""Yields textual fragments from provider specific reasoning payloads."""
351364
if reasoning_value is None:
@@ -1038,6 +1051,15 @@ async def _get_content(
10381051
})
10391052
continue
10401053
base64_string = base64.b64encode(part.inline_data.data).decode("utf-8")
1054+
if mime_type.startswith("audio/"):
1055+
content_objects.append({
1056+
"type": "input_audio",
1057+
"input_audio": {
1058+
"data": base64_string,
1059+
"format": _audio_format_from_mime_type(mime_type),
1060+
},
1061+
})
1062+
continue
10411063
data_uri = f"data:{mime_type};base64,{base64_string}"
10421064
# LiteLLM providers extract the MIME type from the data URI; avoid
10431065
# passing a separate `format` field that some backends reject.

tests/unittests/models/test_litellm.py

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2899,12 +2899,6 @@ async def test_get_content_file_uri_file_id_required_falls_back_to_text(
28992899
"video_url",
29002900
id="video",
29012901
),
2902-
pytest.param(
2903-
"https://example.com/audio.mp3",
2904-
"audio/mpeg",
2905-
"audio_url",
2906-
id="audio",
2907-
),
29082902
],
29092903
)
29102904
async def test_get_content_file_uri_media_url_file_id_required_uses_url_type(
@@ -3169,17 +3163,57 @@ async def test_get_content_file_uri_mime_type_inference(
31693163

31703164

31713165
@pytest.mark.asyncio
3172-
async def test_get_content_audio():
3166+
@pytest.mark.parametrize(
3167+
"mime_type,expected_format",
3168+
[
3169+
("audio/mpeg", "mp3"),
3170+
("audio/mp3", "mp3"),
3171+
("audio/wav", "wav"),
3172+
("audio/x-wav", "wav"),
3173+
("audio/wave", "wav"),
3174+
("audio/flac", "flac"),
3175+
("audio/ogg", "ogg"),
3176+
("audio/mp4", "mp4"),
3177+
],
3178+
)
3179+
async def test_get_content_audio_inline_data_emits_input_audio(
3180+
mime_type, expected_format
3181+
):
3182+
"""Audio inline_data is serialised as `input_audio` with raw base64 + format."""
3183+
parts = [types.Part.from_bytes(data=b"test_audio_data", mime_type=mime_type)]
3184+
content = await _get_content(parts)
3185+
assert content == [{
3186+
"type": "input_audio",
3187+
"input_audio": {
3188+
"data": "dGVzdF9hdWRpb19kYXRh",
3189+
"format": expected_format,
3190+
},
3191+
}]
3192+
3193+
3194+
@pytest.mark.asyncio
3195+
@pytest.mark.parametrize(
3196+
"provider,model",
3197+
[
3198+
("openai", "openai/gpt-4o"),
3199+
("azure", "azure/gpt-4"),
3200+
],
3201+
)
3202+
async def test_get_content_audio_file_uri_http_falls_back_to_text(
3203+
provider, model
3204+
):
3205+
"""Audio HTTP file_uri falls back to a text reference for openai/azure."""
3206+
file_uri = "https://example.com/audio.mp3"
31733207
parts = [
3174-
types.Part.from_bytes(data=b"test_audio_data", mime_type="audio/mpeg")
3208+
types.Part(
3209+
file_data=types.FileData(file_uri=file_uri, mime_type="audio/mpeg")
3210+
)
31753211
]
3176-
content = await _get_content(parts)
3177-
assert content[0]["type"] == "audio_url"
3178-
assert (
3179-
content[0]["audio_url"]["url"]
3180-
== "data:audio/mpeg;base64,dGVzdF9hdWRpb19kYXRh"
3181-
)
3182-
assert "format" not in content[0]["audio_url"]
3212+
content = await _get_content(parts, provider=provider, model=model)
3213+
assert content == [{
3214+
"type": "text",
3215+
"text": f'[File reference: "{file_uri}"]',
3216+
}]
31833217

31843218

31853219
def test_to_litellm_role():

0 commit comments

Comments
 (0)