Skip to content

Commit 47ceeba

Browse files
committed
fix: preserve media blocks in ollama content flattening
Closes #4975 Change-Id: Ica3034cb5a38a2c00f75e9e242fd49b1f787535e
1 parent bdb5582 commit 47ceeba

2 files changed

Lines changed: 89 additions & 25 deletions

File tree

src/google/adk/models/lite_llm.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,14 +1274,19 @@ def _is_ollama_chat_provider(
12741274
return False
12751275

12761276

1277+
_MEDIA_BLOCK_TYPES = frozenset({"image_url", "video_url", "audio_url"})
1278+
1279+
12771280
def _flatten_ollama_content(
12781281
content: OpenAIMessageContent | str | None,
1279-
) -> str | None:
1282+
) -> OpenAIMessageContent | str | None:
12801283
"""Flattens multipart content to text for ollama_chat compatibility.
12811284
1282-
Ollama's chat endpoint rejects arrays for `content`. We keep textual parts,
1283-
join them with newlines, and fall back to a JSON string for non-text content.
1284-
If both text and non-text parts are present, only the text parts are kept.
1285+
Ollama's chat endpoint rejects arrays for `content` when it is text-only, so
1286+
text parts are joined with newlines and other non-media content falls back to
1287+
a JSON string. Multipart content with media blocks (image_url, video_url,
1288+
audio_url) is returned unchanged so LiteLLM's Ollama handler can convert it
1289+
to the native `images` field instead of silently dropping the media.
12851290
"""
12861291
if content is None or isinstance(content, str):
12871292
return content
@@ -1299,6 +1304,12 @@ def _flatten_ollama_content(
12991304
except TypeError:
13001305
return str(content)
13011306

1307+
if any(
1308+
isinstance(block, dict) and block.get("type") in _MEDIA_BLOCK_TYPES
1309+
for block in blocks
1310+
):
1311+
return blocks
1312+
13021313
text_parts = []
13031314
for block in blocks:
13041315
if isinstance(block, dict) and block.get("type") == "text":

tests/unittests/models/test_litellm.py

Lines changed: 74 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1716,7 +1716,7 @@ async def test_generate_content_async_with_usage_metadata(
17161716

17171717

17181718
@pytest.mark.asyncio
1719-
async def test_generate_content_async_ollama_chat_flattens_content(
1719+
async def test_generate_content_async_ollama_chat_preserves_multimodal_content(
17201720
mock_acompletion, mock_completion
17211721
):
17221722
llm_client = MockLLMClient(mock_acompletion, mock_completion)
@@ -1748,12 +1748,26 @@ async def test_generate_content_async_ollama_chat_flattens_content(
17481748
)
17491749
_, kwargs = mock_acompletion.call_args
17501750
message_content = kwargs["messages"][0]["content"]
1751-
assert isinstance(message_content, str)
1752-
assert "Describe this image." in message_content
1751+
# Multimodal content (text + image) should be kept as a list so LiteLLM
1752+
# can convert it to Ollama's native images field.
1753+
assert isinstance(message_content, list)
1754+
text_blocks = [
1755+
b
1756+
for b in message_content
1757+
if isinstance(b, dict) and b.get("type") == "text"
1758+
]
1759+
image_blocks = [
1760+
b
1761+
for b in message_content
1762+
if isinstance(b, dict) and b.get("type") == "image_url"
1763+
]
1764+
assert len(text_blocks) >= 1
1765+
assert "Describe this image." in text_blocks[0].get("text", "")
1766+
assert len(image_blocks) >= 1
17531767

17541768

17551769
@pytest.mark.asyncio
1756-
async def test_generate_content_async_custom_provider_flattens_content(
1770+
async def test_generate_content_async_custom_provider_preserves_multimodal(
17571771
mock_acompletion, mock_completion
17581772
):
17591773
llm_client = MockLLMClient(mock_acompletion, mock_completion)
@@ -1784,8 +1798,14 @@ async def test_generate_content_async_custom_provider_flattens_content(
17841798
assert kwargs["custom_llm_provider"] == "ollama_chat"
17851799
assert kwargs["model"] == "qwen2.5:7b"
17861800
message_content = kwargs["messages"][0]["content"]
1787-
assert isinstance(message_content, str)
1788-
assert "Describe this image." in message_content
1801+
# Multimodal content should be preserved as a list.
1802+
assert isinstance(message_content, list)
1803+
text_blocks = [
1804+
b
1805+
for b in message_content
1806+
if isinstance(b, dict) and b.get("type") == "text"
1807+
]
1808+
assert any("Describe this image." in b.get("text", "") for b in text_blocks)
17891809

17901810

17911811
def test_flatten_ollama_content_accepts_tuple_blocks():
@@ -1811,16 +1831,6 @@ def test_flatten_ollama_content_accepts_tuple_blocks():
18111831
],
18121832
"first\nsecond",
18131833
),
1814-
(
1815-
[
1816-
{"type": "text", "text": "Describe this image."},
1817-
{
1818-
"type": "image_url",
1819-
"image_url": {"url": "http://example.com"},
1820-
},
1821-
],
1822-
"Describe this image.",
1823-
),
18241834
],
18251835
)
18261836
def test_flatten_ollama_content_returns_str_or_none(content, expected):
@@ -1831,15 +1841,58 @@ def test_flatten_ollama_content_returns_str_or_none(content, expected):
18311841
assert flattened is None or isinstance(flattened, str)
18321842

18331843

1834-
def test_flatten_ollama_content_serializes_non_text_blocks_to_json():
1844+
def test_flatten_ollama_content_preserves_image_url_blocks():
1845+
"""Media blocks should be kept as a list so LiteLLM can convert them."""
18351846
from google.adk.models.lite_llm import _flatten_ollama_content
18361847

18371848
blocks = [
1838-
{"type": "image_url", "image_url": {"url": "http://example.com"}},
1849+
{"type": "image_url", "image_url": {"url": "http://example.com/img.png"}},
18391850
]
1840-
flattened = _flatten_ollama_content(blocks)
1841-
assert isinstance(flattened, str)
1842-
assert json.loads(flattened) == blocks
1851+
result = _flatten_ollama_content(blocks)
1852+
assert isinstance(result, list)
1853+
assert result == blocks
1854+
1855+
1856+
def test_flatten_ollama_content_preserves_mixed_text_and_image():
1857+
"""Text + image_url should return the full list, not just the text."""
1858+
from google.adk.models.lite_llm import _flatten_ollama_content
1859+
1860+
blocks = [
1861+
{"type": "text", "text": "Describe this image."},
1862+
{
1863+
"type": "image_url",
1864+
"image_url": {"url": "data:image/png;base64,iVBORw0KGgo="},
1865+
},
1866+
]
1867+
result = _flatten_ollama_content(blocks)
1868+
assert isinstance(result, list)
1869+
assert len(result) == 2
1870+
assert result[0]["type"] == "text"
1871+
assert result[1]["type"] == "image_url"
1872+
1873+
1874+
def test_flatten_ollama_content_preserves_video_url_blocks():
1875+
from google.adk.models.lite_llm import _flatten_ollama_content
1876+
1877+
blocks = [
1878+
{"type": "text", "text": "What happens in this clip?"},
1879+
{"type": "video_url", "video_url": {"url": "http://example.com/v.mp4"}},
1880+
]
1881+
result = _flatten_ollama_content(blocks)
1882+
assert isinstance(result, list)
1883+
assert len(result) == 2
1884+
1885+
1886+
def test_flatten_ollama_content_serializes_non_media_non_text_blocks_to_json():
1887+
"""Blocks with unknown types and no media should still serialize to JSON."""
1888+
from google.adk.models.lite_llm import _flatten_ollama_content
1889+
1890+
blocks = [
1891+
{"type": "custom_block", "data": "something"},
1892+
]
1893+
result = _flatten_ollama_content(blocks)
1894+
assert isinstance(result, str)
1895+
assert json.loads(result) == blocks
18431896

18441897

18451898
def test_flatten_ollama_content_serializes_dict_to_json():

0 commit comments

Comments
 (0)