Skip to content

Commit 4b9fb20

Browse files
feat: add image support to HuggingFaceAPIChatGenerator (#9680)
* feat(huggingface-api): #9671 add image support to HuggingFaceAPIChatGenerator * docs: add release notes for image support in HuggingFaceAPIChatGenerator * Fixed comments on PR: implementation, testing, default value for validation * refinements --------- Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
1 parent 441b487 commit 4b9fb20

5 files changed

Lines changed: 118 additions & 10 deletions

File tree

haystack/components/generators/chat/hugging_face_api.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,33 @@ class HuggingFaceAPIChatGenerator:
223223
print(result)
224224
```
225225
226+
#### With the serverless inference API (Inference Providers) and text+image input
227+
228+
```python
229+
from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
230+
from haystack.dataclasses import ChatMessage, ImageContent
231+
from haystack.utils import Secret
232+
from haystack.utils.hf import HFGenerationAPIType
233+
234+
# Create an image from file path, URL, or base64
235+
image = ImageContent.from_file_path("path/to/your/image.jpg")
236+
237+
# Create a multimodal message with both text and image
238+
messages = [ChatMessage.from_user(content_parts=["Describe this image in detail", image])]
239+
240+
generator = HuggingFaceAPIChatGenerator(
241+
api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
242+
api_params={
243+
"model": "Qwen/Qwen2.5-VL-7B-Instruct", # Vision Language Model
244+
"provider": "hyperbolic"
245+
},
246+
token=Secret.from_token("<your-api-key>")
247+
)
248+
249+
result = generator.run(messages)
250+
print(result)
251+
```
252+
226253
#### With paid inference endpoints
227254
228255
```python

haystack/utils/hf.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
AsyncStreamingCallbackT,
1313
ChatMessage,
1414
ComponentInfo,
15+
ImageContent,
1516
StreamingChunk,
1617
SyncStreamingCallbackT,
18+
TextContent,
1719
)
1820
from haystack.lazy_imports import LazyImport
1921
from haystack.utils.auth import Secret
@@ -258,11 +260,17 @@ def convert_message_to_hf_format(message: ChatMessage) -> Dict[str, Any]:
258260
text_contents = message.texts
259261
tool_calls = message.tool_calls
260262
tool_call_results = message.tool_call_results
263+
images = message.images
261264

262-
if not text_contents and not tool_calls and not tool_call_results:
263-
raise ValueError("A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, or `ToolCallResult`.")
264-
if len(text_contents) + len(tool_call_results) > 1:
265-
raise ValueError("A `ChatMessage` can only contain one `TextContent` or one `ToolCallResult`.")
265+
if not text_contents and not tool_calls and not tool_call_results and not images:
266+
raise ValueError(
267+
"A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, `ToolCallResult`, or `ImageContent`."
268+
)
269+
if len(tool_call_results) > 0 and len(message._content) > 1:
270+
raise ValueError(
271+
"For compatibility with the Hugging Face API, a `ChatMessage` with a `ToolCallResult` "
272+
"cannot contain any other content."
273+
)
266274

267275
# HF always expects a content field, even if it is empty
268276
hf_msg: Dict[str, Any] = {"role": message._role.value, "content": ""}
@@ -275,8 +283,22 @@ def convert_message_to_hf_format(message: ChatMessage) -> Dict[str, Any]:
275283
# HF does not provide a way to communicate errors in tool invocations, so we ignore the error field
276284
return hf_msg
277285

278-
if text_contents:
279-
hf_msg["content"] = text_contents[0]
286+
# Handle multimodal content (text + images) preserving order
287+
if text_contents or images:
288+
content_parts: List[Dict[str, Any]] = []
289+
for part in message._content:
290+
if isinstance(part, TextContent):
291+
content_parts.append({"type": "text", "text": part.text})
292+
elif isinstance(part, ImageContent):
293+
image_url = f"data:{part.mime_type or 'image/jpeg'};base64,{part.base64_image}"
294+
content_parts.append({"type": "image_url", "image_url": {"url": image_url}})
295+
296+
if len(content_parts) == 1 and not images:
297+
# content is a string
298+
hf_msg["content"] = content_parts[0]["text"]
299+
else:
300+
hf_msg["content"] = content_parts
301+
280302
if tool_calls:
281303
hf_tool_calls = []
282304
for tc in tool_calls:
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
enhancements:
3+
- |
4+
Added multimodal support to `HuggingFaceAPIChatGenerator` to enable vision-language model (VLM) usage with images and text.
5+
Users can now send both text and images to VLM models through Hugging Face APIs. The implementation follows the HF VLM API format
6+
specification and maintains full backward compatibility with text-only messages.

test/components/generators/chat/test_hugging_face_api.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
_convert_hfapi_tool_calls,
3131
_convert_tools_to_hfapi_tools,
3232
)
33-
from haystack.dataclasses import ChatMessage, StreamingChunk, ToolCall
33+
from haystack.dataclasses import ChatMessage, ImageContent, StreamingChunk, ToolCall
3434
from haystack.tools import Tool
3535
from haystack.tools.toolset import Toolset
3636
from haystack.utils.auth import Secret
@@ -864,6 +864,34 @@ def test_live_run_with_tools(self, tools):
864864
assert len(final_message.text) > 0
865865
assert "paris" in final_message.text.lower() and "22" in final_message.text
866866

867+
@pytest.mark.integration
868+
@pytest.mark.slow
869+
@pytest.mark.skipif(
870+
not os.environ.get("HF_API_TOKEN", None),
871+
reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
872+
)
873+
def test_live_run_multimodal(self, test_files_path):
874+
image_path = test_files_path / "images" / "apple.jpg"
875+
# Resize the image to keep this test fast
876+
image_content = ImageContent.from_file_path(file_path=image_path, size=(100, 100))
877+
messages = [ChatMessage.from_user(content_parts=["What does this image show? Max 5 words", image_content])]
878+
879+
generator = HuggingFaceAPIChatGenerator(
880+
api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
881+
api_params={"model": "Qwen/Qwen2.5-VL-7B-Instruct", "provider": "hyperbolic"},
882+
generation_kwargs={"max_tokens": 20},
883+
)
884+
885+
response = generator.run(messages=messages)
886+
887+
assert "replies" in response
888+
assert isinstance(response["replies"], list)
889+
assert len(response["replies"]) > 0
890+
message = response["replies"][0]
891+
assert message.text
892+
assert len(message.text) > 0
893+
assert any(word in message.text.lower() for word in ["apple", "fruit", "red"])
894+
867895
@pytest.mark.asyncio
868896
async def test_run_async(self, mock_check_valid_model, mock_chat_completion_async, chat_messages):
869897
generator = HuggingFaceAPIChatGenerator(

test/utils/test_hf.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66

77
import pytest
88

9-
from haystack.dataclasses import ChatMessage, ChatRole, TextContent, ToolCall
9+
from haystack.dataclasses import ChatMessage, ChatRole, ImageContent, TextContent, ToolCall
10+
from haystack.dataclasses.chat_message import ToolCallResult
1011
from haystack.utils.device import ComponentDevice
1112
from haystack.utils.hf import convert_message_to_hf_format, resolve_hf_device_map
1213

@@ -76,8 +77,32 @@ def test_convert_message_to_hf_invalid():
7677
convert_message_to_hf_format(message)
7778

7879
message = ChatMessage(
79-
_role=ChatRole.ASSISTANT,
80-
_content=[TextContent(text="I have an answer"), TextContent(text="I have another answer")],
80+
_role=ChatRole.USER,
81+
_content=[
82+
TextContent(text="I have an answer"),
83+
ToolCallResult(
84+
result="result!",
85+
origin=ToolCall(id="123", tool_name="weather", arguments={"city": "Paris"}),
86+
error=None,
87+
),
88+
],
8189
)
8290
with pytest.raises(ValueError):
8391
convert_message_to_hf_format(message)
92+
93+
94+
def test_convert_message_to_hf_format_with_multiple_images(base64_image_string):
95+
image1 = ImageContent(base64_image=base64_image_string)
96+
image2 = ImageContent(base64_image=base64_image_string)
97+
message = ChatMessage.from_user(content_parts=["Compare these images", image1, image2])
98+
99+
result = convert_message_to_hf_format(message)
100+
expected = {
101+
"role": "user",
102+
"content": [
103+
{"type": "text", "text": "Compare these images"},
104+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image_string}"}},
105+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image_string}"}},
106+
],
107+
}
108+
assert result == expected

0 commit comments

Comments
 (0)