feat: add image support to HuggingFaceAPIChatGenerator (#9680)

ChinmayBansal · anakin87 · web-flow · commit 4b9fb20bab3a · 2025-08-06T16:35:32.000+02:00
* feat(huggingface-api): #9671 add image support to HuggingFaceAPIChatGenerator * docs: add release notes for image support in HuggingFaceAPIChatGenerator * Fixed comments on PR: implementation, testing, default value for validation * refinements --------- Co-authored-by: anakin87 <stefanofiorucci@gmail.com>
diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py
@@ -223,6 +223,33 @@ class HuggingFaceAPIChatGenerator:
     print(result)
     ```
 
+    #### With the serverless inference API (Inference Providers) and text+image input
+
+    ```python
+    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
+    from haystack.dataclasses import ChatMessage, ImageContent
+    from haystack.utils import Secret
+    from haystack.utils.hf import HFGenerationAPIType
+
+    # Create an image from file path, URL, or base64
+    image = ImageContent.from_file_path("path/to/your/image.jpg")
+
+    # Create a multimodal message with both text and image
+    messages = [ChatMessage.from_user(content_parts=["Describe this image in detail", image])]
+
+    generator = HuggingFaceAPIChatGenerator(
+        api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
+        api_params={
+            "model": "Qwen/Qwen2.5-VL-7B-Instruct",  # Vision Language Model
+            "provider": "hyperbolic"
+        },
+        token=Secret.from_token("<your-api-key>")
+    )
+
+    result = generator.run(messages)
+    print(result)
+    ```
+
     #### With paid inference endpoints
 
     ```python
diff --git a/haystack/utils/hf.py b/haystack/utils/hf.py
@@ -12,8 +12,10 @@
     AsyncStreamingCallbackT,
     ChatMessage,
     ComponentInfo,
+    ImageContent,
     StreamingChunk,
     SyncStreamingCallbackT,
+    TextContent,
 )
 from haystack.lazy_imports import LazyImport
 from haystack.utils.auth import Secret
@@ -258,11 +260,17 @@ def convert_message_to_hf_format(message: ChatMessage) -> Dict[str, Any]:
     text_contents = message.texts
     tool_calls = message.tool_calls
     tool_call_results = message.tool_call_results
+    images = message.images
 
-    if not text_contents and not tool_calls and not tool_call_results:
-        raise ValueError("A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, or `ToolCallResult`.")
-    if len(text_contents) + len(tool_call_results) > 1:
-        raise ValueError("A `ChatMessage` can only contain one `TextContent` or one `ToolCallResult`.")
+    if not text_contents and not tool_calls and not tool_call_results and not images:
+        raise ValueError(
+            "A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, `ToolCallResult`, or `ImageContent`."
+        )
+    if len(tool_call_results) > 0 and len(message._content) > 1:
+        raise ValueError(
+            "For compatibility with the Hugging Face API, a `ChatMessage` with a `ToolCallResult` "
+            "cannot contain any other content."
+        )
 
     # HF always expects a content field, even if it is empty
     hf_msg: Dict[str, Any] = {"role": message._role.value, "content": ""}
@@ -275,8 +283,22 @@ def convert_message_to_hf_format(message: ChatMessage) -> Dict[str, Any]:
         # HF does not provide a way to communicate errors in tool invocations, so we ignore the error field
         return hf_msg
 
-    if text_contents:
-        hf_msg["content"] = text_contents[0]
+    # Handle multimodal content (text + images) preserving order
+    if text_contents or images:
+        content_parts: List[Dict[str, Any]] = []
+        for part in message._content:
+            if isinstance(part, TextContent):
+                content_parts.append({"type": "text", "text": part.text})
+            elif isinstance(part, ImageContent):
+                image_url = f"data:{part.mime_type or 'image/jpeg'};base64,{part.base64_image}"
+                content_parts.append({"type": "image_url", "image_url": {"url": image_url}})
+
+        if len(content_parts) == 1 and not images:
+            # content is a string
+            hf_msg["content"] = content_parts[0]["text"]
+        else:
+            hf_msg["content"] = content_parts
+
     if tool_calls:
         hf_tool_calls = []
         for tc in tool_calls:
diff --git a/releasenotes/notes/add-image-support-huggingface-api-chat-generator-9671.yaml b/releasenotes/notes/add-image-support-huggingface-api-chat-generator-9671.yaml
@@ -0,0 +1,6 @@
+---
+enhancements:
+  - |
+    Added multimodal support to `HuggingFaceAPIChatGenerator` to enable vision-language model (VLM) usage with images and text.
+    Users can now send both text and images to VLM models through Hugging Face APIs. The implementation follows the HF VLM API format
+    specification and maintains full backward compatibility with text-only messages.
diff --git a/test/components/generators/chat/test_hugging_face_api.py b/test/components/generators/chat/test_hugging_face_api.py
@@ -30,7 +30,7 @@
     _convert_hfapi_tool_calls,
     _convert_tools_to_hfapi_tools,
 )
-from haystack.dataclasses import ChatMessage, StreamingChunk, ToolCall
+from haystack.dataclasses import ChatMessage, ImageContent, StreamingChunk, ToolCall
 from haystack.tools import Tool
 from haystack.tools.toolset import Toolset
 from haystack.utils.auth import Secret
@@ -864,6 +864,34 @@ def test_live_run_with_tools(self, tools):
         assert len(final_message.text) > 0
         assert "paris" in final_message.text.lower() and "22" in final_message.text
 
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.skipif(
+        not os.environ.get("HF_API_TOKEN", None),
+        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
+    )
+    def test_live_run_multimodal(self, test_files_path):
+        image_path = test_files_path / "images" / "apple.jpg"
+        # Resize the image to keep this test fast
+        image_content = ImageContent.from_file_path(file_path=image_path, size=(100, 100))
+        messages = [ChatMessage.from_user(content_parts=["What does this image show? Max 5 words", image_content])]
+
+        generator = HuggingFaceAPIChatGenerator(
+            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
+            api_params={"model": "Qwen/Qwen2.5-VL-7B-Instruct", "provider": "hyperbolic"},
+            generation_kwargs={"max_tokens": 20},
+        )
+
+        response = generator.run(messages=messages)
+
+        assert "replies" in response
+        assert isinstance(response["replies"], list)
+        assert len(response["replies"]) > 0
+        message = response["replies"][0]
+        assert message.text
+        assert len(message.text) > 0
+        assert any(word in message.text.lower() for word in ["apple", "fruit", "red"])
+
     @pytest.mark.asyncio
     async def test_run_async(self, mock_check_valid_model, mock_chat_completion_async, chat_messages):
         generator = HuggingFaceAPIChatGenerator(
diff --git a/test/utils/test_hf.py b/test/utils/test_hf.py
@@ -6,7 +6,8 @@
 
 import pytest
 
-from haystack.dataclasses import ChatMessage, ChatRole, TextContent, ToolCall
+from haystack.dataclasses import ChatMessage, ChatRole, ImageContent, TextContent, ToolCall
+from haystack.dataclasses.chat_message import ToolCallResult
 from haystack.utils.device import ComponentDevice
 from haystack.utils.hf import convert_message_to_hf_format, resolve_hf_device_map
 
@@ -76,8 +77,32 @@ def test_convert_message_to_hf_invalid():
         convert_message_to_hf_format(message)
 
     message = ChatMessage(
-        _role=ChatRole.ASSISTANT,
-        _content=[TextContent(text="I have an answer"), TextContent(text="I have another answer")],
+        _role=ChatRole.USER,
+        _content=[
+            TextContent(text="I have an answer"),
+            ToolCallResult(
+                result="result!",
+                origin=ToolCall(id="123", tool_name="weather", arguments={"city": "Paris"}),
+                error=None,
+            ),
+        ],
     )
     with pytest.raises(ValueError):
         convert_message_to_hf_format(message)
+
+
+def test_convert_message_to_hf_format_with_multiple_images(base64_image_string):
+    image1 = ImageContent(base64_image=base64_image_string)
+    image2 = ImageContent(base64_image=base64_image_string)
+    message = ChatMessage.from_user(content_parts=["Compare these images", image1, image2])
+
+    result = convert_message_to_hf_format(message)
+    expected = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Compare these images"},
+            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image_string}"}},
+            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image_string}"}},
+        ],
+    }
+    assert result == expected