feat: add run_async for VertexAIGeminiChatGenerator (#1574)

Ryzhtus · julian-risch · web-flow · commit c8f0ca4cd4a2 · 2025-04-08T19:34:27.000+02:00
* Added async calls for VertexAIGeminiChatGenerator

* Linter and typing fixes

* Replace Iterable with AsyncIterable type

* pass tool_config to the send_message_async

---------

Co-authored-by: Julian Risch &lt;julianrisch@gmx.de&gt;
diff --git a/integrations/google_vertex/pyproject.toml b/integrations/google_vertex/pyproject.toml
@@ -46,6 +46,7 @@ installer = "uv"
 dependencies = [
   "coverage[toml]>=6.5",
   "pytest",
+  "pytest-asyncio",
   "pytest-rerunfailures",
   "haystack-pydoc-tools",
 ]
diff --git a/integrations/google_vertex/src/haystack_integrations/components/generators/google_vertex/chat/gemini.py b/integrations/google_vertex/src/haystack_integrations/components/generators/google_vertex/chat/gemini.py
@@ -1,10 +1,10 @@
 import json
-from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+from typing import Any, AsyncIterable, Dict, Iterable, List, Optional, Union
 
 from haystack import logging
 from haystack.core.component import component
 from haystack.core.serialization import default_from_dict, default_to_dict
-from haystack.dataclasses import StreamingChunk
+from haystack.dataclasses import AsyncStreamingCallbackT, StreamingCallbackT, StreamingChunk, select_streaming_callback
 from haystack.dataclasses.chat_message import ChatMessage, ChatRole, ToolCall
 from haystack.tools import Tool, _check_duplicate_tool_names
 from haystack.utils import deserialize_callable, serialize_callable
@@ -150,7 +150,7 @@ def __init__(
         safety_settings: Optional[Dict[HarmCategory, HarmBlockThreshold]] = None,
         tools: Optional[List[Tool]] = None,
         tool_config: Optional[ToolConfig] = None,
-        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
+        streaming_callback: Optional[StreamingCallbackT] = None,
     ):
         """
         `VertexAIGeminiChatGenerator` enables chat completion using Google Gemini models.
@@ -300,7 +300,7 @@ def _convert_to_vertex_tools(tools: List[Tool]) -> List[VertexTool]:
     def run(
         self,
         messages: List[ChatMessage],
-        streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
+        streaming_callback: Optional[StreamingCallbackT] = None,
         *,
         tools: Optional[List[Tool]] = None,
     ):
@@ -355,6 +355,69 @@ def run(
 
         return {"replies": replies}
 
+    @component.output_types(replies=List[ChatMessage])
+    async def run_async(
+        self,
+        messages: List[ChatMessage],
+        streaming_callback: Optional[StreamingCallbackT] = None,
+        *,
+        tools: Optional[List[Tool]] = None,
+    ):
+        """
+        Async version of the run method. Generates text based on the provided messages.
+        :param messages:
+            A list of `ChatMessage` instances, representing the input messages.
+        :param streaming_callback:
+            A callback function that is called when a new token is received from the stream.
+        :param tools:
+            A list of tools for which the model can prepare calls. If set, it will override the `tools` parameter set
+            during component initialization.
+        :returns:
+            A dictionary containing the following key:
+            - `replies`:  A list containing the generated responses as `ChatMessage` instances.
+        """
+        streaming_callback = select_streaming_callback(
+            self._streaming_callback, streaming_callback, requires_async=True
+        )
+
+        tools = tools or self._tools
+        _check_duplicate_tool_names(tools)
+        google_tools = self._convert_to_vertex_tools(tools) if tools else None
+
+        if messages[0].is_from(ChatRole.SYSTEM):
+            self._model._system_instruction = Part.from_text(messages[0].text)
+            messages = messages[1:]
+
+        google_messages = [_convert_chatmessage_to_google_content(m) for m in messages]
+
+        session = self._model.start_chat(history=google_messages[:-1])
+
+        candidate_count = 1
+        if self._generation_config:
+            config_dict = self._generation_config_to_dict(self._generation_config)
+            candidate_count = config_dict.get("candidate_count", 1)
+
+        if streaming_callback and candidate_count > 1:
+            msg = "Streaming is not supported with multiple candidates. Set candidate_count to 1."
+            raise ValueError(msg)
+
+        res = await session.send_message_async(
+            content=google_messages[-1],
+            generation_config=self._generation_config,
+            safety_settings=self._safety_settings,
+            stream=streaming_callback is not None,
+            tools=google_tools,
+            tool_config=self._tool_config,
+        )
+
+        replies = (
+            await self._stream_response_and_convert_to_messages_async(res, streaming_callback)
+            if streaming_callback
+            else self._convert_response_to_messages(res)
+        )
+
+        return {"replies": replies}
+
     @staticmethod
     def _convert_response_to_messages(response_body: GenerationResponse) -> List[ChatMessage]:
         """
@@ -395,7 +458,7 @@ def _convert_response_to_messages(response_body: GenerationResponse) -> List[Cha
         return replies
 
     def _stream_response_and_convert_to_messages(
-        self, stream: Iterable[GenerationResponse], streaming_callback: Callable[[StreamingChunk], None]
+        self, stream: Iterable[GenerationResponse], streaming_callback: StreamingCallbackT
     ) -> List[ChatMessage]:
         """
         Streams the Google Vertex AI response and converts it to a list of `ChatMessage` instances.
@@ -446,3 +509,57 @@ def _stream_response_and_convert_to_messages(
         meta["usage"] = openai_usage
 
         return [ChatMessage.from_assistant(text=text or None, meta=meta, tool_calls=tool_calls)]
+
+    @staticmethod
+    async def _stream_response_and_convert_to_messages_async(
+        stream: AsyncIterable[GenerationResponse], streaming_callback: AsyncStreamingCallbackT
+    ) -> List[ChatMessage]:
+        """
+        Streams the Google Vertex AI response and converts it to a list of `ChatMessage` instances.
+
+        :param stream: The streaming response from the Google AI request.
+        :param streaming_callback: The handler for the streaming response.
+        :returns: List of `ChatMessage` instances.
+        """
+
+        text = ""
+        tool_calls = []
+        chunk_dict = {}
+
+        async for chunk in stream:
+            content_to_stream = ""
+            chunk_dict = chunk.to_dict()
+
+            # Only one candidate is supported with streaming
+            candidate = chunk_dict["candidates"][0]
+
+            for part in candidate["content"]["parts"]:
+                if new_text := part.get("text"):
+                    content_to_stream += new_text
+                    text += new_text
+                elif new_function_call := part.get("function_call"):
+                    content_to_stream += json.dumps(dict(new_function_call))
+                    tool_calls.append(
+                        ToolCall(
+                            tool_name=new_function_call["name"],
+                            arguments=new_function_call["args"],
+                        )
+                    )
+
+            await streaming_callback(StreamingChunk(content=content_to_stream, meta=chunk_dict))
+
+        # store the last chunk metadata
+        meta = chunk_dict
+
+        # format the usage metadata to be compatible with OpenAI
+        usage_metadata = meta.pop("usage_metadata", {})
+
+        openai_usage = {
+            "prompt_tokens": usage_metadata.get("prompt_token_count", 0),
+            "completion_tokens": usage_metadata.get("candidates_token_count", 0),
+            "total_tokens": usage_metadata.get("total_token_count", 0),
+        }
+
+        meta["usage"] = openai_usage
+
+        return [ChatMessage.from_assistant(text=text or None, meta=meta, tool_calls=tool_calls)]
diff --git a/integrations/google_vertex/tests/chat/test_gemini.py b/integrations/google_vertex/tests/chat/test_gemini.py
@@ -1,6 +1,6 @@
 import json
 from typing import Annotated, Literal
-from unittest.mock import MagicMock, Mock, patch
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
 
 import pytest
 from haystack import Pipeline
@@ -224,8 +224,7 @@ def test_from_dict(self, _mock_vertexai_init, _mock_generative_model):
         gemini = VertexAIGeminiChatGenerator.from_dict(
             {
                 "type": (
-                    "haystack_integrations.components.generators.google_vertex.chat.gemini."
-                    "VertexAIGeminiChatGenerator"
+                    "haystack_integrations.components.generators.google_vertex.chat.gemini.VertexAIGeminiChatGenerator"
                 ),
                 "init_parameters": {
                     "project_id": None,
@@ -253,8 +252,7 @@ def test_from_dict_with_param(self, _mock_vertexai_init, _mock_generative_model)
         gemini = VertexAIGeminiChatGenerator.from_dict(
             {
                 "type": (
-                    "haystack_integrations.components.generators.google_vertex.chat.gemini."
-                    "VertexAIGeminiChatGenerator"
+                    "haystack_integrations.components.generators.google_vertex.chat.gemini.VertexAIGeminiChatGenerator"
                 ),
                 "init_parameters": {
                     "project_id": "TestID123",
@@ -513,6 +511,159 @@ def streaming_callback(chunk: StreamingChunk) -> None:
         assert reply.tool_calls[1].arguments == {"city": "Munich"}
         assert reply.meta["usage"] == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
 
+    @patch("haystack_integrations.components.generators.google_vertex.chat.gemini.GenerativeModel")
+    @pytest.mark.asyncio
+    async def test_run_async(self, mock_generative_model):
+        mock_model = Mock()
+        mock_candidate = MagicMock(
+            content=Content(parts=[Part.from_text("This is a generated response.")], role="model")
+        )
+        mock_response = MagicMock(spec=GenerationResponse, candidates=[mock_candidate])
+
+        mock_model.send_message_async = AsyncMock(return_value=mock_response)
+        mock_model.start_chat.return_value = mock_model
+        mock_generative_model.return_value = mock_model
+
+        messages = [
+            ChatMessage.from_system("You are a helpful assistant"),
+            ChatMessage.from_user("What's the capital of France?"),
+        ]
+        gemini = VertexAIGeminiChatGenerator()
+        response = await gemini.run_async(messages=messages)
+
+        mock_model.send_message_async.assert_called_once()
+        assert "replies" in response
+        reply = response["replies"][0]
+        assert reply.role == ChatRole.ASSISTANT
+        assert reply.text == "This is a generated response."
+
+    @patch("haystack_integrations.components.generators.google_vertex.chat.gemini.GenerativeModel")
+    @pytest.mark.asyncio
+    async def test_run_with_tools_async(self, mock_generative_model, tools):
+        mock_model = Mock()
+        mock_candidate = MagicMock(
+            content=Content(
+                parts=[
+                    Part.from_dict(
+                        {"function_call": {"name": "get_current_weather", "args": {"city": "Paris", "unit": "Celsius"}}}
+                    ),
+                ],
+                role="model",
+            )
+        )
+        mock_response = MagicMock(spec=GenerationResponse, candidates=[mock_candidate])
+
+        mock_model.send_message_async = AsyncMock(return_value=mock_response)
+        mock_model.start_chat.return_value = mock_model
+        mock_generative_model.return_value = mock_model
+
+        messages = [
+            ChatMessage.from_user("What's the weather in Paris?"),
+        ]
+
+        gemini = VertexAIGeminiChatGenerator(tools=tools)
+        response = await gemini.run_async(messages=messages)
+
+        mock_model.send_message_async.assert_called_once()
+        call_kwargs = mock_model.send_message_async.call_args.kwargs
+        assert "tools" in call_kwargs
+
+        assert "replies" in response
+        reply = response["replies"][0]
+        assert reply.role == ChatRole.ASSISTANT
+        assert not reply.texts
+        assert not reply.text
+        assert len(reply.tool_calls) == 1
+        assert reply.tool_calls[0].tool_name == "get_current_weather"
+        assert reply.tool_calls[0].arguments == {"city": "Paris", "unit": "Celsius"}
+
+    @patch("haystack_integrations.components.generators.google_vertex.chat.gemini.GenerativeModel")
+    @pytest.mark.asyncio
+    async def test_run_with_muliple_tools_and_streaming_async(self, mock_generative_model, tools):
+        """
+        Test that the generator can handle multiple tools and streaming.
+        Note: this test case is made up because in practice I have always seen multiple function calls in a single
+        streaming chunk.
+        """
+
+        def population(city: Annotated[str, "the city for which to get the population, e.g. 'Munich'"] = "Munich"):
+            """A simple function to get the population for a location."""
+            return f"Population of {city}: 1,000,000"
+
+        multiple_tools = [tools[0], create_tool_from_function(population)]
+
+        mock_model = Mock()
+
+        mock_responses = [
+            MagicMock(
+                spec=GenerationResponse,
+                to_dict=lambda: {
+                    "candidates": [
+                        {
+                            "content": {
+                                "parts": [
+                                    {
+                                        "function_call": {
+                                            "name": "get_current_weather",
+                                            "args": {"city": "Munich", "unit": "Farenheit"},
+                                        }
+                                    }
+                                ]
+                            }
+                        }
+                    ]
+                },
+            ),
+            MagicMock(
+                spec=GenerationResponse,
+                to_dict=lambda: {
+                    "candidates": [
+                        {"content": {"parts": [{"function_call": {"name": "population", "args": {"city": "Munich"}}}]}}
+                    ],
+                    "usage_metadata": {"prompt_token_count": 10, "candidates_token_count": 5, "total_token_count": 15},
+                },
+            ),
+        ]
+
+        async def async_response_generator():
+            for response in mock_responses:
+                yield response
+
+        mock_model.send_message_async = AsyncMock(return_value=async_response_generator())
+        mock_model.start_chat.return_value = mock_model
+        mock_generative_model.return_value = mock_model
+
+        received_chunks = []
+
+        async def async_streaming_callback(chunk: StreamingChunk) -> None:
+            received_chunks.append(chunk)
+
+        messages = [
+            ChatMessage.from_user("What's the weather in Munich (in Farenheit) and how many people live there?"),
+        ]
+
+        gemini = VertexAIGeminiChatGenerator(tools=multiple_tools, streaming_callback=async_streaming_callback)
+        response = await gemini.run_async(messages=messages)
+
+        assert len(received_chunks) == 2
+        assert json.loads(received_chunks[0].content) == {
+            "name": "get_current_weather",
+            "args": {"city": "Munich", "unit": "Farenheit"},
+        }
+        assert json.loads(received_chunks[1].content) == {"name": "population", "args": {"city": "Munich"}}
+
+        assert "replies" in response
+        reply = response["replies"][0]
+        assert reply.role == ChatRole.ASSISTANT
+        assert not reply.texts
+        assert not reply.text
+        assert len(reply.tool_calls) == 2
+        assert reply.tool_calls[0].tool_name == "get_current_weather"
+        assert reply.tool_calls[0].arguments == {"city": "Munich", "unit": "Farenheit"}
+        assert reply.tool_calls[1].tool_name == "population"
+        assert reply.tool_calls[1].arguments == {"city": "Munich"}
+        assert reply.meta["usage"] == {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}
+
     def test_serde_in_pipeline(self):
         tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print)
 
@@ -538,8 +689,7 @@ def test_serde_in_pipeline(self):
             "components": {
                 "generator": {
                     "type": (
-                        "haystack_integrations.components.generators.google_vertex.chat.gemini."
-                        "VertexAIGeminiChatGenerator"
+                        "haystack_integrations.components.generators.google_vertex.chat.gemini.VertexAIGeminiChatGenerator"
                     ),
                     "init_parameters": {
                         "project_id": "TestID123",

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ installer = "uv"`
`46`	`46`	`dependencies = [`
`47`	`47`	`"coverage[toml]>=6.5",`
`48`	`48`	`"pytest",`
	`49`	`+ "pytest-asyncio",`
`49`	`50`	`"pytest-rerunfailures",`
`50`	`51`	`"haystack-pydoc-tools",`
`51`	`52`	`]`