workflow + more tests

anakin87 · anakin87 · commit 64afd715ceca · 2026-04-03T10:16:24.000+02:00
diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml
@@ -1,4 +1,4 @@
-name: Add comment about test coverage to PRs
+name: Core / Add comment about test coverage to PRs
 
 on:
   workflow_run:
diff --git a/.github/workflows/CI_workflows_linting.yml b/.github/workflows/CI_workflows_linting.yml
@@ -1,4 +1,4 @@
-name: Github workflows linter
+name: Core / Github workflows linter
 
 on:
   pull_request:
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
@@ -29,7 +29,9 @@ concurrency:
 env:
   PYTHONUNBUFFERED: "1"
   FORCE_COLOR: "1"
-  TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]'
+  VLLM_MODEL: "Qwen/Qwen3-0.6B"
+  # we only test on Ubuntu to keep vLLM server running simple
+  TEST_MATRIX_OS: '["ubuntu-latest"]'
   TEST_MATRIX_PYTHON: '["3.10", "3.14"]'
 
 jobs:
@@ -44,8 +46,8 @@ jobs:
     steps:
       - id: set
         run: |
-          echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> $GITHUB_OUTPUT
-          echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> $GITHUB_OUTPUT
+          echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
+          echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"
 
   run:
     name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
@@ -61,11 +63,6 @@ jobs:
         python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}
 
     steps:
-      - name: Support longpaths
-        if: matrix.os == 'windows-latest'
-        working-directory: .
-        run: git config --system core.longpaths true
-
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
 
       - name: Set up Python ${{ matrix.python-version }}
@@ -74,7 +71,37 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Install Hatch
-        run: pip install --upgrade hatch
+        run: pip install hatch
+
+      - name: Install vLLM and start server
+        run: |
+          pip install vllm
+
+          nohup vllm serve ${{ env.VLLM_MODEL }} \
+            --reasoning-parser qwen3 \
+            --max-model-len 1024 \
+            --enforce-eager \
+            --dtype half \
+            --enable-auto-tool-choice \
+            --tool-call-parser hermes \
+            > vllm.log 2>&1 &
+
+          # Wait for the vLLM server to be ready with a timeout of 120 seconds
+          timeout=120
+          while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do
+            echo "Waiting for vLLM server to start..."
+            sleep 5
+            ((timeout-=5))
+          done
+
+          if [ $timeout -eq 0 ]; then
+            echo "Timed out waiting for vLLM server to start."
+            cat vllm.log
+            exit 1
+          fi
+
+          echo "vLLM server started successfully."
+
       - name: Lint
         if: matrix.python-version == '3.10' && runner.os == 'Linux'
         run: hatch run fmt-check && hatch run test:types
@@ -122,7 +149,6 @@ jobs:
           hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
           hatch run test:unit
 
-
   notify-slack-on-failure:
     needs: run
     if: failure() && github.event_name == 'schedule'
diff --git a/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py b/integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py
@@ -1,22 +1,17 @@
-from typing import Any
-
 import asyncio
+from typing import Any
 
-from openai import AsyncStream, Stream
-from openai.types.chat import ChatCompletion, ChatCompletionChunk
-from openai.types.chat.chat_completion import Choice
-
-from haystack.components.generators.chat.openai import OpenAIChatGenerator
+from haystack import default_from_dict, default_to_dict
 from haystack.components.generators.chat.openai import (
+    OpenAIChatGenerator,
     _check_finish_reason,
     _convert_chat_completion_chunk_to_streaming_chunk,
+)
+from haystack.components.generators.chat.openai import (
     _convert_chat_completion_to_chat_message as _openai_convert_chat_completion_to_chat_message,
 )
 from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
-from haystack import default_from_dict, default_to_dict
 from haystack.core.component import component
-from haystack.tools import deserialize_tools_or_toolset_inplace, serialize_tools_or_toolset
-from haystack.utils import deserialize_callable, serialize_callable
 from haystack.dataclasses import ChatMessage
 from haystack.dataclasses.chat_message import ReasoningContent
 from haystack.dataclasses.streaming_chunk import (
@@ -27,8 +22,11 @@
     SyncStreamingCallbackT,
     select_streaming_callback,
 )
-from haystack.tools import ToolsType
-from haystack.utils import Secret
+from haystack.tools import ToolsType, deserialize_tools_or_toolset_inplace, serialize_tools_or_toolset
+from haystack.utils import Secret, deserialize_callable, serialize_callable
+from openai import AsyncStream, Stream
+from openai.types.chat import ChatCompletion, ChatCompletionChunk
+from openai.types.chat.chat_completion import Choice
 
 
 def _convert_chat_completion_to_chat_message(completion: ChatCompletion, choice: Choice) -> ChatMessage:
@@ -65,7 +63,7 @@ class VLLMChatGenerator(OpenAIChatGenerator):
     Before using this component, start a vLLM server:
 
     ```bash
-    vllm serve Qwen/Qwen/Qwen3-4B-Instruct-2507
+    vllm serve Qwen/Qwen3-4B-Instruct-2507
     ```
 
     For reasoning models, start the server with the appropriate reasoning parser:
@@ -74,6 +72,15 @@ class VLLMChatGenerator(OpenAIChatGenerator):
     vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3
     ```
 
+    For tool calling, the server must be started with `--enable-auto-tool-choice` and `--tool-call-parser`:
+
+    ```bash
+    vllm serve Qwen/Qwen3-0.6B --enable-auto-tool-choice --tool-call-parser hermes
+    ```
+
+    The available tool call parsers depend on the model. See the
+    [vLLM tool calling docs](https://docs.vllm.ai/en/stable/features/tool_calling/) for the full list.
+
     For details on server options, see the [vLLM CLI docs](https://docs.vllm.ai/en/stable/cli/serve/).
 
     ### Usage example
@@ -112,6 +119,27 @@ class VLLMChatGenerator(OpenAIChatGenerator):
     )
     ```
 
+    ### Usage example with tool calling
+
+    To use tool calling, start the vLLM server with `--enable-auto-tool-choice` and `--tool-call-parser`.
+
+    ```python
+    from haystack.dataclasses import ChatMessage
+    from haystack.tools import tool
+    from haystack_integrations.components.generators.vllm import VLLMChatGenerator
+
+    @tool
+    def weather(city: str) -> str:
+        \"\"\"Get the weather in a given city.\"\"\"
+        return f"The weather in {city} is sunny"
+
+    generator = VLLMChatGenerator(model="Qwen/Qwen3-0.6B", tools=[weather])
+
+    messages = [ChatMessage.from_user("What is the weather in Paris?")]
+    response = generator.run(messages=messages)
+    print(response["replies"][0].tool_calls)
+    ```
+
     ### Usage example with reasoning models
 
     To use reasoning models, start the vLLM server with `--reasoning-parser`.
@@ -135,7 +163,7 @@ def __init__(
         self,
         *,
         model: str,
-        api_key: Secret | None = Secret.from_env_var("VLLM_API_KEY", strict=False),  # noqa: B008
+        api_key: Secret | None = Secret.from_env_var("VLLM_API_KEY", strict=False),
         streaming_callback: StreamingCallbackT | None = None,
         api_base_url: str = "http://localhost:8000/v1",
         generation_kwargs: dict[str, Any] | None = None,
@@ -198,7 +226,7 @@ def __init__(
     def to_dict(self) -> dict[str, Any]:
         """
         Serialize this component to a dictionary.
-        
+
         :returns:
             The serialized component as a dictionary.
         """
@@ -220,7 +248,7 @@ def to_dict(self) -> dict[str, Any]:
     def from_dict(cls, data: dict[str, Any]) -> "VLLMChatGenerator":
         """
         Deserialize this component from a dictionary.
-        
+
         :param data: The dictionary representation of this component.
         :returns:
             The deserialized component instance.
@@ -232,16 +260,14 @@ def from_dict(cls, data: dict[str, Any]) -> "VLLMChatGenerator":
             data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
         return default_from_dict(cls, data)
 
-    def _handle_stream_response(
-        self, chat_completion: Stream, callback: SyncStreamingCallbackT
-    ) -> list[ChatMessage]:
+    def _handle_stream_response(self, chat_completion: Stream, callback: SyncStreamingCallbackT) -> list[ChatMessage]:
         """
         Handle a synchronous streaming response, extracting reasoning content from vLLM's reasoning chunks.
         """
         component_info = ComponentInfo.from_component(self)
         chunks: list[StreamingChunk] = []
         for chunk in chat_completion:
-            assert len(chunk.choices) <= 1
+            assert len(chunk.choices) <= 1  # noqa: S101
 
             reasoning_text = None
             if chunk.choices:
@@ -254,8 +280,11 @@ def _handle_stream_response(
                     index=0,
                     start=not any(c.reasoning for c in chunks),
                     component_info=component_info,
-                    meta={"model": chunk.model, "index": chunk.choices[0].index,
-                          "finish_reason": chunk.choices[0].finish_reason},
+                    meta={
+                        "model": chunk.model,
+                        "index": chunk.choices[0].index,
+                        "finish_reason": chunk.choices[0].finish_reason,
+                    },
                 )
             else:
                 # delegate non-reasoning chunks to OpenAIChatGenerator converter
@@ -278,7 +307,7 @@ async def _handle_async_stream_response(
         chunks: list[StreamingChunk] = []
         try:
             async for chunk in chat_completion:
-                assert len(chunk.choices) <= 1
+                assert len(chunk.choices) <= 1  # noqa: S101
 
                 reasoning_text = None
                 if chunk.choices:
@@ -291,8 +320,11 @@ async def _handle_async_stream_response(
                         index=0,
                         start=not any(c.reasoning for c in chunks),
                         component_info=component_info,
-                        meta={"model": chunk.model, "index": chunk.choices[0].index,
-                              "finish_reason": chunk.choices[0].finish_reason},
+                        meta={
+                            "model": chunk.model,
+                            "index": chunk.choices[0].index,
+                            "finish_reason": chunk.choices[0].finish_reason,
+                        },
                     )
                 else:
                     # delegate non-reasoning chunks to OpenAIChatGenerator converter
@@ -309,7 +341,8 @@ async def _handle_async_stream_response(
         return [_convert_streaming_chunks_to_chat_message(chunks=chunks)]
 
     @component.output_types(replies=list[ChatMessage])
-    def run(
+    # tools_strict is intentionally omitted: vLLM does not support it
+    def run(  # type: ignore[override]
         self,
         messages: list[ChatMessage],
         streaming_callback: StreamingCallbackT | None = None,
@@ -362,8 +395,7 @@ def run(
         openai_endpoint = api_args.pop("openai_endpoint")
         chat_completion = getattr(self.client.chat.completions, openai_endpoint)(**api_args)
         completions = [
-            _convert_chat_completion_to_chat_message(chat_completion, choice)
-            for choice in chat_completion.choices
+            _convert_chat_completion_to_chat_message(chat_completion, choice) for choice in chat_completion.choices
         ]
 
         for message in completions:
@@ -372,7 +404,8 @@ def run(
         return {"replies": completions}
 
     @component.output_types(replies=list[ChatMessage])
-    async def run_async(
+    # tools_strict is intentionally omitted: vLLM does not support it
+    async def run_async(  # type: ignore[override]
         self,
         messages: list[ChatMessage],
         streaming_callback: StreamingCallbackT | None = None,
@@ -428,8 +461,7 @@ async def run_async(
         openai_endpoint = api_args.pop("openai_endpoint")
         chat_completion = await getattr(self.async_client.chat.completions, openai_endpoint)(**api_args)
         completions = [
-            _convert_chat_completion_to_chat_message(chat_completion, choice)
-            for choice in chat_completion.choices
+            _convert_chat_completion_to_chat_message(chat_completion, choice) for choice in chat_completion.choices
         ]
 
         for message in completions:
diff --git a/integrations/vllm/tests/test_chat_generator.py b/integrations/vllm/tests/test_chat_generator.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-name: Add comment about test coverage to PRs`
	`1`	`+name: Core / Add comment about test coverage to PRs`
`2`	`2`
`3`	`3`	`on:`
`4`	`4`	`workflow_run:`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-name: Github workflows linter`
	`1`	`+name: Core / Github workflows linter`
`2`	`2`
`3`	`3`	`on:`
`4`	`4`	`pull_request:`