Skip to content

Commit 64afd71

Browse files
committed
workflow + more tests
1 parent d07592a commit 64afd71

5 files changed

Lines changed: 180 additions & 91 deletions

File tree

.github/workflows/CI_coverage_comment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Add comment about test coverage to PRs
1+
name: Core / Add comment about test coverage to PRs
22

33
on:
44
workflow_run:

.github/workflows/CI_workflows_linting.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Github workflows linter
1+
name: Core / Github workflows linter
22

33
on:
44
pull_request:

.github/workflows/vllm.yml

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ concurrency:
2929
env:
3030
PYTHONUNBUFFERED: "1"
3131
FORCE_COLOR: "1"
32-
TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]'
32+
VLLM_MODEL: "Qwen/Qwen3-0.6B"
33+
# we only test on Ubuntu to keep vLLM server running simple
34+
TEST_MATRIX_OS: '["ubuntu-latest"]'
3335
TEST_MATRIX_PYTHON: '["3.10", "3.14"]'
3436

3537
jobs:
@@ -44,8 +46,8 @@ jobs:
4446
steps:
4547
- id: set
4648
run: |
47-
echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> $GITHUB_OUTPUT
48-
echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> $GITHUB_OUTPUT
49+
echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT"
50+
echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT"
4951
5052
run:
5153
name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
@@ -61,11 +63,6 @@ jobs:
6163
python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }}
6264

6365
steps:
64-
- name: Support longpaths
65-
if: matrix.os == 'windows-latest'
66-
working-directory: .
67-
run: git config --system core.longpaths true
68-
6966
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
7067

7168
- name: Set up Python ${{ matrix.python-version }}
@@ -74,7 +71,37 @@ jobs:
7471
python-version: ${{ matrix.python-version }}
7572

7673
- name: Install Hatch
77-
run: pip install --upgrade hatch
74+
run: pip install hatch
75+
76+
- name: Install vLLM and start server
77+
run: |
78+
pip install vllm
79+
80+
nohup vllm serve ${{ env.VLLM_MODEL }} \
81+
--reasoning-parser qwen3 \
82+
--max-model-len 1024 \
83+
--enforce-eager \
84+
--dtype half \
85+
--enable-auto-tool-choice \
86+
--tool-call-parser hermes \
87+
> vllm.log 2>&1 &
88+
89+
# Wait for the vLLM server to be ready with a timeout of 120 seconds
90+
timeout=120
91+
while [ $timeout -gt 0 ] && ! curl -sSf http://localhost:8000/health > /dev/null 2>&1; do
92+
echo "Waiting for vLLM server to start..."
93+
sleep 5
94+
((timeout-=5))
95+
done
96+
97+
if [ $timeout -eq 0 ]; then
98+
echo "Timed out waiting for vLLM server to start."
99+
cat vllm.log
100+
exit 1
101+
fi
102+
103+
echo "vLLM server started successfully."
104+
78105
- name: Lint
79106
if: matrix.python-version == '3.10' && runner.os == 'Linux'
80107
run: hatch run fmt-check && hatch run test:types
@@ -122,7 +149,6 @@ jobs:
122149
hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main
123150
hatch run test:unit
124151
125-
126152
notify-slack-on-failure:
127153
needs: run
128154
if: failure() && github.event_name == 'schedule'

integrations/vllm/src/haystack_integrations/components/generators/vllm/chat/chat_generator.py

Lines changed: 63 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,17 @@
1-
from typing import Any
2-
31
import asyncio
2+
from typing import Any
43

5-
from openai import AsyncStream, Stream
6-
from openai.types.chat import ChatCompletion, ChatCompletionChunk
7-
from openai.types.chat.chat_completion import Choice
8-
9-
from haystack.components.generators.chat.openai import OpenAIChatGenerator
4+
from haystack import default_from_dict, default_to_dict
105
from haystack.components.generators.chat.openai import (
6+
OpenAIChatGenerator,
117
_check_finish_reason,
128
_convert_chat_completion_chunk_to_streaming_chunk,
9+
)
10+
from haystack.components.generators.chat.openai import (
1311
_convert_chat_completion_to_chat_message as _openai_convert_chat_completion_to_chat_message,
1412
)
1513
from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
16-
from haystack import default_from_dict, default_to_dict
1714
from haystack.core.component import component
18-
from haystack.tools import deserialize_tools_or_toolset_inplace, serialize_tools_or_toolset
19-
from haystack.utils import deserialize_callable, serialize_callable
2015
from haystack.dataclasses import ChatMessage
2116
from haystack.dataclasses.chat_message import ReasoningContent
2217
from haystack.dataclasses.streaming_chunk import (
@@ -27,8 +22,11 @@
2722
SyncStreamingCallbackT,
2823
select_streaming_callback,
2924
)
30-
from haystack.tools import ToolsType
31-
from haystack.utils import Secret
25+
from haystack.tools import ToolsType, deserialize_tools_or_toolset_inplace, serialize_tools_or_toolset
26+
from haystack.utils import Secret, deserialize_callable, serialize_callable
27+
from openai import AsyncStream, Stream
28+
from openai.types.chat import ChatCompletion, ChatCompletionChunk
29+
from openai.types.chat.chat_completion import Choice
3230

3331

3432
def _convert_chat_completion_to_chat_message(completion: ChatCompletion, choice: Choice) -> ChatMessage:
@@ -65,7 +63,7 @@ class VLLMChatGenerator(OpenAIChatGenerator):
6563
Before using this component, start a vLLM server:
6664
6765
```bash
68-
vllm serve Qwen/Qwen/Qwen3-4B-Instruct-2507
66+
vllm serve Qwen/Qwen3-4B-Instruct-2507
6967
```
7068
7169
For reasoning models, start the server with the appropriate reasoning parser:
@@ -74,6 +72,15 @@ class VLLMChatGenerator(OpenAIChatGenerator):
7472
vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3
7573
```
7674
75+
For tool calling, the server must be started with `--enable-auto-tool-choice` and `--tool-call-parser`:
76+
77+
```bash
78+
vllm serve Qwen/Qwen3-0.6B --enable-auto-tool-choice --tool-call-parser hermes
79+
```
80+
81+
The available tool call parsers depend on the model. See the
82+
[vLLM tool calling docs](https://docs.vllm.ai/en/stable/features/tool_calling/) for the full list.
83+
7784
For details on server options, see the [vLLM CLI docs](https://docs.vllm.ai/en/stable/cli/serve/).
7885
7986
### Usage example
@@ -112,6 +119,27 @@ class VLLMChatGenerator(OpenAIChatGenerator):
112119
)
113120
```
114121
122+
### Usage example with tool calling
123+
124+
To use tool calling, start the vLLM server with `--enable-auto-tool-choice` and `--tool-call-parser`.
125+
126+
```python
127+
from haystack.dataclasses import ChatMessage
128+
from haystack.tools import tool
129+
from haystack_integrations.components.generators.vllm import VLLMChatGenerator
130+
131+
@tool
132+
def weather(city: str) -> str:
133+
\"\"\"Get the weather in a given city.\"\"\"
134+
return f"The weather in {city} is sunny"
135+
136+
generator = VLLMChatGenerator(model="Qwen/Qwen3-0.6B", tools=[weather])
137+
138+
messages = [ChatMessage.from_user("What is the weather in Paris?")]
139+
response = generator.run(messages=messages)
140+
print(response["replies"][0].tool_calls)
141+
```
142+
115143
### Usage example with reasoning models
116144
117145
To use reasoning models, start the vLLM server with `--reasoning-parser`.
@@ -135,7 +163,7 @@ def __init__(
135163
self,
136164
*,
137165
model: str,
138-
api_key: Secret | None = Secret.from_env_var("VLLM_API_KEY", strict=False), # noqa: B008
166+
api_key: Secret | None = Secret.from_env_var("VLLM_API_KEY", strict=False),
139167
streaming_callback: StreamingCallbackT | None = None,
140168
api_base_url: str = "http://localhost:8000/v1",
141169
generation_kwargs: dict[str, Any] | None = None,
@@ -198,7 +226,7 @@ def __init__(
198226
def to_dict(self) -> dict[str, Any]:
199227
"""
200228
Serialize this component to a dictionary.
201-
229+
202230
:returns:
203231
The serialized component as a dictionary.
204232
"""
@@ -220,7 +248,7 @@ def to_dict(self) -> dict[str, Any]:
220248
def from_dict(cls, data: dict[str, Any]) -> "VLLMChatGenerator":
221249
"""
222250
Deserialize this component from a dictionary.
223-
251+
224252
:param data: The dictionary representation of this component.
225253
:returns:
226254
The deserialized component instance.
@@ -232,16 +260,14 @@ def from_dict(cls, data: dict[str, Any]) -> "VLLMChatGenerator":
232260
data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
233261
return default_from_dict(cls, data)
234262

235-
def _handle_stream_response(
236-
self, chat_completion: Stream, callback: SyncStreamingCallbackT
237-
) -> list[ChatMessage]:
263+
def _handle_stream_response(self, chat_completion: Stream, callback: SyncStreamingCallbackT) -> list[ChatMessage]:
238264
"""
239265
Handle a synchronous streaming response, extracting reasoning content from vLLM's reasoning chunks.
240266
"""
241267
component_info = ComponentInfo.from_component(self)
242268
chunks: list[StreamingChunk] = []
243269
for chunk in chat_completion:
244-
assert len(chunk.choices) <= 1
270+
assert len(chunk.choices) <= 1 # noqa: S101
245271

246272
reasoning_text = None
247273
if chunk.choices:
@@ -254,8 +280,11 @@ def _handle_stream_response(
254280
index=0,
255281
start=not any(c.reasoning for c in chunks),
256282
component_info=component_info,
257-
meta={"model": chunk.model, "index": chunk.choices[0].index,
258-
"finish_reason": chunk.choices[0].finish_reason},
283+
meta={
284+
"model": chunk.model,
285+
"index": chunk.choices[0].index,
286+
"finish_reason": chunk.choices[0].finish_reason,
287+
},
259288
)
260289
else:
261290
# delegate non-reasoning chunks to OpenAIChatGenerator converter
@@ -278,7 +307,7 @@ async def _handle_async_stream_response(
278307
chunks: list[StreamingChunk] = []
279308
try:
280309
async for chunk in chat_completion:
281-
assert len(chunk.choices) <= 1
310+
assert len(chunk.choices) <= 1 # noqa: S101
282311

283312
reasoning_text = None
284313
if chunk.choices:
@@ -291,8 +320,11 @@ async def _handle_async_stream_response(
291320
index=0,
292321
start=not any(c.reasoning for c in chunks),
293322
component_info=component_info,
294-
meta={"model": chunk.model, "index": chunk.choices[0].index,
295-
"finish_reason": chunk.choices[0].finish_reason},
323+
meta={
324+
"model": chunk.model,
325+
"index": chunk.choices[0].index,
326+
"finish_reason": chunk.choices[0].finish_reason,
327+
},
296328
)
297329
else:
298330
# delegate non-reasoning chunks to OpenAIChatGenerator converter
@@ -309,7 +341,8 @@ async def _handle_async_stream_response(
309341
return [_convert_streaming_chunks_to_chat_message(chunks=chunks)]
310342

311343
@component.output_types(replies=list[ChatMessage])
312-
def run(
344+
# tools_strict is intentionally omitted: vLLM does not support it
345+
def run( # type: ignore[override]
313346
self,
314347
messages: list[ChatMessage],
315348
streaming_callback: StreamingCallbackT | None = None,
@@ -362,8 +395,7 @@ def run(
362395
openai_endpoint = api_args.pop("openai_endpoint")
363396
chat_completion = getattr(self.client.chat.completions, openai_endpoint)(**api_args)
364397
completions = [
365-
_convert_chat_completion_to_chat_message(chat_completion, choice)
366-
for choice in chat_completion.choices
398+
_convert_chat_completion_to_chat_message(chat_completion, choice) for choice in chat_completion.choices
367399
]
368400

369401
for message in completions:
@@ -372,7 +404,8 @@ def run(
372404
return {"replies": completions}
373405

374406
@component.output_types(replies=list[ChatMessage])
375-
async def run_async(
407+
# tools_strict is intentionally omitted: vLLM does not support it
408+
async def run_async( # type: ignore[override]
376409
self,
377410
messages: list[ChatMessage],
378411
streaming_callback: StreamingCallbackT | None = None,
@@ -428,8 +461,7 @@ async def run_async(
428461
openai_endpoint = api_args.pop("openai_endpoint")
429462
chat_completion = await getattr(self.async_client.chat.completions, openai_endpoint)(**api_args)
430463
completions = [
431-
_convert_chat_completion_to_chat_message(chat_completion, choice)
432-
for choice in chat_completion.choices
464+
_convert_chat_completion_to_chat_message(chat_completion, choice) for choice in chat_completion.choices
433465
]
434466

435467
for message in completions:

0 commit comments

Comments
 (0)