1- from typing import Any
2-
31import asyncio
2+ from typing import Any
43
5- from openai import AsyncStream , Stream
6- from openai .types .chat import ChatCompletion , ChatCompletionChunk
7- from openai .types .chat .chat_completion import Choice
8-
9- from haystack .components .generators .chat .openai import OpenAIChatGenerator
4+ from haystack import default_from_dict , default_to_dict
105from haystack .components .generators .chat .openai import (
6+ OpenAIChatGenerator ,
117 _check_finish_reason ,
128 _convert_chat_completion_chunk_to_streaming_chunk ,
9+ )
10+ from haystack .components .generators .chat .openai import (
1311 _convert_chat_completion_to_chat_message as _openai_convert_chat_completion_to_chat_message ,
1412)
1513from haystack .components .generators .utils import _convert_streaming_chunks_to_chat_message
16- from haystack import default_from_dict , default_to_dict
1714from haystack .core .component import component
18- from haystack .tools import deserialize_tools_or_toolset_inplace , serialize_tools_or_toolset
19- from haystack .utils import deserialize_callable , serialize_callable
2015from haystack .dataclasses import ChatMessage
2116from haystack .dataclasses .chat_message import ReasoningContent
2217from haystack .dataclasses .streaming_chunk import (
2722 SyncStreamingCallbackT ,
2823 select_streaming_callback ,
2924)
30- from haystack .tools import ToolsType
31- from haystack .utils import Secret
25+ from haystack .tools import ToolsType , deserialize_tools_or_toolset_inplace , serialize_tools_or_toolset
26+ from haystack .utils import Secret , deserialize_callable , serialize_callable
27+ from openai import AsyncStream , Stream
28+ from openai .types .chat import ChatCompletion , ChatCompletionChunk
29+ from openai .types .chat .chat_completion import Choice
3230
3331
3432def _convert_chat_completion_to_chat_message (completion : ChatCompletion , choice : Choice ) -> ChatMessage :
@@ -65,7 +63,7 @@ class VLLMChatGenerator(OpenAIChatGenerator):
6563 Before using this component, start a vLLM server:
6664
6765 ```bash
68- vllm serve Qwen/Qwen/ Qwen3-4B-Instruct-2507
66+ vllm serve Qwen/Qwen3-4B-Instruct-2507
6967 ```
7068
7169 For reasoning models, start the server with the appropriate reasoning parser:
@@ -74,6 +72,15 @@ class VLLMChatGenerator(OpenAIChatGenerator):
7472 vllm serve Qwen/Qwen3-0.6B --reasoning-parser qwen3
7573 ```
7674
75+ For tool calling, the server must be started with `--enable-auto-tool-choice` and `--tool-call-parser`:
76+
77+ ```bash
78+ vllm serve Qwen/Qwen3-0.6B --enable-auto-tool-choice --tool-call-parser hermes
79+ ```
80+
81+ The available tool call parsers depend on the model. See the
82+ [vLLM tool calling docs](https://docs.vllm.ai/en/stable/features/tool_calling/) for the full list.
83+
7784 For details on server options, see the [vLLM CLI docs](https://docs.vllm.ai/en/stable/cli/serve/).
7885
7986 ### Usage example
@@ -112,6 +119,27 @@ class VLLMChatGenerator(OpenAIChatGenerator):
112119 )
113120 ```
114121
122+ ### Usage example with tool calling
123+
124+ To use tool calling, start the vLLM server with `--enable-auto-tool-choice` and `--tool-call-parser`.
125+
126+ ```python
127+ from haystack.dataclasses import ChatMessage
128+ from haystack.tools import tool
129+ from haystack_integrations.components.generators.vllm import VLLMChatGenerator
130+
131+ @tool
132+ def weather(city: str) -> str:
133+ \" \" \" Get the weather in a given city.\" \" \"
134+ return f"The weather in {city} is sunny"
135+
136+ generator = VLLMChatGenerator(model="Qwen/Qwen3-0.6B", tools=[weather])
137+
138+ messages = [ChatMessage.from_user("What is the weather in Paris?")]
139+ response = generator.run(messages=messages)
140+ print(response["replies"][0].tool_calls)
141+ ```
142+
115143 ### Usage example with reasoning models
116144
117145 To use reasoning models, start the vLLM server with `--reasoning-parser`.
@@ -135,7 +163,7 @@ def __init__(
135163 self ,
136164 * ,
137165 model : str ,
138- api_key : Secret | None = Secret .from_env_var ("VLLM_API_KEY" , strict = False ), # noqa: B008
166+ api_key : Secret | None = Secret .from_env_var ("VLLM_API_KEY" , strict = False ),
139167 streaming_callback : StreamingCallbackT | None = None ,
140168 api_base_url : str = "http://localhost:8000/v1" ,
141169 generation_kwargs : dict [str , Any ] | None = None ,
@@ -198,7 +226,7 @@ def __init__(
198226 def to_dict (self ) -> dict [str , Any ]:
199227 """
200228 Serialize this component to a dictionary.
201-
229+
202230 :returns:
203231 The serialized component as a dictionary.
204232 """
@@ -220,7 +248,7 @@ def to_dict(self) -> dict[str, Any]:
220248 def from_dict (cls , data : dict [str , Any ]) -> "VLLMChatGenerator" :
221249 """
222250 Deserialize this component from a dictionary.
223-
251+
224252 :param data: The dictionary representation of this component.
225253 :returns:
226254 The deserialized component instance.
@@ -232,16 +260,14 @@ def from_dict(cls, data: dict[str, Any]) -> "VLLMChatGenerator":
232260 data ["init_parameters" ]["streaming_callback" ] = deserialize_callable (serialized_callback_handler )
233261 return default_from_dict (cls , data )
234262
235- def _handle_stream_response (
236- self , chat_completion : Stream , callback : SyncStreamingCallbackT
237- ) -> list [ChatMessage ]:
263+ def _handle_stream_response (self , chat_completion : Stream , callback : SyncStreamingCallbackT ) -> list [ChatMessage ]:
238264 """
239265 Handle a synchronous streaming response, extracting reasoning content from vLLM's reasoning chunks.
240266 """
241267 component_info = ComponentInfo .from_component (self )
242268 chunks : list [StreamingChunk ] = []
243269 for chunk in chat_completion :
244- assert len (chunk .choices ) <= 1
270+ assert len (chunk .choices ) <= 1 # noqa: S101
245271
246272 reasoning_text = None
247273 if chunk .choices :
@@ -254,8 +280,11 @@ def _handle_stream_response(
254280 index = 0 ,
255281 start = not any (c .reasoning for c in chunks ),
256282 component_info = component_info ,
257- meta = {"model" : chunk .model , "index" : chunk .choices [0 ].index ,
258- "finish_reason" : chunk .choices [0 ].finish_reason },
283+ meta = {
284+ "model" : chunk .model ,
285+ "index" : chunk .choices [0 ].index ,
286+ "finish_reason" : chunk .choices [0 ].finish_reason ,
287+ },
259288 )
260289 else :
261290 # delegate non-reasoning chunks to OpenAIChatGenerator converter
@@ -278,7 +307,7 @@ async def _handle_async_stream_response(
278307 chunks : list [StreamingChunk ] = []
279308 try :
280309 async for chunk in chat_completion :
281- assert len (chunk .choices ) <= 1
310+ assert len (chunk .choices ) <= 1 # noqa: S101
282311
283312 reasoning_text = None
284313 if chunk .choices :
@@ -291,8 +320,11 @@ async def _handle_async_stream_response(
291320 index = 0 ,
292321 start = not any (c .reasoning for c in chunks ),
293322 component_info = component_info ,
294- meta = {"model" : chunk .model , "index" : chunk .choices [0 ].index ,
295- "finish_reason" : chunk .choices [0 ].finish_reason },
323+ meta = {
324+ "model" : chunk .model ,
325+ "index" : chunk .choices [0 ].index ,
326+ "finish_reason" : chunk .choices [0 ].finish_reason ,
327+ },
296328 )
297329 else :
298330 # delegate non-reasoning chunks to OpenAIChatGenerator converter
@@ -309,7 +341,8 @@ async def _handle_async_stream_response(
309341 return [_convert_streaming_chunks_to_chat_message (chunks = chunks )]
310342
311343 @component .output_types (replies = list [ChatMessage ])
312- def run (
344+ # tools_strict is intentionally omitted: vLLM does not support it
345+ def run ( # type: ignore[override]
313346 self ,
314347 messages : list [ChatMessage ],
315348 streaming_callback : StreamingCallbackT | None = None ,
@@ -362,8 +395,7 @@ def run(
362395 openai_endpoint = api_args .pop ("openai_endpoint" )
363396 chat_completion = getattr (self .client .chat .completions , openai_endpoint )(** api_args )
364397 completions = [
365- _convert_chat_completion_to_chat_message (chat_completion , choice )
366- for choice in chat_completion .choices
398+ _convert_chat_completion_to_chat_message (chat_completion , choice ) for choice in chat_completion .choices
367399 ]
368400
369401 for message in completions :
@@ -372,7 +404,8 @@ def run(
372404 return {"replies" : completions }
373405
374406 @component .output_types (replies = list [ChatMessage ])
375- async def run_async (
407+ # tools_strict is intentionally omitted: vLLM does not support it
408+ async def run_async ( # type: ignore[override]
376409 self ,
377410 messages : list [ChatMessage ],
378411 streaming_callback : StreamingCallbackT | None = None ,
@@ -428,8 +461,7 @@ async def run_async(
428461 openai_endpoint = api_args .pop ("openai_endpoint" )
429462 chat_completion = await getattr (self .async_client .chat .completions , openai_endpoint )(** api_args )
430463 completions = [
431- _convert_chat_completion_to_chat_message (chat_completion , choice )
432- for choice in chat_completion .choices
464+ _convert_chat_completion_to_chat_message (chat_completion , choice ) for choice in chat_completion .choices
433465 ]
434466
435467 for message in completions :
0 commit comments