Skip to content

Commit 312f02a

Browse files
authored
refactor!: OllamaChatGenerator - refine reasoning support + refactoring (#2200)
* draft * test refactoring * small refinements * smaller embedding for tests * set start + more tests * fmt
1 parent 5e5846d commit 312f02a

6 files changed

Lines changed: 670 additions & 568 deletions

File tree

.github/workflows/ollama.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ env:
2424
FORCE_COLOR: "1"
2525
LLM_FOR_TESTS: "qwen3:0.6b"
2626
VISION_LLM_FOR_TESTS: "moondream:1.8b"
27-
EMBEDDER_FOR_TESTS: "nomic-embed-text"
27+
EMBEDDER_FOR_TESTS: "all-minilm"
2828

2929
jobs:
3030
run:

integrations/ollama/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ classifiers = [
2727
"Programming Language :: Python :: Implementation :: CPython",
2828
"Programming Language :: Python :: Implementation :: PyPy",
2929
]
30-
dependencies = ["haystack-ai>=2.16.1", "ollama>=0.4.0", "pydantic"]
30+
dependencies = ["haystack-ai>=2.17.1", "ollama>=0.4.0", "pydantic"]
3131

3232
[project.urls]
3333
Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/ollama#readme"

integrations/ollama/src/haystack_integrations/components/generators/ollama/chat/chat_generator.py

Lines changed: 67 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ def _convert_chatmessage_to_ollama_format(message: ChatMessage) -> Dict[str, Any
5858
{"type": "function", "function": {"name": tool_call.tool_name, "arguments": tool_call.arguments}}
5959
for tool_call in tool_calls
6060
]
61+
if message.reasoning:
62+
ollama_msg["thinking"] = message.reasoning.reasoning_text
6163
return ollama_msg
6264

6365

@@ -131,81 +133,84 @@ def _convert_ollama_response_to_chatmessage(ollama_response: ChatResponse) -> Ch
131133
)
132134
)
133135

134-
chat_msg = ChatMessage.from_assistant(text=text, tool_calls=tool_calls)
136+
reasoning = ollama_message.get("thinking", None)
137+
138+
chat_msg = ChatMessage.from_assistant(text=text or None, tool_calls=tool_calls, reasoning=reasoning)
135139

136140
chat_msg._meta = _convert_ollama_meta_to_openai_format(response_dict)
137141

138-
thinking = ollama_message.get("thinking", None)
142+
return chat_msg
139143

140-
if thinking is not None:
141-
chat_msg._meta["thinking"] = thinking
142144

143-
return chat_msg
145+
def _build_chunk(
146+
chunk_response: ChatResponse, component_info: ComponentInfo, index: int, tool_call_index: int
147+
) -> StreamingChunk:
148+
"""
149+
Convert one Ollama stream-chunk to Haystack StreamingChunk.
150+
"""
151+
chunk_response_dict = chunk_response.model_dump()
152+
finish_reason = FINISH_REASON_MAPPING.get(chunk_response.done_reason or "")
153+
tool_calls_list = []
154+
155+
content = chunk_response_dict["message"]["content"]
156+
157+
meta = {key: value for key, value in chunk_response_dict.items() if key != "message"}
158+
meta["role"] = chunk_response_dict["message"]["role"]
159+
160+
# until a specific field in StreamingChunk is available, we store the thinking in the meta
161+
meta["reasoning"] = chunk_response_dict["message"].get("thinking", None)
162+
163+
if tool_calls := chunk_response_dict["message"].get("tool_calls"):
164+
for tool_call in tool_calls:
165+
tool_calls_list.append(
166+
ToolCallDelta(
167+
index=tool_call_index,
168+
tool_name=tool_call["function"]["name"],
169+
arguments=json.dumps(tool_call["function"]["arguments"])
170+
if tool_call["function"]["arguments"]
171+
else "",
172+
)
173+
)
174+
175+
return StreamingChunk(
176+
content=content,
177+
meta=meta,
178+
index=index,
179+
finish_reason=finish_reason,
180+
component_info=component_info,
181+
tool_calls=tool_calls_list,
182+
)
144183

145184

146185
@component
147186
class OllamaChatGenerator:
148187
"""
149-
Haystack generator for models served by Ollama (https://ollama.ai).
188+
Haystack Chat Generator for models served with Ollama (https://ollama.ai).
150189
151-
* Fully supports streaming.
152-
* Correctly passes tool-calls to Haystack when `stream=True`.
190+
Supports streaming, tool calls, reasoning, and structured outputs.
153191
154192
Usage example:
155193
```python
156-
from haystack.components.generators.utils import print_streaming_chunk
157-
from haystack.components.agents import Agent
158194
from haystack_integrations.components.generators.ollama.chat import OllamaChatGenerator
159195
from haystack.dataclasses import ChatMessage
160-
from haystack.tools import Tool
161-
162-
def echo(query: str) -> str:
163-
print(f"Tool executed with QUERY: {query}")
164-
return query
165196
166-
echo_tool = Tool(
167-
name="echo_tool",
168-
description="Echoes the query (demo tool).",
169-
function=echo,
170-
parameters={"query": {"type": "string", "description": "Search query"}},
171-
)
172-
agent = Agent(
173-
chat_generator=OllamaChatGenerator(model="mistral-small3.1:24b"),
174-
tools=[echo_tool],
175-
system_prompt=(
176-
"Use tool to print the query to test tools. Do not answer the question, just send the query to the tool"
177-
),
178-
max_agent_steps=5,
179-
raise_on_tool_invocation_failure=True,
180-
streaming_callback=print_streaming_chunk,
181-
)
182-
messages = [ChatMessage.from_user("This is stream test of tool usage")]
183-
result = agent.run(messages=messages)
184-
for message in result["messages"]:
185-
print("\n======")
186-
if message.role == "system":
187-
continue
188-
elif message.role == "tool":
189-
print(f"{message.role}:")
190-
print(f"Tool Results: {[tool.result for tool in message.tool_call_results]}")
191-
print(f"Used Tools: {[tool.origin.tool_name for tool in message.tool_call_results]}\n")
192-
else:
193-
print(f"{message.role}: {message.text}")
194-
print(f"Used Tools: {[tool.tool_name for tool in message.tool_calls]}\n")
197+
llm = OllamaChatGenerator(model="qwen3:0.6b")
198+
result = llm.run(messages=[ChatMessage.from_user("What is the capital of France?")])
199+
print(result)
195200
```
196201
"""
197202

198203
def __init__(
199204
self,
200-
model: str = "orca-mini",
205+
model: str = "qwen3:0.6b",
201206
url: str = "http://localhost:11434",
202207
generation_kwargs: Optional[Dict[str, Any]] = None,
203208
timeout: int = 120,
204209
keep_alive: Optional[Union[float, str]] = None,
205210
streaming_callback: Optional[Callable[[StreamingChunk], None]] = None,
206211
tools: Optional[Union[List[Tool], Toolset]] = None,
207212
response_format: Optional[Union[None, Literal["json"], JsonSchemaValue]] = None,
208-
think: bool = False,
213+
think: Union[bool, Literal["low", "medium", "high"]] = False,
209214
):
210215
"""
211216
:param model:
@@ -219,9 +224,11 @@ def __init__(
219224
:param timeout:
220225
The number of seconds before throwing a timeout error from the Ollama API.
221226
:param think
222-
If True, the modell will "think" before producing a response.
227+
If True, the model will "think" before producing a response.
223228
Only [thinking models](https://ollama.com/search?c=thinking) support this feature.
224-
The intermediate "thinking" output can be found in the `meta` property of the returned `ChatMessage`.
229+
Some models like gpt-oss support different levels of thinking: "low", "medium", "high".
230+
The intermediate "thinking" output can be found by inspecting the `reasoning` property of the returned
231+
`ChatMessage`.
225232
:param keep_alive:
226233
The option that controls how long the model will stay loaded into memory following the request.
227234
If not set, it will use the default value from the Ollama (5 minutes).
@@ -295,42 +302,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "OllamaChatGenerator":
295302
data["init_parameters"]["streaming_callback"] = deserialize_callable(callback_ser)
296303
return default_from_dict(cls, data)
297304

298-
@staticmethod
299-
def _build_chunk(
300-
chunk_response: ChatResponse, component_info: ComponentInfo, index: int, tool_call_index: int
301-
) -> StreamingChunk:
302-
"""
303-
Convert one Ollama stream-chunk to Haystack StreamingChunk.
304-
"""
305-
chunk_response_dict = chunk_response.model_dump()
306-
finish_reason = FINISH_REASON_MAPPING.get(chunk_response.done_reason or "")
307-
tool_calls_list = []
308-
309-
content = chunk_response_dict["message"]["content"]
310-
311-
meta = {key: value for key, value in chunk_response_dict.items() if key != "message"}
312-
meta["role"] = chunk_response_dict["message"]["role"]
313-
if tool_calls := chunk_response_dict["message"].get("tool_calls"):
314-
for tool_call in tool_calls:
315-
tool_calls_list.append(
316-
ToolCallDelta(
317-
index=tool_call_index,
318-
tool_name=tool_call["function"]["name"],
319-
arguments=json.dumps(tool_call["function"]["arguments"])
320-
if tool_call["function"]["arguments"]
321-
else "",
322-
)
323-
)
324-
325-
return StreamingChunk(
326-
content=content,
327-
meta=meta,
328-
index=index,
329-
finish_reason=finish_reason,
330-
component_info=component_info,
331-
tool_calls=tool_calls_list,
332-
)
333-
334305
def _handle_streaming_response(
335306
self,
336307
response_iter: Iterator[ChatResponse],
@@ -355,13 +326,15 @@ def _handle_streaming_response(
355326
for index, raw in enumerate(response_iter):
356327
if raw.message.tool_calls:
357328
tool_call_index += 1
358-
chunk = self._build_chunk(
329+
chunk = _build_chunk(
359330
chunk_response=raw, component_info=component_info, index=index, tool_call_index=tool_call_index
360331
)
361332
chunks.append(chunk)
362333

334+
start = index == 0 or bool(chunk.tool_calls)
335+
chunk.start = start
336+
363337
if chunk.tool_calls:
364-
chunk.start = True
365338
for tool_call in chunk.tool_calls:
366339
# the Ollama server doesn't guarantee an id field in every tool_calls entry.
367340
# OpenAI-compatible endpoint (/v1/chat/completions) - recent releases do add an auto-generated id
@@ -389,8 +362,13 @@ def _handle_streaming_response(
389362

390363
if callback:
391364
callback(chunk)
365+
392366
# Compose final reply
393-
text = "".join(c.content for c in chunks)
367+
text = ""
368+
reasoning = ""
369+
for c in chunks:
370+
text += c.content
371+
reasoning += c.meta.get("reasoning", None) or ""
394372

395373
tool_calls = []
396374
for tool_call_id in id_order:
@@ -400,8 +378,9 @@ def _handle_streaming_response(
400378
# We can't use _convert_streaming_chunks_to_chat_message because
401379
# we need to map tool_call name and args by order.
402380
reply = ChatMessage.from_assistant(
403-
text=text,
381+
text=text or None,
404382
tool_calls=tool_calls or None,
383+
reasoning=reasoning or None,
405384
meta=_convert_ollama_meta_to_openai_format(chunks[-1].meta) if chunks else {},
406385
)
407386

0 commit comments

Comments
 (0)