@@ -58,6 +58,8 @@ def _convert_chatmessage_to_ollama_format(message: ChatMessage) -> Dict[str, Any
5858 {"type" : "function" , "function" : {"name" : tool_call .tool_name , "arguments" : tool_call .arguments }}
5959 for tool_call in tool_calls
6060 ]
61+ if message .reasoning :
62+ ollama_msg ["thinking" ] = message .reasoning .reasoning_text
6163 return ollama_msg
6264
6365
@@ -131,81 +133,84 @@ def _convert_ollama_response_to_chatmessage(ollama_response: ChatResponse) -> Ch
131133 )
132134 )
133135
134- chat_msg = ChatMessage .from_assistant (text = text , tool_calls = tool_calls )
136+ reasoning = ollama_message .get ("thinking" , None )
137+
138+ chat_msg = ChatMessage .from_assistant (text = text or None , tool_calls = tool_calls , reasoning = reasoning )
135139
136140 chat_msg ._meta = _convert_ollama_meta_to_openai_format (response_dict )
137141
138- thinking = ollama_message . get ( "thinking" , None )
142+ return chat_msg
139143
140- if thinking is not None :
141- chat_msg ._meta ["thinking" ] = thinking
142144
143- return chat_msg
145+ def _build_chunk (
146+ chunk_response : ChatResponse , component_info : ComponentInfo , index : int , tool_call_index : int
147+ ) -> StreamingChunk :
148+ """
149+ Convert one Ollama stream-chunk to Haystack StreamingChunk.
150+ """
151+ chunk_response_dict = chunk_response .model_dump ()
152+ finish_reason = FINISH_REASON_MAPPING .get (chunk_response .done_reason or "" )
153+ tool_calls_list = []
154+
155+ content = chunk_response_dict ["message" ]["content" ]
156+
157+ meta = {key : value for key , value in chunk_response_dict .items () if key != "message" }
158+ meta ["role" ] = chunk_response_dict ["message" ]["role" ]
159+
160+ # until a specific field in StreamingChunk is available, we store the thinking in the meta
161+ meta ["reasoning" ] = chunk_response_dict ["message" ].get ("thinking" , None )
162+
163+ if tool_calls := chunk_response_dict ["message" ].get ("tool_calls" ):
164+ for tool_call in tool_calls :
165+ tool_calls_list .append (
166+ ToolCallDelta (
167+ index = tool_call_index ,
168+ tool_name = tool_call ["function" ]["name" ],
169+ arguments = json .dumps (tool_call ["function" ]["arguments" ])
170+ if tool_call ["function" ]["arguments" ]
171+ else "" ,
172+ )
173+ )
174+
175+ return StreamingChunk (
176+ content = content ,
177+ meta = meta ,
178+ index = index ,
179+ finish_reason = finish_reason ,
180+ component_info = component_info ,
181+ tool_calls = tool_calls_list ,
182+ )
144183
145184
146185@component
147186class OllamaChatGenerator :
148187 """
149- Haystack generator for models served by Ollama (https://ollama.ai).
188+ Haystack Chat Generator for models served with Ollama (https://ollama.ai).
150189
151- * Fully supports streaming.
152- * Correctly passes tool-calls to Haystack when `stream=True`.
190+ Supports streaming, tool calls, reasoning, and structured outputs.
153191
154192 Usage example:
155193 ```python
156- from haystack.components.generators.utils import print_streaming_chunk
157- from haystack.components.agents import Agent
158194 from haystack_integrations.components.generators.ollama.chat import OllamaChatGenerator
159195 from haystack.dataclasses import ChatMessage
160- from haystack.tools import Tool
161-
162- def echo(query: str) -> str:
163- print(f"Tool executed with QUERY: {query}")
164- return query
165196
166- echo_tool = Tool(
167- name="echo_tool",
168- description="Echoes the query (demo tool).",
169- function=echo,
170- parameters={"query": {"type": "string", "description": "Search query"}},
171- )
172- agent = Agent(
173- chat_generator=OllamaChatGenerator(model="mistral-small3.1:24b"),
174- tools=[echo_tool],
175- system_prompt=(
176- "Use tool to print the query to test tools. Do not answer the question, just send the query to the tool"
177- ),
178- max_agent_steps=5,
179- raise_on_tool_invocation_failure=True,
180- streaming_callback=print_streaming_chunk,
181- )
182- messages = [ChatMessage.from_user("This is stream test of tool usage")]
183- result = agent.run(messages=messages)
184- for message in result["messages"]:
185- print("\n ======")
186- if message.role == "system":
187- continue
188- elif message.role == "tool":
189- print(f"{message.role}:")
190- print(f"Tool Results: {[tool.result for tool in message.tool_call_results]}")
191- print(f"Used Tools: {[tool.origin.tool_name for tool in message.tool_call_results]}\n ")
192- else:
193- print(f"{message.role}: {message.text}")
194- print(f"Used Tools: {[tool.tool_name for tool in message.tool_calls]}\n ")
197+ llm = OllamaChatGenerator(model="qwen3:0.6b")
198+ result = llm.run(messages=[ChatMessage.from_user("What is the capital of France?")])
199+ print(result)
195200 ```
196201 """
197202
198203 def __init__ (
199204 self ,
200- model : str = "orca-mini " ,
205+ model : str = "qwen3:0.6b " ,
201206 url : str = "http://localhost:11434" ,
202207 generation_kwargs : Optional [Dict [str , Any ]] = None ,
203208 timeout : int = 120 ,
204209 keep_alive : Optional [Union [float , str ]] = None ,
205210 streaming_callback : Optional [Callable [[StreamingChunk ], None ]] = None ,
206211 tools : Optional [Union [List [Tool ], Toolset ]] = None ,
207212 response_format : Optional [Union [None , Literal ["json" ], JsonSchemaValue ]] = None ,
208- think : bool = False ,
213+ think : Union [ bool , Literal [ "low" , "medium" , "high" ]] = False ,
209214 ):
210215 """
211216 :param model:
@@ -219,9 +224,11 @@ def __init__(
219224 :param timeout:
220225 The number of seconds before throwing a timeout error from the Ollama API.
221226 :param think
222- If True, the modell will "think" before producing a response.
227+ If True, the model will "think" before producing a response.
223228 Only [thinking models](https://ollama.com/search?c=thinking) support this feature.
224- The intermediate "thinking" output can be found in the `meta` property of the returned `ChatMessage`.
229+ Some models like gpt-oss support different levels of thinking: "low", "medium", "high".
230+ The intermediate "thinking" output can be found by inspecting the `reasoning` property of the returned
231+ `ChatMessage`.
225232 :param keep_alive:
226233 The option that controls how long the model will stay loaded into memory following the request.
227234 If not set, it will use the default value from the Ollama (5 minutes).
@@ -295,42 +302,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "OllamaChatGenerator":
295302 data ["init_parameters" ]["streaming_callback" ] = deserialize_callable (callback_ser )
296303 return default_from_dict (cls , data )
297304
298- @staticmethod
299- def _build_chunk (
300- chunk_response : ChatResponse , component_info : ComponentInfo , index : int , tool_call_index : int
301- ) -> StreamingChunk :
302- """
303- Convert one Ollama stream-chunk to Haystack StreamingChunk.
304- """
305- chunk_response_dict = chunk_response .model_dump ()
306- finish_reason = FINISH_REASON_MAPPING .get (chunk_response .done_reason or "" )
307- tool_calls_list = []
308-
309- content = chunk_response_dict ["message" ]["content" ]
310-
311- meta = {key : value for key , value in chunk_response_dict .items () if key != "message" }
312- meta ["role" ] = chunk_response_dict ["message" ]["role" ]
313- if tool_calls := chunk_response_dict ["message" ].get ("tool_calls" ):
314- for tool_call in tool_calls :
315- tool_calls_list .append (
316- ToolCallDelta (
317- index = tool_call_index ,
318- tool_name = tool_call ["function" ]["name" ],
319- arguments = json .dumps (tool_call ["function" ]["arguments" ])
320- if tool_call ["function" ]["arguments" ]
321- else "" ,
322- )
323- )
324-
325- return StreamingChunk (
326- content = content ,
327- meta = meta ,
328- index = index ,
329- finish_reason = finish_reason ,
330- component_info = component_info ,
331- tool_calls = tool_calls_list ,
332- )
333-
334305 def _handle_streaming_response (
335306 self ,
336307 response_iter : Iterator [ChatResponse ],
@@ -355,13 +326,15 @@ def _handle_streaming_response(
355326 for index , raw in enumerate (response_iter ):
356327 if raw .message .tool_calls :
357328 tool_call_index += 1
358- chunk = self . _build_chunk (
329+ chunk = _build_chunk (
359330 chunk_response = raw , component_info = component_info , index = index , tool_call_index = tool_call_index
360331 )
361332 chunks .append (chunk )
362333
334+ start = index == 0 or bool (chunk .tool_calls )
335+ chunk .start = start
336+
363337 if chunk .tool_calls :
364- chunk .start = True
365338 for tool_call in chunk .tool_calls :
366339 # the Ollama server doesn't guarantee an id field in every tool_calls entry.
367340 # OpenAI-compatible endpoint (/v1/chat/completions) - recent releases do add an auto-generated id
@@ -389,8 +362,13 @@ def _handle_streaming_response(
389362
390363 if callback :
391364 callback (chunk )
365+
392366 # Compose final reply
393- text = "" .join (c .content for c in chunks )
367+ text = ""
368+ reasoning = ""
369+ for c in chunks :
370+ text += c .content
371+ reasoning += c .meta .get ("reasoning" , None ) or ""
394372
395373 tool_calls = []
396374 for tool_call_id in id_order :
@@ -400,8 +378,9 @@ def _handle_streaming_response(
400378 # We can't use _convert_streaming_chunks_to_chat_message because
401379 # we need to map tool_call name and args by order.
402380 reply = ChatMessage .from_assistant (
403- text = text ,
381+ text = text or None ,
404382 tool_calls = tool_calls or None ,
383+ reasoning = reasoning or None ,
405384 meta = _convert_ollama_meta_to_openai_format (chunks [- 1 ].meta ) if chunks else {},
406385 )
407386
0 commit comments