@@ -91,7 +91,10 @@ LLMResponse LLMClient::runToolLoop (const Request& request, LLMToolRegistry& too
9191 for (const auto & toolCall : response.getToolCalls ())
9292 {
9393 auto result = tools.dispatchToolCall (toolCall.name , toolCall.arguments );
94- current.messages .push_back (LLMMessage::toolResult (toolCall.id , JSON::toString (result, true )));
94+
95+ auto toolResultMsg = LLMMessage::toolResult (toolCall.id , JSON::toString (result, true ));
96+ toolResultMsg.name = toolCall.name ; // preserved for providers that need name + id separately (e.g. Gemini)
97+ current.messages .push_back (std::move (toolResultMsg));
9598 }
9699
97100 response = complete (current);
@@ -124,14 +127,19 @@ String LLMClient::buildChatCompletionBody (const Request& request, bool stream)
124127 if (request.toolChoice .has_value ())
125128 setLLMClientProperty (object, " tool_choice" , toolChoiceToVar (*request.toolChoice ));
126129
127- if (request.temperature .has_value ())
128- setLLMClientProperty (object, " temperature" , static_cast <double > (*request.temperature ));
130+ if (! options.noTemperature )
131+ {
132+ if (request.temperature .has_value ())
133+ setLLMClientProperty (object, " temperature" , static_cast <double > (*request.temperature ));
134+ }
129135
130136 if (request.topP .has_value ())
131137 setLLMClientProperty (object, " top_p" , static_cast <double > (*request.topP ));
132138
133- if (request.maxTokens .has_value ())
134- setLLMClientProperty (object, " max_tokens" , *request.maxTokens );
139+ // Per-request maxTokens overrides options.maxTokens; use max_completion_tokens for OpenAI-compatible APIs.
140+ const int effectiveMaxTokens = request.maxTokens .value_or (options.maxTokens );
141+ if (effectiveMaxTokens > 0 )
142+ setLLMClientProperty (object, " max_completion_tokens" , effectiveMaxTokens);
135143
136144 if (request.stopSequences .has_value ())
137145 {
@@ -143,6 +151,40 @@ String LLMClient::buildChatCompletionBody (const Request& request, bool stream)
143151 setLLMClientProperty (object, " stop" , stop);
144152 }
145153
154+ // Reasoning effort for o-series / GPT-5 models.
155+ if (options.reasoningEffort .isNotEmpty ())
156+ setLLMClientProperty (object, " reasoning_effort" , options.reasoningEffort );
157+
158+ // GBNF grammar for llama-server constrained decoding (per-request overrides config).
159+ const auto & effectiveGrammar = request.grammar .isNotEmpty () ? request.grammar : options.grammar ;
160+ if (effectiveGrammar.isNotEmpty ())
161+ setLLMClientProperty (object, " grammar" , effectiveGrammar);
162+
163+ // Prompt caching — bucket by application identity, retain for 24h.
164+ if (options.userAgent .isNotEmpty ())
165+ {
166+ setLLMClientProperty (object, " prompt_cache_key" , options.userAgent );
167+ setLLMClientProperty (object, " prompt_cache_retention" , String (" 24h" ));
168+ }
169+
170+ // Structured output via JSON Schema (built with LLMSchema helpers).
171+ if (! request.schema .isVoid ())
172+ {
173+ auto schemaWrapper = makeLLMClientObject ();
174+ setLLMClientProperty (schemaWrapper, " name" , String (" response" ));
175+ setLLMClientProperty (schemaWrapper, " strict" , true );
176+ setLLMClientProperty (schemaWrapper, " schema" , request.schema );
177+
178+ auto responseFormat = makeLLMClientObject ();
179+ setLLMClientProperty (responseFormat, " type" , String (" json_schema" ));
180+ setLLMClientProperty (responseFormat, " json_schema" , schemaWrapper);
181+
182+ setLLMClientProperty (object, " response_format" , responseFormat);
183+ }
184+
185+ // OpenRouter — application identification headers are injected at HTTP level,
186+ // but some frontends read X-Title from the body; we skip that here.
187+
146188 return JSON::toString (object, true );
147189}
148190
0 commit comments