Skip to content

Commit 99b08c3

Browse files
author
mkulakow
committed
uts
1 parent 0aa9a48 commit 99b08c3

3 files changed

Lines changed: 799 additions & 3 deletions

File tree

demos/continuous_batching/agentic_ai/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ Pull and start OVMS:
330330
```bash
331331
mkdir -p ${HOME}/models
332332
docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
333-
--rest_port 8000 --model_repository_path /models --source_model Junrui2021/Qwen3-VL-8B-Instruct-int4 --tool_parser hermes3 --target_device GPU --task text_generation --pipeline_type VLM_CB --allowed_media_domains raw.githubusercontent.com
333+
--rest_port 8122 --model_repository_path /models --source_model Junrui2021/Qwen3-VL-8B-Instruct-int4 --model_name ovms-model --tool_parser hermes3 --target_device GPU --task text_generation --pipeline_type VLM_CB --allowed_media_domains raw.githubusercontent.com
334334
```
335335

336336
Use MCP server, with additional image of Gdańsk old town. VLM model deduces location and calls `get_weather` tool to summarize the weather conditions in the city.

src/llm/apis/openai_responses.cpp

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,13 +224,20 @@ static absl::StatusOr<ResponsesInputItemKind> classifyInputItem(const rapidjson:
224224
// "Message has tool role, but there was no previous assistant message with a
225225
// tool call!").
226226
//
227+
// Reasoning that is not followed by an assistant or function_call item is
228+
// emitted as a standalone assistant turn with empty content and the buffered
229+
// reasoning attached as `reasoning_content`. This preserves the model's
230+
// chain-of-thought across turns even when the prior turn produced no visible
231+
// output.
232+
//
227233
// The algorithm is sink-agnostic; concrete output (ov::genai::ChatHistory vs a
228234
// rapidjson messages array) is provided by the Sink template parameter, which
229235
// must implement:
230236
// absl::Status extractContent(itemObj, index, std::string& outText);
231237
// void emitToolMessage(callId, output);
232238
// void emitMessage(role, contentText, reasoning); // reasoning empty -> skip
233239
// void emitAssistantWithToolCalls(contentText, reasoning, toolCalls);
240+
// void emitStandaloneReasoning(reasoning); // assistant turn carrying only reasoning_content
234241
// absl::Status onMissingRole(itemObj);
235242
template <typename Sink>
236243
class ResponsesInputBuilder {
@@ -313,7 +320,8 @@ class ResponsesInputBuilder {
313320
return absl::OkStatus();
314321
}
315322
// Non-assistant items must not absorb pending tool_calls; flush first.
316-
// (flushPendingFunctionCalls also clears any orphan reasoning content.)
323+
// (flushPendingFunctionCalls also emits any standalone reasoning content
324+
// as a standalone assistant turn.)
317325
if (role != "assistant") {
318326
flushPendingFunctionCalls("");
319327
}
@@ -329,7 +337,16 @@ class ResponsesInputBuilder {
329337

330338
void flushPendingFunctionCalls(const std::string& assistantText) {
331339
if (pendingFunctionCalls.empty()) {
332-
pendingReasoningContent.clear();
340+
// No tool calls, but possibly buffered reasoning to flush as a
341+
// standalone assistant turn carrying only reasoning_content (no
342+
// `content` field at all, so templates that gate on `message.content`
343+
// skip the content branch and templates that gate on
344+
// `message.reasoning_content` still see the buffered text).
345+
if (!pendingReasoningContent.empty()) {
346+
std::string reasoning = std::move(pendingReasoningContent);
347+
pendingReasoningContent.clear();
348+
sink.emitStandaloneReasoning(reasoning);
349+
}
333350
return;
334351
}
335352
std::string reasoning = std::move(pendingReasoningContent);
@@ -427,6 +444,15 @@ class ChatHistorySink {
427444
chatHistory.last()["tool_calls"] = rapidJsonValueToJsonContainer(toolCallsArray);
428445
}
429446

447+
// Emit an assistant turn that carries only reasoning_content (no content,
448+
// no tool_calls). Used when reasoning is not followed by an assistant or
449+
// function_call item.
450+
void emitStandaloneReasoning(const std::string& reasoning) {
451+
chatHistory.push_back({});
452+
chatHistory.last()["role"] = "assistant";
453+
chatHistory.last()["reasoning_content"] = reasoning;
454+
}
455+
430456
absl::Status onMissingRole(const rapidjson::Value::ConstObject&) {
431457
return absl::InvalidArgumentError("input item role is missing or invalid");
432458
}
@@ -515,6 +541,15 @@ class ProcessedJsonSink {
515541
messagesArray.PushBack(msgObj, alloc);
516542
}
517543

544+
// Emit an assistant turn that carries only reasoning_content (no content,
545+
// no tool_calls). See ChatHistorySink::emitStandaloneReasoning for rationale.
546+
void emitStandaloneReasoning(const std::string& reasoning) {
547+
rapidjson::Value msgObj(rapidjson::kObjectType);
548+
msgObj.AddMember("role", rapidjson::Value("assistant", alloc), alloc);
549+
msgObj.AddMember("reasoning_content", rapidjson::Value(reasoning.c_str(), alloc), alloc);
550+
messagesArray.PushBack(msgObj, alloc);
551+
}
552+
518553
void emitAssistantWithToolCalls(const std::string& contentText, const std::string& reasoning,
519554
const std::vector<const rapidjson::Value*>& toolCalls) {
520555
rapidjson::Value msgObj(rapidjson::kObjectType);

0 commit comments

Comments
 (0)