Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,17 +409,20 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco

// choices: array of size N, where N is related to n request parameter
jsonResponse.StartArray("choices");
int index = 0;
for (int i = 0; i < results.tokens.size(); i++) {
for (size_t i = 0; i < results.tokens.size(); ++i) {
const std::vector<int64_t>& tokens = results.tokens[i];
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens);
ParsedOutput parsedOutput = parseOutputIfNeeded(tokens);
jsonResponse.StartObject();
// finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls
auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty());
if (results.finish_reasons.empty()) {
throw std::runtime_error("Missing finish reason in unary LM generation result");
}
// Current generation flow uses batch=1, so only finish_reasons[0] is expected here.
const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons[0];
auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty());
jsonResponse.FinishReason(finishReason.value_or("unknown"));
// index: integer; Choice index, only n=1 supported anyway
jsonResponse.Index(index++);
jsonResponse.Index(static_cast<int>(i));

if (endpoint == Endpoint::CHAT_COMPLETIONS) {
jsonResponse.MessageObject(parsedOutput);
Expand Down Expand Up @@ -480,8 +483,12 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", generatedTokens);
ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens);
jsonResponse.StartObject();
// finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls
auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty());
if (results.finish_reasons.empty()) {
throw std::runtime_error("Missing finish reason in unary VLM generation result");
}
// Current generation flow uses batch=1, so only finish_reasons[0] is expected here.
const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons[0];
auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty());
jsonResponse.FinishReason(finishReason.value_or("unknown"));
// index: integer; Choice index, only n=1 supported anyway
jsonResponse.Index(index++);
Expand Down
24 changes: 22 additions & 2 deletions src/llm/apis/openai_responses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -652,17 +652,30 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes
OVMS_PROFILE_FUNCTION();
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
if (results.finish_reasons.empty()) {
throw std::runtime_error("Missing finish reason in unary LM responses generation result");
}
std::vector<ParsedOutput> parsedOutputs;
ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP;
for (const auto& tokens : results.tokens) {
parsedOutputs.push_back(parseOutputIfNeeded(tokens));
}
return serializeUnaryResponseImpl(parsedOutputs);
for (const auto& finishReason : results.finish_reasons) {
Copy link
Copy Markdown
Collaborator

@dkalinowski dkalinowski May 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we have different implementation than chat/completions? cant we just take [0]? I think we also have batch size=1 here always

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now we could have same implementation for both APIs, but this approach highlights differences (for chat/completions there is one finish reason per output, for response one incomplete details for whole response)

if (finishReason == ov::genai::GenerationFinishReason::LENGTH) {
responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH;
break;
}
}
return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason);
}

std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) {
OVMS_PROFILE_FUNCTION();
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
if (results.finish_reasons.empty()) {
throw std::runtime_error("Missing finish reason in unary VLM responses generation result");
}
// Usage is already correctly set from perf_metrics above — no need for updateUsage.
std::vector<ParsedOutput> parsedOutputs;
if (!textResponse.empty()) {
Expand All @@ -677,7 +690,14 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecoded
parsedOutputs.push_back(std::move(output));
}
}
return serializeUnaryResponseImpl(parsedOutputs);
ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP;
for (const auto& finishReason : results.finish_reasons) {
if (finishReason == ov::genai::GenerationFinishReason::LENGTH) {
responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH;
break;
}
}
return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason);
}

// --- Streaming event building blocks ---
Expand Down
7 changes: 6 additions & 1 deletion src/llm/language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,12 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptr<GenAiServabl
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
if (legacyExecutionContext->results.finish_reasons.empty()) {
return absl::InternalError("Missing finish reason in legacy LM streaming generation result");
}
// Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons[0];
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
}
Expand Down
7 changes: 6 additions & 1 deletion src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,12 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
if (legacyExecutionContext->results.finish_reasons.empty()) {
return absl::InternalError("Missing finish reason in legacy VLM streaming generation result");
}
// Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons[0];
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
}
Expand Down
8 changes: 8 additions & 0 deletions src/test/http_openai_handler_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseEncodedResultsReturns

ov::genai::EncodedResults results;
results.tokens = {createHermes3ToolCallTokens(*tokenizer)};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};
std::string serialized = apiHandler->serializeUnaryResponse(results);

ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized;
Expand Down Expand Up @@ -1049,6 +1050,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF
ov::genai::VLMDecodedResults results;
std::string toolCall = R"(<tool_call>{"name": "example_tool", "arguments": {"arg1": "value1", "arg2": 42}}</tool_call>)";
results.texts = {toolCall};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};
std::string serialized = apiHandler->serializeUnaryResponse(results, toolCall);

ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized;
Expand Down Expand Up @@ -1076,6 +1078,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsO
ASSERT_EQ(outputIds.get_element_type(), ov::element::i64);
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};

std::string serialized = apiHandler->serializeUnaryResponse(results);
ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized;
Expand Down Expand Up @@ -1107,6 +1110,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsR
ASSERT_EQ(outputIds.get_element_type(), ov::element::i64);
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};

std::string serialized = apiHandler->serializeUnaryResponse(results);
ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized;
Expand Down Expand Up @@ -1145,6 +1149,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesOmitsReas
ASSERT_EQ(outputIds.get_element_type(), ov::element::i64);
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};

std::string serialized = apiHandler->serializeUnaryResponse(results);
ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized;
Expand Down Expand Up @@ -2741,6 +2746,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunc
ASSERT_EQ(outputIds.get_element_type(), ov::element::i64);
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};

std::string serialized = apiHandler->serializeUnaryResponse(results);
ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized;
Expand Down Expand Up @@ -2786,6 +2792,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunc
ASSERT_EQ(outputIds.get_element_type(), ov::element::i64);
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};

std::string serialized = apiHandler->serializeUnaryResponse(results);
ASSERT_NE(serialized.find("\"tool_choice\":{"), std::string::npos) << serialized;
Expand Down Expand Up @@ -3095,6 +3102,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeUnaryResponseVLMDecodedResultsWith
std::string vlmText =
"I will call a tool.<tool_call>{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}</tool_call>";
results.texts.push_back(vlmText);
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};

std::string serialized = apiHandler->serializeUnaryResponse(results, vlmText);

Expand Down
2 changes: 1 addition & 1 deletion src/test/llm/llmnode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2685,7 +2685,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(
// params: model name, generate expected output, check logprobs, check finish reason, test speculative decoding, supports empty handshake msg
TestParameters{"lm_cb_regular", true, true, true, false, true},
TestParameters{"lm_legacy_regular", false, false, false, false, false},
TestParameters{"lm_legacy_regular", false, false, true, false, false},
TestParameters{"vlm_cb_regular", false, true, true, false, true},
TestParameters{"vlm_legacy_regular", false, false, false, false, false}));

Expand Down