Skip to content

Commit 6241d97

Browse files
michalkulakowskiporlows1
authored andcommitted
Mkulakow/legacy pipeline finish reason (#4169)
1 parent 6da92ad commit 6241d97

6 files changed

Lines changed: 220 additions & 14 deletions

File tree

src/llm/apis/openai_completions.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -409,17 +409,22 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco
409409

410410
// choices: array of size N, where N is related to n request parameter
411411
jsonResponse.StartArray("choices");
412-
int index = 0;
413-
for (int i = 0; i < results.tokens.size(); i++) {
412+
if (results.finish_reasons.empty()) {
413+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary LM generation result, defaulting to STOP for all choices");
414+
} else if (results.finish_reasons.size() != results.tokens.size()) {
415+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Finish reasons size ({}) does not match tokens size ({}) in unary LM generation result, defaulting missing entries to STOP",
416+
results.finish_reasons.size(), results.tokens.size());
417+
}
418+
for (size_t i = 0; i < results.tokens.size(); ++i) {
414419
const std::vector<int64_t>& tokens = results.tokens[i];
415420
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens);
416421
ParsedOutput parsedOutput = parseOutputIfNeeded(tokens);
417422
jsonResponse.StartObject();
418-
// finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls
419-
auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty());
423+
const ov::genai::GenerationFinishReason finishReasonRaw = i < results.finish_reasons.size() ? results.finish_reasons[i] : ov::genai::GenerationFinishReason::STOP;
424+
auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty());
420425
jsonResponse.FinishReason(finishReason.value_or("unknown"));
421426
// index: integer; Choice index, only n=1 supported anyway
422-
jsonResponse.Index(index++);
427+
jsonResponse.Index(static_cast<int>(i));
423428

424429
if (endpoint == Endpoint::CHAT_COMPLETIONS) {
425430
jsonResponse.MessageObject(parsedOutput);
@@ -480,8 +485,12 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD
480485
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", generatedTokens);
481486
ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens);
482487
jsonResponse.StartObject();
483-
// finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls
484-
auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty());
488+
if (results.finish_reasons.empty()) {
489+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary VLM generation result, defaulting to STOP");
490+
}
491+
// Current generation flow uses batch=1, so only finish_reasons[0] is expected here.
492+
const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : results.finish_reasons[0];
493+
auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty());
485494
jsonResponse.FinishReason(finishReason.value_or("unknown"));
486495
// index: integer; Choice index, only n=1 supported anyway
487496
jsonResponse.Index(index++);

src/llm/apis/openai_responses.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -652,17 +652,30 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes
652652
OVMS_PROFILE_FUNCTION();
653653
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
654654
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
655+
if (results.finish_reasons.empty()) {
656+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary LM responses generation result, defaulting to STOP");
657+
}
655658
std::vector<ParsedOutput> parsedOutputs;
659+
ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP;
656660
for (const auto& tokens : results.tokens) {
657661
parsedOutputs.push_back(parseOutputIfNeeded(tokens));
658662
}
659-
return serializeUnaryResponseImpl(parsedOutputs);
663+
for (const auto& finishReason : results.finish_reasons) {
664+
if (finishReason == ov::genai::GenerationFinishReason::LENGTH) {
665+
responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH;
666+
break;
667+
}
668+
}
669+
return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason);
660670
}
661671

662672
std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) {
663673
OVMS_PROFILE_FUNCTION();
664674
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
665675
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
676+
if (results.finish_reasons.empty()) {
677+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary VLM responses generation result, defaulting to STOP");
678+
}
666679
// Usage is already correctly set from perf_metrics above — no need for updateUsage.
667680
std::vector<ParsedOutput> parsedOutputs;
668681
if (!textResponse.empty()) {
@@ -677,7 +690,14 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecoded
677690
parsedOutputs.push_back(std::move(output));
678691
}
679692
}
680-
return serializeUnaryResponseImpl(parsedOutputs);
693+
ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP;
694+
for (const auto& finishReason : results.finish_reasons) {
695+
if (finishReason == ov::genai::GenerationFinishReason::LENGTH) {
696+
responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH;
697+
break;
698+
}
699+
}
700+
return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason);
681701
}
682702

683703
// --- Streaming event building blocks ---

src/llm/language_model/legacy/servable.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,12 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptr<GenAiServabl
229229
if (!executionContext->lastStreamerCallbackOutput.empty()) {
230230
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
231231
}
232-
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
232+
if (legacyExecutionContext->results.finish_reasons.empty()) {
233+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy LM streaming generation result, defaulting to STOP");
234+
}
235+
// Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
236+
ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0];
237+
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
233238
if (!serializedChunk.empty()) {
234239
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
235240
}

src/llm/visual_language_model/legacy/servable.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,12 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
245245
if (!executionContext->lastStreamerCallbackOutput.empty()) {
246246
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
247247
}
248-
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
248+
if (legacyExecutionContext->results.finish_reasons.empty()) {
249+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy VLM streaming generation result, defaulting to STOP");
250+
}
251+
// Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
252+
ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0];
253+
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
249254
if (!serializedChunk.empty()) {
250255
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
251256
}

src/test/http_openai_handler_test.cpp

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1715,6 +1715,173 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesCompleted
17151715
ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized;
17161716
}
17171717

1718+
TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesEncodedResultsIncompleteOnLength) {
1719+
std::string json = R"({
1720+
"model": "llama",
1721+
"input": "What is OpenVINO?",
1722+
"max_output_tokens": 5
1723+
})";
1724+
doc.Parse(json.c_str());
1725+
ASSERT_FALSE(doc.HasParseError());
1726+
1727+
auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
1728+
std::optional<uint32_t> maxTokensLimit;
1729+
uint32_t bestOfLimit = 0;
1730+
std::optional<uint32_t> maxModelLength;
1731+
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
1732+
1733+
ov::genai::EncodedResults results;
1734+
ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids;
1735+
const auto& shape = outputIds.get_shape();
1736+
ASSERT_EQ(shape.size(), 2);
1737+
ASSERT_EQ(shape[0], 1);
1738+
ASSERT_EQ(outputIds.get_element_type(), ov::element::i64);
1739+
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
1740+
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + shape[1])};
1741+
results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH};
1742+
1743+
std::string serialized = apiHandler->serializeUnaryResponse(results);
1744+
1745+
ASSERT_NE(serialized.find("\"status\":\"incomplete\""), std::string::npos) << serialized;
1746+
ASSERT_NE(serialized.find("\"incomplete_details\""), std::string::npos) << serialized;
1747+
ASSERT_NE(serialized.find("\"reason\":\"max_tokens\""), std::string::npos) << serialized;
1748+
ASSERT_EQ(serialized.find("\"completed_at\""), std::string::npos) << serialized;
1749+
ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized;
1750+
}
1751+
1752+
TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesEncodedResultsCompletedOnStop) {
1753+
std::string json = R"({
1754+
"model": "llama",
1755+
"input": "What is OpenVINO?",
1756+
"max_output_tokens": 5
1757+
})";
1758+
doc.Parse(json.c_str());
1759+
ASSERT_FALSE(doc.HasParseError());
1760+
1761+
auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
1762+
std::optional<uint32_t> maxTokensLimit;
1763+
uint32_t bestOfLimit = 0;
1764+
std::optional<uint32_t> maxModelLength;
1765+
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
1766+
1767+
ov::genai::EncodedResults results;
1768+
ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids;
1769+
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
1770+
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
1771+
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};
1772+
1773+
std::string serialized = apiHandler->serializeUnaryResponse(results);
1774+
1775+
ASSERT_NE(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized;
1776+
ASSERT_NE(serialized.find("\"completed_at\""), std::string::npos) << serialized;
1777+
ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized;
1778+
}
1779+
1780+
TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesVLMDecodedResultsIncompleteOnLength) {
1781+
std::string json = R"({
1782+
"model": "llama",
1783+
"input": "What is OpenVINO?",
1784+
"max_output_tokens": 5
1785+
})";
1786+
doc.Parse(json.c_str());
1787+
ASSERT_FALSE(doc.HasParseError());
1788+
1789+
auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
1790+
std::optional<uint32_t> maxTokensLimit;
1791+
uint32_t bestOfLimit = 0;
1792+
std::optional<uint32_t> maxModelLength;
1793+
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
1794+
1795+
ov::genai::VLMDecodedResults results;
1796+
std::string text = "OVMS";
1797+
results.texts = {text};
1798+
results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH};
1799+
1800+
std::string serialized = apiHandler->serializeUnaryResponse(results, text);
1801+
1802+
ASSERT_NE(serialized.find("\"status\":\"incomplete\""), std::string::npos) << serialized;
1803+
ASSERT_NE(serialized.find("\"incomplete_details\""), std::string::npos) << serialized;
1804+
ASSERT_NE(serialized.find("\"reason\":\"max_tokens\""), std::string::npos) << serialized;
1805+
ASSERT_EQ(serialized.find("\"completed_at\""), std::string::npos) << serialized;
1806+
ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized;
1807+
}
1808+
1809+
TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesVLMDecodedResultsCompletedOnStop) {
1810+
std::string json = R"({
1811+
"model": "llama",
1812+
"input": "What is OpenVINO?",
1813+
"max_output_tokens": 5
1814+
})";
1815+
doc.Parse(json.c_str());
1816+
ASSERT_FALSE(doc.HasParseError());
1817+
1818+
auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
1819+
std::optional<uint32_t> maxTokensLimit;
1820+
uint32_t bestOfLimit = 0;
1821+
std::optional<uint32_t> maxModelLength;
1822+
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
1823+
1824+
ov::genai::VLMDecodedResults results;
1825+
std::string text = "OVMS";
1826+
results.texts = {text};
1827+
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};
1828+
1829+
std::string serialized = apiHandler->serializeUnaryResponse(results, text);
1830+
1831+
ASSERT_NE(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized;
1832+
ASSERT_NE(serialized.find("\"completed_at\""), std::string::npos) << serialized;
1833+
ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized;
1834+
}
1835+
1836+
TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseChatCompletionsEncodedResultsLengthFinishReason) {
1837+
std::string json = R"({
1838+
"model": "llama",
1839+
"stream": false,
1840+
"messages": [{"role": "user", "content": "What is OpenVINO?"}]
1841+
})";
1842+
doc.Parse(json.c_str());
1843+
ASSERT_FALSE(doc.HasParseError());
1844+
1845+
auto apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
1846+
uint32_t maxTokensLimit = 100;
1847+
uint32_t bestOfLimit = 0;
1848+
std::optional<uint32_t> maxModelLength;
1849+
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
1850+
1851+
ov::genai::EncodedResults results;
1852+
ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids;
1853+
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
1854+
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
1855+
results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH};
1856+
1857+
std::string serialized = apiHandler->serializeUnaryResponse(results);
1858+
ASSERT_NE(serialized.find("\"finish_reason\":\"length\""), std::string::npos) << serialized;
1859+
}
1860+
1861+
TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseChatCompletionsVLMDecodedResultsLengthFinishReason) {
1862+
std::string json = R"({
1863+
"model": "llama",
1864+
"stream": false,
1865+
"messages": [{"role": "user", "content": "What is OpenVINO?"}]
1866+
})";
1867+
doc.Parse(json.c_str());
1868+
ASSERT_FALSE(doc.HasParseError());
1869+
1870+
auto apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
1871+
uint32_t maxTokensLimit = 100;
1872+
uint32_t bestOfLimit = 0;
1873+
std::optional<uint32_t> maxModelLength;
1874+
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());
1875+
1876+
ov::genai::VLMDecodedResults results;
1877+
std::string text = "OVMS";
1878+
results.texts = {text};
1879+
results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH};
1880+
1881+
std::string serialized = apiHandler->serializeUnaryResponse(results, text);
1882+
ASSERT_NE(serialized.find("\"finish_reason\":\"length\""), std::string::npos) << serialized;
1883+
}
1884+
17181885
TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) {
17191886
std::string json = R"({
17201887
"model": "llama",

src/test/llm/llmnode_test.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2685,9 +2685,9 @@ INSTANTIATE_TEST_SUITE_P(
26852685
::testing::Values(
26862686
// params: model name, generate expected output, check logprobs, check finish reason, test speculative decoding, supports empty handshake msg
26872687
TestParameters{"lm_cb_regular", true, true, true, false, true},
2688-
TestParameters{"lm_legacy_regular", false, false, false, false, false},
2688+
TestParameters{"lm_legacy_regular", false, false, true, false, false},
26892689
TestParameters{"vlm_cb_regular", false, true, true, false, true},
2690-
TestParameters{"vlm_legacy_regular", false, false, false, false, false}));
2690+
TestParameters{"vlm_legacy_regular", false, false, true, false, false}));
26912691

26922692
const std::string validRequestBodyWithParameter(const std::string& modelName, const std::string& parameter, const std::string& value) {
26932693
std::string requestBody = R"(
@@ -3611,7 +3611,7 @@ INSTANTIATE_TEST_SUITE_P(
36113611
TestParameters{"lm_cb_regular", true, true, true, false, true},
36123612
TestParameters{"lm_legacy_regular", false, false, false, false, false},
36133613
TestParameters{"vlm_cb_regular", false, true, true, false, true},
3614-
TestParameters{"vlm_legacy_regular", false, false, false, false, false}));
3614+
TestParameters{"vlm_legacy_regular", false, false, true, false, false}));
36153615

36163616
// Common tests for all pipeline types (testing logic executed prior pipeline type selection)
36173617
class LLMConfigHttpTest : public ::testing::Test {};

0 commit comments

Comments
 (0)