Skip to content

Commit 230cfd9

Browse files
committed
fix: apply to non-streaming path too
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 3f0f799 commit 230cfd9

1 file changed

Lines changed: 27 additions & 1 deletion

File tree

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2377,6 +2377,8 @@ class BackendServiceImpl final : public backend::Backend::Service {
23772377
// reasoning, tool calls, and content are classified into ChatDeltas.
23782378
// Without this, the PEG parser never produces diffs and the Go side
23792379
// cannot detect tool calls or separate reasoning from content.
2380+
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
2381+
// reasoning, tool calls, and content are classified into ChatDeltas.
23802382
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
23812383
task.params.oaicompat_cmpl_id = completion_id;
23822384
// oaicompat_model is already populated by params_from_json_cmpl
@@ -2408,7 +2410,22 @@ class BackendServiceImpl final : public backend::Backend::Service {
24082410
auto* final_res = dynamic_cast<server_task_result_cmpl_final*>(all_results.results[0].get());
24092411
GGML_ASSERT(final_res != nullptr);
24102412
json result_json = all_results.results[0]->to_json();
2411-
reply->set_message(result_json.value("content", ""));
2413+
2414+
// Handle both native format ({"content": "..."}) and OAI chat
2415+
// format ({"choices": [{"message": {"content": "..."}}]}).
2416+
std::string completion_text;
2417+
if (result_json.contains("choices")) {
2418+
const auto & choices = result_json.at("choices");
2419+
if (!choices.empty()) {
2420+
const auto & msg = choices[0].value("message", json::object());
2421+
if (msg.contains("content") && !msg.at("content").is_null()) {
2422+
completion_text = msg.at("content").get<std::string>();
2423+
}
2424+
}
2425+
} else {
2426+
completion_text = result_json.value("content", "");
2427+
}
2428+
reply->set_message(completion_text);
24122429

24132430
int32_t tokens_predicted = result_json.value("tokens_predicted", 0);
24142431
reply->set_tokens(tokens_predicted);
@@ -2420,6 +2437,15 @@ class BackendServiceImpl final : public backend::Backend::Service {
24202437
reply->set_timing_prompt_processing(timing_prompt_processing);
24212438
double timing_token_generation = result_json.at("timings").value("predicted_ms", 0.0);
24222439
reply->set_timing_token_generation(timing_token_generation);
2440+
} else if (result_json.contains("usage")) {
2441+
// OAI chat format stores timings in usage
2442+
const auto & usage = result_json.at("usage");
2443+
if (usage.contains("prompt_ms")) {
2444+
reply->set_timing_prompt_processing(usage.value("prompt_ms", 0.0));
2445+
}
2446+
if (usage.contains("predicted_ms")) {
2447+
reply->set_timing_token_generation(usage.value("predicted_ms", 0.0));
2448+
}
24232449
}
24242450

24252451
// Extract and set logprobs if present

0 commit comments

Comments
 (0)