Skip to content

Commit 8966fa6

Browse files
committed
map also other fields
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 230cfd9 commit 8966fa6

File tree

1 file changed

+47
-24
lines changed

1 file changed

+47
-24
lines changed

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 47 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1667,14 +1667,25 @@ class BackendServiceImpl final : public backend::Backend::Service {
16671667
}
16681668

16691669
reply.set_message(completion_text);
1670-
reply.set_tokens(res_json.value("tokens_predicted", 0));
1671-
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
16721670

1671+
// Token counts: native format has top-level fields,
1672+
// OAI format has them in "usage" (final chunk only)
1673+
if (res_json.contains("usage")) {
1674+
const auto & usage = res_json.at("usage");
1675+
reply.set_tokens(usage.value("completion_tokens", 0));
1676+
reply.set_prompt_tokens(usage.value("prompt_tokens", 0));
1677+
} else {
1678+
reply.set_tokens(res_json.value("tokens_predicted", 0));
1679+
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
1680+
}
1681+
1682+
// Timings: present as top-level "timings" in both formats
16731683
if (res_json.contains("timings")) {
16741684
reply.set_timing_prompt_processing(res_json.at("timings").value("prompt_ms", 0.0));
16751685
reply.set_timing_token_generation(res_json.at("timings").value("predicted_ms", 0.0));
16761686
}
16771687

1688+
// Logprobs: extract_logprobs_from_json handles both formats
16781689
json logprobs_json = extract_logprobs_from_json(res_json);
16791690
if (!logprobs_json.empty() && !logprobs_json.is_null()) {
16801691
reply.set_logprobs(logprobs_json.dump());
@@ -2411,48 +2422,47 @@ class BackendServiceImpl final : public backend::Backend::Service {
24112422
GGML_ASSERT(final_res != nullptr);
24122423
json result_json = all_results.results[0]->to_json();
24132424

2414-
// Handle both native format ({"content": "..."}) and OAI chat
2415-
// format ({"choices": [{"message": {"content": "..."}}]}).
2425+
// Handle both native format ({"content": "...", "tokens_predicted": N})
2426+
// and OAI chat format ({"choices": [{"message": {"content": "..."}}],
2427+
// "usage": {"completion_tokens": N, "prompt_tokens": N}}).
24162428
std::string completion_text;
2429+
int32_t tokens_predicted = 0;
2430+
int32_t tokens_evaluated = 0;
2431+
24172432
if (result_json.contains("choices")) {
2433+
// OAI chat format
24182434
const auto & choices = result_json.at("choices");
24192435
if (!choices.empty()) {
24202436
const auto & msg = choices[0].value("message", json::object());
24212437
if (msg.contains("content") && !msg.at("content").is_null()) {
24222438
completion_text = msg.at("content").get<std::string>();
24232439
}
24242440
}
2441+
if (result_json.contains("usage")) {
2442+
const auto & usage = result_json.at("usage");
2443+
tokens_predicted = usage.value("completion_tokens", 0);
2444+
tokens_evaluated = usage.value("prompt_tokens", 0);
2445+
}
24252446
} else {
2447+
// Native llama.cpp format
24262448
completion_text = result_json.value("content", "");
2449+
tokens_predicted = result_json.value("tokens_predicted", 0);
2450+
tokens_evaluated = result_json.value("tokens_evaluated", 0);
24272451
}
24282452
reply->set_message(completion_text);
2429-
2430-
int32_t tokens_predicted = result_json.value("tokens_predicted", 0);
24312453
reply->set_tokens(tokens_predicted);
2432-
int32_t tokens_evaluated = result_json.value("tokens_evaluated", 0);
24332454
reply->set_prompt_tokens(tokens_evaluated);
24342455

2456+
// Timings: present in both formats as a top-level "timings" object
24352457
if (result_json.contains("timings")) {
2436-
double timing_prompt_processing = result_json.at("timings").value("prompt_ms", 0.0);
2437-
reply->set_timing_prompt_processing(timing_prompt_processing);
2438-
double timing_token_generation = result_json.at("timings").value("predicted_ms", 0.0);
2439-
reply->set_timing_token_generation(timing_token_generation);
2440-
} else if (result_json.contains("usage")) {
2441-
// OAI chat format stores timings in usage
2442-
const auto & usage = result_json.at("usage");
2443-
if (usage.contains("prompt_ms")) {
2444-
reply->set_timing_prompt_processing(usage.value("prompt_ms", 0.0));
2445-
}
2446-
if (usage.contains("predicted_ms")) {
2447-
reply->set_timing_token_generation(usage.value("predicted_ms", 0.0));
2448-
}
2458+
reply->set_timing_prompt_processing(result_json.at("timings").value("prompt_ms", 0.0));
2459+
reply->set_timing_token_generation(result_json.at("timings").value("predicted_ms", 0.0));
24492460
}
24502461

2451-
// Extract and set logprobs if present
2462+
// Logprobs: extract_logprobs_from_json handles both formats
24522463
json logprobs_json = extract_logprobs_from_json(result_json);
24532464
if (!logprobs_json.empty() && !logprobs_json.is_null()) {
2454-
std::string logprobs_str = logprobs_json.dump();
2455-
reply->set_logprobs(logprobs_str);
2465+
reply->set_logprobs(logprobs_json.dump());
24562466
}
24572467

24582468
// Populate chat deltas from the autoparser's final parsed message
@@ -2468,7 +2478,20 @@ class BackendServiceImpl final : public backend::Backend::Service {
24682478
for (auto & res : all_results.results) {
24692479
GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
24702480
json res_json = res->to_json();
2471-
arr.push_back(res_json.value("content", ""));
2481+
// Handle both native and OAI chat formats
2482+
std::string result_content;
2483+
if (res_json.contains("choices")) {
2484+
const auto & choices = res_json.at("choices");
2485+
if (!choices.empty()) {
2486+
const auto & msg = choices[0].value("message", json::object());
2487+
if (msg.contains("content") && !msg.at("content").is_null()) {
2488+
result_content = msg.at("content").get<std::string>();
2489+
}
2490+
}
2491+
} else {
2492+
result_content = res_json.value("content", "");
2493+
}
2494+
arr.push_back(result_content);
24722495

24732496
// Extract logprobs for each result
24742497
json logprobs_json = extract_logprobs_from_json(res_json);

0 commit comments

Comments
 (0)