@@ -1667,14 +1667,25 @@ class BackendServiceImpl final : public backend::Backend::Service {
16671667 }
16681668
16691669 reply.set_message (completion_text);
1670- reply.set_tokens (res_json.value (" tokens_predicted" , 0 ));
1671- reply.set_prompt_tokens (res_json.value (" tokens_evaluated" , 0 ));
16721670
1671+ // Token counts: native format has top-level fields,
1672+ // OAI format has them in "usage" (final chunk only)
1673+ if (res_json.contains (" usage" )) {
1674+ const auto & usage = res_json.at (" usage" );
1675+ reply.set_tokens (usage.value (" completion_tokens" , 0 ));
1676+ reply.set_prompt_tokens (usage.value (" prompt_tokens" , 0 ));
1677+ } else {
1678+ reply.set_tokens (res_json.value (" tokens_predicted" , 0 ));
1679+ reply.set_prompt_tokens (res_json.value (" tokens_evaluated" , 0 ));
1680+ }
1681+
1682+ // Timings: present as top-level "timings" in both formats
16731683 if (res_json.contains (" timings" )) {
16741684 reply.set_timing_prompt_processing (res_json.at (" timings" ).value (" prompt_ms" , 0.0 ));
16751685 reply.set_timing_token_generation (res_json.at (" timings" ).value (" predicted_ms" , 0.0 ));
16761686 }
16771687
1688+ // Logprobs: extract_logprobs_from_json handles both formats
16781689 json logprobs_json = extract_logprobs_from_json (res_json);
16791690 if (!logprobs_json.empty () && !logprobs_json.is_null ()) {
16801691 reply.set_logprobs (logprobs_json.dump ());
@@ -2411,48 +2422,47 @@ class BackendServiceImpl final : public backend::Backend::Service {
24112422 GGML_ASSERT (final_res != nullptr );
24122423 json result_json = all_results.results [0 ]->to_json ();
24132424
2414- // Handle both native format ({"content": "..."}) and OAI chat
2415- // format ({"choices": [{"message": {"content": "..."}}]}).
2425+ // Handle both native format ({"content": "...", "tokens_predicted": N})
2426+ // and OAI chat format ({"choices": [{"message": {"content": "..."}}],
2427+ // "usage": {"completion_tokens": N, "prompt_tokens": N}}).
24162428 std::string completion_text;
2429+ int32_t tokens_predicted = 0 ;
2430+ int32_t tokens_evaluated = 0 ;
2431+
24172432 if (result_json.contains (" choices" )) {
2433+ // OAI chat format
24182434 const auto & choices = result_json.at (" choices" );
24192435 if (!choices.empty ()) {
24202436 const auto & msg = choices[0 ].value (" message" , json::object ());
24212437 if (msg.contains (" content" ) && !msg.at (" content" ).is_null ()) {
24222438 completion_text = msg.at (" content" ).get <std::string>();
24232439 }
24242440 }
2441+ if (result_json.contains (" usage" )) {
2442+ const auto & usage = result_json.at (" usage" );
2443+ tokens_predicted = usage.value (" completion_tokens" , 0 );
2444+ tokens_evaluated = usage.value (" prompt_tokens" , 0 );
2445+ }
24252446 } else {
2447+ // Native llama.cpp format
24262448 completion_text = result_json.value (" content" , " " );
2449+ tokens_predicted = result_json.value (" tokens_predicted" , 0 );
2450+ tokens_evaluated = result_json.value (" tokens_evaluated" , 0 );
24272451 }
24282452 reply->set_message (completion_text);
2429-
2430- int32_t tokens_predicted = result_json.value (" tokens_predicted" , 0 );
24312453 reply->set_tokens (tokens_predicted);
2432- int32_t tokens_evaluated = result_json.value (" tokens_evaluated" , 0 );
24332454 reply->set_prompt_tokens (tokens_evaluated);
24342455
2456+ // Timings: present in both formats as a top-level "timings" object
24352457 if (result_json.contains (" timings" )) {
2436- double timing_prompt_processing = result_json.at (" timings" ).value (" prompt_ms" , 0.0 );
2437- reply->set_timing_prompt_processing (timing_prompt_processing);
2438- double timing_token_generation = result_json.at (" timings" ).value (" predicted_ms" , 0.0 );
2439- reply->set_timing_token_generation (timing_token_generation);
2440- } else if (result_json.contains (" usage" )) {
2441- // OAI chat format stores timings in usage
2442- const auto & usage = result_json.at (" usage" );
2443- if (usage.contains (" prompt_ms" )) {
2444- reply->set_timing_prompt_processing (usage.value (" prompt_ms" , 0.0 ));
2445- }
2446- if (usage.contains (" predicted_ms" )) {
2447- reply->set_timing_token_generation (usage.value (" predicted_ms" , 0.0 ));
2448- }
2458+ reply->set_timing_prompt_processing (result_json.at (" timings" ).value (" prompt_ms" , 0.0 ));
2459+ reply->set_timing_token_generation (result_json.at (" timings" ).value (" predicted_ms" , 0.0 ));
24492460 }
24502461
2451- // Extract and set logprobs if present
2462+ // Logprobs: extract_logprobs_from_json handles both formats
24522463 json logprobs_json = extract_logprobs_from_json (result_json);
24532464 if (!logprobs_json.empty () && !logprobs_json.is_null ()) {
2454- std::string logprobs_str = logprobs_json.dump ();
2455- reply->set_logprobs (logprobs_str);
2465+ reply->set_logprobs (logprobs_json.dump ());
24562466 }
24572467
24582468 // Populate chat deltas from the autoparser's final parsed message
@@ -2468,7 +2478,20 @@ class BackendServiceImpl final : public backend::Backend::Service {
24682478 for (auto & res : all_results.results ) {
24692479 GGML_ASSERT (dynamic_cast <server_task_result_cmpl_final*>(res.get ()) != nullptr );
24702480 json res_json = res->to_json ();
2471- arr.push_back (res_json.value (" content" , " " ));
2481+ // Handle both native and OAI chat formats
2482+ std::string result_content;
2483+ if (res_json.contains (" choices" )) {
2484+ const auto & choices = res_json.at (" choices" );
2485+ if (!choices.empty ()) {
2486+ const auto & msg = choices[0 ].value (" message" , json::object ());
2487+ if (msg.contains (" content" ) && !msg.at (" content" ).is_null ()) {
2488+ result_content = msg.at (" content" ).get <std::string>();
2489+ }
2490+ }
2491+ } else {
2492+ result_content = res_json.value (" content" , " " );
2493+ }
2494+ arr.push_back (result_content);
24722495
24732496 // Extract logprobs for each result
24742497 json logprobs_json = extract_logprobs_from_json (res_json);
0 commit comments