@@ -1667,14 +1667,25 @@ class BackendServiceImpl final : public backend::Backend::Service {
16671667 }
16681668
16691669 reply.set_message (completion_text);
1670- reply.set_tokens (res_json.value (" tokens_predicted" , 0 ));
1671- reply.set_prompt_tokens (res_json.value (" tokens_evaluated" , 0 ));
16721670
1671+ // Token counts: native format has top-level fields,
1672+ // OAI format has them in "usage" (final chunk only)
1673+ if (res_json.contains (" usage" )) {
1674+ const auto & usage = res_json.at (" usage" );
1675+ reply.set_tokens (usage.value (" completion_tokens" , 0 ));
1676+ reply.set_prompt_tokens (usage.value (" prompt_tokens" , 0 ));
1677+ } else {
1678+ reply.set_tokens (res_json.value (" tokens_predicted" , 0 ));
1679+ reply.set_prompt_tokens (res_json.value (" tokens_evaluated" , 0 ));
1680+ }
1681+
1682+ // Timings: present as top-level "timings" in both formats
16731683 if (res_json.contains (" timings" )) {
16741684 reply.set_timing_prompt_processing (res_json.at (" timings" ).value (" prompt_ms" , 0.0 ));
16751685 reply.set_timing_token_generation (res_json.at (" timings" ).value (" predicted_ms" , 0.0 ));
16761686 }
16771687
1688+ // Logprobs: extract_logprobs_from_json handles both formats
16781689 json logprobs_json = extract_logprobs_from_json (res_json);
16791690 if (!logprobs_json.empty () && !logprobs_json.is_null ()) {
16801691 reply.set_logprobs (logprobs_json.dump ());
@@ -2373,10 +2384,6 @@ class BackendServiceImpl final : public backend::Backend::Service {
23732384 data);
23742385 task.id_slot = json_value (data, " id_slot" , -1 );
23752386
2376- // OAI-compat: enable autoparser (PEG-based chat parsing) so that
2377- // reasoning, tool calls, and content are classified into ChatDeltas.
2378- // Without this, the PEG parser never produces diffs and the Go side
2379- // cannot detect tool calls or separate reasoning from content.
23802387 // OAI-compat: enable autoparser (PEG-based chat parsing) so that
23812388 // reasoning, tool calls, and content are classified into ChatDeltas.
23822389 task.params .res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
@@ -2411,48 +2418,47 @@ class BackendServiceImpl final : public backend::Backend::Service {
24112418 GGML_ASSERT (final_res != nullptr );
24122419 json result_json = all_results.results [0 ]->to_json ();
24132420
2414- // Handle both native format ({"content": "..."}) and OAI chat
2415- // format ({"choices": [{"message": {"content": "..."}}]}).
2421+ // Handle both native format ({"content": "...", "tokens_predicted": N})
2422+ // and OAI chat format ({"choices": [{"message": {"content": "..."}}],
2423+ // "usage": {"completion_tokens": N, "prompt_tokens": N}}).
24162424 std::string completion_text;
2425+ int32_t tokens_predicted = 0 ;
2426+ int32_t tokens_evaluated = 0 ;
2427+
24172428 if (result_json.contains (" choices" )) {
2429+ // OAI chat format
24182430 const auto & choices = result_json.at (" choices" );
24192431 if (!choices.empty ()) {
24202432 const auto & msg = choices[0 ].value (" message" , json::object ());
24212433 if (msg.contains (" content" ) && !msg.at (" content" ).is_null ()) {
24222434 completion_text = msg.at (" content" ).get <std::string>();
24232435 }
24242436 }
2437+ if (result_json.contains (" usage" )) {
2438+ const auto & usage = result_json.at (" usage" );
2439+ tokens_predicted = usage.value (" completion_tokens" , 0 );
2440+ tokens_evaluated = usage.value (" prompt_tokens" , 0 );
2441+ }
24252442 } else {
2443+ // Native llama.cpp format
24262444 completion_text = result_json.value (" content" , " " );
2445+ tokens_predicted = result_json.value (" tokens_predicted" , 0 );
2446+ tokens_evaluated = result_json.value (" tokens_evaluated" , 0 );
24272447 }
24282448 reply->set_message (completion_text);
2429-
2430- int32_t tokens_predicted = result_json.value (" tokens_predicted" , 0 );
24312449 reply->set_tokens (tokens_predicted);
2432- int32_t tokens_evaluated = result_json.value (" tokens_evaluated" , 0 );
24332450 reply->set_prompt_tokens (tokens_evaluated);
24342451
2452+ // Timings: present in both formats as a top-level "timings" object
24352453 if (result_json.contains (" timings" )) {
2436- double timing_prompt_processing = result_json.at (" timings" ).value (" prompt_ms" , 0.0 );
2437- reply->set_timing_prompt_processing (timing_prompt_processing);
2438- double timing_token_generation = result_json.at (" timings" ).value (" predicted_ms" , 0.0 );
2439- reply->set_timing_token_generation (timing_token_generation);
2440- } else if (result_json.contains (" usage" )) {
2441- // OAI chat format stores timings in usage
2442- const auto & usage = result_json.at (" usage" );
2443- if (usage.contains (" prompt_ms" )) {
2444- reply->set_timing_prompt_processing (usage.value (" prompt_ms" , 0.0 ));
2445- }
2446- if (usage.contains (" predicted_ms" )) {
2447- reply->set_timing_token_generation (usage.value (" predicted_ms" , 0.0 ));
2448- }
2454+ reply->set_timing_prompt_processing (result_json.at (" timings" ).value (" prompt_ms" , 0.0 ));
2455+ reply->set_timing_token_generation (result_json.at (" timings" ).value (" predicted_ms" , 0.0 ));
24492456 }
24502457
2451- // Extract and set logprobs if present
2458+ // Logprobs: extract_logprobs_from_json handles both formats
24522459 json logprobs_json = extract_logprobs_from_json (result_json);
24532460 if (!logprobs_json.empty () && !logprobs_json.is_null ()) {
2454- std::string logprobs_str = logprobs_json.dump ();
2455- reply->set_logprobs (logprobs_str);
2461+ reply->set_logprobs (logprobs_json.dump ());
24562462 }
24572463
24582464 // Populate chat deltas from the autoparser's final parsed message
@@ -2468,7 +2474,20 @@ class BackendServiceImpl final : public backend::Backend::Service {
24682474 for (auto & res : all_results.results ) {
24692475 GGML_ASSERT (dynamic_cast <server_task_result_cmpl_final*>(res.get ()) != nullptr );
24702476 json res_json = res->to_json ();
2471- arr.push_back (res_json.value (" content" , " " ));
2477+ // Handle both native and OAI chat formats
2478+ std::string result_content;
2479+ if (res_json.contains (" choices" )) {
2480+ const auto & choices = res_json.at (" choices" );
2481+ if (!choices.empty ()) {
2482+ const auto & msg = choices[0 ].value (" message" , json::object ());
2483+ if (msg.contains (" content" ) && !msg.at (" content" ).is_null ()) {
2484+ result_content = msg.at (" content" ).get <std::string>();
2485+ }
2486+ }
2487+ } else {
2488+ result_content = res_json.value (" content" , " " );
2489+ }
2490+ arr.push_back (result_content);
24722491
24732492 // Extract logprobs for each result
24742493 json logprobs_json = extract_logprobs_from_json (res_json);
0 commit comments