@@ -2377,6 +2377,8 @@ class BackendServiceImpl final : public backend::Backend::Service {
23772377 // reasoning, tool calls, and content are classified into ChatDeltas.
23782378 // Without this, the PEG parser never produces diffs and the Go side
23792379 // cannot detect tool calls or separate reasoning from content.
2380+ // OAI-compat: enable autoparser (PEG-based chat parsing) so that
2381+ // reasoning, tool calls, and content are classified into ChatDeltas.
23802382 task.params .res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
23812383 task.params .oaicompat_cmpl_id = completion_id;
23822384 // oaicompat_model is already populated by params_from_json_cmpl
@@ -2408,7 +2410,22 @@ class BackendServiceImpl final : public backend::Backend::Service {
24082410 auto * final_res = dynamic_cast <server_task_result_cmpl_final*>(all_results.results [0 ].get ());
24092411 GGML_ASSERT (final_res != nullptr );
24102412 json result_json = all_results.results [0 ]->to_json ();
2411- reply->set_message (result_json.value (" content" , " " ));
2413+
2414+ // Handle both native format ({"content": "..."}) and OAI chat
2415+ // format ({"choices": [{"message": {"content": "..."}}]}).
2416+ std::string completion_text;
2417+ if (result_json.contains (" choices" )) {
2418+ const auto & choices = result_json.at (" choices" );
2419+ if (!choices.empty ()) {
2420+ const auto & msg = choices[0 ].value (" message" , json::object ());
2421+ if (msg.contains (" content" ) && !msg.at (" content" ).is_null ()) {
2422+ completion_text = msg.at (" content" ).get <std::string>();
2423+ }
2424+ }
2425+ } else {
2426+ completion_text = result_json.value (" content" , " " );
2427+ }
2428+ reply->set_message (completion_text);
24122429
24132430 int32_t tokens_predicted = result_json.value (" tokens_predicted" , 0 );
24142431 reply->set_tokens (tokens_predicted);
@@ -2420,6 +2437,15 @@ class BackendServiceImpl final : public backend::Backend::Service {
24202437 reply->set_timing_prompt_processing (timing_prompt_processing);
24212438 double timing_token_generation = result_json.at (" timings" ).value (" predicted_ms" , 0.0 );
24222439 reply->set_timing_token_generation (timing_token_generation);
2440+ } else if (result_json.contains (" usage" )) {
2441+ // OAI chat format stores timings in usage
2442+ const auto & usage = result_json.at (" usage" );
2443+ if (usage.contains (" prompt_ms" )) {
2444+ reply->set_timing_prompt_processing (usage.value (" prompt_ms" , 0.0 ));
2445+ }
2446+ if (usage.contains (" predicted_ms" )) {
2447+ reply->set_timing_token_generation (usage.value (" predicted_ms" , 0.0 ));
2448+ }
24232449 }
24242450
24252451 // Extract and set logprobs if present
0 commit comments