@@ -1616,8 +1616,11 @@ class BackendServiceImpl final : public backend::Backend::Service {
16161616 data);
16171617 task.id_slot = json_value (data, " id_slot" , -1 );
16181618
1619- // OAI-compat
1620- task.params .res_type = TASK_RESPONSE_TYPE_NONE;
1619+ // OAI-compat: enable autoparser (PEG-based chat parsing) so that
1620+ // reasoning, tool calls, and content are classified into ChatDeltas.
1621+ // Without this, the PEG parser never produces diffs and the Go side
1622+ // cannot detect tool calls or separate reasoning from content.
1623+ task.params .res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
16211624 task.params .oaicompat_cmpl_id = completion_id;
16221625 // oaicompat_model is already populated by params_from_json_cmpl
16231626
@@ -1642,10 +1645,27 @@ class BackendServiceImpl final : public backend::Backend::Service {
16421645 return grpc::Status (grpc::StatusCode::INTERNAL, error_json.value (" message" , " Error occurred" ));
16431646 }
16441647
1645- // Lambda to build a Reply from JSON + attach chat deltas from a result
1648+ // Lambda to build a Reply from JSON + attach chat deltas from a result.
1649+ // Handles both native format ({"content": "..."}) and OAI chat format
1650+ // ({"choices": [{"delta": {"content": "...", "reasoning": "..."}}]}).
16461651 auto build_reply_from_json = [](const json & res_json, server_task_result * raw_result) -> backend::Reply {
16471652 backend::Reply reply;
1648- std::string completion_text = res_json.value (" content" , " " );
1653+ std::string completion_text;
1654+
1655+ if (res_json.contains (" choices" )) {
1656+ // OAI chat format — extract content from choices[0].delta
1657+ const auto & choices = res_json.at (" choices" );
1658+ if (!choices.empty ()) {
1659+ const auto & delta = choices[0 ].value (" delta" , json::object ());
1660+ if (delta.contains (" content" ) && !delta.at (" content" ).is_null ()) {
1661+ completion_text = delta.at (" content" ).get <std::string>();
1662+ }
1663+ }
1664+ } else {
1665+ // Native llama.cpp format
1666+ completion_text = res_json.value (" content" , " " );
1667+ }
1668+
16491669 reply.set_message (completion_text);
16501670 reply.set_tokens (res_json.value (" tokens_predicted" , 0 ));
16511671 reply.set_prompt_tokens (res_json.value (" tokens_evaluated" , 0 ));
@@ -1663,21 +1683,17 @@ class BackendServiceImpl final : public backend::Backend::Service {
16631683 return reply;
16641684 };
16651685
1686+ // Attach chat deltas from the autoparser to a Reply.
1687+ // When diffs are available, populate ChatDeltas on the reply.
1688+ // The raw message is always preserved so the Go side can use it
1689+ // for reasoning extraction and tool call parsing as a fallback
1690+ // (important in distributed mode where ChatDeltas may not be
1691+ // the primary parsing path).
16661692 auto attach_chat_deltas = [](backend::Reply & reply, server_task_result * raw_result) {
16671693 // Try streaming partial result first
16681694 auto * partial = dynamic_cast <server_task_result_cmpl_partial*>(raw_result);
1669- if (partial) {
1670- if (!partial->oaicompat_msg_diffs .empty ()) {
1671- populate_chat_deltas_from_diffs (reply, partial->oaicompat_msg_diffs );
1672- } else if (partial->is_updated ) {
1673- // Autoparser is active but hasn't classified this chunk yet
1674- // (PEG parser warming up). Clear the raw message so the Go
1675- // side doesn't try to parse partial tag tokens (e.g. "<|channel>"
1676- // before the full "<|channel>thought\n" is received).
1677- // This matches llama.cpp server behavior which only emits SSE
1678- // chunks when the parser produces diffs.
1679- reply.set_message (" " );
1680- }
1695+ if (partial && !partial->oaicompat_msg_diffs .empty ()) {
1696+ populate_chat_deltas_from_diffs (reply, partial->oaicompat_msg_diffs );
16811697 return ;
16821698 }
16831699 // Try final result
@@ -2357,8 +2373,11 @@ class BackendServiceImpl final : public backend::Backend::Service {
23572373 data);
23582374 task.id_slot = json_value (data, " id_slot" , -1 );
23592375
2360- // OAI-compat
2361- task.params .res_type = TASK_RESPONSE_TYPE_NONE;
2376+ // OAI-compat: enable autoparser (PEG-based chat parsing) so that
2377+ // reasoning, tool calls, and content are classified into ChatDeltas.
2378+ // Without this, the PEG parser never produces diffs and the Go side
2379+ // cannot detect tool calls or separate reasoning from content.
2380+ task.params .res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
23622381 task.params .oaicompat_cmpl_id = completion_id;
23632382 // oaicompat_model is already populated by params_from_json_cmpl
23642383
0 commit comments