Skip to content

Commit 3f0f799

Browse files
committed
fix: use oai compat for llama.cpp
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent cbb315c commit 3f0f799

2 files changed

Lines changed: 45 additions & 18 deletions

File tree

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1616,8 +1616,11 @@ class BackendServiceImpl final : public backend::Backend::Service {
16161616
data);
16171617
task.id_slot = json_value(data, "id_slot", -1);
16181618

1619-
// OAI-compat
1620-
task.params.res_type = TASK_RESPONSE_TYPE_NONE;
1619+
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
1620+
// reasoning, tool calls, and content are classified into ChatDeltas.
1621+
// Without this, the PEG parser never produces diffs and the Go side
1622+
// cannot detect tool calls or separate reasoning from content.
1623+
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
16211624
task.params.oaicompat_cmpl_id = completion_id;
16221625
// oaicompat_model is already populated by params_from_json_cmpl
16231626

@@ -1642,10 +1645,27 @@ class BackendServiceImpl final : public backend::Backend::Service {
16421645
return grpc::Status(grpc::StatusCode::INTERNAL, error_json.value("message", "Error occurred"));
16431646
}
16441647

1645-
// Lambda to build a Reply from JSON + attach chat deltas from a result
1648+
// Lambda to build a Reply from JSON + attach chat deltas from a result.
1649+
// Handles both native format ({"content": "..."}) and OAI chat format
1650+
// ({"choices": [{"delta": {"content": "...", "reasoning": "..."}}]}).
16461651
auto build_reply_from_json = [](const json & res_json, server_task_result * raw_result) -> backend::Reply {
16471652
backend::Reply reply;
1648-
std::string completion_text = res_json.value("content", "");
1653+
std::string completion_text;
1654+
1655+
if (res_json.contains("choices")) {
1656+
// OAI chat format — extract content from choices[0].delta
1657+
const auto & choices = res_json.at("choices");
1658+
if (!choices.empty()) {
1659+
const auto & delta = choices[0].value("delta", json::object());
1660+
if (delta.contains("content") && !delta.at("content").is_null()) {
1661+
completion_text = delta.at("content").get<std::string>();
1662+
}
1663+
}
1664+
} else {
1665+
// Native llama.cpp format
1666+
completion_text = res_json.value("content", "");
1667+
}
1668+
16491669
reply.set_message(completion_text);
16501670
reply.set_tokens(res_json.value("tokens_predicted", 0));
16511671
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
@@ -1663,21 +1683,17 @@ class BackendServiceImpl final : public backend::Backend::Service {
16631683
return reply;
16641684
};
16651685

1686+
// Attach chat deltas from the autoparser to a Reply.
1687+
// When diffs are available, populate ChatDeltas on the reply.
1688+
// The raw message is always preserved so the Go side can use it
1689+
// for reasoning extraction and tool call parsing as a fallback
1690+
// (important in distributed mode where ChatDeltas may not be
1691+
// the primary parsing path).
16661692
auto attach_chat_deltas = [](backend::Reply & reply, server_task_result * raw_result) {
16671693
// Try streaming partial result first
16681694
auto* partial = dynamic_cast<server_task_result_cmpl_partial*>(raw_result);
1669-
if (partial) {
1670-
if (!partial->oaicompat_msg_diffs.empty()) {
1671-
populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
1672-
} else if (partial->is_updated) {
1673-
// Autoparser is active but hasn't classified this chunk yet
1674-
// (PEG parser warming up). Clear the raw message so the Go
1675-
// side doesn't try to parse partial tag tokens (e.g. "<|channel>"
1676-
// before the full "<|channel>thought\n" is received).
1677-
// This matches llama.cpp server behavior which only emits SSE
1678-
// chunks when the parser produces diffs.
1679-
reply.set_message("");
1680-
}
1695+
if (partial && !partial->oaicompat_msg_diffs.empty()) {
1696+
populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
16811697
return;
16821698
}
16831699
// Try final result
@@ -2357,8 +2373,11 @@ class BackendServiceImpl final : public backend::Backend::Service {
23572373
data);
23582374
task.id_slot = json_value(data, "id_slot", -1);
23592375

2360-
// OAI-compat
2361-
task.params.res_type = TASK_RESPONSE_TYPE_NONE;
2376+
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
2377+
// reasoning, tool calls, and content are classified into ChatDeltas.
2378+
// Without this, the PEG parser never produces diffs and the Go side
2379+
// cannot detect tool calls or separate reasoning from content.
2380+
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
23622381
task.params.oaicompat_cmpl_id = completion_id;
23632382
// oaicompat_model is already populated by params_from_json_cmpl
23642383

core/http/endpoints/openai/chat.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
134134
return err
135135
}
136136
processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool, id string, created int, textContentToReturn *string) error {
137+
xlog.Warn("[StreamDebug] processTools ENTERED", "model", req.Model, "useTokenizerTemplate", config.TemplateConfig.UseTokenizerTemplate)
137138
// Detect if thinking token is already in prompt or template
138139
var template string
139140
if config.TemplateConfig.UseTokenizerTemplate {
@@ -158,10 +159,17 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
158159
for _, d := range usage.ChatDeltas {
159160
if len(d.ToolCalls) > 0 {
160161
hasChatDeltaToolCalls = true
162+
xlog.Debug("[StreamDebug] ChatDelta with tool calls detected", "tool_count", len(d.ToolCalls))
161163
}
162164
if d.Content != "" {
163165
hasChatDeltaContent = true
164166
}
167+
if d.ReasoningContent != "" {
168+
xlog.Debug("[StreamDebug] ChatDelta reasoning chunk", "len", len(d.ReasoningContent))
169+
}
170+
}
171+
if len(usage.ChatDeltas) == 0 {
172+
xlog.Warn("[StreamDebug] No ChatDeltas in chunk", "raw_len", len(s), "raw_empty", s == "")
165173
}
166174

167175
var reasoningDelta, contentDelta string

0 commit comments

Comments
 (0)