Skip to content

Commit 8c43fd6

Browse files
committed
map also other fields
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent 230cfd9 commit 8c43fd6

File tree

2 files changed

+47
-36
lines changed

2 files changed

+47
-36
lines changed

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1667,14 +1667,25 @@ class BackendServiceImpl final : public backend::Backend::Service {
16671667
}
16681668

16691669
reply.set_message(completion_text);
1670-
reply.set_tokens(res_json.value("tokens_predicted", 0));
1671-
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
16721670

1671+
// Token counts: native format has top-level fields,
1672+
// OAI format has them in "usage" (final chunk only)
1673+
if (res_json.contains("usage")) {
1674+
const auto & usage = res_json.at("usage");
1675+
reply.set_tokens(usage.value("completion_tokens", 0));
1676+
reply.set_prompt_tokens(usage.value("prompt_tokens", 0));
1677+
} else {
1678+
reply.set_tokens(res_json.value("tokens_predicted", 0));
1679+
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
1680+
}
1681+
1682+
// Timings: present as top-level "timings" in both formats
16731683
if (res_json.contains("timings")) {
16741684
reply.set_timing_prompt_processing(res_json.at("timings").value("prompt_ms", 0.0));
16751685
reply.set_timing_token_generation(res_json.at("timings").value("predicted_ms", 0.0));
16761686
}
16771687

1688+
// Logprobs: extract_logprobs_from_json handles both formats
16781689
json logprobs_json = extract_logprobs_from_json(res_json);
16791690
if (!logprobs_json.empty() && !logprobs_json.is_null()) {
16801691
reply.set_logprobs(logprobs_json.dump());
@@ -2373,10 +2384,6 @@ class BackendServiceImpl final : public backend::Backend::Service {
23732384
data);
23742385
task.id_slot = json_value(data, "id_slot", -1);
23752386

2376-
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
2377-
// reasoning, tool calls, and content are classified into ChatDeltas.
2378-
// Without this, the PEG parser never produces diffs and the Go side
2379-
// cannot detect tool calls or separate reasoning from content.
23802387
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
23812388
// reasoning, tool calls, and content are classified into ChatDeltas.
23822389
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
@@ -2411,48 +2418,47 @@ class BackendServiceImpl final : public backend::Backend::Service {
24112418
GGML_ASSERT(final_res != nullptr);
24122419
json result_json = all_results.results[0]->to_json();
24132420

2414-
// Handle both native format ({"content": "..."}) and OAI chat
2415-
// format ({"choices": [{"message": {"content": "..."}}]}).
2421+
// Handle both native format ({"content": "...", "tokens_predicted": N})
2422+
// and OAI chat format ({"choices": [{"message": {"content": "..."}}],
2423+
// "usage": {"completion_tokens": N, "prompt_tokens": N}}).
24162424
std::string completion_text;
2425+
int32_t tokens_predicted = 0;
2426+
int32_t tokens_evaluated = 0;
2427+
24172428
if (result_json.contains("choices")) {
2429+
// OAI chat format
24182430
const auto & choices = result_json.at("choices");
24192431
if (!choices.empty()) {
24202432
const auto & msg = choices[0].value("message", json::object());
24212433
if (msg.contains("content") && !msg.at("content").is_null()) {
24222434
completion_text = msg.at("content").get<std::string>();
24232435
}
24242436
}
2437+
if (result_json.contains("usage")) {
2438+
const auto & usage = result_json.at("usage");
2439+
tokens_predicted = usage.value("completion_tokens", 0);
2440+
tokens_evaluated = usage.value("prompt_tokens", 0);
2441+
}
24252442
} else {
2443+
// Native llama.cpp format
24262444
completion_text = result_json.value("content", "");
2445+
tokens_predicted = result_json.value("tokens_predicted", 0);
2446+
tokens_evaluated = result_json.value("tokens_evaluated", 0);
24272447
}
24282448
reply->set_message(completion_text);
2429-
2430-
int32_t tokens_predicted = result_json.value("tokens_predicted", 0);
24312449
reply->set_tokens(tokens_predicted);
2432-
int32_t tokens_evaluated = result_json.value("tokens_evaluated", 0);
24332450
reply->set_prompt_tokens(tokens_evaluated);
24342451

2452+
// Timings: present in both formats as a top-level "timings" object
24352453
if (result_json.contains("timings")) {
2436-
double timing_prompt_processing = result_json.at("timings").value("prompt_ms", 0.0);
2437-
reply->set_timing_prompt_processing(timing_prompt_processing);
2438-
double timing_token_generation = result_json.at("timings").value("predicted_ms", 0.0);
2439-
reply->set_timing_token_generation(timing_token_generation);
2440-
} else if (result_json.contains("usage")) {
2441-
// OAI chat format stores timings in usage
2442-
const auto & usage = result_json.at("usage");
2443-
if (usage.contains("prompt_ms")) {
2444-
reply->set_timing_prompt_processing(usage.value("prompt_ms", 0.0));
2445-
}
2446-
if (usage.contains("predicted_ms")) {
2447-
reply->set_timing_token_generation(usage.value("predicted_ms", 0.0));
2448-
}
2454+
reply->set_timing_prompt_processing(result_json.at("timings").value("prompt_ms", 0.0));
2455+
reply->set_timing_token_generation(result_json.at("timings").value("predicted_ms", 0.0));
24492456
}
24502457

2451-
// Extract and set logprobs if present
2458+
// Logprobs: extract_logprobs_from_json handles both formats
24522459
json logprobs_json = extract_logprobs_from_json(result_json);
24532460
if (!logprobs_json.empty() && !logprobs_json.is_null()) {
2454-
std::string logprobs_str = logprobs_json.dump();
2455-
reply->set_logprobs(logprobs_str);
2461+
reply->set_logprobs(logprobs_json.dump());
24562462
}
24572463

24582464
// Populate chat deltas from the autoparser's final parsed message
@@ -2468,7 +2474,20 @@ class BackendServiceImpl final : public backend::Backend::Service {
24682474
for (auto & res : all_results.results) {
24692475
GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
24702476
json res_json = res->to_json();
2471-
arr.push_back(res_json.value("content", ""));
2477+
// Handle both native and OAI chat formats
2478+
std::string result_content;
2479+
if (res_json.contains("choices")) {
2480+
const auto & choices = res_json.at("choices");
2481+
if (!choices.empty()) {
2482+
const auto & msg = choices[0].value("message", json::object());
2483+
if (msg.contains("content") && !msg.at("content").is_null()) {
2484+
result_content = msg.at("content").get<std::string>();
2485+
}
2486+
}
2487+
} else {
2488+
result_content = res_json.value("content", "");
2489+
}
2490+
arr.push_back(result_content);
24722491

24732492
// Extract logprobs for each result
24742493
json logprobs_json = extract_logprobs_from_json(res_json);

core/http/endpoints/openai/chat.go

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
134134
return err
135135
}
136136
processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool, id string, created int, textContentToReturn *string) error {
137-
xlog.Warn("[StreamDebug] processTools ENTERED", "model", req.Model, "useTokenizerTemplate", config.TemplateConfig.UseTokenizerTemplate)
138137
// Detect if thinking token is already in prompt or template
139138
var template string
140139
if config.TemplateConfig.UseTokenizerTemplate {
@@ -159,17 +158,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
159158
for _, d := range usage.ChatDeltas {
160159
if len(d.ToolCalls) > 0 {
161160
hasChatDeltaToolCalls = true
162-
xlog.Debug("[StreamDebug] ChatDelta with tool calls detected", "tool_count", len(d.ToolCalls))
163161
}
164162
if d.Content != "" {
165163
hasChatDeltaContent = true
166164
}
167-
if d.ReasoningContent != "" {
168-
xlog.Debug("[StreamDebug] ChatDelta reasoning chunk", "len", len(d.ReasoningContent))
169-
}
170-
}
171-
if len(usage.ChatDeltas) == 0 {
172-
xlog.Warn("[StreamDebug] No ChatDeltas in chunk", "raw_len", len(s), "raw_empty", s == "")
173165
}
174166

175167
var reasoningDelta, contentDelta string

0 commit comments

Comments
 (0)