map also other fields

mudler · mudler · commit 8c43fd6eb321 · 2026-04-06T07:12:01.000Z
Signed-off-by: Ettore Di Giacinto &lt;mudler@localai.io&gt;
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -1667,14 +1667,25 @@ class BackendServiceImpl final : public backend::Backend::Service {
             }
 
             reply.set_message(completion_text);
-            reply.set_tokens(res_json.value("tokens_predicted", 0));
-            reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
 
+            // Token counts: native format has top-level fields,
+            // OAI format has them in "usage" (final chunk only)
+            if (res_json.contains("usage")) {
+                const auto & usage = res_json.at("usage");
+                reply.set_tokens(usage.value("completion_tokens", 0));
+                reply.set_prompt_tokens(usage.value("prompt_tokens", 0));
+            } else {
+                reply.set_tokens(res_json.value("tokens_predicted", 0));
+                reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
+            }
+
+            // Timings: present as top-level "timings" in both formats
             if (res_json.contains("timings")) {
                 reply.set_timing_prompt_processing(res_json.at("timings").value("prompt_ms", 0.0));
                 reply.set_timing_token_generation(res_json.at("timings").value("predicted_ms", 0.0));
             }
 
+            // Logprobs: extract_logprobs_from_json handles both formats
             json logprobs_json = extract_logprobs_from_json(res_json);
             if (!logprobs_json.empty() && !logprobs_json.is_null()) {
                 reply.set_logprobs(logprobs_json.dump());
@@ -2373,10 +2384,6 @@ class BackendServiceImpl final : public backend::Backend::Service {
                         data);
                 task.id_slot = json_value(data, "id_slot", -1);
 
-                // OAI-compat: enable autoparser (PEG-based chat parsing) so that
-                // reasoning, tool calls, and content are classified into ChatDeltas.
-                // Without this, the PEG parser never produces diffs and the Go side
-                // cannot detect tool calls or separate reasoning from content.
                 // OAI-compat: enable autoparser (PEG-based chat parsing) so that
                 // reasoning, tool calls, and content are classified into ChatDeltas.
                 task.params.res_type                 = TASK_RESPONSE_TYPE_OAI_CHAT;
@@ -2411,48 +2418,47 @@ class BackendServiceImpl final : public backend::Backend::Service {
                 GGML_ASSERT(final_res != nullptr);
                 json result_json = all_results.results[0]->to_json();
 
-                // Handle both native format ({"content": "..."}) and OAI chat
-                // format ({"choices": [{"message": {"content": "..."}}]}).
+                // Handle both native format ({"content": "...", "tokens_predicted": N})
+                // and OAI chat format ({"choices": [{"message": {"content": "..."}}],
+                // "usage": {"completion_tokens": N, "prompt_tokens": N}}).
                 std::string completion_text;
+                int32_t tokens_predicted = 0;
+                int32_t tokens_evaluated = 0;
+
                 if (result_json.contains("choices")) {
+                    // OAI chat format
                     const auto & choices = result_json.at("choices");
                     if (!choices.empty()) {
                         const auto & msg = choices[0].value("message", json::object());
                         if (msg.contains("content") && !msg.at("content").is_null()) {
                             completion_text = msg.at("content").get<std::string>();
                         }
                     }
+                    if (result_json.contains("usage")) {
+                        const auto & usage = result_json.at("usage");
+                        tokens_predicted = usage.value("completion_tokens", 0);
+                        tokens_evaluated = usage.value("prompt_tokens", 0);
+                    }
                 } else {
+                    // Native llama.cpp format
                     completion_text = result_json.value("content", "");
+                    tokens_predicted = result_json.value("tokens_predicted", 0);
+                    tokens_evaluated = result_json.value("tokens_evaluated", 0);
                 }
                 reply->set_message(completion_text);
-
-                int32_t tokens_predicted = result_json.value("tokens_predicted", 0);
                 reply->set_tokens(tokens_predicted);
-                int32_t tokens_evaluated = result_json.value("tokens_evaluated", 0);
                 reply->set_prompt_tokens(tokens_evaluated);
 
+                // Timings: present in both formats as a top-level "timings" object
                 if (result_json.contains("timings")) {
-                    double timing_prompt_processing = result_json.at("timings").value("prompt_ms", 0.0);
-                    reply->set_timing_prompt_processing(timing_prompt_processing);
-                    double timing_token_generation = result_json.at("timings").value("predicted_ms", 0.0);
-                    reply->set_timing_token_generation(timing_token_generation);
-                } else if (result_json.contains("usage")) {
-                    // OAI chat format stores timings in usage
-                    const auto & usage = result_json.at("usage");
-                    if (usage.contains("prompt_ms")) {
-                        reply->set_timing_prompt_processing(usage.value("prompt_ms", 0.0));
-                    }
-                    if (usage.contains("predicted_ms")) {
-                        reply->set_timing_token_generation(usage.value("predicted_ms", 0.0));
-                    }
+                    reply->set_timing_prompt_processing(result_json.at("timings").value("prompt_ms", 0.0));
+                    reply->set_timing_token_generation(result_json.at("timings").value("predicted_ms", 0.0));
                 }
 
-                // Extract and set logprobs if present
+                // Logprobs: extract_logprobs_from_json handles both formats
                 json logprobs_json = extract_logprobs_from_json(result_json);
                 if (!logprobs_json.empty() && !logprobs_json.is_null()) {
-                    std::string logprobs_str = logprobs_json.dump();
-                    reply->set_logprobs(logprobs_str);
+                    reply->set_logprobs(logprobs_json.dump());
                 }
 
                 // Populate chat deltas from the autoparser's final parsed message
@@ -2468,7 +2474,20 @@ class BackendServiceImpl final : public backend::Backend::Service {
                 for (auto & res : all_results.results) {
                     GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
                     json res_json = res->to_json();
-                    arr.push_back(res_json.value("content", ""));
+                    // Handle both native and OAI chat formats
+                    std::string result_content;
+                    if (res_json.contains("choices")) {
+                        const auto & choices = res_json.at("choices");
+                        if (!choices.empty()) {
+                            const auto & msg = choices[0].value("message", json::object());
+                            if (msg.contains("content") && !msg.at("content").is_null()) {
+                                result_content = msg.at("content").get<std::string>();
+                            }
+                        }
+                    } else {
+                        result_content = res_json.value("content", "");
+                    }
+                    arr.push_back(result_content);
 
                     // Extract logprobs for each result
                     json logprobs_json = extract_logprobs_from_json(res_json);
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -134,7 +134,6 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 		return err
 	}
 	processTools := func(noAction string, prompt string, req *schema.OpenAIRequest, config *config.ModelConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool, id string, created int, textContentToReturn *string) error {
-		xlog.Warn("[StreamDebug] processTools ENTERED", "model", req.Model, "useTokenizerTemplate", config.TemplateConfig.UseTokenizerTemplate)
 		// Detect if thinking token is already in prompt or template
 		var template string
 		if config.TemplateConfig.UseTokenizerTemplate {
@@ -159,17 +158,10 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			for _, d := range usage.ChatDeltas {
 				if len(d.ToolCalls) > 0 {
 					hasChatDeltaToolCalls = true
-					xlog.Debug("[StreamDebug] ChatDelta with tool calls detected", "tool_count", len(d.ToolCalls))
 				}
 				if d.Content != "" {
 					hasChatDeltaContent = true
 				}
-				if d.ReasoningContent != "" {
-					xlog.Debug("[StreamDebug] ChatDelta reasoning chunk", "len", len(d.ReasoningContent))
-				}
-			}
-			if len(usage.ChatDeltas) == 0 {
-				xlog.Warn("[StreamDebug] No ChatDeltas in chunk", "raw_len", len(s), "raw_empty", s == "")
 			}
 
 			var reasoningDelta, contentDelta string

Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,6 @@ func ChatEndpoint(cl config.ModelConfigLoader, ml model.ModelLoader, evaluator`
`134`	`134`	`return err`
`135`	`135`	`}`
`136`	`136`	`processTools := func(noAction string, prompt string, req schema.OpenAIRequest, config config.ModelConfig, loader model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool, id string, created int, textContentToReturn string) error {`
`137`		`- xlog.Warn("[StreamDebug] processTools ENTERED", "model", req.Model, "useTokenizerTemplate", config.TemplateConfig.UseTokenizerTemplate)`
`138`	`137`	`// Detect if thinking token is already in prompt or template`
`139`	`138`	`var template string`
`140`	`139`	`if config.TemplateConfig.UseTokenizerTemplate {`
`@@ -159,17 +158,10 @@ func ChatEndpoint(cl config.ModelConfigLoader, ml model.ModelLoader, evaluator`
`159`	`158`	`for _, d := range usage.ChatDeltas {`
`160`	`159`	`if len(d.ToolCalls) > 0 {`
`161`	`160`	`hasChatDeltaToolCalls = true`
`162`		`- xlog.Debug("[StreamDebug] ChatDelta with tool calls detected", "tool_count", len(d.ToolCalls))`
`163`	`161`	`}`
`164`	`162`	`if d.Content != "" {`
`165`	`163`	`hasChatDeltaContent = true`
`166`	`164`	`}`
`167`		`- if d.ReasoningContent != "" {`
`168`		`- xlog.Debug("[StreamDebug] ChatDelta reasoning chunk", "len", len(d.ReasoningContent))`
`169`		`- }`
`170`		`- }`
`171`		`- if len(usage.ChatDeltas) == 0 {`
`172`		`- xlog.Warn("[StreamDebug] No ChatDeltas in chunk", "raw_len", len(s), "raw_empty", s == "")`
`173`	`165`	`}`
`174`	`166`
`175`	`167`	`var reasoningDelta, contentDelta string`