feat: support token usage extraction for llama.cpp (#7358)

Soulter · web-flow · commit dc9c17c1956b · 2026-04-04T23:49:18.000+08:00
* feat: support token usage extraction for llama.cpp

* chore: ruff format
diff --git a/astrbot/core/provider/sources/openai_source.py b/astrbot/core/provider/sources/openai_source.py
@@ -532,19 +532,18 @@ async def _query_stream(
             **payloads,
             stream=True,
             extra_body=extra_body,
+            stream_options={"include_usage": True},
         )
 
         llm_response = LLMResponse("assistant", is_chunk=True)
 
         state = ChatCompletionStreamState()
 
         async for chunk in stream:
-            if not chunk.choices:
-                continue
-            choice = chunk.choices[0]
-            delta = choice.delta
+            choice = chunk.choices[0] if chunk.choices else None
+            delta = choice.delta if choice else None
 
-            if dtcs := delta.tool_calls:
+            if delta and (dtcs := delta.tool_calls):
                 for idx, tc in enumerate(dtcs):
                     # siliconflow workaround
                     if tc.function and tc.function.arguments:
@@ -574,7 +573,7 @@ async def _query_stream(
                 _y = True
             if chunk.usage:
                 llm_response.usage = self._extract_usage(chunk.usage)
-            elif choice_usage := getattr(choice, "usage", None):
+            elif choice and (choice_usage := getattr(choice, "usage", None)):
                 # Workaround for some providers that only return usage in choices[].usage, e.g. MoonshotAI
                 # See https://github.com/AstrBotDevs/AstrBot/issues/6614
                 llm_response.usage = self._extract_usage(choice_usage)
diff --git a/tests/test_openai_source.py b/tests/test_openai_source.py
@@ -2,6 +2,7 @@
 
 import pytest
 from openai.types.chat.chat_completion import ChatCompletion
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
 from PIL import Image as PILImage
 
 from astrbot.core.exceptions import EmptyModelOutputError
@@ -1175,6 +1176,93 @@ async def test_parse_openai_completion_raises_empty_model_output_error():
         await provider.terminate()
 
 
+@pytest.mark.asyncio
+async def test_query_stream_extracts_usage_from_empty_choices_chunk(monkeypatch):
+    provider = _make_provider()
+    try:
+        chunks = [
+            ChatCompletionChunk.model_validate(
+                {
+                    "id": "chatcmpl-stream",
+                    "object": "chat.completion.chunk",
+                    "created": 0,
+                    "model": "gpt-4o-mini",
+                    "choices": [
+                        {
+                            "index": 0,
+                            "delta": {
+                                "role": "assistant",
+                                "content": "ok",
+                            },
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+            ),
+            ChatCompletionChunk.model_validate(
+                {
+                    "id": "chatcmpl-stream",
+                    "object": "chat.completion.chunk",
+                    "created": 0,
+                    "model": "gpt-4o-mini",
+                    "choices": [
+                        {
+                            "index": 0,
+                            "delta": {},
+                            "finish_reason": "stop",
+                        }
+                    ],
+                }
+            ),
+            ChatCompletionChunk.model_validate(
+                {
+                    "id": "chatcmpl-stream",
+                    "object": "chat.completion.chunk",
+                    "created": 0,
+                    "model": "gpt-4o-mini",
+                    "choices": [],
+                    "usage": {
+                        "prompt_tokens": 2550,
+                        "completion_tokens": 125,
+                        "total_tokens": 2675,
+                        "prompt_tokens_details": {
+                            "cached_tokens": 2488,
+                        },
+                    },
+                }
+            ),
+        ]
+
+        async def fake_stream():
+            for chunk in chunks:
+                yield chunk
+
+        async def fake_create(**kwargs):
+            return fake_stream()
+
+        monkeypatch.setattr(provider.client.chat.completions, "create", fake_create)
+
+        responses = [
+            response
+            async for response in provider._query_stream(
+                payloads={
+                    "model": "gpt-4o-mini",
+                    "messages": [{"role": "user", "content": "hello"}],
+                },
+                tools=None,
+            )
+        ]
+
+        final_response = responses[-1]
+        assert final_response.completion_text == "ok"
+        assert final_response.usage is not None
+        assert final_response.usage.input_other == 62
+        assert final_response.usage.input_cached == 2488
+        assert final_response.usage.output == 125
+    finally:
+        await provider.terminate()
+
+
 @pytest.mark.asyncio
 async def test_query_filters_empty_assistant_message_without_tool_calls(monkeypatch):
     """Test that empty assistant messages without tool_calls are filtered out."""