From bda0490b19510be8589173ea6916d70db4f374ac Mon Sep 17 00:00:00 2001 From: Mr-Dark-debug Date: Mon, 22 Jun 2026 12:59:08 +0200 Subject: [PATCH 1/3] fix(langchain): read chat token usage from response metadata --- .../instrumentation/langchain/span_utils.py | 65 ++++++++++++++----- .../tests/test_token_usage.py | 65 +++++++++++++++++++ 2 files changed, 115 insertions(+), 15 deletions(-) create mode 100644 packages/opentelemetry-instrumentation-langchain/tests/test_token_usage.py diff --git a/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/span_utils.py b/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/span_utils.py index 09d365daaf..0fbdfde953 100644 --- a/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/span_utils.py +++ b/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/span_utils.py @@ -86,6 +86,42 @@ def _set_span_attribute(span: Span, key: str, value: Any) -> None: span.set_attribute(key, "") +def _get_token_count(usage: dict[str, Any], *keys: str) -> int: + for key in keys: + value = usage.get(key) + if isinstance(value, (int, float)): + return int(value) + return 0 + + +def _extract_message_usage(message: BaseMessage) -> dict[str, Any] | None: + usage_metadata = getattr(message, "usage_metadata", None) + if usage_metadata is not None: + return usage_metadata + + response_metadata = getattr(message, "response_metadata", None) + if not isinstance(response_metadata, dict): + return None + + response_usage = response_metadata.get("usage") + if isinstance(response_usage, dict): + return response_usage + + if any( + key in response_metadata + for key in ( + "prompt_tokens", + "completion_tokens", + "total_tokens", + "input_tokens", + "output_tokens", + ) + ): + return response_metadata + + return None + + def _content_to_parts(content) -> list[dict]: """Convert LangChain message content (str or list-of-blocks) into OTel parts.""" if isinstance(content, str): @@ -405,25 +441,24 @@ def set_chat_response_usage( for generation in generations: if ( hasattr(generation, "message") - and hasattr(generation.message, "usage_metadata") - and generation.message.usage_metadata is not None + and (usage := _extract_message_usage(generation.message)) is not None ): - input_tokens += ( - generation.message.usage_metadata.get("input_tokens") - or generation.message.usage_metadata.get("prompt_tokens") - or 0 + generation_input_tokens = _get_token_count( + usage, "input_tokens", "prompt_tokens", "input_token_count" + ) + generation_output_tokens = _get_token_count( + usage, "output_tokens", "completion_tokens", "generated_token_count" ) - output_tokens += ( - generation.message.usage_metadata.get("output_tokens") - or generation.message.usage_metadata.get("completion_tokens") - or 0 + generation_total_tokens = _get_token_count(usage, "total_tokens") + + input_tokens += generation_input_tokens + output_tokens += generation_output_tokens + total_tokens += generation_total_tokens or ( + generation_input_tokens + generation_output_tokens ) - total_tokens = input_tokens + output_tokens - if generation.message.usage_metadata.get("input_token_details"): - input_token_details = generation.message.usage_metadata.get( - "input_token_details", {} - ) + if usage.get("input_token_details"): + input_token_details = usage.get("input_token_details", {}) raw_cache_read = input_token_details.get("cache_read") if isinstance(raw_cache_read, (int, float)): cache_read_tokens = (cache_read_tokens or 0) + raw_cache_read diff --git a/packages/opentelemetry-instrumentation-langchain/tests/test_token_usage.py b/packages/opentelemetry-instrumentation-langchain/tests/test_token_usage.py new file mode 100644 index 0000000000..5c2867f414 --- /dev/null +++ b/packages/opentelemetry-instrumentation-langchain/tests/test_token_usage.py @@ -0,0 +1,65 @@ +import pytest +from unittest.mock import Mock + +from langchain_core.messages import AIMessage +from langchain_core.outputs import ChatGeneration, LLMResult +from opentelemetry.instrumentation.langchain.span_utils import set_chat_response_usage +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes as GenAIAttributes +from opentelemetry.semconv_ai import SpanAttributes + + +def _mock_span(): + span = Mock() + span.is_recording.return_value = True + span.attributes = {} + + def set_attribute(key, value): + span.attributes[key] = value + + span.set_attribute = set_attribute + return span + + +@pytest.mark.parametrize( + "response_metadata", + [ + { + "usage": { + "prompt_tokens": 10, + "completion_tokens": 16, + "total_tokens": 26, + } + }, + { + "prompt_tokens": 10, + "completion_tokens": 16, + "total_tokens": 26, + }, + ], +) +def test_chat_response_usage_reads_databricks_response_metadata(response_metadata): + span = _mock_span() + response = LLMResult( + generations=[ + [ + ChatGeneration( + message=AIMessage( + content="Hello!", + response_metadata=response_metadata, + ) + ) + ] + ] + ) + + set_chat_response_usage( + span, + response, + token_histogram=Mock(), + record_token_usage=False, + model_name="databricks-claude-sonnet", + ) + + assert span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] == 10 + assert span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] == 16 + assert span.attributes[SpanAttributes.GEN_AI_USAGE_TOTAL_TOKENS] == 26 From 40e3c11c0bd092256e17ea2b1727b1eb1535d2b4 Mon Sep 17 00:00:00 2001 From: Mr-Dark-debug Date: Mon, 22 Jun 2026 13:10:29 +0200 Subject: [PATCH 2/3] fix(llamaindex): read VertexAI token usage metadata --- .../llamaindex/_response_utils.py | 40 ++++++++++++++++--- .../tests/test_response_utils.py | 30 ++++++++++++++ 2 files changed, 64 insertions(+), 6 deletions(-) diff --git a/packages/opentelemetry-instrumentation-llamaindex/opentelemetry/instrumentation/llamaindex/_response_utils.py b/packages/opentelemetry-instrumentation-llamaindex/opentelemetry/instrumentation/llamaindex/_response_utils.py index c39ccb0212..04e90c4774 100644 --- a/packages/opentelemetry-instrumentation-llamaindex/opentelemetry/instrumentation/llamaindex/_response_utils.py +++ b/packages/opentelemetry-instrumentation-llamaindex/opentelemetry/instrumentation/llamaindex/_response_utils.py @@ -65,7 +65,7 @@ def extract_response_id(raw: Any) -> Optional[str]: def extract_token_usage(raw: Any) -> TokenUsage: - """Extract token usage from raw response. Handles OpenAI, Anthropic, Cohere, and dict formats.""" + """Extract token usage from raw response. Handles OpenAI, Anthropic, Cohere, VertexAI, and dict formats.""" usage = _get_nested(raw, "usage") if usage: result = _extract_openai_usage(usage) @@ -75,6 +75,12 @@ def extract_token_usage(raw: Any) -> TokenUsage: if result.input_tokens is not None: return result + usage_metadata = _get_nested(raw, "usage_metadata") or _get_nested(raw, "usageMetadata") + if usage_metadata: + result = _extract_google_usage_metadata(usage_metadata) + if result.input_tokens is not None: + return result + meta = _get_nested(raw, "meta") if meta: return _extract_cohere_usage(meta) @@ -146,6 +152,25 @@ def _extract_anthropic_usage(usage: Any) -> TokenUsage: return TokenUsage() +def _extract_google_usage_metadata(usage_metadata: Any) -> TokenUsage: + """Extract tokens from Google Gemini / VertexAI usage_metadata.""" + input_tokens = _get_int( + usage_metadata, "prompt_token_count", "promptTokenCount" + ) + output_tokens = _get_int( + usage_metadata, "candidates_token_count", "candidatesTokenCount" + ) + total_tokens = _get_int( + usage_metadata, "total_token_count", "totalTokenCount" + ) + + return TokenUsage( + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens or _safe_sum(input_tokens, output_tokens), + ) + + def _extract_cohere_usage(meta: Any) -> TokenUsage: """Extract tokens from Cohere-style meta.tokens or meta.billed_units.""" tokens = _get_nested(meta, "tokens") @@ -165,12 +190,15 @@ def _extract_cohere_usage(meta: Any) -> TokenUsage: return TokenUsage() -def _get_int(obj: Any, key: str) -> Optional[int]: +def _get_int(obj: Any, *keys: str) -> Optional[int]: """Get an integer attribute or dict key from obj.""" - val = getattr(obj, key, None) - if val is None and isinstance(obj, dict): - val = obj.get(key) - return int(val) if val is not None else None + for key in keys: + val = getattr(obj, key, None) + if val is None and isinstance(obj, dict): + val = obj.get(key) + if val is not None: + return int(val) + return None def _safe_sum(a: Optional[int], b: Optional[int]) -> Optional[int]: diff --git a/packages/opentelemetry-instrumentation-llamaindex/tests/test_response_utils.py b/packages/opentelemetry-instrumentation-llamaindex/tests/test_response_utils.py index b0e113c3d7..6b5d8e57ef 100644 --- a/packages/opentelemetry-instrumentation-llamaindex/tests/test_response_utils.py +++ b/packages/opentelemetry-instrumentation-llamaindex/tests/test_response_utils.py @@ -124,6 +124,36 @@ def test_openai_format_dict(self): result = extract_token_usage(raw) assert result == TokenUsage(input_tokens=10, output_tokens=20, total_tokens=30) + @pytest.mark.parametrize( + "raw", + [ + { + "usage_metadata": { + "prompt_token_count": 10, + "candidates_token_count": 20, + "total_token_count": 30, + } + }, + SimpleNamespace( + usage_metadata=SimpleNamespace( + prompt_token_count=10, + candidates_token_count=20, + total_token_count=30, + ) + ), + { + "usageMetadata": { + "promptTokenCount": 10, + "candidatesTokenCount": 20, + "totalTokenCount": 30, + } + }, + ], + ) + def test_google_vertexai_usage_metadata(self, raw): + result = extract_token_usage(raw) + assert result == TokenUsage(input_tokens=10, output_tokens=20, total_tokens=30) + def test_cohere_meta_tokens_format(self): raw = SimpleNamespace( meta=SimpleNamespace(tokens=SimpleNamespace(input_tokens=5, output_tokens=15)) From 83b66e546c010848f5aab2d67e24739b3537ccc6 Mon Sep 17 00:00:00 2001 From: Mr-Dark-debug Date: Mon, 22 Jun 2026 13:30:00 +0200 Subject: [PATCH 3/3] fix(instrumentation): address token usage review feedback --- .../instrumentation/langchain/span_utils.py | 4 +- .../tests/test_token_usage.py | 13 ++ .../llamaindex/_response_utils.py | 6 +- .../tests/test_response_utils.py | 116 ++++++++++++++---- 4 files changed, 114 insertions(+), 25 deletions(-) diff --git a/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/span_utils.py b/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/span_utils.py index 0fbdfde953..3d635dff53 100644 --- a/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/span_utils.py +++ b/packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/span_utils.py @@ -449,7 +449,9 @@ def set_chat_response_usage( generation_output_tokens = _get_token_count( usage, "output_tokens", "completion_tokens", "generated_token_count" ) - generation_total_tokens = _get_token_count(usage, "total_tokens") + generation_total_tokens = _get_token_count( + usage, "total_tokens", "total_token_count" + ) input_tokens += generation_input_tokens output_tokens += generation_output_tokens diff --git a/packages/opentelemetry-instrumentation-langchain/tests/test_token_usage.py b/packages/opentelemetry-instrumentation-langchain/tests/test_token_usage.py index 5c2867f414..f187d46dd6 100644 --- a/packages/opentelemetry-instrumentation-langchain/tests/test_token_usage.py +++ b/packages/opentelemetry-instrumentation-langchain/tests/test_token_usage.py @@ -35,6 +35,19 @@ def set_attribute(key, value): "completion_tokens": 16, "total_tokens": 26, }, + { + "usage": { + "prompt_tokens": 10, + "completion_tokens": 16, + } + }, + { + "usage": { + "prompt_tokens": 10, + "completion_tokens": 16, + "total_token_count": 26, + } + }, ], ) def test_chat_response_usage_reads_databricks_response_metadata(response_metadata): diff --git a/packages/opentelemetry-instrumentation-llamaindex/opentelemetry/instrumentation/llamaindex/_response_utils.py b/packages/opentelemetry-instrumentation-llamaindex/opentelemetry/instrumentation/llamaindex/_response_utils.py index 04e90c4774..ae868b4a19 100644 --- a/packages/opentelemetry-instrumentation-llamaindex/opentelemetry/instrumentation/llamaindex/_response_utils.py +++ b/packages/opentelemetry-instrumentation-llamaindex/opentelemetry/instrumentation/llamaindex/_response_utils.py @@ -167,7 +167,11 @@ def _extract_google_usage_metadata(usage_metadata: Any) -> TokenUsage: return TokenUsage( input_tokens=input_tokens, output_tokens=output_tokens, - total_tokens=total_tokens or _safe_sum(input_tokens, output_tokens), + total_tokens=( + total_tokens + if total_tokens is not None + else _safe_sum(input_tokens, output_tokens) + ), ) diff --git a/packages/opentelemetry-instrumentation-llamaindex/tests/test_response_utils.py b/packages/opentelemetry-instrumentation-llamaindex/tests/test_response_utils.py index 6b5d8e57ef..e95265ce8b 100644 --- a/packages/opentelemetry-instrumentation-llamaindex/tests/test_response_utils.py +++ b/packages/opentelemetry-instrumentation-llamaindex/tests/test_response_utils.py @@ -125,34 +125,104 @@ def test_openai_format_dict(self): assert result == TokenUsage(input_tokens=10, output_tokens=20, total_tokens=30) @pytest.mark.parametrize( - "raw", + "raw, expected_total_tokens", [ - { - "usage_metadata": { - "prompt_token_count": 10, - "candidates_token_count": 20, - "total_token_count": 30, - } - }, - SimpleNamespace( - usage_metadata=SimpleNamespace( - prompt_token_count=10, - candidates_token_count=20, - total_token_count=30, - ) + ( + { + "usage_metadata": { + "prompt_token_count": 10, + "candidates_token_count": 20, + "total_token_count": 30, + } + }, + 30, + ), + ( + SimpleNamespace( + usage_metadata=SimpleNamespace( + prompt_token_count=10, + candidates_token_count=20, + total_token_count=30, + ) + ), + 30, + ), + ( + { + "usageMetadata": { + "promptTokenCount": 10, + "candidatesTokenCount": 20, + "totalTokenCount": 30, + } + }, + 30, + ), + ( + { + "usage_metadata": { + "prompt_token_count": 10, + "candidates_token_count": 20, + "total_token_count": 0, + } + }, + 0, + ), + ( + SimpleNamespace( + usage_metadata=SimpleNamespace( + prompt_token_count=10, + candidates_token_count=20, + total_token_count=0, + ) + ), + 0, + ), + ( + { + "usageMetadata": { + "promptTokenCount": 10, + "candidatesTokenCount": 20, + "totalTokenCount": 0, + } + }, + 0, + ), + ( + { + "usage_metadata": { + "prompt_token_count": 10, + "candidates_token_count": 20, + } + }, + 30, + ), + ( + SimpleNamespace( + usage_metadata=SimpleNamespace( + prompt_token_count=10, + candidates_token_count=20, + ) + ), + 30, + ), + ( + { + "usageMetadata": { + "promptTokenCount": 10, + "candidatesTokenCount": 20, + } + }, + 30, ), - { - "usageMetadata": { - "promptTokenCount": 10, - "candidatesTokenCount": 20, - "totalTokenCount": 30, - } - }, ], ) - def test_google_vertexai_usage_metadata(self, raw): + def test_google_vertexai_usage_metadata(self, raw, expected_total_tokens): result = extract_token_usage(raw) - assert result == TokenUsage(input_tokens=10, output_tokens=20, total_tokens=30) + assert result == TokenUsage( + input_tokens=10, + output_tokens=20, + total_tokens=expected_total_tokens, + ) def test_cohere_meta_tokens_format(self): raw = SimpleNamespace(