google
diff --git a/‎src/google/adk/flows/llm_flows/base_llm_flow.py‎
Lines changed: 8 additions & 11 deletions b/‎src/google/adk/flows/llm_flows/base_llm_flow.py‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎src/google/adk/telemetry/_instrumentation.py‎
Lines changed: 67 additions & 3 deletions b/‎src/google/adk/telemetry/_instrumentation.py‎
Lines changed: 67 additions & 3 deletions
diff --git a/‎src/google/adk/telemetry/_metrics.py‎
Lines changed: 100 additions & 5 deletions b/‎src/google/adk/telemetry/_metrics.py‎
Lines changed: 100 additions & 5 deletions
diff --git a/‎src/google/adk/telemetry/tracing.py‎
Lines changed: 25 additions & 20 deletions b/‎src/google/adk/telemetry/tracing.py‎
Lines changed: 25 additions & 20 deletions
@@ -43,6 +43,7 @@
 from ...models.google_llm import GoogleLLMVariant
 from ...models.llm_request import LlmRequest
 from ...models.llm_response import LlmResponse
+from ...telemetry import _instrumentation
 from ...telemetry import tracing
 from ...telemetry.tracing import trace_call_llm
 from ...telemetry.tracing import trace_send_data
@@ -376,18 +377,14 @@ async def _run_on_model_error_callbacks(
     return None
 
   try:
-    async with Aclosing(response_generator) as agen:
-      async with tracing.use_inference_span(
-          llm_request,
-          invocation_context,
-          model_response_event,
-      ) as gc_span:
+    async with _instrumentation.record_inference_telemetry(
+        llm_request,
+        invocation_context,
+        model_response_event,
+    ) as tel_ctx:
+      async with Aclosing(response_generator) as agen:
         async for llm_response in agen:
-          if gc_span:
-            tracing.trace_inference_result(
-                gc_span,
-                llm_response,
-            )
+          tel_ctx.record_llm_response(llm_response)
           yield llm_response
   except Exception as model_error:
     callback_context = CallbackContext(
 
@@ -17,6 +17,7 @@
 import contextlib
 import dataclasses
 import logging
+import sys
 import time
 from typing import Any
 from typing import AsyncIterator
@@ -32,18 +33,24 @@
 if TYPE_CHECKING:
   from ..agents.base_agent import BaseAgent
   from ..agents.invocation_context import InvocationContext
+  from ..models.llm_request import LlmRequest
+  from ..models.llm_response import LlmResponse
   from ..tools.base_tool import BaseTool
 
 logger = logging.getLogger("google_adk." + __name__)
 
 
-def _get_elapsed_ms(span: trace.Span | None, fallback_start: float) -> float:
+def _get_elapsed_ms(
+    span: trace.Span | tracing.GenerateContentSpan | None,
+    fallback_start: float,
+) -> float:
   """Guarantees consistent time source for duration calculation.
 
   Note: This must be called with an ended span.
 
   Args:
-    span (trace.Span | None): The ended span to extract duration from.
+    span (trace.Span | tracing.GenerateContentSpan | None): The ended span to
+      extract duration from.
     fallback_start (float): Fallback start time in seconds (monotonic).
 
   Returns:
@@ -52,6 +59,7 @@ def _get_elapsed_ms(span: trace.Span | None, fallback_start: float) -> float:
   if span is None:
     return (time.monotonic() - fallback_start) * 1000
 
+  span = span.span if hasattr(span, "span") else span
   start_ns = getattr(span, "start_time", None)
   end_ns = getattr(span, "end_time", None)
 
@@ -66,9 +74,19 @@ def _get_elapsed_ms(span: trace.Span | None, fallback_start: float) -> float:
 class TelemetryContext:
   """Stores all telemetry related state."""
 
-  otel_context: context_api.Context
+  otel_context: context_api.Context | None = None
   function_response_event: event_lib.Event | None = None
   error_type: str | None = None
+  span: tracing.GenerateContentSpan | trace.Span | None = None
+  _llm_responses: list[LlmResponse] = dataclasses.field(default_factory=list)
+
+  @property
+  def llm_responses(self) -> list[LlmResponse]:
+    return self._llm_responses
+
+  def record_llm_response(self, response: LlmResponse) -> None:
+    self._llm_responses.append(response)
+    tracing.trace_inference_result(self.span, response)
 
 
 def _record_agent_metrics(
@@ -163,3 +181,49 @@ async def record_tool_execution(
       logger.exception(
           "Failed to record tool execution duration for tool %s", tool.name
       )
+
+
+@contextlib.asynccontextmanager
+async def record_inference_telemetry(
+    llm_request: LlmRequest,
+    invocation_context: InvocationContext,
+    model_response_event: event_lib.Event,
+) -> AsyncIterator[TelemetryContext]:
+  """Unified async context manager for consolidated inference metrics."""
+  start_time = time.monotonic()
+  tel_ctx: TelemetryContext = TelemetryContext()
+  try:
+    async with tracing.use_inference_span(
+        llm_request,
+        invocation_context,
+        model_response_event,
+    ) as gc_span:
+      tel_ctx.span = gc_span
+      yield tel_ctx
+  finally:
+    inference_error = sys.exc_info()[1]
+    elapsed_ms = _get_elapsed_ms(tel_ctx.span, start_time)
+    agent = invocation_context.agent
+    try:
+      if agent is not None and tracing._should_emit_native_telemetry(agent):
+        _metrics.record_client_operation_duration(
+            agent_name=agent.name,
+            elapsed_ms=elapsed_ms,
+            llm_request=llm_request,
+            responses=tel_ctx.llm_responses,
+            error=(
+                inference_error
+                if isinstance(inference_error, Exception)
+                else None
+            ),
+        )
+        _metrics.record_client_token_usage(
+            agent_name=agent.name,
+            llm_request=llm_request,
+            responses=tel_ctx.llm_responses,
+        )
+    except Exception:  # pylint: disable=broad-exception-caught
+      logger.exception(
+          "Failed to record inference metrics for agent %s",
+          agent.name if agent is not None else "<unknown>",
+      )
@@ -15,28 +15,31 @@
 from __future__ import annotations
 
 import logging
+from typing import TYPE_CHECKING
 
 from google.adk import version
-from google.adk.events.event import Event
+from google.adk.telemetry import tracing
 from google.genai import types
 from opentelemetry import metrics
 from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
 from opentelemetry.semconv.attributes import error_attributes
 
+if TYPE_CHECKING:
+  from google.adk.events.event import Event
+  from google.adk.models.llm_request import LlmRequest
+  from google.adk.models.llm_response import LlmResponse
+
 logger = logging.getLogger("google_adk." + __name__)
 
-# TODO(b/477553411): add these attributes to Otel semconv.
 GEN_AI_AGENT_VERSION = "gen_ai.agent.version"
 GEN_AI_TOOL_VERSION = "gen_ai.tool.version"
 
-# Initialize meter
 meter = metrics.get_meter(
     name="gcp.vertex.agent",
     version=version.__version__,
-    # TODO(b/477553411): set schema version after OTel semconv updates.
 )
 
-# Define histograms
 _agent_invocation_duration = meter.create_histogram(
     "gen_ai.agent.invocation.duration",
     unit="ms",
@@ -62,6 +65,10 @@
     unit="1",
     description="Length of agentic workflow (# of events).",
 )
+_client_operation_duration = (
+    gen_ai_metrics.create_gen_ai_client_operation_duration(meter)
+)
+_client_token_usage = gen_ai_metrics.create_gen_ai_client_token_usage(meter)
 
 
 def record_agent_invocation_duration(
@@ -121,6 +128,90 @@ def record_tool_execution_duration(
   _tool_execution_duration.record(elapsed_ms, attributes=attrs)
 
 
+def record_client_operation_duration(
+    agent_name: str,
+    elapsed_ms: float,
+    llm_request: LlmRequest,
+    responses: list[LlmResponse],
+    error: Exception | None = None,
+):
+  """Encapsulates the business logic for tracking gen_ai client operation duration."""
+
+  attrs = {
+      gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
+      gen_ai_attributes.GEN_AI_OPERATION_NAME: "generate_content",
+      gen_ai_attributes.GEN_AI_PROVIDER_NAME: _get_provider_name(),
+  }
+  if llm_request.model:
+    attrs[gen_ai_attributes.GEN_AI_REQUEST_MODEL] = llm_request.model
+
+  if responses:
+    response_model = responses[-1].model_version or llm_request.model
+    if response_model:
+      attrs[gen_ai_attributes.GEN_AI_RESPONSE_MODEL] = response_model
+
+  if error is not None:
+    attrs[error_attributes.ERROR_TYPE] = type(error).__name__
+
+  _client_operation_duration.record(elapsed_ms / 1000.0, attributes=attrs)
+
+
+def record_client_token_usage(
+    agent_name: str,
+    llm_request: LlmRequest,
+    responses: list[LlmResponse],
+):
+  """Encapsulates the business logic for tracking gen_ai client token usage."""
+  if not responses:
+    return
+
+  # The assumption is that token usage in streaming responses is cumulative.
+  # The last response chunk contains the total usage for the entire request.
+  # Summing them up across all response chunks would result in overcounting.
+  last_response = responses[-1]
+  if not last_response.usage_metadata:
+    logger.warning(
+        "Skipping missing token usage metadata for agent %s and model %s",
+        agent_name,
+        llm_request.model,
+    )
+    return
+
+  # OTel semconv for `gen_ai.client.token.usage` states that token counts should
+  # be categorized under `gen_ai.token.type` as either "input" or "output".
+  # We aggregate prompt and tool use tokens for "input", and candidates and
+  # thoughts tokens for "output".
+  # `cached_content_token_count` is omitted as it's already included in prompt tokens.
+  # `total_token_count` is omitted as SemConv expects input/output breakdown.
+  usage = last_response.usage_metadata
+  input_token_count = (usage.prompt_token_count or 0) + (
+      usage.tool_use_prompt_token_count or 0
+  )
+  output_token_count = (usage.candidates_token_count or 0) + (
+      usage.thoughts_token_count or 0
+  )
+  response_model = last_response.model_version or llm_request.model
+  base_attrs = {
+      gen_ai_attributes.GEN_AI_AGENT_NAME: agent_name,
+      gen_ai_attributes.GEN_AI_OPERATION_NAME: "generate_content",
+      gen_ai_attributes.GEN_AI_PROVIDER_NAME: _get_provider_name(),
+  }
+  if llm_request.model:
+    base_attrs[gen_ai_attributes.GEN_AI_REQUEST_MODEL] = llm_request.model
+  if response_model:
+    base_attrs[gen_ai_attributes.GEN_AI_RESPONSE_MODEL] = response_model
+
+  if input_token_count > 0:
+    input_attrs = base_attrs.copy()
+    input_attrs[gen_ai_attributes.GEN_AI_TOKEN_TYPE] = "input"
+    _client_token_usage.record(input_token_count, attributes=input_attrs)
+
+  if output_token_count > 0:
+    output_attrs = base_attrs.copy()
+    output_attrs[gen_ai_attributes.GEN_AI_TOKEN_TYPE] = "output"
+    _client_token_usage.record(output_token_count, attributes=output_attrs)
+
+
 def _get_content_size(
     content: types.Content | None,
 ) -> int:
@@ -133,3 +224,7 @@ def _get_content_size(
     if part.inline_data and part.inline_data.data:
       size += len(part.inline_data.data)
   return size
+
+
+def _get_provider_name() -> str:
+  return tracing._guess_gemini_system_name()
@@ -570,22 +570,19 @@ def use_generate_content_span(
   log_only_common_attributes = {}
   if invocation_context.session.user_id is not None:
     log_only_common_attributes[USER_ID] = invocation_context.session.user_id
-  if (
-      _is_gemini_agent(invocation_context.agent)
-      and _instrumented_with_opentelemetry_instrumentation_google_genai()
-  ):
-    with _use_extra_generate_content_attributes(
-        common_attributes,
-        log_only_extra_attributes=log_only_common_attributes,
-    ):
-      yield
-  else:
+  if _should_emit_native_telemetry(invocation_context.agent):
     with _use_native_generate_content_span_stable_semconv(
         llm_request=llm_request,
         common_attributes=common_attributes,
         log_only_common_attributes=log_only_common_attributes,
     ) as span:
       yield span.span
+  else:
+    with _use_extra_generate_content_attributes(
+        common_attributes,
+        log_only_extra_attributes=log_only_common_attributes,
+    ):
+      yield
 
 
 @asynccontextmanager
@@ -610,16 +607,7 @@ async def use_inference_span(
   log_only_common_attributes = {}
   if invocation_context.session.user_id is not None:
     log_only_common_attributes[USER_ID] = invocation_context.session.user_id
-  if (
-      _is_gemini_agent(invocation_context.agent)
-      and _instrumented_with_opentelemetry_instrumentation_google_genai()
-  ):
-    with _use_extra_generate_content_attributes(
-        common_attributes,
-        log_only_extra_attributes=log_only_common_attributes,
-    ):
-      yield
-  else:
+  if _should_emit_native_telemetry(invocation_context.agent):
     async with _use_native_generate_content_span(
         llm_request=llm_request,
         common_attributes=common_attributes,
@@ -640,6 +628,12 @@ async def use_inference_span(
             gc_span.operation_details_attributes,
             gc_span.operation_details_common_attributes,
         )
+  else:
+    with _use_extra_generate_content_attributes(
+        common_attributes,
+        log_only_extra_attributes=log_only_common_attributes,
+    ):
+      yield
 
 
 def _should_log_prompt_response_content() -> bool:
@@ -683,6 +677,17 @@ def _instrumented_with_opentelemetry_instrumentation_google_genai() -> bool:
   return False
 
 
+def _should_emit_native_telemetry(agent: BaseAgent) -> bool:
+  """If the google-genai instrumentation lib is active AND this is a Gemini agent, then the lib already emits inference metrics."""
+  if (
+      _instrumented_with_opentelemetry_instrumentation_google_genai()
+      and _is_gemini_agent(agent)
+  ):
+    return False
+
+  return True
+
+
 @contextmanager
 def _use_extra_generate_content_attributes(
     extra_attributes: Mapping[str, AttributeValue],