NVIDIA-NeMo
diff --git a/‎nemoguardrails/guardrails/engine_registry.py‎
Lines changed: 103 additions & 7 deletions b/‎nemoguardrails/guardrails/engine_registry.py‎
Lines changed: 103 additions & 7 deletions
diff --git a/‎nemoguardrails/guardrails/iorails.py‎
Lines changed: 6 additions & 1 deletion b/‎nemoguardrails/guardrails/iorails.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎nemoguardrails/guardrails/model_engine.py‎
Lines changed: 42 additions & 19 deletions b/‎nemoguardrails/guardrails/model_engine.py‎
Lines changed: 42 additions & 19 deletions
@@ -20,16 +20,27 @@
 """
 
 import logging
+import time
 from collections.abc import AsyncGenerator
+from contextlib import nullcontext
 from typing import TYPE_CHECKING, Any, Optional, TypeVar
 
 from nemoguardrails.guardrails.api_engine import APIEngine
 from nemoguardrails.guardrails.base_engine import BaseEngine
 from nemoguardrails.guardrails.guardrails_types import get_request_id, truncate
 from nemoguardrails.guardrails.model_engine import ModelEngine
-from nemoguardrails.guardrails.telemetry import api_call_span, llm_call_span
+from nemoguardrails.guardrails.telemetry import (
+    api_call_span,
+    llm_call_span,
+)
 from nemoguardrails.rails.llm.config import Model, RailsConfigData
-from nemoguardrails.types import LLMResponse, LLMResponseChunk
+from nemoguardrails.tracing.constants import (
+    llm_operation_duration,
+    record_time_per_output_chunk,
+    record_time_to_first_chunk,
+    record_token_usage,
+)
+from nemoguardrails.types import LLMResponse, LLMResponseChunk, UsageInfo
 
 if TYPE_CHECKING:
     from opentelemetry.trace import Tracer
@@ -51,15 +62,24 @@ def __init__(
         models: list[Model],
         rails_config_data: RailsConfigData,
         tracer: Optional["Tracer"] = None,
+        metrics_enabled: bool = False,
     ) -> None:
         """Build one engine per configured model and API service.
 
         When *tracer* is provided, LLM and API calls produce OTEL spans; when
         ``None`` the span helpers become no-ops.
+
+        When *metrics_enabled* is True, LLM calls emit the OTEL GenAI
+        client-side metrics (``gen_ai.client.token.usage``,
+        ``gen_ai.client.operation.duration``, plus the streaming
+        chunk-timing metrics).  Defaults to False so callers that don't
+        opt in get no metric emissions even if a MeterProvider is
+        configured globally.
         """
         self._engines: dict[str, BaseEngine] = {}
         self._running = False
         self._tracer = tracer
+        self._metrics_enabled = metrics_enabled
 
         for model_config in models:
             engine = ModelEngine(model_config)
@@ -147,6 +167,11 @@ async def model_call(self, model_type: str, messages: list[dict], **kwargs: Any)
         reasoning (when the provider exposes it), usage, finish reason.
         Callers that only want the assistant text should access ``.content``.
 
+        When metrics are enabled, emits ``gen_ai.client.operation.duration``
+        (with ``error.type`` on exception) and ``gen_ai.client.token.usage``
+        (one observation each for ``input`` and ``output`` token types,
+        only when ``LLMResponse.usage`` is populated).
+
         Raises:
             KeyError: If no engine is registered with the given name.
             TypeError: If the named engine is not a ModelEngine.
@@ -155,8 +180,26 @@ async def model_call(self, model_type: str, messages: list[dict], **kwargs: Any)
         log.debug("[%s] Model engine '%s' messages: %s", req_id, model_type, truncate(messages))
 
         engine = self._get_engine(model_type, ModelEngine)
-        with llm_call_span(self._tracer, engine.model_name, engine.model_config.engine or "unknown"):
-            result = await engine.chat_completion(messages, **kwargs)
+        # TODO: Replace with LLMModel.provider_name after refactoring
+        provider_name = engine.model_config.engine or "unknown"
+        operation_name = "chat"
+
+        # Compose: span (always created — no-op when tracer is None) and
+        # duration metric (only when metrics enabled).  Token usage is
+        # emitted after the call returns since it depends on
+        # ``result.usage`` — exception path skips it because control
+        # never reaches the line below.
+        duration_ctx = (
+            llm_operation_duration(engine.model_name, provider_name, operation_name)
+            if self._metrics_enabled
+            else nullcontext()
+        )
+        with llm_call_span(self._tracer, engine.model_name, provider_name, operation_name):
+            with duration_ctx:
+                result = await engine.chat_completion(messages, **kwargs)
+
+        if self._metrics_enabled:
+            record_token_usage(engine.model_name, provider_name, operation_name, result.usage)
 
         log.debug("[%s] Model engine '%s' response: %s", req_id, model_type, truncate(result))
         return result
@@ -171,6 +214,15 @@ async def stream_model_call(
         before the first chunk and closes when the generator exhausts or
         raises.
 
+        When metrics are enabled, emits ``gen_ai.client.operation.duration``
+        for the full stream lifetime (with ``error.type`` on exception)
+        and ``gen_ai.client.token.usage`` after stream completion using
+        the ``UsageInfo`` carried on the terminal SSE chunk (when the
+        provider returns one — controlled by ``include_usage_in_stream``,
+        defaults to True for OpenAI-compatible engines).  No token
+        observation is emitted on early consumer cancellation or on
+        provider error mid-stream.
+
         Raises:
             KeyError: If no engine is registered with the given name.
             TypeError: If the named engine is not a ModelEngine.
@@ -179,9 +231,53 @@ async def stream_model_call(
         log.debug("[%s] Model engine '%s' stream messages: %s", req_id, model_type, truncate(messages))
 
         engine = self._get_engine(model_type, ModelEngine)
-        with llm_call_span(self._tracer, engine.model_name, engine.model_config.engine or "unknown"):
-            async for chunk in engine.stream_chat_completion(messages, **kwargs):
-                yield chunk
+        # TODO: Change to LLMModel.provider_name after refactor
+        provider_name = engine.model_config.engine or "unknown"
+        operation_name = "chat"
+
+        # Capture the most recent chunk's ``usage`` field so we can emit
+        # token metrics after the stream completes — providers (e.g.
+        # OpenAI-compatible) only populate ``usage`` on the terminal
+        # chunk when ``stream_options.include_usage=true``.
+        captured_usage: Optional["UsageInfo"] = None
+        duration_ctx = (
+            llm_operation_duration(engine.model_name, provider_name, operation_name)
+            if self._metrics_enabled
+            else nullcontext()
+        )
+        with llm_call_span(self._tracer, engine.model_name, provider_name, operation_name):
+            with duration_ctx:
+                # Gate timing-state setup on ``_metrics_enabled`` so the
+                # cold path skips ``time.monotonic()`` and the per-chunk
+                # bookkeeping entirely.  ``t0`` defaults to ``0.0`` in
+                # the disabled path so the type stays a plain ``float``
+                # — it's never read in that branch.
+                t0 = time.monotonic() if self._metrics_enabled else 0.0
+                last_chunk_time: Optional[float] = None
+                async for chunk in engine.stream_chat_completion(messages, **kwargs):
+                    if self._metrics_enabled:
+                        # Per OTEL semconv, "first chunk" / "output chunk"
+                        # mean content-bearing chunks — gate on
+                        # ``delta_content`` / ``delta_reasoning`` to skip
+                        # the terminal usage frame and any other cosmetic
+                        # SSE events that the parser leaves in place.
+                        if chunk.delta_content or chunk.delta_reasoning:
+                            now = time.monotonic()
+                            if last_chunk_time is None:
+                                record_time_to_first_chunk(engine.model_name, provider_name, operation_name, now - t0)
+                            else:
+                                record_time_per_output_chunk(
+                                    engine.model_name, provider_name, operation_name, now - last_chunk_time
+                                )
+                            last_chunk_time = now
+                        if chunk.usage is not None:
+                            captured_usage = chunk.usage
+                    yield chunk
+
+        # Reached only on natural exhaustion (not on consumer cancellation
+        # or provider error — those raise out of the ``with`` blocks above).
+        if self._metrics_enabled:
+            record_token_usage(engine.model_name, provider_name, operation_name, captured_usage)
 
     async def api_call(self, api_name: str, message: dict[str, Any], **kwargs: Any) -> dict[str, Any]:
         """Route an API request to the named API engine.
 
@@ -93,7 +93,12 @@ def __init__(self, config: RailsConfig) -> None:
         self._tracer = get_tracer() if self._tracing_enabled else None
         self._metrics_enabled = are_metrics_enabled(config.metrics)
 
-        self.engine_registry = EngineRegistry(config.models, config.rails.config, tracer=self._tracer)
+        self.engine_registry = EngineRegistry(
+            config.models,
+            config.rails.config,
+            tracer=self._tracer,
+            metrics_enabled=self._metrics_enabled,
+        )
         self.rails_manager = RailsManager(
             engine_registry=self.engine_registry,
             task_manager=LLMTaskManager(config),
 
@@ -116,28 +116,38 @@ def _parse_chat_completion(response: dict) -> LLMResponse:
 def _parse_chat_completion_chunk(chunk: dict) -> Optional[LLMResponseChunk]:
     """Build an LLMResponseChunk from an SSE chunk dict.
 
-    Returns None for chunks that carry no content or reasoning delta —
-    role-only first events, finish-only events, or empty-choices events
-    are skipped, preserving current stream_call behavior.
+    Returns None for chunks without one of: content delta, reasoning delta,
+    or a usage payload.
+    Role-only first events and finish-only events with empty deltas
+    map to None.
+
+    Last chunk from OpenAI-compatible providers has a ``usage`` field when
+    ``stream_options.include_usage=true``. This is passed through to capture
+    the token usage metadata.
     """
     choices = chunk.get("choices") or []
-    if not choices:
-        return None
-
-    choice = choices[0]
-    delta = choice.get("delta") or {}
-    delta_content = delta.get("content")
-    delta_reasoning = delta.get("reasoning_content") or None
-
-    if not delta_content and not delta_reasoning:
+    usage_dict = chunk.get("usage")
+
+    delta_content: Optional[str] = None
+    delta_reasoning: Optional[str] = None
+    finish_reason = None
+    if choices:
+        choice = choices[0]
+        delta = choice.get("delta") or {}
+        delta_content = delta.get("content")
+        delta_reasoning = delta.get("reasoning_content") or None
+        finish_reason = choice.get("finish_reason")
+
+    if not delta_content and not delta_reasoning and not usage_dict:
         return None
 
     return LLMResponseChunk(
         delta_content=delta_content,
         delta_reasoning=delta_reasoning,
         model=chunk.get("model"),
-        finish_reason=choice.get("finish_reason"),
+        finish_reason=finish_reason,
         request_id=chunk.get("id"),
+        usage=_parse_usage(usage_dict) if usage_dict else None,
     )
 
 
@@ -315,17 +325,28 @@ async def stream_call(
         """Make a streaming POST request to the /v1/chat/completions endpoint.
 
         Sends ``stream=True`` and yields one ``LLMResponseChunk`` per SSE
-        event that carries a content or reasoning delta. Role-only,
-        finish-only, and empty-choices events are skipped. Retries are
-        handled by the RetryClient (same as ``call()``).
+        event that carries a content delta, reasoning delta, OR a
+        ``usage`` payload. Role-only, finish-only, and empty-choices
+        events without usage are skipped. Retries are handled by the
+        RetryClient (same as ``call()``).
+
+        Note: when the upstream payload includes
+        ``stream_options.include_usage=true`` (default for the
+        OpenAI-compatible client), the provider sends a final
+        usage-only chunk with empty ``choices`` after the last content
+        chunk. That terminal chunk is yielded as
+        ``LLMResponseChunk(usage=...)`` with both ``delta_content``
+        and ``delta_reasoning`` unset — callers that only care about
+        content should gate on ``chunk.delta_content`` rather than
+        assuming every yielded chunk carries one.
 
         Args:
             messages: List of message dicts in OpenAI format.
             **kwargs: Additional parameters for the request body (temperature, max_tokens, etc.)
 
         Yields:
-            ``LLMResponseChunk`` objects with ``delta_content`` and/or
-            ``delta_reasoning`` populated.
+            ``LLMResponseChunk`` objects with ``delta_content``,
+            ``delta_reasoning``, and/or ``usage`` populated.
 
         Raises:
             ModelEngineError: If the request fails after all retries.
@@ -413,7 +434,9 @@ async def stream_chat_completion(
     ) -> AsyncGenerator[LLMResponseChunk, None]:
         """Stream a chat completion and yield ``LLMResponseChunk`` objects.
 
-        Thin pass-through over ``stream_call``.
+        Thin pass-through over ``stream_call`` — see that method's
+        docstring for the contract, including the terminal usage-only
+        chunk emitted when ``stream_options.include_usage`` is on.
 
         Raises:
             ModelEngineError: If the request fails after all retries.