feat: add RLS API inference metrics

major · major · commit 2cb984092589 · 2026-04-29T18:42:20.000-05:00
Signed-off-by: Major Hayden &lt;major@redhat.com&gt;
diff --git a/src/app/endpoints/rlsapi_v1.py b/src/app/endpoints/rlsapi_v1.py
@@ -455,6 +455,9 @@ def _record_inference_failure(  # pylint: disable=too-many-arguments,too-many-po
     """
     inference_time = time.monotonic() - start_time
     recording.record_llm_failure(provider, model, endpoint_path)
+    recording.record_llm_inference_duration(
+        provider, model, endpoint_path, "failure", inference_time
+    )
     _queue_splunk_event(
         background_tasks,
         infer_request,
@@ -669,13 +672,14 @@ async def infer_endpoint(  # pylint: disable=R0914
     """
     # Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
     check_configuration_loaded(configuration)
-
     # Quota enforcement: resolve subject and check availability before any work.
     # No-op when quota_subject is not configured or no quota limiters exist.
     quota_id = _resolve_quota_subject(request, auth)
     if quota_id is not None:
         check_tokens_available(configuration.quota_limiters, quota_id)
 
+    endpoint_path = "/v1/infer"
+
     request_id = get_suid()
 
     logger.info("Processing rlsapi v1 /infer request %s", request_id)
@@ -685,8 +689,6 @@ async def infer_endpoint(  # pylint: disable=R0914
         "Request %s: Combined input source length: %d", request_id, len(input_source)
     )
 
-    endpoint_path = "/v1/infer"
-
     # Run shield moderation on user input before inference.
     # Uses all configured shields; no-op when no shields are registered.
     # Runs before model/tool discovery so blocked requests short-circuit
@@ -721,6 +723,9 @@ async def infer_endpoint(  # pylint: disable=R0914
         response_text = extract_text_from_response_items(response.output)
         token_usage = extract_token_usage(response.usage, model_id, endpoint_path)
         inference_time = time.monotonic() - start_time
+        recording.record_llm_inference_duration(
+            provider, model, endpoint_path, "success", inference_time
+        )
     except _INFER_HANDLED_EXCEPTIONS as error:
         if response is not None:
             extract_token_usage(response.usage, model_id, endpoint_path)  # type: ignore[arg-type]
diff --git a/src/metrics/__init__.py b/src/metrics/__init__.py
@@ -1,11 +1,27 @@
 """Metrics module for Lightspeed Core Stack."""
 
+from typing import Final
+
 from prometheus_client import (
     Counter,
     Gauge,
     Histogram,
 )
 
+LLM_INFERENCE_DURATION_BUCKETS: Final[tuple[float, ...]] = (
+    0.1,
+    0.5,
+    1.0,
+    2.5,
+    5.0,
+    10.0,
+    20.0,
+    30.0,
+    60.0,
+    120.0,
+    float("inf"),
+)
+
 # Counter to track REST API calls
 # This will be used to count how many times each API endpoint is called
 # and the status code of the response
@@ -55,3 +71,11 @@
     "LLM tokens received",
     ["provider", "model", "endpoint"],
 )
+
+# Histogram to measure the latency of direct LLM inference backend calls.
+llm_inference_duration_seconds = Histogram(
+    "ls_llm_inference_duration_seconds",
+    "LLM inference call duration",
+    ["provider", "model", "endpoint", "result"],
+    buckets=LLM_INFERENCE_DURATION_BUCKETS,
+)
diff --git a/src/metrics/recording.py b/src/metrics/recording.py
@@ -109,3 +109,23 @@ def record_llm_token_usage(
         )
     except (AttributeError, TypeError, ValueError):
         logger.warning("Failed to update token metrics", exc_info=True)
+
+
+def record_llm_inference_duration(
+    provider: str, model: str, endpoint_path: str, result: str, duration: float
+) -> None:
+    """Record the latency of a direct LLM inference backend call.
+
+    Args:
+        provider: LLM provider identifier.
+        model: LLM model identifier without the provider prefix.
+        endpoint_path: API endpoint path for metric labeling.
+        result: Bounded result label, such as ``success`` or ``failure``.
+        duration: Inference call duration in seconds.
+    """
+    try:
+        metrics.llm_inference_duration_seconds.labels(
+            provider, model, endpoint_path, result
+        ).observe(duration)
+    except (AttributeError, TypeError, ValueError):
+        logger.warning("Failed to update LLM inference duration metric", exc_info=True)
diff --git a/tests/unit/metrics/test_recording.py b/tests/unit/metrics/test_recording.py
@@ -1,10 +1,26 @@
 """Unit tests for Prometheus metric recording helpers."""
 
-from pytest_mock import MockerFixture
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import pytest
+from pytest_mock import MockerFixture, MockType
 
 from metrics import recording
 
 
+@dataclass(frozen=True)
+class HistogramRecorderCase:
+    """Expected behavior for a histogram-style metric recorder."""
+
+    metric_path: str
+    recorder: Callable[..., None]
+    args: tuple[object, ...]
+    labels: tuple[object, ...]
+    duration: float
+    warning_message: str
+
+
 def test_measure_response_duration_records_timer(mocker: MockerFixture) -> None:
     """Test that response duration measurement uses the path label timer."""
     mock_timer = mocker.MagicMock()
@@ -159,3 +175,44 @@ def test_record_llm_token_usage_logs_metric_errors(mocker: MockerFixture) -> Non
     mock_logger.warning.assert_called_once_with(
         "Failed to update token metrics", exc_info=True
     )
+
+
+@pytest.fixture(name="recording_logger")
+def recording_logger_fixture(mocker: MockerFixture) -> MockType:
+    """Patch the metric recording logger for failure assertions."""
+    return mocker.patch("metrics.recording.logger")
+
+
+@pytest.mark.parametrize(
+    "case",
+    [
+        HistogramRecorderCase(
+            metric_path="metrics.recording.metrics.llm_inference_duration_seconds",
+            recorder=recording.record_llm_inference_duration,
+            args=("vertexai", "gemini", "/v1/responses", "success", 1.5),
+            labels=("vertexai", "gemini", "/v1/responses", "success"),
+            duration=1.5,
+            warning_message="Failed to update LLM inference duration metric",
+        ),
+    ],
+)
+def test_histogram_recorders_observe_metrics_and_log_errors(
+    mocker: MockerFixture,
+    recording_logger: MockType,
+    case: HistogramRecorderCase,
+) -> None:
+    """Test new histogram helpers with shared success and failure coverage."""
+    mock_metric = mocker.patch(case.metric_path)
+
+    case.recorder(*case.args)
+
+    mock_metric.labels.assert_called_once_with(*case.labels)
+    mock_metric.labels.return_value.observe.assert_called_once_with(case.duration)
+
+    mock_metric.reset_mock()
+    mock_metric.labels.return_value.observe.side_effect = TypeError("bad")
+    case.recorder(*case.args)
+
+    recording_logger.warning.assert_called_once_with(
+        case.warning_message, exc_info=True
+    )