feat: add quota monitoring metrics

major · major · commit f71930bf19f5 · 2026-05-08T11:20:21.000-05:00
Signed-off-by: Major Hayden &lt;major@redhat.com&gt;
diff --git a/src/app/endpoints/responses.py b/src/app/endpoints/responses.py
@@ -138,6 +138,37 @@ def _get_user_agent(request: Request) -> Optional[str]:
     return sanitized or None
 
 
+def _check_response_quota(user_id: str, endpoint_path: str) -> None:
+    """Check response quota availability and record bounded quota metrics."""
+    quota_start_time = time.monotonic()
+    try:
+        check_tokens_available(configuration.quota_limiters, user_id)
+    except HTTPException:
+        recording.record_quota_check(
+            endpoint_path,
+            recording.QUOTA_TYPE_USER_ID,
+            recording.QUOTA_RESULT_FAILURE,
+            time.monotonic() - quota_start_time,
+        )
+        raise
+    except Exception:  # pylint: disable=broad-exception-caught
+        # Unexpected quota backend failures still need bounded metrics before
+        # propagating to the endpoint error handling layer.
+        recording.record_quota_check(
+            endpoint_path,
+            recording.QUOTA_TYPE_USER_ID,
+            recording.QUOTA_RESULT_ERROR,
+            time.monotonic() - quota_start_time,
+        )
+        raise
+    recording.record_quota_check(
+        endpoint_path,
+        recording.QUOTA_TYPE_USER_ID,
+        recording.QUOTA_RESULT_SUCCESS,
+        time.monotonic() - quota_start_time,
+    )
+
+
 responses_response: dict[int | str, dict[str, Any]] = {
     200: ResponsesResponse.openapi_response(),
     401: UnauthorizedResponse.openapi_response(
@@ -358,11 +389,12 @@ async def responses_endpoint_handler(
     started_at = datetime.now(UTC)
     rh_identity_context = get_rh_identity_context(request)
     user_id, _, _, token = auth
+    endpoint_path = "/v1/responses"
 
     await check_mcp_auth(configuration, mcp_headers, token, request.headers)
 
     # Check token availability
-    check_tokens_available(configuration.quota_limiters, user_id)
+    _check_response_quota(user_id, endpoint_path)
 
     # Enforce RBAC: optionally disallow overriding model in requests
     validate_model_provider_override(
diff --git a/src/app/endpoints/rlsapi_v1.py b/src/app/endpoints/rlsapi_v1.py
@@ -575,6 +575,46 @@ def _resolve_quota_subject(request: Request, auth: AuthTuple) -> Optional[str]:
     return system_id
 
 
+def _check_infer_quota(
+    request: Request, auth: AuthTuple, endpoint_path: str
+) -> Optional[str]:
+    """Check infer quota availability and record bounded quota metrics."""
+    quota_id = _resolve_quota_subject(request, auth)
+    quota_type = configuration.rlsapi_v1.quota_subject or "disabled"
+    if quota_id is None:
+        recording.record_quota_check(
+            endpoint_path, quota_type, recording.QUOTA_RESULT_SKIPPED, 0.0
+        )
+        return None
+
+    quota_start_time = time.monotonic()
+    try:
+        check_tokens_available(configuration.quota_limiters, quota_id)
+    except HTTPException:
+        recording.record_quota_check(
+            endpoint_path,
+            quota_type,
+            recording.QUOTA_RESULT_FAILURE,
+            time.monotonic() - quota_start_time,
+        )
+        raise
+    except Exception:  # pylint: disable=broad-exception-caught
+        recording.record_quota_check(
+            endpoint_path,
+            quota_type,
+            recording.QUOTA_RESULT_ERROR,
+            time.monotonic() - quota_start_time,
+        )
+        raise
+    recording.record_quota_check(
+        endpoint_path,
+        quota_type,
+        recording.QUOTA_RESULT_SUCCESS,
+        time.monotonic() - quota_start_time,
+    )
+    return quota_id
+
+
 def _build_infer_response(
     response_text: str,
     request_id: str,
@@ -733,16 +773,17 @@ async def infer_endpoint(  # pylint: disable=R0914,R0915
 
     logger.info("Processing rlsapi v1 /infer request %s", request_id)
 
-    # Quota enforcement: resolve subject and check availability before any work.
-    # No-op when quota_subject is not configured or no quota limiters exist.
-    quota_id = _resolve_quota_subject(request, auth)
-    if quota_id is not None:
+    # Quota enforcement: check availability before any work and record metrics for
+    # both enforced and disabled quota paths.
+    quota_subject = configuration.rlsapi_v1.quota_subject
+    if quota_subject is not None:
         logger.info(
             "Checking quota availability for rlsapi v1 request %s using subject type %s",
             request_id,
-            configuration.rlsapi_v1.quota_subject,
+            quota_subject,
         )
-        check_tokens_available(configuration.quota_limiters, quota_id)
+    quota_id = _check_infer_quota(request, auth, endpoint_path)
+    if quota_id is not None:
         logger.info(
             "Quota availability check passed for rlsapi v1 request %s", request_id
         )
diff --git a/src/metrics/__init__.py b/src/metrics/__init__.py
@@ -51,6 +51,21 @@
     5.0,
     float("inf"),
 )
+
+QUOTA_CHECK_DURATION_BUCKETS: Final[tuple[float, ...]] = (
+    0.001,
+    0.005,
+    0.01,
+    0.025,
+    0.05,
+    0.1,
+    0.25,
+    0.5,
+    1.0,
+    2.5,
+    5.0,
+    float("inf"),
+)
 # Counter to track REST API calls
 # This will be used to count how many times each API endpoint is called
 # and the status code of the response
@@ -144,3 +159,21 @@
     ["action", "result"],
     buckets=AUTHORIZATION_DURATION_BUCKETS,
 )
+
+# Counter to track pre-request quota checks. Labels must stay bounded:
+# endpoint uses static route patterns, quota_type is a configured quota subject,
+# and result is one terminal state from the recording helper.
+quota_checks_total: Final[Counter] = Counter(
+    "ls_quota_checks_total",
+    "Quota availability checks",
+    ["endpoint", "quota_type", "result"],
+)
+
+# Histogram to measure quota availability check latency with sub-second buckets.
+# It uses the same bounded endpoint/quota_type/result labels as the counter.
+quota_check_duration_seconds: Final[Histogram] = Histogram(
+    "ls_quota_check_duration_seconds",
+    "Quota availability check duration",
+    ["endpoint", "quota_type", "result"],
+    buckets=QUOTA_CHECK_DURATION_BUCKETS,
+)
diff --git a/src/metrics/recording.py b/src/metrics/recording.py
@@ -129,6 +129,48 @@ def normalize_authorization_result(result: str) -> str:
         return result
     return AUTHORIZATION_RESULT_ERROR
 
+
+QUOTA_TYPE_USER_ID: Final[str] = "user_id"
+QUOTA_TYPE_ORG_ID: Final[str] = "org_id"
+QUOTA_TYPE_SYSTEM_ID: Final[str] = "system_id"
+QUOTA_TYPE_DISABLED: Final[str] = "disabled"
+QUOTA_RESULT_SUCCESS: Final[str] = "success"
+QUOTA_RESULT_FAILURE: Final[str] = "failure"
+QUOTA_RESULT_SKIPPED: Final[str] = "skipped"
+QUOTA_RESULT_ERROR: Final[str] = "error"
+
+ALLOWED_QUOTA_TYPES: Final[frozenset[str]] = frozenset(
+    {
+        QUOTA_TYPE_USER_ID,
+        QUOTA_TYPE_ORG_ID,
+        QUOTA_TYPE_SYSTEM_ID,
+        QUOTA_TYPE_DISABLED,
+    }
+)
+ALLOWED_QUOTA_RESULTS: Final[frozenset[str]] = frozenset(
+    {
+        QUOTA_RESULT_SUCCESS,
+        QUOTA_RESULT_FAILURE,
+        QUOTA_RESULT_SKIPPED,
+        QUOTA_RESULT_ERROR,
+    }
+)
+
+
+def normalize_quota_type(quota_type: str) -> str:
+    """Return a bounded quota type label for Prometheus cardinality safety."""
+    if quota_type in ALLOWED_QUOTA_TYPES:
+        return quota_type
+    return QUOTA_TYPE_USER_ID
+
+
+def normalize_quota_result(result: str) -> str:
+    """Return a bounded quota result label for Prometheus cardinality safety."""
+    if result in ALLOWED_QUOTA_RESULTS:
+        return result
+    return QUOTA_RESULT_ERROR
+
+
 @contextmanager
 def measure_response_duration(path: str) -> Iterator[None]:
     """Measure REST API response duration for a route path.
@@ -273,7 +315,6 @@ def record_llm_inference_duration(
         logger.warning("Failed to update LLM inference duration metric", exc_info=True)
 
 
-
 def record_auth_attempt(auth_module: str, result: str, reason: str) -> None:
     """Record one authentication attempt.
 
@@ -349,3 +390,28 @@ def record_authorization_duration(action: str, result: str, duration: float) ->
         ).observe(duration)
     except (AttributeError, TypeError, ValueError):
         logger.warning("Failed to update authorization duration metric", exc_info=True)
+
+
+def record_quota_check(
+    endpoint_path: str, quota_type: str, result: str, duration: float
+) -> None:
+    """Record a quota availability check.
+
+    Args:
+        endpoint_path: API endpoint path for metric labeling.
+        quota_type: Bounded quota subject type, not the subject identifier. Out-of-set
+            values are recorded as ``user_id``.
+        result: Bounded result label. Out-of-set values are recorded as ``error``.
+        duration: Quota check duration in seconds.
+    """
+    normalized_quota_type = normalize_quota_type(quota_type)
+    normalized_result = normalize_quota_result(result)
+    try:
+        metrics.quota_checks_total.labels(
+            endpoint_path, normalized_quota_type, normalized_result
+        ).inc()
+        metrics.quota_check_duration_seconds.labels(
+            endpoint_path, normalized_quota_type, normalized_result
+        ).observe(duration)
+    except (AttributeError, TypeError, ValueError):
+        logger.warning("Failed to update quota check metrics", exc_info=True)
diff --git a/tests/unit/app/endpoints/test_responses.py b/tests/unit/app/endpoints/test_responses.py
@@ -18,6 +18,7 @@
 from pytest_mock import MockerFixture
 
 from app.endpoints.responses import (
+    _check_response_quota,
     _is_server_mcp_output_item,
     _sanitize_response_dict,
     _should_filter_mcp_chunk,
@@ -278,6 +279,29 @@ def _request_with_previous_response_id(
     return request
 
 
+def test_check_response_quota_records_unexpected_errors(
+    minimal_config: AppConfig,
+    mocker: MockerFixture,
+) -> None:
+    """Test unexpected quota failures are recorded before being re-raised."""
+    mocker.patch(f"{MODULE}.configuration", minimal_config)
+    mocker.patch(
+        f"{MODULE}.check_tokens_available",
+        side_effect=RuntimeError("quota backend unavailable"),
+    )
+    mock_record = mocker.patch(f"{MODULE}.recording.record_quota_check")
+
+    with pytest.raises(RuntimeError, match="quota backend unavailable"):
+        _check_response_quota("user-123", "/v1/responses")
+
+    mock_record.assert_called_once()
+    endpoint_path, quota_type, result, duration = mock_record.call_args.args
+    assert endpoint_path == "/v1/responses"
+    assert quota_type == "user_id"
+    assert result == "error"
+    assert duration >= 0
+
+
 class TestResponsesEndpointHandler:
     """Unit tests for responses_endpoint_handler."""
 
diff --git a/tests/unit/app/endpoints/test_rlsapi_v1.py b/tests/unit/app/endpoints/test_rlsapi_v1.py
@@ -1185,6 +1185,7 @@ async def test_infer_quota_skipped_when_not_configured(
     """Test /infer skips quota calls when quota_subject is None (default)."""
     mock_check = mocker.patch("app.endpoints.rlsapi_v1.check_tokens_available")
     mock_consume = mocker.patch("app.endpoints.rlsapi_v1.consume_query_tokens")
+    mock_record = mocker.patch("app.endpoints.rlsapi_v1.recording.record_quota_check")
 
     await infer_endpoint(
         infer_request=RlsapiV1InferRequest(question="How do I list files?"),
@@ -1195,6 +1196,12 @@ async def test_infer_quota_skipped_when_not_configured(
 
     mock_check.assert_not_called()
     mock_consume.assert_not_called()
+    mock_record.assert_called_once_with(
+        rlsapi_v1.ENDPOINT_PATH_INFER,
+        rlsapi_v1.recording.QUOTA_TYPE_DISABLED,
+        rlsapi_v1.recording.QUOTA_RESULT_SKIPPED,
+        0.0,
+    )
 
 
 @pytest.mark.asyncio
@@ -1224,6 +1231,39 @@ async def test_infer_quota_exceeded_returns_429(
     assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
 
 
+@pytest.mark.asyncio
+async def test_infer_quota_records_unexpected_errors(
+    mocker: MockerFixture,
+    mock_quota_config: Callable[[str], None],
+    mock_llm_response: None,
+    mock_auth_resolvers: None,
+    mock_request_factory: Callable[..., Any],
+    mock_background_tasks: Any,
+) -> None:
+    """Test unexpected quota failures are recorded before being re-raised."""
+    mock_quota_config("user_id")
+    mocker.patch(
+        "app.endpoints.rlsapi_v1.check_tokens_available",
+        side_effect=RuntimeError("quota backend unavailable"),
+    )
+    mock_record = mocker.patch("app.endpoints.rlsapi_v1.recording.record_quota_check")
+
+    with pytest.raises(RuntimeError, match="quota backend unavailable"):
+        await infer_endpoint(
+            infer_request=RlsapiV1InferRequest(question="How do I list files?"),
+            request=mock_request_factory(),
+            background_tasks=mock_background_tasks,
+            auth=MOCK_AUTH,
+        )
+
+    mock_record.assert_called_once()
+    endpoint_path, quota_type, result, duration = mock_record.call_args.args
+    assert endpoint_path == rlsapi_v1.ENDPOINT_PATH_INFER
+    assert quota_type == "user_id"
+    assert result == rlsapi_v1.recording.QUOTA_RESULT_ERROR
+    assert duration >= 0
+
+
 @pytest.mark.parametrize(
     ("quota_subject", "rh_identity_setup", "expected_subject"),
     [
diff --git a/tests/unit/metrics/test_recording.py b/tests/unit/metrics/test_recording.py