Skip to content

Commit 0eccec0

Browse files
committed
feat: add quota monitoring metrics
Signed-off-by: Major Hayden <major@redhat.com>
1 parent 1ed8411 commit 0eccec0

7 files changed

Lines changed: 320 additions & 9 deletions

File tree

src/app/endpoints/responses.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,37 @@ def _get_user_agent(request: Request) -> Optional[str]:
138138
return sanitized or None
139139

140140

141+
def _check_response_quota(user_id: str, endpoint_path: str) -> None:
142+
"""Check response quota availability and record bounded quota metrics."""
143+
quota_start_time = time.monotonic()
144+
try:
145+
check_tokens_available(configuration.quota_limiters, user_id)
146+
except HTTPException:
147+
recording.record_quota_check(
148+
endpoint_path,
149+
recording.QUOTA_TYPE_USER_ID,
150+
recording.QUOTA_RESULT_FAILURE,
151+
time.monotonic() - quota_start_time,
152+
)
153+
raise
154+
except Exception: # pylint: disable=broad-exception-caught
155+
# Unexpected quota backend failures still need bounded metrics before
156+
# propagating to the endpoint error handling layer.
157+
recording.record_quota_check(
158+
endpoint_path,
159+
recording.QUOTA_TYPE_USER_ID,
160+
recording.QUOTA_RESULT_ERROR,
161+
time.monotonic() - quota_start_time,
162+
)
163+
raise
164+
recording.record_quota_check(
165+
endpoint_path,
166+
recording.QUOTA_TYPE_USER_ID,
167+
recording.QUOTA_RESULT_SUCCESS,
168+
time.monotonic() - quota_start_time,
169+
)
170+
171+
141172
responses_response: dict[int | str, dict[str, Any]] = {
142173
200: ResponsesResponse.openapi_response(),
143174
401: UnauthorizedResponse.openapi_response(
@@ -358,11 +389,12 @@ async def responses_endpoint_handler(
358389
started_at = datetime.now(UTC)
359390
rh_identity_context = get_rh_identity_context(request)
360391
user_id, _, _, token = auth
392+
endpoint_path = ENDPOINT_PATH_RESPONSES
361393

362394
await check_mcp_auth(configuration, mcp_headers, token, request.headers)
363395

364396
# Check token availability
365-
check_tokens_available(configuration.quota_limiters, user_id)
397+
_check_response_quota(user_id, endpoint_path)
366398

367399
# Enforce RBAC: optionally disallow overriding model in requests
368400
validate_model_provider_override(

src/app/endpoints/rlsapi_v1.py

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,63 @@ def _resolve_quota_subject(request: Request, auth: AuthTuple) -> Optional[str]:
575575
return system_id
576576

577577

578+
def _check_infer_quota(
579+
request: Request, auth: AuthTuple, endpoint_path: str
580+
) -> Optional[str]:
581+
"""Check infer quota availability and record bounded quota metrics.
582+
583+
Resolves the quota subject from the request and auth context, then
584+
verifies that the subject has tokens available. All outcomes (success,
585+
failure, error, skipped) are recorded as Prometheus metrics.
586+
587+
Args:
588+
request: The incoming FastAPI request used to resolve the quota subject.
589+
auth: Authentication tuple ``(user_id, username, skip_userid_check, token)``.
590+
endpoint_path: API endpoint path for metric labeling.
591+
592+
Returns:
593+
The resolved quota subject identifier, or ``None`` when quota is disabled.
594+
595+
Raises:
596+
HTTPException: Re-raised from the quota limiter when the subject has
597+
exhausted its token allowance (HTTP 429).
598+
"""
599+
quota_id = _resolve_quota_subject(request, auth)
600+
quota_type = configuration.rlsapi_v1.quota_subject or "disabled"
601+
if quota_id is None:
602+
recording.record_quota_check(
603+
endpoint_path, quota_type, recording.QUOTA_RESULT_SKIPPED, 0.0
604+
)
605+
return None
606+
607+
quota_start_time = time.monotonic()
608+
try:
609+
check_tokens_available(configuration.quota_limiters, quota_id)
610+
except HTTPException:
611+
recording.record_quota_check(
612+
endpoint_path,
613+
quota_type,
614+
recording.QUOTA_RESULT_FAILURE,
615+
time.monotonic() - quota_start_time,
616+
)
617+
raise
618+
except Exception: # pylint: disable=broad-exception-caught
619+
recording.record_quota_check(
620+
endpoint_path,
621+
quota_type,
622+
recording.QUOTA_RESULT_ERROR,
623+
time.monotonic() - quota_start_time,
624+
)
625+
raise
626+
recording.record_quota_check(
627+
endpoint_path,
628+
quota_type,
629+
recording.QUOTA_RESULT_SUCCESS,
630+
time.monotonic() - quota_start_time,
631+
)
632+
return quota_id
633+
634+
578635
def _build_infer_response(
579636
response_text: str,
580637
request_id: str,
@@ -733,16 +790,17 @@ async def infer_endpoint( # pylint: disable=R0914,R0915
733790

734791
logger.info("Processing rlsapi v1 /infer request %s", request_id)
735792

736-
# Quota enforcement: resolve subject and check availability before any work.
737-
# No-op when quota_subject is not configured or no quota limiters exist.
738-
quota_id = _resolve_quota_subject(request, auth)
739-
if quota_id is not None:
793+
# Quota enforcement: check availability before any work and record metrics for
794+
# both enforced and disabled quota paths.
795+
quota_subject = configuration.rlsapi_v1.quota_subject
796+
if quota_subject is not None:
740797
logger.info(
741798
"Checking quota availability for rlsapi v1 request %s using subject type %s",
742799
request_id,
743-
configuration.rlsapi_v1.quota_subject,
800+
quota_subject,
744801
)
745-
check_tokens_available(configuration.quota_limiters, quota_id)
802+
quota_id = _check_infer_quota(request, auth, endpoint_path)
803+
if quota_id is not None:
746804
logger.info(
747805
"Quota availability check passed for rlsapi v1 request %s", request_id
748806
)

src/metrics/__init__.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,21 @@
5151
5.0,
5252
float("inf"),
5353
)
54+
55+
QUOTA_CHECK_DURATION_BUCKETS: Final[tuple[float, ...]] = (
56+
0.001,
57+
0.005,
58+
0.01,
59+
0.025,
60+
0.05,
61+
0.1,
62+
0.25,
63+
0.5,
64+
1.0,
65+
2.5,
66+
5.0,
67+
float("inf"),
68+
)
5469
# Counter to track REST API calls
5570
# This will be used to count how many times each API endpoint is called
5671
# and the status code of the response
@@ -144,3 +159,21 @@
144159
["action", "result"],
145160
buckets=AUTHORIZATION_DURATION_BUCKETS,
146161
)
162+
163+
# Counter to track pre-request quota checks. Labels must stay bounded:
164+
# endpoint uses static route patterns, quota_type is a configured quota subject,
165+
# and result is one terminal state from the recording helper.
166+
quota_checks_total: Final[Counter] = Counter(
167+
"ls_quota_checks_total",
168+
"Quota availability checks",
169+
["endpoint", "quota_type", "result"],
170+
)
171+
172+
# Histogram to measure quota availability check latency with sub-second buckets.
173+
# It uses the same bounded endpoint/quota_type/result labels as the counter.
174+
quota_check_duration_seconds: Final[Histogram] = Histogram(
175+
"ls_quota_check_duration_seconds",
176+
"Quota availability check duration",
177+
["endpoint", "quota_type", "result"],
178+
buckets=QUOTA_CHECK_DURATION_BUCKETS,
179+
)

src/metrics/recording.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,48 @@ def normalize_authorization_result(result: str) -> str:
134134
return result
135135
return AUTHORIZATION_RESULT_ERROR
136136

137+
138+
QUOTA_TYPE_USER_ID: Final[str] = "user_id"
139+
QUOTA_TYPE_ORG_ID: Final[str] = "org_id"
140+
QUOTA_TYPE_SYSTEM_ID: Final[str] = "system_id"
141+
QUOTA_TYPE_DISABLED: Final[str] = "disabled"
142+
QUOTA_RESULT_SUCCESS: Final[str] = "success"
143+
QUOTA_RESULT_FAILURE: Final[str] = "failure"
144+
QUOTA_RESULT_SKIPPED: Final[str] = "skipped"
145+
QUOTA_RESULT_ERROR: Final[str] = "error"
146+
147+
ALLOWED_QUOTA_TYPES: Final[frozenset[str]] = frozenset(
148+
{
149+
QUOTA_TYPE_USER_ID,
150+
QUOTA_TYPE_ORG_ID,
151+
QUOTA_TYPE_SYSTEM_ID,
152+
QUOTA_TYPE_DISABLED,
153+
}
154+
)
155+
ALLOWED_QUOTA_RESULTS: Final[frozenset[str]] = frozenset(
156+
{
157+
QUOTA_RESULT_SUCCESS,
158+
QUOTA_RESULT_FAILURE,
159+
QUOTA_RESULT_SKIPPED,
160+
QUOTA_RESULT_ERROR,
161+
}
162+
)
163+
164+
165+
def normalize_quota_type(quota_type: str) -> str:
166+
"""Return a bounded quota type label for Prometheus cardinality safety."""
167+
if quota_type in ALLOWED_QUOTA_TYPES:
168+
return quota_type
169+
return "unknown"
170+
171+
172+
def normalize_quota_result(result: str) -> str:
173+
"""Return a bounded quota result label for Prometheus cardinality safety."""
174+
if result in ALLOWED_QUOTA_RESULTS:
175+
return result
176+
return QUOTA_RESULT_ERROR
177+
178+
137179
@contextmanager
138180
def measure_response_duration(path: str) -> Iterator[None]:
139181
"""Measure REST API response duration for a route path.
@@ -278,7 +320,6 @@ def record_llm_inference_duration(
278320
logger.warning("Failed to update LLM inference duration metric", exc_info=True)
279321

280322

281-
282323
def record_auth_attempt(auth_module: str, result: str, reason: str) -> None:
283324
"""Record one authentication attempt.
284325
@@ -354,3 +395,31 @@ def record_authorization_duration(action: str, result: str, duration: float) ->
354395
).observe(duration)
355396
except (AttributeError, TypeError, ValueError):
356397
logger.warning("Failed to update authorization duration metric", exc_info=True)
398+
399+
400+
def record_quota_check(
401+
endpoint_path: str, quota_type: str, result: str, duration: float
402+
) -> None:
403+
"""Record a quota availability check.
404+
405+
Args:
406+
endpoint_path: API endpoint path for metric labeling.
407+
quota_type: Bounded quota subject type, not the subject identifier. Out-of-set
408+
values are recorded as ``unknown``.
409+
result: Bounded result label. Out-of-set values are recorded as ``error``.
410+
duration: Quota check duration in seconds.
411+
"""
412+
normalized_quota_type = normalize_quota_type(quota_type)
413+
normalized_result = normalize_quota_result(result)
414+
try:
415+
metrics.quota_checks_total.labels(
416+
endpoint_path, normalized_quota_type, normalized_result
417+
).inc()
418+
except (AttributeError, TypeError, ValueError):
419+
logger.warning("Failed to update quota check counter", exc_info=True)
420+
try:
421+
metrics.quota_check_duration_seconds.labels(
422+
endpoint_path, normalized_quota_type, normalized_result
423+
).observe(duration)
424+
except (AttributeError, TypeError, ValueError):
425+
logger.warning("Failed to update quota check duration metric", exc_info=True)

tests/unit/app/endpoints/test_responses.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from pytest_mock import MockerFixture
1919

2020
from app.endpoints.responses import (
21+
_check_response_quota,
2122
_is_server_mcp_output_item,
2223
_sanitize_response_dict,
2324
_should_filter_mcp_chunk,
@@ -278,6 +279,29 @@ def _request_with_previous_response_id(
278279
return request
279280

280281

282+
def test_check_response_quota_records_unexpected_errors(
283+
minimal_config: AppConfig,
284+
mocker: MockerFixture,
285+
) -> None:
286+
"""Test unexpected quota failures are recorded before being re-raised."""
287+
mocker.patch(f"{MODULE}.configuration", minimal_config)
288+
mocker.patch(
289+
f"{MODULE}.check_tokens_available",
290+
side_effect=RuntimeError("quota backend unavailable"),
291+
)
292+
mock_record = mocker.patch(f"{MODULE}.recording.record_quota_check")
293+
294+
with pytest.raises(RuntimeError, match="quota backend unavailable"):
295+
_check_response_quota("user-123", "/v1/responses")
296+
297+
mock_record.assert_called_once()
298+
endpoint_path, quota_type, result, duration = mock_record.call_args.args
299+
assert endpoint_path == "/v1/responses"
300+
assert quota_type == "user_id"
301+
assert result == "error"
302+
assert duration >= 0
303+
304+
281305
class TestResponsesEndpointHandler:
282306
"""Unit tests for responses_endpoint_handler."""
283307

tests/unit/app/endpoints/test_rlsapi_v1.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1185,6 +1185,7 @@ async def test_infer_quota_skipped_when_not_configured(
11851185
"""Test /infer skips quota calls when quota_subject is None (default)."""
11861186
mock_check = mocker.patch("app.endpoints.rlsapi_v1.check_tokens_available")
11871187
mock_consume = mocker.patch("app.endpoints.rlsapi_v1.consume_query_tokens")
1188+
mock_record = mocker.patch("app.endpoints.rlsapi_v1.recording.record_quota_check")
11881189

11891190
await infer_endpoint(
11901191
infer_request=RlsapiV1InferRequest(question="How do I list files?"),
@@ -1195,6 +1196,12 @@ async def test_infer_quota_skipped_when_not_configured(
11951196

11961197
mock_check.assert_not_called()
11971198
mock_consume.assert_not_called()
1199+
mock_record.assert_called_once_with(
1200+
rlsapi_v1.ENDPOINT_PATH_INFER,
1201+
rlsapi_v1.recording.QUOTA_TYPE_DISABLED,
1202+
rlsapi_v1.recording.QUOTA_RESULT_SKIPPED,
1203+
0.0,
1204+
)
11981205

11991206

12001207
@pytest.mark.asyncio
@@ -1224,6 +1231,39 @@ async def test_infer_quota_exceeded_returns_429(
12241231
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
12251232

12261233

1234+
@pytest.mark.asyncio
1235+
async def test_infer_quota_records_unexpected_errors(
1236+
mocker: MockerFixture,
1237+
mock_quota_config: Callable[[str], None],
1238+
mock_llm_response: None,
1239+
mock_auth_resolvers: None,
1240+
mock_request_factory: Callable[..., Any],
1241+
mock_background_tasks: Any,
1242+
) -> None:
1243+
"""Test unexpected quota failures are recorded before being re-raised."""
1244+
mock_quota_config("user_id")
1245+
mocker.patch(
1246+
"app.endpoints.rlsapi_v1.check_tokens_available",
1247+
side_effect=RuntimeError("quota backend unavailable"),
1248+
)
1249+
mock_record = mocker.patch("app.endpoints.rlsapi_v1.recording.record_quota_check")
1250+
1251+
with pytest.raises(RuntimeError, match="quota backend unavailable"):
1252+
await infer_endpoint(
1253+
infer_request=RlsapiV1InferRequest(question="How do I list files?"),
1254+
request=mock_request_factory(),
1255+
background_tasks=mock_background_tasks,
1256+
auth=MOCK_AUTH,
1257+
)
1258+
1259+
mock_record.assert_called_once()
1260+
endpoint_path, quota_type, result, duration = mock_record.call_args.args
1261+
assert endpoint_path == rlsapi_v1.ENDPOINT_PATH_INFER
1262+
assert quota_type == "user_id"
1263+
assert result == rlsapi_v1.recording.QUOTA_RESULT_ERROR
1264+
assert duration >= 0
1265+
1266+
12271267
@pytest.mark.parametrize(
12281268
("quota_subject", "rh_identity_setup", "expected_subject"),
12291269
[

0 commit comments

Comments
 (0)