Skip to content

Commit f71930b

Browse files
committed
feat: add quota monitoring metrics
Signed-off-by: Major Hayden <major@redhat.com>
1 parent 6a6c51f commit f71930b

7 files changed

Lines changed: 294 additions & 9 deletions

File tree

src/app/endpoints/responses.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,37 @@ def _get_user_agent(request: Request) -> Optional[str]:
138138
return sanitized or None
139139

140140

141+
def _check_response_quota(user_id: str, endpoint_path: str) -> None:
142+
"""Check response quota availability and record bounded quota metrics."""
143+
quota_start_time = time.monotonic()
144+
try:
145+
check_tokens_available(configuration.quota_limiters, user_id)
146+
except HTTPException:
147+
recording.record_quota_check(
148+
endpoint_path,
149+
recording.QUOTA_TYPE_USER_ID,
150+
recording.QUOTA_RESULT_FAILURE,
151+
time.monotonic() - quota_start_time,
152+
)
153+
raise
154+
except Exception: # pylint: disable=broad-exception-caught
155+
# Unexpected quota backend failures still need bounded metrics before
156+
# propagating to the endpoint error handling layer.
157+
recording.record_quota_check(
158+
endpoint_path,
159+
recording.QUOTA_TYPE_USER_ID,
160+
recording.QUOTA_RESULT_ERROR,
161+
time.monotonic() - quota_start_time,
162+
)
163+
raise
164+
recording.record_quota_check(
165+
endpoint_path,
166+
recording.QUOTA_TYPE_USER_ID,
167+
recording.QUOTA_RESULT_SUCCESS,
168+
time.monotonic() - quota_start_time,
169+
)
170+
171+
141172
responses_response: dict[int | str, dict[str, Any]] = {
142173
200: ResponsesResponse.openapi_response(),
143174
401: UnauthorizedResponse.openapi_response(
@@ -358,11 +389,12 @@ async def responses_endpoint_handler(
358389
started_at = datetime.now(UTC)
359390
rh_identity_context = get_rh_identity_context(request)
360391
user_id, _, _, token = auth
392+
endpoint_path = "/v1/responses"
361393

362394
await check_mcp_auth(configuration, mcp_headers, token, request.headers)
363395

364396
# Check token availability
365-
check_tokens_available(configuration.quota_limiters, user_id)
397+
_check_response_quota(user_id, endpoint_path)
366398

367399
# Enforce RBAC: optionally disallow overriding model in requests
368400
validate_model_provider_override(

src/app/endpoints/rlsapi_v1.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,46 @@ def _resolve_quota_subject(request: Request, auth: AuthTuple) -> Optional[str]:
575575
return system_id
576576

577577

578+
def _check_infer_quota(
579+
request: Request, auth: AuthTuple, endpoint_path: str
580+
) -> Optional[str]:
581+
"""Check infer quota availability and record bounded quota metrics."""
582+
quota_id = _resolve_quota_subject(request, auth)
583+
quota_type = configuration.rlsapi_v1.quota_subject or "disabled"
584+
if quota_id is None:
585+
recording.record_quota_check(
586+
endpoint_path, quota_type, recording.QUOTA_RESULT_SKIPPED, 0.0
587+
)
588+
return None
589+
590+
quota_start_time = time.monotonic()
591+
try:
592+
check_tokens_available(configuration.quota_limiters, quota_id)
593+
except HTTPException:
594+
recording.record_quota_check(
595+
endpoint_path,
596+
quota_type,
597+
recording.QUOTA_RESULT_FAILURE,
598+
time.monotonic() - quota_start_time,
599+
)
600+
raise
601+
except Exception: # pylint: disable=broad-exception-caught
602+
recording.record_quota_check(
603+
endpoint_path,
604+
quota_type,
605+
recording.QUOTA_RESULT_ERROR,
606+
time.monotonic() - quota_start_time,
607+
)
608+
raise
609+
recording.record_quota_check(
610+
endpoint_path,
611+
quota_type,
612+
recording.QUOTA_RESULT_SUCCESS,
613+
time.monotonic() - quota_start_time,
614+
)
615+
return quota_id
616+
617+
578618
def _build_infer_response(
579619
response_text: str,
580620
request_id: str,
@@ -733,16 +773,17 @@ async def infer_endpoint( # pylint: disable=R0914,R0915
733773

734774
logger.info("Processing rlsapi v1 /infer request %s", request_id)
735775

736-
# Quota enforcement: resolve subject and check availability before any work.
737-
# No-op when quota_subject is not configured or no quota limiters exist.
738-
quota_id = _resolve_quota_subject(request, auth)
739-
if quota_id is not None:
776+
# Quota enforcement: check availability before any work and record metrics for
777+
# both enforced and disabled quota paths.
778+
quota_subject = configuration.rlsapi_v1.quota_subject
779+
if quota_subject is not None:
740780
logger.info(
741781
"Checking quota availability for rlsapi v1 request %s using subject type %s",
742782
request_id,
743-
configuration.rlsapi_v1.quota_subject,
783+
quota_subject,
744784
)
745-
check_tokens_available(configuration.quota_limiters, quota_id)
785+
quota_id = _check_infer_quota(request, auth, endpoint_path)
786+
if quota_id is not None:
746787
logger.info(
747788
"Quota availability check passed for rlsapi v1 request %s", request_id
748789
)

src/metrics/__init__.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,21 @@
5151
5.0,
5252
float("inf"),
5353
)
54+
55+
QUOTA_CHECK_DURATION_BUCKETS: Final[tuple[float, ...]] = (
56+
0.001,
57+
0.005,
58+
0.01,
59+
0.025,
60+
0.05,
61+
0.1,
62+
0.25,
63+
0.5,
64+
1.0,
65+
2.5,
66+
5.0,
67+
float("inf"),
68+
)
5469
# Counter to track REST API calls
5570
# This will be used to count how many times each API endpoint is called
5671
# and the status code of the response
@@ -144,3 +159,21 @@
144159
["action", "result"],
145160
buckets=AUTHORIZATION_DURATION_BUCKETS,
146161
)
162+
163+
# Counter to track pre-request quota checks. Labels must stay bounded:
164+
# endpoint uses static route patterns, quota_type is a configured quota subject,
165+
# and result is one terminal state from the recording helper.
166+
quota_checks_total: Final[Counter] = Counter(
167+
"ls_quota_checks_total",
168+
"Quota availability checks",
169+
["endpoint", "quota_type", "result"],
170+
)
171+
172+
# Histogram to measure quota availability check latency with sub-second buckets.
173+
# It uses the same bounded endpoint/quota_type/result labels as the counter.
174+
quota_check_duration_seconds: Final[Histogram] = Histogram(
175+
"ls_quota_check_duration_seconds",
176+
"Quota availability check duration",
177+
["endpoint", "quota_type", "result"],
178+
buckets=QUOTA_CHECK_DURATION_BUCKETS,
179+
)

src/metrics/recording.py

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,48 @@ def normalize_authorization_result(result: str) -> str:
129129
return result
130130
return AUTHORIZATION_RESULT_ERROR
131131

132+
133+
QUOTA_TYPE_USER_ID: Final[str] = "user_id"
134+
QUOTA_TYPE_ORG_ID: Final[str] = "org_id"
135+
QUOTA_TYPE_SYSTEM_ID: Final[str] = "system_id"
136+
QUOTA_TYPE_DISABLED: Final[str] = "disabled"
137+
QUOTA_RESULT_SUCCESS: Final[str] = "success"
138+
QUOTA_RESULT_FAILURE: Final[str] = "failure"
139+
QUOTA_RESULT_SKIPPED: Final[str] = "skipped"
140+
QUOTA_RESULT_ERROR: Final[str] = "error"
141+
142+
ALLOWED_QUOTA_TYPES: Final[frozenset[str]] = frozenset(
143+
{
144+
QUOTA_TYPE_USER_ID,
145+
QUOTA_TYPE_ORG_ID,
146+
QUOTA_TYPE_SYSTEM_ID,
147+
QUOTA_TYPE_DISABLED,
148+
}
149+
)
150+
ALLOWED_QUOTA_RESULTS: Final[frozenset[str]] = frozenset(
151+
{
152+
QUOTA_RESULT_SUCCESS,
153+
QUOTA_RESULT_FAILURE,
154+
QUOTA_RESULT_SKIPPED,
155+
QUOTA_RESULT_ERROR,
156+
}
157+
)
158+
159+
160+
def normalize_quota_type(quota_type: str) -> str:
161+
"""Return a bounded quota type label for Prometheus cardinality safety."""
162+
if quota_type in ALLOWED_QUOTA_TYPES:
163+
return quota_type
164+
return QUOTA_TYPE_USER_ID
165+
166+
167+
def normalize_quota_result(result: str) -> str:
168+
"""Return a bounded quota result label for Prometheus cardinality safety."""
169+
if result in ALLOWED_QUOTA_RESULTS:
170+
return result
171+
return QUOTA_RESULT_ERROR
172+
173+
132174
@contextmanager
133175
def measure_response_duration(path: str) -> Iterator[None]:
134176
"""Measure REST API response duration for a route path.
@@ -273,7 +315,6 @@ def record_llm_inference_duration(
273315
logger.warning("Failed to update LLM inference duration metric", exc_info=True)
274316

275317

276-
277318
def record_auth_attempt(auth_module: str, result: str, reason: str) -> None:
278319
"""Record one authentication attempt.
279320
@@ -349,3 +390,28 @@ def record_authorization_duration(action: str, result: str, duration: float) ->
349390
).observe(duration)
350391
except (AttributeError, TypeError, ValueError):
351392
logger.warning("Failed to update authorization duration metric", exc_info=True)
393+
394+
395+
def record_quota_check(
396+
endpoint_path: str, quota_type: str, result: str, duration: float
397+
) -> None:
398+
"""Record a quota availability check.
399+
400+
Args:
401+
endpoint_path: API endpoint path for metric labeling.
402+
quota_type: Bounded quota subject type, not the subject identifier. Out-of-set
403+
values are recorded as ``user_id``.
404+
result: Bounded result label. Out-of-set values are recorded as ``error``.
405+
duration: Quota check duration in seconds.
406+
"""
407+
normalized_quota_type = normalize_quota_type(quota_type)
408+
normalized_result = normalize_quota_result(result)
409+
try:
410+
metrics.quota_checks_total.labels(
411+
endpoint_path, normalized_quota_type, normalized_result
412+
).inc()
413+
metrics.quota_check_duration_seconds.labels(
414+
endpoint_path, normalized_quota_type, normalized_result
415+
).observe(duration)
416+
except (AttributeError, TypeError, ValueError):
417+
logger.warning("Failed to update quota check metrics", exc_info=True)

tests/unit/app/endpoints/test_responses.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from pytest_mock import MockerFixture
1919

2020
from app.endpoints.responses import (
21+
_check_response_quota,
2122
_is_server_mcp_output_item,
2223
_sanitize_response_dict,
2324
_should_filter_mcp_chunk,
@@ -278,6 +279,29 @@ def _request_with_previous_response_id(
278279
return request
279280

280281

282+
def test_check_response_quota_records_unexpected_errors(
283+
minimal_config: AppConfig,
284+
mocker: MockerFixture,
285+
) -> None:
286+
"""Test unexpected quota failures are recorded before being re-raised."""
287+
mocker.patch(f"{MODULE}.configuration", minimal_config)
288+
mocker.patch(
289+
f"{MODULE}.check_tokens_available",
290+
side_effect=RuntimeError("quota backend unavailable"),
291+
)
292+
mock_record = mocker.patch(f"{MODULE}.recording.record_quota_check")
293+
294+
with pytest.raises(RuntimeError, match="quota backend unavailable"):
295+
_check_response_quota("user-123", "/v1/responses")
296+
297+
mock_record.assert_called_once()
298+
endpoint_path, quota_type, result, duration = mock_record.call_args.args
299+
assert endpoint_path == "/v1/responses"
300+
assert quota_type == "user_id"
301+
assert result == "error"
302+
assert duration >= 0
303+
304+
281305
class TestResponsesEndpointHandler:
282306
"""Unit tests for responses_endpoint_handler."""
283307

tests/unit/app/endpoints/test_rlsapi_v1.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1185,6 +1185,7 @@ async def test_infer_quota_skipped_when_not_configured(
11851185
"""Test /infer skips quota calls when quota_subject is None (default)."""
11861186
mock_check = mocker.patch("app.endpoints.rlsapi_v1.check_tokens_available")
11871187
mock_consume = mocker.patch("app.endpoints.rlsapi_v1.consume_query_tokens")
1188+
mock_record = mocker.patch("app.endpoints.rlsapi_v1.recording.record_quota_check")
11881189

11891190
await infer_endpoint(
11901191
infer_request=RlsapiV1InferRequest(question="How do I list files?"),
@@ -1195,6 +1196,12 @@ async def test_infer_quota_skipped_when_not_configured(
11951196

11961197
mock_check.assert_not_called()
11971198
mock_consume.assert_not_called()
1199+
mock_record.assert_called_once_with(
1200+
rlsapi_v1.ENDPOINT_PATH_INFER,
1201+
rlsapi_v1.recording.QUOTA_TYPE_DISABLED,
1202+
rlsapi_v1.recording.QUOTA_RESULT_SKIPPED,
1203+
0.0,
1204+
)
11981205

11991206

12001207
@pytest.mark.asyncio
@@ -1224,6 +1231,39 @@ async def test_infer_quota_exceeded_returns_429(
12241231
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
12251232

12261233

1234+
@pytest.mark.asyncio
1235+
async def test_infer_quota_records_unexpected_errors(
1236+
mocker: MockerFixture,
1237+
mock_quota_config: Callable[[str], None],
1238+
mock_llm_response: None,
1239+
mock_auth_resolvers: None,
1240+
mock_request_factory: Callable[..., Any],
1241+
mock_background_tasks: Any,
1242+
) -> None:
1243+
"""Test unexpected quota failures are recorded before being re-raised."""
1244+
mock_quota_config("user_id")
1245+
mocker.patch(
1246+
"app.endpoints.rlsapi_v1.check_tokens_available",
1247+
side_effect=RuntimeError("quota backend unavailable"),
1248+
)
1249+
mock_record = mocker.patch("app.endpoints.rlsapi_v1.recording.record_quota_check")
1250+
1251+
with pytest.raises(RuntimeError, match="quota backend unavailable"):
1252+
await infer_endpoint(
1253+
infer_request=RlsapiV1InferRequest(question="How do I list files?"),
1254+
request=mock_request_factory(),
1255+
background_tasks=mock_background_tasks,
1256+
auth=MOCK_AUTH,
1257+
)
1258+
1259+
mock_record.assert_called_once()
1260+
endpoint_path, quota_type, result, duration = mock_record.call_args.args
1261+
assert endpoint_path == rlsapi_v1.ENDPOINT_PATH_INFER
1262+
assert quota_type == "user_id"
1263+
assert result == rlsapi_v1.recording.QUOTA_RESULT_ERROR
1264+
assert duration >= 0
1265+
1266+
12271267
@pytest.mark.parametrize(
12281268
("quota_subject", "rh_identity_setup", "expected_subject"),
12291269
[

0 commit comments

Comments
 (0)