Skip to content

Commit 6971abe

Browse files
committed
feat: add quota monitoring metrics
Signed-off-by: Major Hayden <major@redhat.com>
1 parent f7b927a commit 6971abe

7 files changed

Lines changed: 296 additions & 8 deletions

File tree

src/app/endpoints/responses.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import asyncio
66
import json
7+
import time
78
from collections.abc import AsyncIterator, Sequence
89
from datetime import UTC, datetime
910
from typing import Annotated, Any, Final, NoReturn, Optional, cast
@@ -39,6 +40,7 @@
3940
from configuration import configuration
4041
from constants import ENDPOINT_PATH_RESPONSES, SUBSTITUTED_INSTRUCTIONS_PLACEHOLDER
4142
from log import get_logger
43+
from metrics import recording
4244
from models.api.requests import ResponsesRequest
4345
from models.api.responses.constants import UNAUTHORIZED_OPENAPI_EXAMPLES_WITH_MCP_OAUTH
4446
from models.api.responses.error import (
@@ -133,6 +135,37 @@ def _get_user_agent(request: Request) -> Optional[str]:
133135
return sanitized or None
134136

135137

138+
def _check_response_quota(user_id: str, endpoint_path: str) -> None:
139+
"""Check response quota availability and record bounded quota metrics."""
140+
quota_start_time = time.monotonic()
141+
try:
142+
check_tokens_available(configuration.quota_limiters, user_id)
143+
except HTTPException:
144+
recording.record_quota_check(
145+
endpoint_path,
146+
recording.QUOTA_TYPE_USER_ID,
147+
recording.QUOTA_RESULT_FAILURE,
148+
time.monotonic() - quota_start_time,
149+
)
150+
raise
151+
except Exception: # pylint: disable=broad-exception-caught
152+
# Unexpected quota backend failures still need bounded metrics before
153+
# propagating to the endpoint error handling layer.
154+
recording.record_quota_check(
155+
endpoint_path,
156+
recording.QUOTA_TYPE_USER_ID,
157+
recording.QUOTA_RESULT_ERROR,
158+
time.monotonic() - quota_start_time,
159+
)
160+
raise
161+
recording.record_quota_check(
162+
endpoint_path,
163+
recording.QUOTA_TYPE_USER_ID,
164+
recording.QUOTA_RESULT_SUCCESS,
165+
time.monotonic() - quota_start_time,
166+
)
167+
168+
136169
responses_response: dict[int | str, dict[str, Any]] = {
137170
200: ResponsesResponse.openapi_response(),
138171
401: UnauthorizedResponse.openapi_response(
@@ -501,11 +534,12 @@ async def responses_endpoint_handler(
501534
started_at = datetime.now(UTC)
502535
rh_identity_context = get_rh_identity_context(request)
503536
user_id, _, _, token = auth
537+
endpoint_path = "/v1/responses"
504538

505539
await check_mcp_auth(configuration, mcp_headers, token, request.headers)
506540

507541
# Check token availability
508-
check_tokens_available(configuration.quota_limiters, user_id)
542+
_check_response_quota(user_id, endpoint_path)
509543

510544
# Enforce RBAC: optionally disallow overriding model in requests
511545
validate_model_provider_override(

src/app/endpoints/rlsapi_v1.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,46 @@ def _resolve_quota_subject(request: Request, auth: AuthTuple) -> Optional[str]:
575575
return system_id
576576

577577

578+
def _check_infer_quota(
579+
request: Request, auth: AuthTuple, endpoint_path: str
580+
) -> Optional[str]:
581+
"""Check infer quota availability and record bounded quota metrics."""
582+
quota_id = _resolve_quota_subject(request, auth)
583+
quota_type = configuration.rlsapi_v1.quota_subject or "disabled"
584+
if quota_id is None:
585+
recording.record_quota_check(
586+
endpoint_path, quota_type, recording.QUOTA_RESULT_SKIPPED, 0.0
587+
)
588+
return None
589+
590+
quota_start_time = time.monotonic()
591+
try:
592+
check_tokens_available(configuration.quota_limiters, quota_id)
593+
except HTTPException:
594+
recording.record_quota_check(
595+
endpoint_path,
596+
quota_type,
597+
recording.QUOTA_RESULT_FAILURE,
598+
time.monotonic() - quota_start_time,
599+
)
600+
raise
601+
except Exception: # pylint: disable=broad-exception-caught
602+
recording.record_quota_check(
603+
endpoint_path,
604+
quota_type,
605+
recording.QUOTA_RESULT_ERROR,
606+
time.monotonic() - quota_start_time,
607+
)
608+
raise
609+
recording.record_quota_check(
610+
endpoint_path,
611+
quota_type,
612+
recording.QUOTA_RESULT_SUCCESS,
613+
time.monotonic() - quota_start_time,
614+
)
615+
return quota_id
616+
617+
578618
def _build_infer_response(
579619
response_text: str,
580620
request_id: str,
@@ -733,16 +773,17 @@ async def infer_endpoint( # pylint: disable=R0914,R0915
733773

734774
logger.info("Processing rlsapi v1 /infer request %s", request_id)
735775

736-
# Quota enforcement: resolve subject and check availability before any work.
737-
# No-op when quota_subject is not configured or no quota limiters exist.
738-
quota_id = _resolve_quota_subject(request, auth)
739-
if quota_id is not None:
776+
# Quota enforcement: check availability before any work and record metrics for
777+
# both enforced and disabled quota paths.
778+
quota_subject = configuration.rlsapi_v1.quota_subject
779+
if quota_subject is not None:
740780
logger.info(
741781
"Checking quota availability for rlsapi v1 request %s using subject type %s",
742782
request_id,
743-
configuration.rlsapi_v1.quota_subject,
783+
quota_subject,
744784
)
745-
check_tokens_available(configuration.quota_limiters, quota_id)
785+
quota_id = _check_infer_quota(request, auth, endpoint_path)
786+
if quota_id is not None:
746787
logger.info(
747788
"Quota availability check passed for rlsapi v1 request %s", request_id
748789
)

src/metrics/__init__.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,20 @@
2222
float("inf"),
2323
)
2424

25+
QUOTA_CHECK_DURATION_BUCKETS: Final[tuple[float, ...]] = (
26+
0.001,
27+
0.005,
28+
0.01,
29+
0.025,
30+
0.05,
31+
0.1,
32+
0.25,
33+
0.5,
34+
1.0,
35+
2.5,
36+
5.0,
37+
float("inf"),
38+
)
2539
# Counter to track REST API calls
2640
# This will be used to count how many times each API endpoint is called
2741
# and the status code of the response
@@ -72,10 +86,29 @@
7286
["provider", "model", "endpoint"],
7387
)
7488

89+
7590
# Histogram to measure the latency of direct LLM inference backend calls.
76-
llm_inference_duration_seconds = Histogram(
91+
llm_inference_duration_seconds: Final[Histogram] = Histogram(
7792
"ls_llm_inference_duration_seconds",
7893
"LLM inference call duration",
7994
["provider", "model", "endpoint", "result"],
8095
buckets=LLM_INFERENCE_DURATION_BUCKETS,
8196
)
97+
98+
# Counter to track pre-request quota checks. Labels must stay bounded:
99+
# endpoint uses static route patterns, quota_type is a configured quota subject,
100+
# and result is one terminal state from the recording helper.
101+
quota_checks_total: Final[Counter] = Counter(
102+
"ls_quota_checks_total",
103+
"Quota availability checks",
104+
["endpoint", "quota_type", "result"],
105+
)
106+
107+
# Histogram to measure quota availability check latency with sub-second buckets.
108+
# It uses the same bounded endpoint/quota_type/result labels as the counter.
109+
quota_check_duration_seconds: Final[Histogram] = Histogram(
110+
"ls_quota_check_duration_seconds",
111+
"Quota availability check duration",
112+
["endpoint", "quota_type", "result"],
113+
buckets=QUOTA_CHECK_DURATION_BUCKETS,
114+
)

src/metrics/recording.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,53 @@
77

88
from collections.abc import Iterator
99
from contextlib import contextmanager
10+
from typing import Final
1011

1112
import metrics
1213
from log import get_logger
1314

1415
logger = get_logger(__name__)
1516

17+
QUOTA_TYPE_USER_ID: Final[str] = "user_id"
18+
QUOTA_TYPE_ORG_ID: Final[str] = "org_id"
19+
QUOTA_TYPE_SYSTEM_ID: Final[str] = "system_id"
20+
QUOTA_TYPE_DISABLED: Final[str] = "disabled"
21+
QUOTA_RESULT_SUCCESS: Final[str] = "success"
22+
QUOTA_RESULT_FAILURE: Final[str] = "failure"
23+
QUOTA_RESULT_SKIPPED: Final[str] = "skipped"
24+
QUOTA_RESULT_ERROR: Final[str] = "error"
25+
26+
ALLOWED_QUOTA_TYPES: Final[frozenset[str]] = frozenset(
27+
{
28+
QUOTA_TYPE_USER_ID,
29+
QUOTA_TYPE_ORG_ID,
30+
QUOTA_TYPE_SYSTEM_ID,
31+
QUOTA_TYPE_DISABLED,
32+
}
33+
)
34+
ALLOWED_QUOTA_RESULTS: Final[frozenset[str]] = frozenset(
35+
{
36+
QUOTA_RESULT_SUCCESS,
37+
QUOTA_RESULT_FAILURE,
38+
QUOTA_RESULT_SKIPPED,
39+
QUOTA_RESULT_ERROR,
40+
}
41+
)
42+
43+
44+
def normalize_quota_type(quota_type: str) -> str:
45+
"""Return a bounded quota type label for Prometheus cardinality safety."""
46+
if quota_type in ALLOWED_QUOTA_TYPES:
47+
return quota_type
48+
return QUOTA_TYPE_USER_ID
49+
50+
51+
def normalize_quota_result(result: str) -> str:
52+
"""Return a bounded quota result label for Prometheus cardinality safety."""
53+
if result in ALLOWED_QUOTA_RESULTS:
54+
return result
55+
return QUOTA_RESULT_ERROR
56+
1657

1758
@contextmanager
1859
def measure_response_duration(path: str) -> Iterator[None]:
@@ -129,3 +170,28 @@ def record_llm_inference_duration(
129170
).observe(duration)
130171
except (AttributeError, TypeError, ValueError):
131172
logger.warning("Failed to update LLM inference duration metric", exc_info=True)
173+
174+
175+
def record_quota_check(
176+
endpoint_path: str, quota_type: str, result: str, duration: float
177+
) -> None:
178+
"""Record a quota availability check.
179+
180+
Args:
181+
endpoint_path: API endpoint path for metric labeling.
182+
quota_type: Bounded quota subject type, not the subject identifier. Out-of-set
183+
values are recorded as ``user_id``.
184+
result: Bounded result label. Out-of-set values are recorded as ``error``.
185+
duration: Quota check duration in seconds.
186+
"""
187+
normalized_quota_type = normalize_quota_type(quota_type)
188+
normalized_result = normalize_quota_result(result)
189+
try:
190+
metrics.quota_checks_total.labels(
191+
endpoint_path, normalized_quota_type, normalized_result
192+
).inc()
193+
metrics.quota_check_duration_seconds.labels(
194+
endpoint_path, normalized_quota_type, normalized_result
195+
).observe(duration)
196+
except (AttributeError, TypeError, ValueError):
197+
logger.warning("Failed to update quota check metrics", exc_info=True)

tests/unit/app/endpoints/test_responses.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pytest_mock import MockerFixture
1818

1919
from app.endpoints.responses import (
20+
_check_response_quota,
2021
_is_server_mcp_output_item,
2122
_sanitize_response_dict,
2223
_should_filter_mcp_chunk,
@@ -276,6 +277,29 @@ def _request_with_previous_response_id(
276277
return request
277278

278279

280+
def test_check_response_quota_records_unexpected_errors(
281+
minimal_config: AppConfig,
282+
mocker: MockerFixture,
283+
) -> None:
284+
"""Test unexpected quota failures are recorded before being re-raised."""
285+
mocker.patch(f"{MODULE}.configuration", minimal_config)
286+
mocker.patch(
287+
f"{MODULE}.check_tokens_available",
288+
side_effect=RuntimeError("quota backend unavailable"),
289+
)
290+
mock_record = mocker.patch(f"{MODULE}.recording.record_quota_check")
291+
292+
with pytest.raises(RuntimeError, match="quota backend unavailable"):
293+
_check_response_quota("user-123", "/v1/responses")
294+
295+
mock_record.assert_called_once()
296+
endpoint_path, quota_type, result, duration = mock_record.call_args.args
297+
assert endpoint_path == "/v1/responses"
298+
assert quota_type == "user_id"
299+
assert result == "error"
300+
assert duration >= 0
301+
302+
279303
class TestResponsesEndpointHandler:
280304
"""Unit tests for responses_endpoint_handler."""
281305

tests/unit/app/endpoints/test_rlsapi_v1.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1185,6 +1185,7 @@ async def test_infer_quota_skipped_when_not_configured(
11851185
"""Test /infer skips quota calls when quota_subject is None (default)."""
11861186
mock_check = mocker.patch("app.endpoints.rlsapi_v1.check_tokens_available")
11871187
mock_consume = mocker.patch("app.endpoints.rlsapi_v1.consume_query_tokens")
1188+
mock_record = mocker.patch("app.endpoints.rlsapi_v1.recording.record_quota_check")
11881189

11891190
await infer_endpoint(
11901191
infer_request=RlsapiV1InferRequest(question="How do I list files?"),
@@ -1195,6 +1196,12 @@ async def test_infer_quota_skipped_when_not_configured(
11951196

11961197
mock_check.assert_not_called()
11971198
mock_consume.assert_not_called()
1199+
mock_record.assert_called_once_with(
1200+
rlsapi_v1.ENDPOINT_PATH_INFER,
1201+
rlsapi_v1.recording.QUOTA_TYPE_DISABLED,
1202+
rlsapi_v1.recording.QUOTA_RESULT_SKIPPED,
1203+
0.0,
1204+
)
11981205

11991206

12001207
@pytest.mark.asyncio
@@ -1224,6 +1231,39 @@ async def test_infer_quota_exceeded_returns_429(
12241231
assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS
12251232

12261233

1234+
@pytest.mark.asyncio
1235+
async def test_infer_quota_records_unexpected_errors(
1236+
mocker: MockerFixture,
1237+
mock_quota_config: Callable[[str], None],
1238+
mock_llm_response: None,
1239+
mock_auth_resolvers: None,
1240+
mock_request_factory: Callable[..., Any],
1241+
mock_background_tasks: Any,
1242+
) -> None:
1243+
"""Test unexpected quota failures are recorded before being re-raised."""
1244+
mock_quota_config("user_id")
1245+
mocker.patch(
1246+
"app.endpoints.rlsapi_v1.check_tokens_available",
1247+
side_effect=RuntimeError("quota backend unavailable"),
1248+
)
1249+
mock_record = mocker.patch("app.endpoints.rlsapi_v1.recording.record_quota_check")
1250+
1251+
with pytest.raises(RuntimeError, match="quota backend unavailable"):
1252+
await infer_endpoint(
1253+
infer_request=RlsapiV1InferRequest(question="How do I list files?"),
1254+
request=mock_request_factory(),
1255+
background_tasks=mock_background_tasks,
1256+
auth=MOCK_AUTH,
1257+
)
1258+
1259+
mock_record.assert_called_once()
1260+
endpoint_path, quota_type, result, duration = mock_record.call_args.args
1261+
assert endpoint_path == rlsapi_v1.ENDPOINT_PATH_INFER
1262+
assert quota_type == "user_id"
1263+
assert result == rlsapi_v1.recording.QUOTA_RESULT_ERROR
1264+
assert duration >= 0
1265+
1266+
12271267
@pytest.mark.parametrize(
12281268
("quota_subject", "rh_identity_setup", "expected_subject"),
12291269
[

0 commit comments

Comments
 (0)