Skip to content

Commit c64fde1

Browse files
committed
feat: add quota monitoring metrics
Signed-off-by: Major Hayden <major@redhat.com>
1 parent 43c8f4c commit c64fde1

11 files changed

Lines changed: 302 additions & 13 deletions

File tree

docs/demos/lcore/weak_points_for_ai/ex3.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
):
1010
self._task_state = TaskState.auth_required
1111
self._task_status_message = event.status.message
12-
elif (
13-
event.status.state == TaskState.input_required
14-
and self._task_state not in (TaskState.failed, TaskState.auth_required)
12+
elif event.status.state == TaskState.input_required and self._task_state not in (
13+
TaskState.failed,
14+
TaskState.auth_required,
1515
):
1616
self._task_state = TaskState.input_required
1717
self._task_status_message = event.status.message

docs/demos/lcore/weak_points_for_ai/ex9.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Pydantic model utilization
22

3+
34
class ShieldModerationBlocked(BaseModel):
45
"""Shield moderation blocked the content; refusal details are present."""
56

docs/demos/lcore/weak_points_for_ai/exA.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Dynamic dispatch: functional style
22

3+
34
@singledispatch
45
def function(arg: Any) -> None:
56
print("Original function with argument", arg, "that has type", type(arg))
@@ -26,4 +27,3 @@ def _(arg: None) -> None:
2627
function(("foo", "bar", "baz"))
2728
function(1.4142)
2829
function(None)
29-

docs/demos/lcore/weak_points_for_ai/exB.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Pydantic model utilization
22

3+
34
class TranscriptMetadata(BaseModel):
45
"""Metadata for a transcript entry."""
56

@@ -31,4 +32,3 @@ def create_transcript_metadata(
3132
conversation_id=conversation_id,
3233
timestamp=datetime.now(UTC).isoformat(),
3334
)
34-

src/app/endpoints/responses.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import asyncio
66
import json
7+
import time
78
from collections.abc import AsyncIterator
89
from datetime import UTC, datetime
910
from typing import Annotated, Any, Final, Optional, cast
@@ -38,6 +39,7 @@
3839
from configuration import configuration
3940
from constants import ENDPOINT_PATH_RESPONSES, SUBSTITUTED_INSTRUCTIONS_PLACEHOLDER
4041
from log import get_logger
42+
from metrics import recording
4143
from models.api.responses import (
4244
UNAUTHORIZED_OPENAPI_EXAMPLES_WITH_MCP_OAUTH,
4345
ConflictResponse,
@@ -136,6 +138,37 @@ def _get_user_agent(request: Request) -> Optional[str]:
136138
return sanitized or None
137139

138140

141+
def _check_response_quota(user_id: str, endpoint_path: str) -> None:
142+
"""Check response quota availability and record bounded quota metrics."""
143+
quota_start_time = time.monotonic()
144+
try:
145+
check_tokens_available(configuration.quota_limiters, user_id)
146+
except HTTPException:
147+
recording.record_quota_check(
148+
endpoint_path,
149+
recording.QUOTA_TYPE_USER_ID,
150+
recording.QUOTA_RESULT_FAILURE,
151+
time.monotonic() - quota_start_time,
152+
)
153+
raise
154+
except Exception: # pylint: disable=broad-exception-caught
155+
# Unexpected quota backend failures still need bounded metrics before
156+
# propagating to the endpoint error handling layer.
157+
recording.record_quota_check(
158+
endpoint_path,
159+
recording.QUOTA_TYPE_USER_ID,
160+
recording.QUOTA_RESULT_ERROR,
161+
time.monotonic() - quota_start_time,
162+
)
163+
raise
164+
recording.record_quota_check(
165+
endpoint_path,
166+
recording.QUOTA_TYPE_USER_ID,
167+
recording.QUOTA_RESULT_SUCCESS,
168+
time.monotonic() - quota_start_time,
169+
)
170+
171+
139172
responses_response: dict[int | str, dict[str, Any]] = {
140173
200: ResponsesResponse.openapi_response(),
141174
401: UnauthorizedResponse.openapi_response(
@@ -275,11 +308,12 @@ async def responses_endpoint_handler(
275308
started_at = datetime.now(UTC)
276309
rh_identity_context = get_rh_identity_context(request)
277310
user_id, _, _, token = auth
311+
endpoint_path = "/v1/responses"
278312

279313
await check_mcp_auth(configuration, mcp_headers, token, request.headers)
280314

281315
# Check token availability
282-
check_tokens_available(configuration.quota_limiters, user_id)
316+
_check_response_quota(user_id, endpoint_path)
283317

284318
# Enforce RBAC: optionally disallow overriding model in requests
285319
validate_model_provider_override(

src/app/endpoints/rlsapi_v1.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,46 @@ def _resolve_quota_subject(request: Request, auth: AuthTuple) -> Optional[str]:
575575
return system_id
576576

577577

578+
def _check_infer_quota(
579+
request: Request, auth: AuthTuple, endpoint_path: str
580+
) -> Optional[str]:
581+
"""Check infer quota availability and record bounded quota metrics."""
582+
quota_id = _resolve_quota_subject(request, auth)
583+
quota_type = configuration.rlsapi_v1.quota_subject or "disabled"
584+
if quota_id is None:
585+
recording.record_quota_check(
586+
endpoint_path, quota_type, recording.QUOTA_RESULT_SKIPPED, 0.0
587+
)
588+
return None
589+
590+
quota_start_time = time.monotonic()
591+
try:
592+
check_tokens_available(configuration.quota_limiters, quota_id)
593+
except HTTPException:
594+
recording.record_quota_check(
595+
endpoint_path,
596+
quota_type,
597+
recording.QUOTA_RESULT_FAILURE,
598+
time.monotonic() - quota_start_time,
599+
)
600+
raise
601+
except Exception: # pylint: disable=broad-exception-caught
602+
recording.record_quota_check(
603+
endpoint_path,
604+
quota_type,
605+
recording.QUOTA_RESULT_ERROR,
606+
time.monotonic() - quota_start_time,
607+
)
608+
raise
609+
recording.record_quota_check(
610+
endpoint_path,
611+
quota_type,
612+
recording.QUOTA_RESULT_SUCCESS,
613+
time.monotonic() - quota_start_time,
614+
)
615+
return quota_id
616+
617+
578618
def _build_infer_response(
579619
response_text: str,
580620
request_id: str,
@@ -733,16 +773,17 @@ async def infer_endpoint( # pylint: disable=R0914,R0915
733773

734774
logger.info("Processing rlsapi v1 /infer request %s", request_id)
735775

736-
# Quota enforcement: resolve subject and check availability before any work.
737-
# No-op when quota_subject is not configured or no quota limiters exist.
738-
quota_id = _resolve_quota_subject(request, auth)
739-
if quota_id is not None:
776+
# Quota enforcement: check availability before any work and record metrics for
777+
# both enforced and disabled quota paths.
778+
quota_subject = configuration.rlsapi_v1.quota_subject
779+
if quota_subject is not None:
740780
logger.info(
741781
"Checking quota availability for rlsapi v1 request %s using subject type %s",
742782
request_id,
743-
configuration.rlsapi_v1.quota_subject,
783+
quota_subject,
744784
)
745-
check_tokens_available(configuration.quota_limiters, quota_id)
785+
quota_id = _check_infer_quota(request, auth, endpoint_path)
786+
if quota_id is not None:
746787
logger.info(
747788
"Quota availability check passed for rlsapi v1 request %s", request_id
748789
)

src/metrics/__init__.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,20 @@
2222
float("inf"),
2323
)
2424

25+
QUOTA_CHECK_DURATION_BUCKETS: Final[tuple[float, ...]] = (
26+
0.001,
27+
0.005,
28+
0.01,
29+
0.025,
30+
0.05,
31+
0.1,
32+
0.25,
33+
0.5,
34+
1.0,
35+
2.5,
36+
5.0,
37+
float("inf"),
38+
)
2539
# Counter to track REST API calls
2640
# This will be used to count how many times each API endpoint is called
2741
# and the status code of the response
@@ -72,10 +86,29 @@
7286
["provider", "model", "endpoint"],
7387
)
7488

89+
7590
# Histogram to measure the latency of direct LLM inference backend calls.
76-
llm_inference_duration_seconds = Histogram(
91+
llm_inference_duration_seconds: Final[Histogram] = Histogram(
7792
"ls_llm_inference_duration_seconds",
7893
"LLM inference call duration",
7994
["provider", "model", "endpoint", "result"],
8095
buckets=LLM_INFERENCE_DURATION_BUCKETS,
8196
)
97+
98+
# Counter to track pre-request quota checks. Labels must stay bounded:
99+
# endpoint uses static route patterns, quota_type is a configured quota subject,
100+
# and result is one terminal state from the recording helper.
101+
quota_checks_total: Final[Counter] = Counter(
102+
"ls_quota_checks_total",
103+
"Quota availability checks",
104+
["endpoint", "quota_type", "result"],
105+
)
106+
107+
# Histogram to measure quota availability check latency with sub-second buckets.
108+
# It uses the same bounded endpoint/quota_type/result labels as the counter.
109+
quota_check_duration_seconds: Final[Histogram] = Histogram(
110+
"ls_quota_check_duration_seconds",
111+
"Quota availability check duration",
112+
["endpoint", "quota_type", "result"],
113+
buckets=QUOTA_CHECK_DURATION_BUCKETS,
114+
)

src/metrics/recording.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,53 @@
77

88
from collections.abc import Iterator
99
from contextlib import contextmanager
10+
from typing import Final
1011

1112
import metrics
1213
from log import get_logger
1314

1415
logger = get_logger(__name__)
1516

17+
QUOTA_TYPE_USER_ID: Final[str] = "user_id"
18+
QUOTA_TYPE_ORG_ID: Final[str] = "org_id"
19+
QUOTA_TYPE_SYSTEM_ID: Final[str] = "system_id"
20+
QUOTA_TYPE_DISABLED: Final[str] = "disabled"
21+
QUOTA_RESULT_SUCCESS: Final[str] = "success"
22+
QUOTA_RESULT_FAILURE: Final[str] = "failure"
23+
QUOTA_RESULT_SKIPPED: Final[str] = "skipped"
24+
QUOTA_RESULT_ERROR: Final[str] = "error"
25+
26+
ALLOWED_QUOTA_TYPES: Final[frozenset[str]] = frozenset(
27+
{
28+
QUOTA_TYPE_USER_ID,
29+
QUOTA_TYPE_ORG_ID,
30+
QUOTA_TYPE_SYSTEM_ID,
31+
QUOTA_TYPE_DISABLED,
32+
}
33+
)
34+
ALLOWED_QUOTA_RESULTS: Final[frozenset[str]] = frozenset(
35+
{
36+
QUOTA_RESULT_SUCCESS,
37+
QUOTA_RESULT_FAILURE,
38+
QUOTA_RESULT_SKIPPED,
39+
QUOTA_RESULT_ERROR,
40+
}
41+
)
42+
43+
44+
def normalize_quota_type(quota_type: str) -> str:
45+
"""Return a bounded quota type label for Prometheus cardinality safety."""
46+
if quota_type in ALLOWED_QUOTA_TYPES:
47+
return quota_type
48+
return QUOTA_TYPE_USER_ID
49+
50+
51+
def normalize_quota_result(result: str) -> str:
52+
"""Return a bounded quota result label for Prometheus cardinality safety."""
53+
if result in ALLOWED_QUOTA_RESULTS:
54+
return result
55+
return QUOTA_RESULT_ERROR
56+
1657

1758
@contextmanager
1859
def measure_response_duration(path: str) -> Iterator[None]:
@@ -129,3 +170,28 @@ def record_llm_inference_duration(
129170
).observe(duration)
130171
except (AttributeError, TypeError, ValueError):
131172
logger.warning("Failed to update LLM inference duration metric", exc_info=True)
173+
174+
175+
def record_quota_check(
176+
endpoint_path: str, quota_type: str, result: str, duration: float
177+
) -> None:
178+
"""Record a quota availability check.
179+
180+
Args:
181+
endpoint_path: API endpoint path for metric labeling.
182+
quota_type: Bounded quota subject type, not the subject identifier. Out-of-set
183+
values are recorded as ``user_id``.
184+
result: Bounded result label. Out-of-set values are recorded as ``error``.
185+
duration: Quota check duration in seconds.
186+
"""
187+
normalized_quota_type = normalize_quota_type(quota_type)
188+
normalized_result = normalize_quota_result(result)
189+
try:
190+
metrics.quota_checks_total.labels(
191+
endpoint_path, normalized_quota_type, normalized_result
192+
).inc()
193+
metrics.quota_check_duration_seconds.labels(
194+
endpoint_path, normalized_quota_type, normalized_result
195+
).observe(duration)
196+
except (AttributeError, TypeError, ValueError):
197+
logger.warning("Failed to update quota check metrics", exc_info=True)

tests/unit/app/endpoints/test_responses.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pytest_mock import MockerFixture
1818

1919
from app.endpoints.responses import (
20+
_check_response_quota,
2021
_is_server_mcp_output_item,
2122
_sanitize_response_dict,
2223
_should_filter_mcp_chunk,
@@ -277,6 +278,29 @@ def _request_with_previous_response_id(
277278
return request
278279

279280

281+
def test_check_response_quota_records_unexpected_errors(
282+
minimal_config: AppConfig,
283+
mocker: MockerFixture,
284+
) -> None:
285+
"""Test unexpected quota failures are recorded before being re-raised."""
286+
mocker.patch(f"{MODULE}.configuration", minimal_config)
287+
mocker.patch(
288+
f"{MODULE}.check_tokens_available",
289+
side_effect=RuntimeError("quota backend unavailable"),
290+
)
291+
mock_record = mocker.patch(f"{MODULE}.recording.record_quota_check")
292+
293+
with pytest.raises(RuntimeError, match="quota backend unavailable"):
294+
_check_response_quota("user-123", "/v1/responses")
295+
296+
mock_record.assert_called_once()
297+
endpoint_path, quota_type, result, duration = mock_record.call_args.args
298+
assert endpoint_path == "/v1/responses"
299+
assert quota_type == "user_id"
300+
assert result == "error"
301+
assert duration >= 0
302+
303+
280304
class TestResponsesEndpointHandler:
281305
"""Unit tests for responses_endpoint_handler."""
282306

0 commit comments

Comments
 (0)