Skip to content

Commit 7668c4b

Browse files
committed
feat: add quota monitoring metrics
Signed-off-by: Major Hayden <major@redhat.com>
1 parent ca125c4 commit 7668c4b

6 files changed

Lines changed: 250 additions & 8 deletions

File tree

src/app/endpoints/responses.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import asyncio
66
import json
7+
import time
78
from collections.abc import AsyncIterator
89
from datetime import UTC, datetime
910
from typing import Annotated, Any, Final, Optional, cast
@@ -39,6 +40,7 @@
3940
from configuration import configuration
4041
from constants import SUBSTITUTED_INSTRUCTIONS_PLACEHOLDER
4142
from log import get_logger
43+
from metrics import recording
4244
from models.config import Action
4345
from models.requests import ResponsesRequest
4446
from models.responses import (
@@ -136,6 +138,37 @@ def _get_user_agent(request: Request) -> Optional[str]:
136138
return sanitized or None
137139

138140

141+
def _check_response_quota(user_id: str, endpoint_path: str) -> None:
142+
"""Check response quota availability and record bounded quota metrics."""
143+
quota_start_time = time.monotonic()
144+
try:
145+
check_tokens_available(configuration.quota_limiters, user_id)
146+
except HTTPException:
147+
recording.record_quota_check(
148+
endpoint_path,
149+
recording.QUOTA_TYPE_USER_ID,
150+
recording.QUOTA_RESULT_FAILURE,
151+
time.monotonic() - quota_start_time,
152+
)
153+
raise
154+
except Exception: # pylint: disable=broad-exception-caught
155+
# Unexpected quota backend failures still need bounded metrics before
156+
# propagating to the endpoint error handling layer.
157+
recording.record_quota_check(
158+
endpoint_path,
159+
recording.QUOTA_TYPE_USER_ID,
160+
recording.QUOTA_RESULT_ERROR,
161+
time.monotonic() - quota_start_time,
162+
)
163+
raise
164+
recording.record_quota_check(
165+
endpoint_path,
166+
recording.QUOTA_TYPE_USER_ID,
167+
recording.QUOTA_RESULT_SUCCESS,
168+
time.monotonic() - quota_start_time,
169+
)
170+
171+
139172
responses_response: dict[int | str, dict[str, Any]] = {
140173
200: ResponsesResponse.openapi_response(),
141174
401: UnauthorizedResponse.openapi_response(
@@ -275,11 +308,12 @@ async def responses_endpoint_handler(
275308
started_at = datetime.now(UTC)
276309
rh_identity_context = get_rh_identity_context(request)
277310
user_id, _, _, token = auth
311+
endpoint_path = "/v1/responses"
278312

279313
await check_mcp_auth(configuration, mcp_headers, token, request.headers)
280314

281315
# Check token availability
282-
check_tokens_available(configuration.quota_limiters, user_id)
316+
_check_response_quota(user_id, endpoint_path)
283317

284318
# Enforce RBAC: optionally disallow overriding model in requests
285319
validate_model_provider_override(
@@ -331,7 +365,6 @@ async def responses_endpoint_handler(
331365
)
332366
attachments_text = extract_attachments_text(original_request.input)
333367

334-
endpoint_path = "/v1/responses"
335368
moderation_result = await run_shield_moderation(
336369
client,
337370
input_text + "\n\n" + attachments_text,

src/app/endpoints/rlsapi_v1.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,36 @@ def _resolve_quota_subject(request: Request, auth: AuthTuple) -> Optional[str]:
532532
return system_id
533533

534534

535+
def _check_infer_quota(
536+
request: Request, auth: AuthTuple, endpoint_path: str
537+
) -> Optional[str]:
538+
"""Check infer quota availability and record bounded quota metrics."""
539+
quota_id = _resolve_quota_subject(request, auth)
540+
quota_type = configuration.rlsapi_v1.quota_subject or "disabled"
541+
if quota_id is None:
542+
recording.record_quota_check(endpoint_path, quota_type, "skipped", 0.0)
543+
return None
544+
545+
quota_start_time = time.monotonic()
546+
try:
547+
check_tokens_available(configuration.quota_limiters, quota_id)
548+
except HTTPException:
549+
recording.record_quota_check(
550+
endpoint_path,
551+
quota_type,
552+
"failure",
553+
time.monotonic() - quota_start_time,
554+
)
555+
raise
556+
recording.record_quota_check(
557+
endpoint_path,
558+
quota_type,
559+
"success",
560+
time.monotonic() - quota_start_time,
561+
)
562+
return quota_id
563+
564+
535565
def _build_infer_response(
536566
response_text: str,
537567
request_id: str,
@@ -669,12 +699,11 @@ async def infer_endpoint( # pylint: disable=R0914
669699
"""
670700
# Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
671701
check_configuration_loaded(configuration)
702+
endpoint_path = "/v1/infer"
672703

673704
# Quota enforcement: resolve subject and check availability before any work.
674705
# No-op when quota_subject is not configured or no quota limiters exist.
675-
quota_id = _resolve_quota_subject(request, auth)
676-
if quota_id is not None:
677-
check_tokens_available(configuration.quota_limiters, quota_id)
706+
quota_id = _check_infer_quota(request, auth, endpoint_path)
678707

679708
request_id = get_suid()
680709

@@ -685,8 +714,6 @@ async def infer_endpoint( # pylint: disable=R0914
685714
"Request %s: Combined input source length: %d", request_id, len(input_source)
686715
)
687716

688-
endpoint_path = "/v1/infer"
689-
690717
# Run shield moderation on user input before inference.
691718
# Uses all configured shields; no-op when no shields are registered.
692719
# Runs before model/tool discovery so blocked requests short-circuit

src/metrics/__init__.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,28 @@
11
"""Metrics module for Lightspeed Core Stack."""
22

3+
from typing import Final
4+
35
from prometheus_client import (
46
Counter,
57
Gauge,
68
Histogram,
79
)
810

11+
QUOTA_CHECK_DURATION_BUCKETS: Final[tuple[float, ...]] = (
12+
0.001,
13+
0.005,
14+
0.01,
15+
0.025,
16+
0.05,
17+
0.1,
18+
0.25,
19+
0.5,
20+
1.0,
21+
2.5,
22+
5.0,
23+
float("inf"),
24+
)
25+
926
# Counter to track REST API calls
1027
# This will be used to count how many times each API endpoint is called
1128
# and the status code of the response
@@ -55,3 +72,21 @@
5572
"LLM tokens received",
5673
["provider", "model", "endpoint"],
5774
)
75+
76+
# Counter to track pre-request quota checks. Labels must stay bounded:
77+
# endpoint uses static route patterns, quota_type is a configured quota subject,
78+
# and result is one terminal state from the recording helper.
79+
quota_checks_total = Counter(
80+
"ls_quota_checks_total",
81+
"Quota availability checks",
82+
["endpoint", "quota_type", "result"],
83+
)
84+
85+
# Histogram to measure quota availability check latency with sub-second buckets.
86+
# It uses the same bounded endpoint/quota_type/result labels as the counter.
87+
quota_check_duration_seconds = Histogram(
88+
"ls_quota_check_duration_seconds",
89+
"Quota availability check duration",
90+
["endpoint", "quota_type", "result"],
91+
buckets=QUOTA_CHECK_DURATION_BUCKETS,
92+
)

src/metrics/recording.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,53 @@
77

88
from collections.abc import Iterator
99
from contextlib import contextmanager
10+
from typing import Final
1011

1112
import metrics
1213
from log import get_logger
1314

1415
logger = get_logger(__name__)
1516

17+
QUOTA_TYPE_USER_ID: Final[str] = "user_id"
18+
QUOTA_TYPE_ORG_ID: Final[str] = "org_id"
19+
QUOTA_TYPE_SYSTEM_ID: Final[str] = "system_id"
20+
QUOTA_TYPE_DISABLED: Final[str] = "disabled"
21+
QUOTA_RESULT_SUCCESS: Final[str] = "success"
22+
QUOTA_RESULT_FAILURE: Final[str] = "failure"
23+
QUOTA_RESULT_SKIPPED: Final[str] = "skipped"
24+
QUOTA_RESULT_ERROR: Final[str] = "error"
25+
26+
ALLOWED_QUOTA_TYPES: Final[frozenset[str]] = frozenset(
27+
{
28+
QUOTA_TYPE_USER_ID,
29+
QUOTA_TYPE_ORG_ID,
30+
QUOTA_TYPE_SYSTEM_ID,
31+
QUOTA_TYPE_DISABLED,
32+
}
33+
)
34+
ALLOWED_QUOTA_RESULTS: Final[frozenset[str]] = frozenset(
35+
{
36+
QUOTA_RESULT_SUCCESS,
37+
QUOTA_RESULT_FAILURE,
38+
QUOTA_RESULT_SKIPPED,
39+
QUOTA_RESULT_ERROR,
40+
}
41+
)
42+
43+
44+
def normalize_quota_type(quota_type: str) -> str:
45+
"""Return a bounded quota type label for Prometheus cardinality safety."""
46+
if quota_type in ALLOWED_QUOTA_TYPES:
47+
return quota_type
48+
return QUOTA_TYPE_USER_ID
49+
50+
51+
def normalize_quota_result(result: str) -> str:
52+
"""Return a bounded quota result label for Prometheus cardinality safety."""
53+
if result in ALLOWED_QUOTA_RESULTS:
54+
return result
55+
return QUOTA_RESULT_ERROR
56+
1657

1758
@contextmanager
1859
def measure_response_duration(path: str) -> Iterator[None]:
@@ -109,3 +150,28 @@ def record_llm_token_usage(
109150
)
110151
except (AttributeError, TypeError, ValueError):
111152
logger.warning("Failed to update token metrics", exc_info=True)
153+
154+
155+
def record_quota_check(
156+
endpoint_path: str, quota_type: str, result: str, duration: float
157+
) -> None:
158+
"""Record a quota availability check.
159+
160+
Args:
161+
endpoint_path: API endpoint path for metric labeling.
162+
quota_type: Bounded quota subject type, not the subject identifier. Out-of-set
163+
values are recorded as ``user_id``.
164+
result: Bounded result label. Out-of-set values are recorded as ``error``.
165+
duration: Quota check duration in seconds.
166+
"""
167+
normalized_quota_type = normalize_quota_type(quota_type)
168+
normalized_result = normalize_quota_result(result)
169+
try:
170+
metrics.quota_checks_total.labels(
171+
endpoint_path, normalized_quota_type, normalized_result
172+
).inc()
173+
metrics.quota_check_duration_seconds.labels(
174+
endpoint_path, normalized_quota_type, normalized_result
175+
).observe(duration)
176+
except (AttributeError, TypeError, ValueError):
177+
logger.warning("Failed to update quota check metrics", exc_info=True)

tests/unit/app/endpoints/test_responses.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pytest_mock import MockerFixture
1818

1919
from app.endpoints.responses import (
20+
_check_response_quota,
2021
_is_server_mcp_output_item,
2122
_sanitize_response_dict,
2223
_should_filter_mcp_chunk,
@@ -219,6 +220,29 @@ def _request_with_previous_response_id(
219220
return request
220221

221222

223+
def test_check_response_quota_records_unexpected_errors(
224+
minimal_config: AppConfig,
225+
mocker: MockerFixture,
226+
) -> None:
227+
"""Test unexpected quota failures are recorded before being re-raised."""
228+
mocker.patch(f"{MODULE}.configuration", minimal_config)
229+
mocker.patch(
230+
f"{MODULE}.check_tokens_available",
231+
side_effect=RuntimeError("quota backend unavailable"),
232+
)
233+
mock_record = mocker.patch(f"{MODULE}.recording.record_quota_check")
234+
235+
with pytest.raises(RuntimeError, match="quota backend unavailable"):
236+
_check_response_quota("user-123", "/v1/responses")
237+
238+
mock_record.assert_called_once()
239+
endpoint_path, quota_type, result, duration = mock_record.call_args.args
240+
assert endpoint_path == "/v1/responses"
241+
assert quota_type == "user_id"
242+
assert result == "error"
243+
assert duration >= 0
244+
245+
222246
class TestResponsesEndpointHandler:
223247
"""Unit tests for responses_endpoint_handler."""
224248

tests/unit/metrics/test_recording.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Unit tests for Prometheus metric recording helpers."""
22

3-
from pytest_mock import MockerFixture
3+
import pytest
4+
from pytest_mock import MockerFixture, MockType
45

56
from metrics import recording
67

@@ -159,3 +160,59 @@ def test_record_llm_token_usage_logs_metric_errors(mocker: MockerFixture) -> Non
159160
mock_logger.warning.assert_called_once_with(
160161
"Failed to update token metrics", exc_info=True
161162
)
163+
164+
165+
@pytest.fixture(name="recording_logger")
166+
def recording_logger_fixture(mocker: MockerFixture) -> MockType:
167+
"""Patch the metric recording logger for failure assertions."""
168+
return mocker.patch("metrics.recording.logger")
169+
170+
171+
@pytest.mark.parametrize("failing_metric", ["counter", "histogram"])
172+
def test_record_quota_check_updates_metrics_and_logs_errors(
173+
mocker: MockerFixture,
174+
recording_logger: MockType,
175+
failing_metric: str,
176+
) -> None:
177+
"""Test quota helper counter and histogram updates plus both failure points."""
178+
mock_counter = mocker.patch("metrics.recording.metrics.quota_checks_total")
179+
mock_histogram = mocker.patch(
180+
"metrics.recording.metrics.quota_check_duration_seconds"
181+
)
182+
183+
recording.record_quota_check("/v1/infer", "org_id", "success", 0.75)
184+
185+
mock_counter.labels.assert_called_once_with("/v1/infer", "org_id", "success")
186+
mock_counter.labels.return_value.inc.assert_called_once()
187+
mock_histogram.labels.assert_called_once_with("/v1/infer", "org_id", "success")
188+
mock_histogram.labels.return_value.observe.assert_called_once_with(0.75)
189+
recording_logger.warning.assert_not_called()
190+
191+
mock_counter.reset_mock()
192+
mock_histogram.reset_mock()
193+
recording_logger.reset_mock()
194+
if failing_metric == "counter":
195+
mock_counter.labels.return_value.inc.side_effect = TypeError("bad")
196+
else:
197+
mock_histogram.labels.return_value.observe.side_effect = TypeError("bad")
198+
199+
recording.record_quota_check("/v1/infer", "org_id", "failure", 0.75)
200+
201+
recording_logger.warning.assert_called_once_with(
202+
"Failed to update quota check metrics", exc_info=True
203+
)
204+
205+
206+
def test_record_quota_check_bounds_labels(mocker: MockerFixture) -> None:
207+
"""Test quota helper maps unexpected label values to bounded fallbacks."""
208+
mock_counter = mocker.patch("metrics.recording.metrics.quota_checks_total")
209+
mock_histogram = mocker.patch(
210+
"metrics.recording.metrics.quota_check_duration_seconds"
211+
)
212+
213+
recording.record_quota_check("/v1/responses", "customer-123", "timeout", 0.25)
214+
215+
mock_counter.labels.assert_called_once_with("/v1/responses", "user_id", "error")
216+
mock_counter.labels.return_value.inc.assert_called_once()
217+
mock_histogram.labels.assert_called_once_with("/v1/responses", "user_id", "error")
218+
mock_histogram.labels.return_value.observe.assert_called_once_with(0.25)

0 commit comments

Comments
 (0)