|
4 | 4 |
|
5 | 5 | import asyncio |
6 | 6 | import json |
| 7 | +import time |
7 | 8 | from collections.abc import AsyncIterator, Sequence |
8 | 9 | from datetime import UTC, datetime |
9 | 10 | from typing import Annotated, Any, Final, NoReturn, Optional, cast |
|
39 | 40 | from configuration import configuration |
40 | 41 | from constants import ENDPOINT_PATH_RESPONSES, SUBSTITUTED_INSTRUCTIONS_PLACEHOLDER |
41 | 42 | from log import get_logger |
| 43 | +from metrics import recording |
42 | 44 | from models.api.requests import ResponsesRequest |
43 | 45 | from models.api.responses.constants import UNAUTHORIZED_OPENAPI_EXAMPLES_WITH_MCP_OAUTH |
44 | 46 | from models.api.responses.error import ( |
@@ -133,6 +135,37 @@ def _get_user_agent(request: Request) -> Optional[str]: |
133 | 135 | return sanitized or None |
134 | 136 |
|
135 | 137 |
|
| 138 | +def _check_response_quota(user_id: str, endpoint_path: str) -> None: |
| 139 | + """Check response quota availability and record bounded quota metrics.""" |
| 140 | + quota_start_time = time.monotonic() |
| 141 | + try: |
| 142 | + check_tokens_available(configuration.quota_limiters, user_id) |
| 143 | + except HTTPException: |
| 144 | + recording.record_quota_check( |
| 145 | + endpoint_path, |
| 146 | + recording.QUOTA_TYPE_USER_ID, |
| 147 | + recording.QUOTA_RESULT_FAILURE, |
| 148 | + time.monotonic() - quota_start_time, |
| 149 | + ) |
| 150 | + raise |
| 151 | + except Exception: # pylint: disable=broad-exception-caught |
| 152 | + # Unexpected quota backend failures still need bounded metrics before |
| 153 | + # propagating to the endpoint error handling layer. |
| 154 | + recording.record_quota_check( |
| 155 | + endpoint_path, |
| 156 | + recording.QUOTA_TYPE_USER_ID, |
| 157 | + recording.QUOTA_RESULT_ERROR, |
| 158 | + time.monotonic() - quota_start_time, |
| 159 | + ) |
| 160 | + raise |
| 161 | + recording.record_quota_check( |
| 162 | + endpoint_path, |
| 163 | + recording.QUOTA_TYPE_USER_ID, |
| 164 | + recording.QUOTA_RESULT_SUCCESS, |
| 165 | + time.monotonic() - quota_start_time, |
| 166 | + ) |
| 167 | + |
| 168 | + |
136 | 169 | responses_response: dict[int | str, dict[str, Any]] = { |
137 | 170 | 200: ResponsesResponse.openapi_response(), |
138 | 171 | 401: UnauthorizedResponse.openapi_response( |
@@ -501,11 +534,12 @@ async def responses_endpoint_handler( |
501 | 534 | started_at = datetime.now(UTC) |
502 | 535 | rh_identity_context = get_rh_identity_context(request) |
503 | 536 | user_id, _, _, token = auth |
| 537 | + endpoint_path = "/v1/responses" |
504 | 538 |
|
505 | 539 | await check_mcp_auth(configuration, mcp_headers, token, request.headers) |
506 | 540 |
|
507 | 541 | # Check token availability |
508 | | - check_tokens_available(configuration.quota_limiters, user_id) |
| 542 | + _check_response_quota(user_id, endpoint_path) |
509 | 543 |
|
510 | 544 | # Enforce RBAC: optionally disallow overriding model in requests |
511 | 545 | validate_model_provider_override( |
|
0 commit comments