Skip to content

Commit 8d54701

Browse files
committed
fix(rlsapi_v1): handle RuntimeError for context length in infer endpoint
- Add RuntimeError catch block matching query.py and streaming_query.py pattern - Return 413 with PromptTooLongResponse when context_length error detected - Re-raise non-context-length RuntimeErrors for proper middleware handling - Add unit tests for both context_length and other RuntimeError scenarios Signed-off-by: Major Hayden <major@redhat.com>
1 parent 81e303a commit 8d54701

2 files changed

Lines changed: 83 additions & 0 deletions

File tree

src/app/endpoints/rlsapi_v1.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from models.responses import (
2525
ForbiddenResponse,
2626
InternalServerErrorResponse,
27+
PromptTooLongResponse,
2728
QuotaExceededResponse,
2829
ServiceUnavailableResponse,
2930
UnauthorizedResponse,
@@ -270,6 +271,24 @@ async def infer_endpoint(
270271
input_source, instructions, tools=mcp_tools
271272
)
272273
inference_time = time.monotonic() - start_time
274+
except RuntimeError as e:
275+
# Library mode wraps HTTP 413 context length errors as RuntimeError
276+
if "context_length" in str(e).lower():
277+
inference_time = time.monotonic() - start_time
278+
metrics.llm_calls_failures_total.inc()
279+
logger.error("Prompt too long for request %s: %s", request_id, e)
280+
_queue_splunk_event(
281+
background_tasks,
282+
infer_request,
283+
request,
284+
request_id,
285+
str(e),
286+
inference_time,
287+
"infer_error",
288+
)
289+
error_response = PromptTooLongResponse(model=_get_default_model_id())
290+
raise HTTPException(**error_response.model_dump()) from e
291+
raise
273292
except APIConnectionError as e:
274293
inference_time = time.monotonic() - start_time
275294
metrics.llm_calls_failures_total.inc()

tests/unit/app/endpoints/test_rlsapi_v1.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,26 @@ def mock_api_connection_error_fixture(mocker: MockerFixture) -> None:
126126
)
127127

128128

129+
@pytest.fixture(name="mock_runtime_error_context_length")
130+
def mock_runtime_error_context_length_fixture(mocker: MockerFixture) -> None:
131+
"""Mock responses.create() to raise RuntimeError with context_length message."""
132+
_setup_responses_mock(
133+
mocker,
134+
mocker.AsyncMock(
135+
side_effect=RuntimeError("context_length exceeded maximum tokens")
136+
),
137+
)
138+
139+
140+
@pytest.fixture(name="mock_runtime_error_other")
141+
def mock_runtime_error_other_fixture(mocker: MockerFixture) -> None:
142+
"""Mock responses.create() to raise RuntimeError with non-context_length message."""
143+
_setup_responses_mock(
144+
mocker,
145+
mocker.AsyncMock(side_effect=RuntimeError("Some other runtime error")),
146+
)
147+
148+
129149
# --- Test _build_instructions ---
130150

131151

@@ -400,6 +420,50 @@ async def test_infer_api_connection_error_returns_503(
400420
assert exc_info.value.status_code == status.HTTP_503_SERVICE_UNAVAILABLE
401421

402422

423+
@pytest.mark.asyncio
424+
async def test_infer_runtime_error_context_length_returns_413(
425+
mocker: MockerFixture,
426+
mock_configuration: AppConfig,
427+
mock_runtime_error_context_length: None,
428+
mock_auth_resolvers: None,
429+
) -> None:
430+
"""Test /infer returns 413 when LLM raises RuntimeError with context_length."""
431+
infer_request = RlsapiV1InferRequest(question="Test question")
432+
mock_request = _create_mock_request(mocker)
433+
mock_background_tasks = _create_mock_background_tasks(mocker)
434+
435+
with pytest.raises(HTTPException) as exc_info:
436+
await infer_endpoint(
437+
infer_request=infer_request,
438+
request=mock_request,
439+
background_tasks=mock_background_tasks,
440+
auth=MOCK_AUTH,
441+
)
442+
443+
assert exc_info.value.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE
444+
445+
446+
@pytest.mark.asyncio
447+
async def test_infer_runtime_error_other_reraises(
448+
mocker: MockerFixture,
449+
mock_configuration: AppConfig,
450+
mock_runtime_error_other: None,
451+
mock_auth_resolvers: None,
452+
) -> None:
453+
"""Test /infer re-raises RuntimeError when not context_length related."""
454+
infer_request = RlsapiV1InferRequest(question="Test question")
455+
mock_request = _create_mock_request(mocker)
456+
mock_background_tasks = _create_mock_background_tasks(mocker)
457+
458+
with pytest.raises(RuntimeError, match="Some other runtime error"):
459+
await infer_endpoint(
460+
infer_request=infer_request,
461+
request=mock_request,
462+
background_tasks=mock_background_tasks,
463+
auth=MOCK_AUTH,
464+
)
465+
466+
403467
@pytest.mark.asyncio
404468
async def test_infer_empty_llm_response_returns_fallback(
405469
mocker: MockerFixture,

0 commit comments

Comments
 (0)