fix(rlsapi_v1): handle RuntimeError for context length in infer endpoint

major · major · commit 8d5470145679 · 2026-02-10T15:14:49.000-06:00
- Add RuntimeError catch block matching query.py and streaming_query.py pattern
- Return 413 with PromptTooLongResponse when context_length error detected
- Re-raise non-context-length RuntimeErrors for proper middleware handling
- Add unit tests for both context_length and other RuntimeError scenarios

Signed-off-by: Major Hayden &lt;major@redhat.com&gt;
diff --git a/src/app/endpoints/rlsapi_v1.py b/src/app/endpoints/rlsapi_v1.py
@@ -24,6 +24,7 @@
 from models.responses import (
     ForbiddenResponse,
     InternalServerErrorResponse,
+    PromptTooLongResponse,
     QuotaExceededResponse,
     ServiceUnavailableResponse,
     UnauthorizedResponse,
@@ -270,6 +271,24 @@ async def infer_endpoint(
             input_source, instructions, tools=mcp_tools
         )
         inference_time = time.monotonic() - start_time
+    except RuntimeError as e:
+        # Library mode wraps HTTP 413 context length errors as RuntimeError
+        if "context_length" in str(e).lower():
+            inference_time = time.monotonic() - start_time
+            metrics.llm_calls_failures_total.inc()
+            logger.error("Prompt too long for request %s: %s", request_id, e)
+            _queue_splunk_event(
+                background_tasks,
+                infer_request,
+                request,
+                request_id,
+                str(e),
+                inference_time,
+                "infer_error",
+            )
+            error_response = PromptTooLongResponse(model=_get_default_model_id())
+            raise HTTPException(**error_response.model_dump()) from e
+        raise
     except APIConnectionError as e:
         inference_time = time.monotonic() - start_time
         metrics.llm_calls_failures_total.inc()
diff --git a/tests/unit/app/endpoints/test_rlsapi_v1.py b/tests/unit/app/endpoints/test_rlsapi_v1.py
@@ -126,6 +126,26 @@ def mock_api_connection_error_fixture(mocker: MockerFixture) -> None:
     )
 
 
+@pytest.fixture(name="mock_runtime_error_context_length")
+def mock_runtime_error_context_length_fixture(mocker: MockerFixture) -> None:
+    """Mock responses.create() to raise RuntimeError with context_length message."""
+    _setup_responses_mock(
+        mocker,
+        mocker.AsyncMock(
+            side_effect=RuntimeError("context_length exceeded maximum tokens")
+        ),
+    )
+
+
+@pytest.fixture(name="mock_runtime_error_other")
+def mock_runtime_error_other_fixture(mocker: MockerFixture) -> None:
+    """Mock responses.create() to raise RuntimeError with non-context_length message."""
+    _setup_responses_mock(
+        mocker,
+        mocker.AsyncMock(side_effect=RuntimeError("Some other runtime error")),
+    )
+
+
 # --- Test _build_instructions ---
 
 
@@ -400,6 +420,50 @@ async def test_infer_api_connection_error_returns_503(
     assert exc_info.value.status_code == status.HTTP_503_SERVICE_UNAVAILABLE
 
 
+@pytest.mark.asyncio
+async def test_infer_runtime_error_context_length_returns_413(
+    mocker: MockerFixture,
+    mock_configuration: AppConfig,
+    mock_runtime_error_context_length: None,
+    mock_auth_resolvers: None,
+) -> None:
+    """Test /infer returns 413 when LLM raises RuntimeError with context_length."""
+    infer_request = RlsapiV1InferRequest(question="Test question")
+    mock_request = _create_mock_request(mocker)
+    mock_background_tasks = _create_mock_background_tasks(mocker)
+
+    with pytest.raises(HTTPException) as exc_info:
+        await infer_endpoint(
+            infer_request=infer_request,
+            request=mock_request,
+            background_tasks=mock_background_tasks,
+            auth=MOCK_AUTH,
+        )
+
+    assert exc_info.value.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE
+
+
+@pytest.mark.asyncio
+async def test_infer_runtime_error_other_reraises(
+    mocker: MockerFixture,
+    mock_configuration: AppConfig,
+    mock_runtime_error_other: None,
+    mock_auth_resolvers: None,
+) -> None:
+    """Test /infer re-raises RuntimeError when not context_length related."""
+    infer_request = RlsapiV1InferRequest(question="Test question")
+    mock_request = _create_mock_request(mocker)
+    mock_background_tasks = _create_mock_background_tasks(mocker)
+
+    with pytest.raises(RuntimeError, match="Some other runtime error"):
+        await infer_endpoint(
+            infer_request=infer_request,
+            request=mock_request,
+            background_tasks=mock_background_tasks,
+            auth=MOCK_AUTH,
+        )
+
+
 @pytest.mark.asyncio
 async def test_infer_empty_llm_response_returns_fallback(
     mocker: MockerFixture,