treat vLLM input validation 500s as 400 for circuit breaker (#24)

JannikSt · web-flow · commit bf5e83db2650 · 2026-04-15T13:50:27.000+02:00
* treat vLLM input validation 500s as 400 for circuit breaker

vLLM returns 500 for prompt-too-long errors instead of 400. This
causes the circuit breaker to penalise healthy workers for bad client
input. Rewrite the status to 400 when the response body matches known
input validation patterns.

* handle vLLM input validation 500s for streaming requests too

Move the 500 body inspection before the stream/non-stream branch so
both paths get the 400 rewrite. vLLM error responses are always
synchronous JSON even when the client requested streaming.

* handle 500 body read errors explicitly instead of unwrap_or_default

Log the transport error and return a diagnostic 500 to the caller
rather than silently forwarding an empty body with stale headers.

* fix double load decrement for 500 responses

Only decrement load in the early-return path for rewritten 400s
(input validation). Genuine 500s are retryable and the caller
retry closure already handles their load cleanup. Also properly
handle body read errors without swallowing them.
diff --git a/src/routers/http/router.rs b/src/routers/http/router.rs
@@ -49,6 +49,25 @@ pub struct Router {
     _load_monitor_handle: Option<Arc<tokio::task::JoinHandle<()>>>,
 }
 
+/// Check if a 500 response body is actually a vLLM input validation error
+/// that should be treated as a 400 (client error) instead.
+///
+/// vLLM incorrectly returns 500 for several request validation failures
+/// (e.g. prompt exceeding max context length). These should not count as
+/// backend failures for circuit breaker purposes.
+fn is_vllm_input_validation_error(body: &[u8]) -> bool {
+    const PATTERNS: &[&str] = &[
+        "exceeds the model's maximum context length",
+        "Please reduce the length of the input prompt",
+        "This model's maximum context length is",
+    ];
+
+    if let Ok(text) = std::str::from_utf8(body) {
+        return PATTERNS.iter().any(|p| text.contains(p));
+    }
+    false
+}
+
 impl Router {
     /// Create a new router with injected policy and client
     #[allow(clippy::too_many_arguments)]
@@ -890,6 +909,53 @@ impl Router {
         let status = StatusCode::from_u16(res.status().as_u16())
             .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
 
+        // vLLM returns 500 for input validation errors (e.g. prompt exceeding
+        // max context length) that are really client errors. These are always
+        // synchronous JSON bodies even when the client requested streaming.
+        // Read the body, rewrite to 400, and return so the circuit breaker
+        // doesn't penalise the worker for bad input.
+        if status == StatusCode::INTERNAL_SERVER_ERROR {
+            let response_headers = header_utils::preserve_response_headers(res.headers());
+            match res.bytes().await {
+                Ok(body) => {
+                    let status = if is_vllm_input_validation_error(&body) {
+                        tracing::debug!(
+                            "Rewriting vLLM input validation 500 to 400 for worker_url={}",
+                            worker_url
+                        );
+                        StatusCode::BAD_REQUEST
+                    } else {
+                        StatusCode::INTERNAL_SERVER_ERROR
+                    };
+                    // For rewritten 400s (input validation), decrement load here
+                    // since the caller only decrements for retryable statuses
+                    // (400 is not retryable) and we bypass the normal non-streaming
+                    // cleanup. For genuine 500s, the caller's retry closure handles it.
+                    if status == StatusCode::BAD_REQUEST && load_incremented {
+                        if let Some(worker) = self.worker_registry.get_by_url(worker_url) {
+                            worker.decrement_load();
+                            RouterMetrics::set_running_requests(worker_url, worker.load());
+                        }
+                    }
+                    let mut response = Response::new(axum::body::Body::from(body));
+                    *response.status_mut() = status;
+                    *response.headers_mut() = response_headers;
+                    return response;
+                }
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to read 500 response body from worker_url={}: {}",
+                        worker_url, e
+                    );
+                    return (
+                        StatusCode::INTERNAL_SERVER_ERROR,
+                        format!("Failed to read upstream response: {}", e),
+                    )
+                        .into_response();
+                }
+            }
+        }
+
         if !is_stream {
             // For non-streaming requests, preserve headers
             let response_headers = header_utils::preserve_response_headers(res.headers());
@@ -1887,6 +1953,23 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_is_vllm_input_validation_error() {
+        // Prompt too long error from vLLM
+        let body = br#"{"error":{"message":"The prompt is 65537 tokens, which exceeds the model's maximum context length of 65536 tokens. Please reduce the length of the input prompt.","type":"Internal Server Error","param":null,"code":500}}"#;
+        assert!(is_vllm_input_validation_error(body));
+
+        // Actual server error should not match
+        let body = br#"{"error":{"message":"CUDA out of memory","type":"Internal Server Error","param":null,"code":500}}"#;
+        assert!(!is_vllm_input_validation_error(body));
+
+        // Empty body
+        assert!(!is_vllm_input_validation_error(b""));
+
+        // Non-UTF8
+        assert!(!is_vllm_input_validation_error(&[0xFF, 0xFE]));
+    }
+
     #[test]
     fn test_router_get_worker_urls_regular() {
         let router = create_test_regular_router();