Skip to content

Commit bf5e83d

Browse files
authored
treat vLLM input validation 500s as 400 for circuit breaker (#24)
* treat vLLM input validation 500s as 400 for circuit breaker vLLM returns 500 for prompt-too-long errors instead of 400. This causes the circuit breaker to penalise healthy workers for bad client input. Rewrite the status to 400 when the response body matches known input validation patterns. * handle vLLM input validation 500s for streaming requests too Move the 500 body inspection before the stream/non-stream branch so both paths get the 400 rewrite. vLLM error responses are always synchronous JSON even when the client requested streaming. * handle 500 body read errors explicitly instead of unwrap_or_default Log the transport error and return a diagnostic 500 to the caller rather than silently forwarding an empty body with stale headers. * fix double load decrement for 500 responses Only decrement load in the early-return path for rewritten 400s (input validation). Genuine 500s are retryable and the caller retry closure already handles their load cleanup. Also properly handle body read errors without swallowing them.
1 parent c33849a commit bf5e83d

1 file changed

Lines changed: 83 additions & 0 deletions

File tree

src/routers/http/router.rs

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,25 @@ pub struct Router {
4949
_load_monitor_handle: Option<Arc<tokio::task::JoinHandle<()>>>,
5050
}
5151

52+
/// Check if a 500 response body is actually a vLLM input validation error
53+
/// that should be treated as a 400 (client error) instead.
54+
///
55+
/// vLLM incorrectly returns 500 for several request validation failures
56+
/// (e.g. prompt exceeding max context length). These should not count as
57+
/// backend failures for circuit breaker purposes.
58+
fn is_vllm_input_validation_error(body: &[u8]) -> bool {
59+
const PATTERNS: &[&str] = &[
60+
"exceeds the model's maximum context length",
61+
"Please reduce the length of the input prompt",
62+
"This model's maximum context length is",
63+
];
64+
65+
if let Ok(text) = std::str::from_utf8(body) {
66+
return PATTERNS.iter().any(|p| text.contains(p));
67+
}
68+
false
69+
}
70+
5271
impl Router {
5372
/// Create a new router with injected policy and client
5473
#[allow(clippy::too_many_arguments)]
@@ -890,6 +909,53 @@ impl Router {
890909
let status = StatusCode::from_u16(res.status().as_u16())
891910
.unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
892911

912+
// vLLM returns 500 for input validation errors (e.g. prompt exceeding
913+
// max context length) that are really client errors. These are always
914+
// synchronous JSON bodies even when the client requested streaming.
915+
// Read the body, rewrite to 400, and return so the circuit breaker
916+
// doesn't penalise the worker for bad input.
917+
if status == StatusCode::INTERNAL_SERVER_ERROR {
918+
let response_headers = header_utils::preserve_response_headers(res.headers());
919+
match res.bytes().await {
920+
Ok(body) => {
921+
let status = if is_vllm_input_validation_error(&body) {
922+
tracing::debug!(
923+
"Rewriting vLLM input validation 500 to 400 for worker_url={}",
924+
worker_url
925+
);
926+
StatusCode::BAD_REQUEST
927+
} else {
928+
StatusCode::INTERNAL_SERVER_ERROR
929+
};
930+
// For rewritten 400s (input validation), decrement load here
931+
// since the caller only decrements for retryable statuses
932+
// (400 is not retryable) and we bypass the normal non-streaming
933+
// cleanup. For genuine 500s, the caller's retry closure handles it.
934+
if status == StatusCode::BAD_REQUEST && load_incremented {
935+
if let Some(worker) = self.worker_registry.get_by_url(worker_url) {
936+
worker.decrement_load();
937+
RouterMetrics::set_running_requests(worker_url, worker.load());
938+
}
939+
}
940+
let mut response = Response::new(axum::body::Body::from(body));
941+
*response.status_mut() = status;
942+
*response.headers_mut() = response_headers;
943+
return response;
944+
}
945+
Err(e) => {
946+
tracing::error!(
947+
"Failed to read 500 response body from worker_url={}: {}",
948+
worker_url, e
949+
);
950+
return (
951+
StatusCode::INTERNAL_SERVER_ERROR,
952+
format!("Failed to read upstream response: {}", e),
953+
)
954+
.into_response();
955+
}
956+
}
957+
}
958+
893959
if !is_stream {
894960
// For non-streaming requests, preserve headers
895961
let response_headers = header_utils::preserve_response_headers(res.headers());
@@ -1887,6 +1953,23 @@ mod tests {
18871953
}
18881954
}
18891955

1956+
#[test]
1957+
fn test_is_vllm_input_validation_error() {
1958+
// Prompt too long error from vLLM
1959+
let body = br#"{"error":{"message":"The prompt is 65537 tokens, which exceeds the model's maximum context length of 65536 tokens. Please reduce the length of the input prompt.","type":"Internal Server Error","param":null,"code":500}}"#;
1960+
assert!(is_vllm_input_validation_error(body));
1961+
1962+
// Actual server error should not match
1963+
let body = br#"{"error":{"message":"CUDA out of memory","type":"Internal Server Error","param":null,"code":500}}"#;
1964+
assert!(!is_vllm_input_validation_error(body));
1965+
1966+
// Empty body
1967+
assert!(!is_vllm_input_validation_error(b""));
1968+
1969+
// Non-UTF8
1970+
assert!(!is_vllm_input_validation_error(&[0xFF, 0xFE]));
1971+
}
1972+
18901973
#[test]
18911974
fn test_router_get_worker_urls_regular() {
18921975
let router = create_test_regular_router();

0 commit comments

Comments
 (0)