ENG-3780: use model id in body for routing (#40)

eexwhyzee · web-flow · commit c97c7746ec7d · 2026-05-29T07:23:20.000+02:00
* use model id in body for routing

* Fix body-model routing fallback for LoRA-aware workers

Use request body models as worker filters only when the router registry
has indexed that model, while keeping run-scoped requests strict. Treat
rerank's "default" sentinel as unspecified so omitted-model rerank
requests continue to route normally.

Also sync all advertised worker models during startup and add_worker so
workers serving multiple models or LoRA adapters are routable
immediately, before the next health refresh.

Adds regression coverage for indexed LoRA routing, rerank default
handling, and multi-model worker registration.
diff --git a/src/routers/http/router.rs b/src/routers/http/router.rs
@@ -9,7 +9,7 @@ use crate::otel_http::{self, ClientRequestOptions};
 use crate::policies::{LoadBalancingPolicy, PolicyRegistry};
 use crate::protocols::spec::{
     ChatCompletionRequest, CompletionRequest, EmbeddingRequest, GenerateRequest, GenerationRequest,
-    RerankRequest, RerankResponse, RerankResult, ResponsesRequest,
+    RerankRequest, RerankResponse, RerankResult, ResponsesRequest, DEFAULT_MODEL_NAME,
 };
 use crate::routers::header_utils;
 use crate::routers::http::dp_utils;
@@ -154,6 +154,10 @@ impl Router {
                 )
             };
             ctx.worker_registry.register(worker_arc.clone());
+            if !models.is_empty() {
+                ctx.worker_registry
+                    .sync_worker_models(worker_arc.url(), &models);
+            }
 
             // Notify PolicyRegistry about the new worker
             let model_id = worker_arc.model_id();
@@ -538,6 +542,39 @@ impl Router {
         })
     }
 
+    fn normalize_model_id(model_id: Option<&str>) -> Option<&str> {
+        let model = model_id?.trim();
+        (!model.is_empty()).then_some(model)
+    }
+
+    fn body_model_for_route<'a>(route: &str, model_id: Option<&'a str>) -> Option<&'a str> {
+        let model = Self::normalize_model_id(model_id)?;
+        if route == "/v1/rerank" && model == DEFAULT_MODEL_NAME {
+            return None;
+        }
+        Some(model)
+    }
+
+    fn resolve_body_model_filter<'a>(
+        &self,
+        route: &str,
+        model_id: Option<&'a str>,
+        run_id: Option<&str>,
+    ) -> Option<&'a str> {
+        let model = Self::body_model_for_route(route, model_id)?;
+
+        if run_id.is_some() || !self.worker_registry.get_by_model_fast(model).is_empty() {
+            return Some(model);
+        }
+
+        debug!(
+            model_id = %model,
+            route,
+            "body model is not indexed; routing without a model filter"
+        );
+        None
+    }
+
     /// Select worker for a specific model considering circuit breaker state
     fn select_worker_for_model(
         &self,
@@ -586,11 +623,24 @@ impl Router {
         let text = typed_req.extract_text_for_routing();
         let run_id = run_id.map(|s| s.to_string());
 
+        // Fall back to the body's `model` field when the caller doesn't pass one, but
+        // only use it as a routing filter when the registry has already indexed that
+        // model. This keeps compatibility for generic upstream model validation while
+        // still preventing known LoRA requests from being sent to workers that have not
+        // loaded the adapter. Run-scoped requests keep the body model as a hard filter.
+        let effective_model_id = Self::normalize_model_id(model_id).or_else(|| {
+            self.resolve_body_model_filter(route, typed_req.get_model(), run_id.as_deref())
+        });
+
         let response = RetryExecutor::execute_response_with_retry(
             &self.retry_config,
             // operation per attempt
             |_: u32| async {
-                let worker = match self.select_worker_for_model(model_id, Some(&text), headers) {
+                let worker = match self.select_worker_for_model(
+                    effective_model_id,
+                    Some(&text),
+                    headers,
+                ) {
                     Some(w) => w,
                     None => {
                         RouterMetrics::record_request_error(route, "no_available_workers");
@@ -604,7 +654,7 @@ impl Router {
 
                 // Optional load tracking for cache-aware policy
                 // Get the policy for this model to check if it's cache-aware
-                let policy = match model_id {
+                let policy = match effective_model_id {
                     Some(model) => self.policy_registry.get_policy_or_default(model),
                     None => self.policy_registry.get_default_policy(),
                 };
@@ -1209,6 +1259,10 @@ impl Router {
 
                                 let worker_arc: Arc<dyn Worker> = Arc::new(new_worker);
                                 self.worker_registry.register(worker_arc.clone());
+                                if !models.is_empty() {
+                                    self.worker_registry
+                                        .sync_worker_models(worker_arc.url(), &models);
+                                }
 
                                 // Notify PolicyRegistry about the new worker
                                 let model_id = worker_arc.model_id();
@@ -1246,6 +1300,10 @@ impl Router {
 
                             let worker_arc = Arc::new(new_worker);
                             self.worker_registry.register(worker_arc.clone());
+                            if !models.is_empty() {
+                                self.worker_registry
+                                    .sync_worker_models(worker_arc.url(), &models);
+                            }
 
                             // Notify PolicyRegistry about the new worker
                             let model_id = worker_arc.model_id();
@@ -2273,6 +2331,268 @@ mod tests {
         );
     }
 
+    /// Verify select_worker_for_model only picks workers whose model_index
+    /// includes the requested model. This is the safety property the
+    /// `effective_model_id` fallback in route_typed_request depends on: a
+    /// request body that asks for a LoRA adapter must never be dispatched
+    /// to a pod that hasn't loaded it.
+    #[test]
+    fn test_select_worker_filters_to_pods_with_loaded_lora() {
+        let router = create_test_regular_router();
+
+        // Both workers initially serve the base model only (this matches the
+        // state of a freshly-scaled-up vLLM pod before the orchestrator has
+        // pushed the LoRA adapter).
+        router
+            .worker_registry
+            .sync_worker_models("http://worker1:8080", &["base-model".to_string()]);
+        router
+            .worker_registry
+            .sync_worker_models("http://worker2:8080", &["base-model".to_string()]);
+
+        // Simulate orchestrator pushing the LoRA to worker1 only.
+        router.worker_registry.sync_worker_models(
+            "http://worker1:8080",
+            &["base-model".to_string(), "rft-run-1".to_string()],
+        );
+
+        // A request for the LoRA must land on worker1.
+        for _ in 0..20 {
+            let worker = router
+                .select_worker_for_model(Some("rft-run-1"), Some(r#"{"prompt":"x"}"#), None)
+                .expect("a worker is available");
+            assert_eq!(
+                worker.url(),
+                "http://worker1:8080",
+                "LoRA request leaked to a worker without the adapter loaded"
+            );
+        }
+
+        // The base model should still see both workers.
+        let base_workers = router.worker_registry.get_by_model_fast("base-model");
+        assert_eq!(base_workers.len(), 2);
+    }
+
+    #[test]
+    fn test_body_model_filter_only_uses_indexed_models() {
+        let router = create_test_regular_router();
+
+        router.worker_registry.sync_worker_models(
+            "http://worker1:8080",
+            &["base-model".to_string(), "rft-run-1".to_string()],
+        );
+
+        assert_eq!(
+            router.resolve_body_model_filter(
+                "/v1/chat/completions",
+                Some("rft-run-1"),
+                None,
+            ),
+            Some("rft-run-1")
+        );
+        assert_eq!(
+            router.resolve_body_model_filter(
+                "/v1/chat/completions",
+                Some("not-indexed"),
+                None,
+            ),
+            None
+        );
+        assert_eq!(
+            router.resolve_body_model_filter(
+                "/v1/chat/completions",
+                Some("not-indexed"),
+                Some("run-123"),
+            ),
+            Some("not-indexed")
+        );
+        assert_eq!(
+            router.resolve_body_model_filter("/v1/rerank", Some(DEFAULT_MODEL_NAME), None),
+            None
+        );
+    }
+
+    async fn start_counting_chat_worker(
+        request_count: Arc<std::sync::atomic::AtomicUsize>,
+    ) -> (String, tokio::task::JoinHandle<()>) {
+        use axum::{routing::post, Json, Router as AxumRouter};
+        use tokio::net::TcpListener;
+
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        let app = AxumRouter::new().route(
+            "/v1/chat/completions",
+            post(move |Json(body): Json<serde_json::Value>| {
+                let request_count = request_count.clone();
+                async move {
+                    request_count.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+                    let model = body
+                        .get("model")
+                        .and_then(|v| v.as_str())
+                        .unwrap_or("unknown")
+                        .to_string();
+                    Json(serde_json::json!({
+                        "id": "chatcmpl-test",
+                        "object": "chat.completion",
+                        "created": 0,
+                        "model": model,
+                        "choices": [{
+                            "index": 0,
+                            "message": {
+                                "role": "assistant",
+                                "content": "ok"
+                            },
+                            "finish_reason": "stop"
+                        }],
+                        "usage": {
+                            "prompt_tokens": 1,
+                            "completion_tokens": 1,
+                            "total_tokens": 2
+                        }
+                    }))
+                }
+            }),
+        );
+        let handle = tokio::spawn(async move {
+            axum::serve(listener, app).await.unwrap();
+        });
+        tokio::time::sleep(std::time::Duration::from_millis(10)).await;
+        (format!("http://{}", addr), handle)
+    }
+
+    async fn start_model_listing_worker(
+        models: Vec<&'static str>,
+    ) -> (String, tokio::task::JoinHandle<()>) {
+        use axum::{routing::get, Json, Router as AxumRouter};
+        use tokio::net::TcpListener;
+
+        let models: Vec<String> = models.into_iter().map(String::from).collect();
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        let app = AxumRouter::new()
+            .route("/health", get(|| async { StatusCode::OK }))
+            .route(
+                "/v1/models",
+                get(move || {
+                    let models = models.clone();
+                    async move {
+                        let data: Vec<serde_json::Value> = models
+                            .iter()
+                            .map(|id| serde_json::json!({"id": id, "object": "model"}))
+                            .collect();
+                        Json(serde_json::json!({"object": "list", "data": data}))
+                    }
+                }),
+            );
+        let handle = tokio::spawn(async move {
+            axum::serve(listener, app).await.unwrap();
+        });
+        tokio::time::sleep(std::time::Duration::from_millis(10)).await;
+        (format!("http://{}", addr), handle)
+    }
+
+    #[tokio::test]
+    async fn test_route_chat_with_body_model_filters_to_loaded_lora_worker() {
+        let base_only_count = Arc::new(std::sync::atomic::AtomicUsize::new(0));
+        let lora_count = Arc::new(std::sync::atomic::AtomicUsize::new(0));
+        let (base_only_url, _base_only_handle) =
+            start_counting_chat_worker(base_only_count.clone()).await;
+        let (lora_url, _lora_handle) = start_counting_chat_worker(lora_count.clone()).await;
+
+        let worker_registry = Arc::new(WorkerRegistry::new());
+        worker_registry.register(Arc::new(BasicWorker::new(
+            base_only_url.clone(),
+            WorkerType::Regular,
+        )));
+        worker_registry.register(Arc::new(BasicWorker::new(
+            lora_url.clone(),
+            WorkerType::Regular,
+        )));
+
+        worker_registry.sync_worker_models(&base_only_url, &["base-model".to_string()]);
+        worker_registry.sync_worker_models(
+            &lora_url,
+            &["base-model".to_string(), "rft-run-1".to_string()],
+        );
+
+        let policy_registry = Arc::new(PolicyRegistry::new(
+            crate::config::types::PolicyConfig::RoundRobin,
+        ));
+        let (_, rx) = tokio::sync::watch::channel(HashMap::new());
+        let router = Router {
+            worker_registry,
+            policy_registry,
+            worker_startup_timeout_secs: 5,
+            worker_startup_check_interval_secs: 1,
+            intra_node_data_parallel_size: 1,
+            api_key: None,
+            client: Client::new(),
+            retry_config: RetryConfig::default(),
+            circuit_breaker_config: CircuitBreakerConfig::default(),
+            _worker_loads: Arc::new(rx),
+            _load_monitor_handle: None,
+        };
+
+        let request: ChatCompletionRequest = serde_json::from_value(serde_json::json!({
+            "model": "rft-run-1",
+            "messages": [{"role": "user", "content": "hello"}]
+        }))
+        .unwrap();
+
+        for _ in 0..6 {
+            let response = router.route_chat(None, &request, None, None).await;
+            assert_eq!(response.status(), StatusCode::OK);
+        }
+
+        assert_eq!(
+            base_only_count.load(std::sync::atomic::Ordering::SeqCst),
+            0,
+            "body model was ignored and the request reached a worker without the LoRA"
+        );
+        assert_eq!(
+            lora_count.load(std::sync::atomic::Ordering::SeqCst),
+            6,
+            "all requests should be routed to the worker indexed for the requested LoRA"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_router_new_indexes_all_discovered_models_immediately() {
+        let (url, _handle) =
+            start_model_listing_worker(vec!["base-model", "rft-run-1"]).await;
+        let config = crate::config::types::RouterConfig {
+            mode: crate::config::types::RoutingMode::Regular {
+                worker_urls: vec![url.clone()],
+            },
+            policy: crate::config::types::PolicyConfig::RoundRobin,
+            worker_startup_timeout_secs: 2,
+            worker_startup_check_interval_secs: 1,
+            ..Default::default()
+        };
+        let ctx = Arc::new(
+            crate::server::AppContext::new(
+                config.clone(),
+                Client::new(),
+                config.max_concurrent_requests,
+                config.rate_limit_tokens_per_second,
+                config.api_key_validation_urls.clone(),
+            )
+            .unwrap(),
+        );
+
+        let router = Router::new(vec![url.clone()], &ctx).await.unwrap();
+
+        let base_workers = router.worker_registry.get_by_model_fast("base-model");
+        let lora_workers = router.worker_registry.get_by_model_fast("rft-run-1");
+        assert_eq!(base_workers.len(), 1);
+        assert_eq!(lora_workers.len(), 1);
+
+        let worker = router
+            .select_worker_for_model(Some("rft-run-1"), Some(r#"{"prompt":"x"}"#), None)
+            .expect("LoRA model should be routable immediately after registration");
+        assert_eq!(worker.url(), url);
+    }
+
     #[test]
     fn test_inline_header_conversion_matches_headers_to_request_headers() {
         // Verify that the inline header conversion pattern used in pd_router and