PrimeIntellect-ai
diff --git a/‎src/routers/http/pd_router.rs‎
Lines changed: 52 additions & 33 deletions b/‎src/routers/http/pd_router.rs‎
Lines changed: 52 additions & 33 deletions
diff --git a/‎src/routers/http/router.rs‎
Lines changed: 11 additions & 0 deletions b/‎src/routers/http/router.rs‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/routers/http/vllm_pd_router.rs‎
Lines changed: 86 additions & 82 deletions b/‎src/routers/http/vllm_pd_router.rs‎
Lines changed: 86 additions & 82 deletions
diff --git a/‎src/routers/mod.rs‎
Lines changed: 12 additions & 0 deletions b/‎src/routers/mod.rs‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/routers/router_manager.rs‎
Lines changed: 23 additions & 0 deletions b/‎src/routers/router_manager.rs‎
Lines changed: 23 additions & 0 deletions
@@ -1847,6 +1847,46 @@ impl PDRouter {
         );
         Ok(bytes::Bytes::from(merged_str))
     }
+
+    /// Internal helper for routing chat requests with a configurable backend path.
+    async fn route_chat_with_path(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+        model_id: Option<&str>,
+        run_id: Option<&str>,
+        route: &'static str,
+    ) -> Response {
+        let is_stream = body.stream;
+        let return_logprob = body.logprobs;
+
+        let request_text = if self.policies_need_request_text() {
+            body.messages.first().and_then(|msg| match msg {
+                ChatMessage::User { content, .. } => match content {
+                    UserMessageContent::Text(text) => Some(text.clone()),
+                    UserMessageContent::Parts(_) => None,
+                },
+                ChatMessage::System { content, .. } => Some(content.clone()),
+                _ => None,
+            })
+        } else {
+            None
+        };
+
+        let batch_size = Self::get_chat_batch_size(body);
+
+        let context = PDRequestContext {
+            route,
+            batch_size,
+            is_stream,
+            return_logprob,
+            request_text,
+            model_id,
+            run_id,
+        };
+
+        self.execute_dual_dispatch(headers, body, context).await
+    }
 }
 
 // Helper functions
@@ -2114,40 +2154,19 @@ impl RouterTrait for PDRouter {
         model_id: Option<&str>,
         run_id: Option<&str>,
     ) -> Response {
-        // Extract parameters
-        let is_stream = body.stream;
-        let return_logprob = body.logprobs;
-
-        // Extract text for cache-aware routing
-        let request_text = if self.policies_need_request_text() {
-            body.messages.first().and_then(|msg| match msg {
-                ChatMessage::User { content, .. } => match content {
-                    UserMessageContent::Text(text) => Some(text.clone()),
-                    UserMessageContent::Parts(_) => None,
-                },
-                ChatMessage::System { content, .. } => Some(content.clone()),
-                _ => None,
-            })
-        } else {
-            None
-        };
-
-        // Calculate batch size
-        let batch_size = Self::get_chat_batch_size(body);
-
-        // Create context
-        let context = PDRequestContext {
-            route: "/v1/chat/completions",
-            batch_size,
-            is_stream,
-            return_logprob,
-            request_text,
-            model_id,
-            run_id,
-        };
+        self.route_chat_with_path(headers, body, model_id, run_id, "/v1/chat/completions")
+            .await
+    }
 
-        // Execute with retry and bootstrap injection
-        self.execute_dual_dispatch(headers, body, context).await
+    async fn route_chat_tokens(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+        model_id: Option<&str>,
+        run_id: Option<&str>,
+    ) -> Response {
+        self.route_chat_with_path(headers, body, model_id, run_id, "/v1/chat/completions/tokens")
+            .await
     }
 
     async fn route_completion(
 
@@ -1535,6 +1535,17 @@ impl RouterTrait for Router {
             .await
     }
 
+    async fn route_chat_tokens(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+        model_id: Option<&str>,
+        run_id: Option<&str>,
+    ) -> Response {
+        self.route_typed_request(headers, body, "/v1/chat/completions/tokens", model_id, run_id)
+            .await
+    }
+
     async fn route_completion(
         &self,
         headers: Option<&HeaderMap>,
 
@@ -1270,108 +1270,48 @@ impl VllmPDRouter {
     pub fn worker_registry(&self) -> &crate::core::WorkerRegistry {
         &self.pd_router.worker_registry
     }
-}
-
-// Delegate most RouterTrait methods to the underlying PDRouter,
-// but override specific ones for vLLM behavior
-#[async_trait]
-impl RouterTrait for VllmPDRouter {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    async fn health(&self, req: Request<Body>) -> Response {
-        self.pd_router.health(req).await
-    }
-
-    async fn health_generate(&self, req: Request<Body>) -> Response {
-        self.pd_router.health_generate(req).await
-    }
-
-    async fn get_server_info(&self, req: Request<Body>) -> Response {
-        self.pd_router.get_server_info(req).await
-    }
-
-    async fn get_models(&self, req: Request<Body>) -> Response {
-        self.pd_router.get_models(req).await
-    }
-
-    async fn get_model_info(&self, req: Request<Body>) -> Response {
-        self.pd_router.get_model_info(req).await
-    }
 
-    async fn route_generate(
-        &self,
-        headers: Option<&HeaderMap>,
-        body: &crate::protocols::spec::GenerateRequest,
-        model_id: Option<&str>,
-        run_id: Option<&str>,
-    ) -> Response {
-        self.pd_router
-            .route_generate(headers, body, model_id, run_id)
-            .await
-    }
-
-    // Override OpenAI-compatible routes for vLLM two-stage processing
-    async fn route_chat(
+    /// Internal helper for routing chat requests with a configurable backend path.
+    async fn route_chat_with_path(
         &self,
         headers: Option<&HeaderMap>,
         body: &crate::protocols::spec::ChatCompletionRequest,
-        _model_id: Option<&str>,
         run_id: Option<&str>,
+        route: &str,
     ) -> Response {
         info!(
             "vLLM route_chat called, use_discovery={}",
             self.use_discovery
         );
 
+        let request_json = match serde_json::to_value(body) {
+            Ok(json) => {
+                debug!(
+                    "Serialized chat request: {}",
+                    serde_json::to_string_pretty(&json).unwrap_or_default()
+                );
+                json
+            }
+            Err(e) => {
+                return (
+                    axum::http::StatusCode::INTERNAL_SERVER_ERROR,
+                    format!("Serialization error: {}", e),
+                )
+                    .into_response()
+            }
+        };
+
         if self.use_discovery {
             // Discovery mode - use vLLM-specific two-stage processing
             info!("Using service discovery mode, processing vLLM two-stage request");
 
-            // Convert to generic request and use vLLM processing
-            let request_json = match serde_json::to_value(body) {
-                Ok(json) => {
-                    debug!(
-                        "Serialized chat request: {}",
-                        serde_json::to_string_pretty(&json).unwrap_or_default()
-                    );
-                    json
-                }
-                Err(e) => {
-                    return (
-                        axum::http::StatusCode::INTERNAL_SERVER_ERROR,
-                        format!("Serialization error: {}", e),
-                    )
-                        .into_response()
-                }
-            };
-
             // Process vLLM two-stage request with service discovery
-            self.process_vllm_request(request_json, "/v1/chat/completions", headers, run_id)
+            self.process_vllm_request(request_json, route, headers, run_id)
                 .await
         } else {
             // Direct URL mode - implement routing logic here (not delegating to PDRouter)
             info!("Using direct URL mode with VllmPDRouter's own routing logic");
 
-            // Convert request to JSON
-            let request_json = match serde_json::to_value(body) {
-                Ok(json) => {
-                    debug!(
-                        "Serialized chat request: {}",
-                        serde_json::to_string_pretty(&json).unwrap_or_default()
-                    );
-                    json
-                }
-                Err(e) => {
-                    return (
-                        axum::http::StatusCode::INTERNAL_SERVER_ERROR,
-                        format!("Serialization error: {}", e),
-                    )
-                        .into_response()
-                }
-            };
-
             // Get prefill and decode workers from worker_registry
             let prefill_workers = self.pd_router.worker_registry.get_prefill_workers();
             let decode_workers = self.pd_router.worker_registry.get_decode_workers();
@@ -1463,7 +1403,7 @@ impl RouterTrait for VllmPDRouter {
                     request_json,
                     prefill_worker.clone(),
                     decode_worker.clone(),
-                    "/v1/chat/completions",
+                    route,
                     headers,
                     run_id,
                 )
@@ -1485,6 +1425,70 @@ impl RouterTrait for VllmPDRouter {
             resp
         }
     }
+}
+
+// Delegate most RouterTrait methods to the underlying PDRouter,
+// but override specific ones for vLLM behavior
+#[async_trait]
+impl RouterTrait for VllmPDRouter {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    async fn health(&self, req: Request<Body>) -> Response {
+        self.pd_router.health(req).await
+    }
+
+    async fn health_generate(&self, req: Request<Body>) -> Response {
+        self.pd_router.health_generate(req).await
+    }
+
+    async fn get_server_info(&self, req: Request<Body>) -> Response {
+        self.pd_router.get_server_info(req).await
+    }
+
+    async fn get_models(&self, req: Request<Body>) -> Response {
+        self.pd_router.get_models(req).await
+    }
+
+    async fn get_model_info(&self, req: Request<Body>) -> Response {
+        self.pd_router.get_model_info(req).await
+    }
+
+    async fn route_generate(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &crate::protocols::spec::GenerateRequest,
+        model_id: Option<&str>,
+        run_id: Option<&str>,
+    ) -> Response {
+        self.pd_router
+            .route_generate(headers, body, model_id, run_id)
+            .await
+    }
+
+    // Override OpenAI-compatible routes for vLLM two-stage processing
+    async fn route_chat(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &crate::protocols::spec::ChatCompletionRequest,
+        _model_id: Option<&str>,
+        run_id: Option<&str>,
+    ) -> Response {
+        self.route_chat_with_path(headers, body, run_id, "/v1/chat/completions")
+            .await
+    }
+
+    async fn route_chat_tokens(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &crate::protocols::spec::ChatCompletionRequest,
+        _model_id: Option<&str>,
+        run_id: Option<&str>,
+    ) -> Response {
+        self.route_chat_with_path(headers, body, run_id, "/v1/chat/completions/tokens")
+            .await
+    }
 
     async fn route_completion(
         &self,
 
@@ -82,6 +82,18 @@ pub trait RouterTrait: Send + Sync + Debug + WorkerManagement {
         run_id: Option<&str>,
     ) -> Response;
 
+    /// Route a chat completion tokens (TITO) request.
+    /// Defaults to route_chat; override to forward to /v1/chat/completions/tokens.
+    async fn route_chat_tokens(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+        model_id: Option<&str>,
+        run_id: Option<&str>,
+    ) -> Response {
+        self.route_chat(headers, body, model_id, run_id).await
+    }
+
     /// Route a completion request
     async fn route_completion(
         &self,
 
@@ -612,6 +612,29 @@ impl RouterTrait for RouterManager {
         }
     }
 
+    async fn route_chat_tokens(
+        &self,
+        headers: Option<&HeaderMap>,
+        body: &ChatCompletionRequest,
+        _model_id: Option<&str>,
+        _run_id: Option<&str>,
+    ) -> Response {
+        let model_id = body.model.as_deref();
+        let router = self.select_router_for_request(headers, model_id);
+
+        if let Some(router) = router {
+            router
+                .route_chat_tokens(headers, body, model_id, _run_id)
+                .await
+        } else {
+            let msg = match model_id {
+                Some(m) => format!("Model '{}' not found or no router available", m),
+                None => "No routers registered to handle this request".to_string(),
+            };
+            (StatusCode::NOT_FOUND, msg).into_response()
+        }
+    }
+
     /// Route a completion request
     async fn route_completion(
         &self,