@@ -1270,108 +1270,48 @@ impl VllmPDRouter {
12701270 pub fn worker_registry ( & self ) -> & crate :: core:: WorkerRegistry {
12711271 & self . pd_router . worker_registry
12721272 }
1273- }
1274-
1275- // Delegate most RouterTrait methods to the underlying PDRouter,
1276- // but override specific ones for vLLM behavior
1277- #[ async_trait]
1278- impl RouterTrait for VllmPDRouter {
1279- fn as_any ( & self ) -> & dyn std:: any:: Any {
1280- self
1281- }
1282-
1283- async fn health ( & self , req : Request < Body > ) -> Response {
1284- self . pd_router . health ( req) . await
1285- }
1286-
1287- async fn health_generate ( & self , req : Request < Body > ) -> Response {
1288- self . pd_router . health_generate ( req) . await
1289- }
1290-
1291- async fn get_server_info ( & self , req : Request < Body > ) -> Response {
1292- self . pd_router . get_server_info ( req) . await
1293- }
1294-
1295- async fn get_models ( & self , req : Request < Body > ) -> Response {
1296- self . pd_router . get_models ( req) . await
1297- }
1298-
1299- async fn get_model_info ( & self , req : Request < Body > ) -> Response {
1300- self . pd_router . get_model_info ( req) . await
1301- }
13021273
1303- async fn route_generate (
1304- & self ,
1305- headers : Option < & HeaderMap > ,
1306- body : & crate :: protocols:: spec:: GenerateRequest ,
1307- model_id : Option < & str > ,
1308- run_id : Option < & str > ,
1309- ) -> Response {
1310- self . pd_router
1311- . route_generate ( headers, body, model_id, run_id)
1312- . await
1313- }
1314-
1315- // Override OpenAI-compatible routes for vLLM two-stage processing
1316- async fn route_chat (
1274+ /// Internal helper for routing chat requests with a configurable backend path.
1275+ async fn route_chat_with_path (
13171276 & self ,
13181277 headers : Option < & HeaderMap > ,
13191278 body : & crate :: protocols:: spec:: ChatCompletionRequest ,
1320- _model_id : Option < & str > ,
13211279 run_id : Option < & str > ,
1280+ route : & str ,
13221281 ) -> Response {
13231282 info ! (
13241283 "vLLM route_chat called, use_discovery={}" ,
13251284 self . use_discovery
13261285 ) ;
13271286
1287+ let request_json = match serde_json:: to_value ( body) {
1288+ Ok ( json) => {
1289+ debug ! (
1290+ "Serialized chat request: {}" ,
1291+ serde_json:: to_string_pretty( & json) . unwrap_or_default( )
1292+ ) ;
1293+ json
1294+ }
1295+ Err ( e) => {
1296+ return (
1297+ axum:: http:: StatusCode :: INTERNAL_SERVER_ERROR ,
1298+ format ! ( "Serialization error: {}" , e) ,
1299+ )
1300+ . into_response ( )
1301+ }
1302+ } ;
1303+
13281304 if self . use_discovery {
13291305 // Discovery mode - use vLLM-specific two-stage processing
13301306 info ! ( "Using service discovery mode, processing vLLM two-stage request" ) ;
13311307
1332- // Convert to generic request and use vLLM processing
1333- let request_json = match serde_json:: to_value ( body) {
1334- Ok ( json) => {
1335- debug ! (
1336- "Serialized chat request: {}" ,
1337- serde_json:: to_string_pretty( & json) . unwrap_or_default( )
1338- ) ;
1339- json
1340- }
1341- Err ( e) => {
1342- return (
1343- axum:: http:: StatusCode :: INTERNAL_SERVER_ERROR ,
1344- format ! ( "Serialization error: {}" , e) ,
1345- )
1346- . into_response ( )
1347- }
1348- } ;
1349-
13501308 // Process vLLM two-stage request with service discovery
1351- self . process_vllm_request ( request_json, "/v1/chat/completions" , headers, run_id)
1309+ self . process_vllm_request ( request_json, route , headers, run_id)
13521310 . await
13531311 } else {
13541312 // Direct URL mode - implement routing logic here (not delegating to PDRouter)
13551313 info ! ( "Using direct URL mode with VllmPDRouter's own routing logic" ) ;
13561314
1357- // Convert request to JSON
1358- let request_json = match serde_json:: to_value ( body) {
1359- Ok ( json) => {
1360- debug ! (
1361- "Serialized chat request: {}" ,
1362- serde_json:: to_string_pretty( & json) . unwrap_or_default( )
1363- ) ;
1364- json
1365- }
1366- Err ( e) => {
1367- return (
1368- axum:: http:: StatusCode :: INTERNAL_SERVER_ERROR ,
1369- format ! ( "Serialization error: {}" , e) ,
1370- )
1371- . into_response ( )
1372- }
1373- } ;
1374-
13751315 // Get prefill and decode workers from worker_registry
13761316 let prefill_workers = self . pd_router . worker_registry . get_prefill_workers ( ) ;
13771317 let decode_workers = self . pd_router . worker_registry . get_decode_workers ( ) ;
@@ -1463,7 +1403,7 @@ impl RouterTrait for VllmPDRouter {
14631403 request_json,
14641404 prefill_worker. clone ( ) ,
14651405 decode_worker. clone ( ) ,
1466- "/v1/chat/completions" ,
1406+ route ,
14671407 headers,
14681408 run_id,
14691409 )
@@ -1485,6 +1425,70 @@ impl RouterTrait for VllmPDRouter {
14851425 resp
14861426 }
14871427 }
1428+ }
1429+
1430+ // Delegate most RouterTrait methods to the underlying PDRouter,
1431+ // but override specific ones for vLLM behavior
1432+ #[ async_trait]
1433+ impl RouterTrait for VllmPDRouter {
1434+ fn as_any ( & self ) -> & dyn std:: any:: Any {
1435+ self
1436+ }
1437+
1438+ async fn health ( & self , req : Request < Body > ) -> Response {
1439+ self . pd_router . health ( req) . await
1440+ }
1441+
1442+ async fn health_generate ( & self , req : Request < Body > ) -> Response {
1443+ self . pd_router . health_generate ( req) . await
1444+ }
1445+
1446+ async fn get_server_info ( & self , req : Request < Body > ) -> Response {
1447+ self . pd_router . get_server_info ( req) . await
1448+ }
1449+
1450+ async fn get_models ( & self , req : Request < Body > ) -> Response {
1451+ self . pd_router . get_models ( req) . await
1452+ }
1453+
1454+ async fn get_model_info ( & self , req : Request < Body > ) -> Response {
1455+ self . pd_router . get_model_info ( req) . await
1456+ }
1457+
1458+ async fn route_generate (
1459+ & self ,
1460+ headers : Option < & HeaderMap > ,
1461+ body : & crate :: protocols:: spec:: GenerateRequest ,
1462+ model_id : Option < & str > ,
1463+ run_id : Option < & str > ,
1464+ ) -> Response {
1465+ self . pd_router
1466+ . route_generate ( headers, body, model_id, run_id)
1467+ . await
1468+ }
1469+
1470+ // Override OpenAI-compatible routes for vLLM two-stage processing
1471+ async fn route_chat (
1472+ & self ,
1473+ headers : Option < & HeaderMap > ,
1474+ body : & crate :: protocols:: spec:: ChatCompletionRequest ,
1475+ _model_id : Option < & str > ,
1476+ run_id : Option < & str > ,
1477+ ) -> Response {
1478+ self . route_chat_with_path ( headers, body, run_id, "/v1/chat/completions" )
1479+ . await
1480+ }
1481+
1482+ async fn route_chat_tokens (
1483+ & self ,
1484+ headers : Option < & HeaderMap > ,
1485+ body : & crate :: protocols:: spec:: ChatCompletionRequest ,
1486+ _model_id : Option < & str > ,
1487+ run_id : Option < & str > ,
1488+ ) -> Response {
1489+ self . route_chat_with_path ( headers, body, run_id, "/v1/chat/completions/tokens" )
1490+ . await
1491+ }
14881492
14891493 async fn route_completion (
14901494 & self ,
0 commit comments