|
28 | 28 | logger = get_logger(__name__) |
29 | 29 |
|
30 | 30 |
|
31 | | -def _get_llm_deployment_names(llm_deployments: List[Application]) -> List[str]: |
32 | | - return [app._bound_deployment.name for app in llm_deployments] |
33 | | - |
34 | | - |
35 | | -def _validate_direct_streaming_llm_configs(llm_configs: List[LLMConfig]) -> None: |
36 | | - """Validate temporary direct-streaming app-shape constraints.""" |
37 | | - if len(llm_configs) > 1: |
38 | | - raise ValueError( |
39 | | - "RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING currently supports exactly one " |
40 | | - "LLM config. Multi-model direct streaming requires composing multiple " |
41 | | - "LLMServer deployments into the main application graph, which is not " |
42 | | - "supported yet." |
43 | | - ) |
44 | | - |
45 | | - |
46 | | -def _build_direct_streaming_llm_deployments( |
47 | | - llm_configs: List[LLMConfig], |
48 | | -) -> List[Application]: |
49 | | - """Build LLMServer deployments with late-bound ASGI ingress enabled.""" |
50 | | - deployments = [] |
51 | | - for llm_config in llm_configs: |
52 | | - server_cls = llm_config.server_cls or LLMServer |
53 | | - deployments.append( |
54 | | - build_llm_deployment( |
55 | | - llm_config, |
56 | | - deployment_cls=serve.ingress(FastAPI())(server_cls), |
57 | | - ) |
58 | | - ) |
59 | | - return deployments |
| 31 | +def _build_direct_streaming_llm_deployment(llm_config: LLMConfig) -> Application: |
| 32 | + """Build the LLMServer deployment with late-bound ASGI ingress enabled.""" |
| 33 | + server_cls = llm_config.server_cls or LLMServer |
| 34 | + return build_llm_deployment( |
| 35 | + llm_config, |
| 36 | + deployment_cls=serve.ingress(FastAPI())(server_cls), |
| 37 | + ) |
60 | 38 |
|
61 | 39 |
|
62 | | -def build_openai_ingress_request_router( |
63 | | - builder_config: dict, |
64 | | - *, |
65 | | - llm_deployment_names: List[str], |
66 | | -) -> Application: |
| 40 | +def _build_openai_ingress_request_router(*, llm_deployment_name: str) -> Application: |
67 | 41 | """Build the ingress request router peer for OpenAI compatible LLM apps. |
68 | 42 |
|
69 | | - The returned Application should be attached to the ingress application with |
| 43 | + The returned Application is attached to the ingress application with |
70 | 44 | ``Application._with_ingress_request_router``. |
71 | 45 | """ |
72 | | - builder_config = LLMServingArgs.model_validate(builder_config) |
73 | | - llm_configs = builder_config.llm_configs |
74 | | - _validate_direct_streaming_builder_config(builder_config) |
75 | | - |
76 | 46 | from ray.llm._internal.serve.core.ingress.router import LLMRouter |
77 | 47 |
|
78 | | - num_ingress_request_router_replicas = 1 |
79 | | - logger.info( |
80 | | - "Creating " |
81 | | - f"{num_ingress_request_router_replicas} ingress request router " |
82 | | - "replicas (LLMRouter)" |
83 | | - ) |
| 48 | + logger.info("Creating 1 ingress request router replica (LLMRouter)") |
84 | 49 |
|
85 | | - # Late-bind by deployment name to avoid pulling LLMServer Applications into |
86 | | - # the router's recursive build. |
| 50 | + # Late-bind by deployment name to avoid pulling the LLMServer Application |
| 51 | + # into the router's recursive build. |
87 | 52 | return serve.deployment( |
88 | 53 | LLMRouter, |
89 | | - num_replicas=num_ingress_request_router_replicas, |
| 54 | + num_replicas=1, |
90 | 55 | max_ongoing_requests=1000, |
91 | | - ).bind( |
92 | | - llm_deployment_names=llm_deployment_names, |
93 | | - llm_configs_pre=llm_configs, |
94 | | - ) |
| 56 | + ).bind(llm_deployment_name=llm_deployment_name) |
95 | 57 |
|
96 | 58 |
|
97 | 59 | class IngressClsConfig(BaseModelExtended): |
@@ -176,7 +138,13 @@ def _validate_model_ids(self): |
176 | 138 | def _validate_direct_streaming_builder_config( |
177 | 139 | builder_config: LLMServingArgs, |
178 | 140 | ) -> None: |
179 | | - _validate_direct_streaming_llm_configs(builder_config.llm_configs) |
| 141 | + if len(builder_config.llm_configs) > 1: |
| 142 | + raise ValueError( |
| 143 | + "RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING currently supports exactly one " |
| 144 | + "LLM config. Multi-model direct streaming requires composing multiple " |
| 145 | + "LLMServer deployments into the main application graph, which is not " |
| 146 | + "supported yet." |
| 147 | + ) |
180 | 148 |
|
181 | 149 | if builder_config.ingress_deployment_config: |
182 | 150 | raise ValueError( |
@@ -218,15 +186,14 @@ def build_openai_app(builder_config: dict) -> Application: |
218 | 186 | # before the regular OpenAiIngress wiring. |
219 | 187 | if RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING: |
220 | 188 | _validate_direct_streaming_builder_config(builder_config) |
221 | | - direct_deployments = _build_direct_streaming_llm_deployments(llm_configs) |
| 189 | + direct_deployment = _build_direct_streaming_llm_deployment(llm_configs[0]) |
222 | 190 | logger.info( |
223 | 191 | "Direct streaming enabled: " |
224 | 192 | "LLMServer=ingress, LLMRouter=ingress_request_router" |
225 | 193 | ) |
226 | | - return direct_deployments[0]._with_ingress_request_router( |
227 | | - build_openai_ingress_request_router( |
228 | | - builder_config, |
229 | | - llm_deployment_names=_get_llm_deployment_names(direct_deployments), |
| 194 | + return direct_deployment._with_ingress_request_router( |
| 195 | + _build_openai_ingress_request_router( |
| 196 | + llm_deployment_name=direct_deployment._bound_deployment.name, |
230 | 197 | ) |
231 | 198 | ) |
232 | 199 |
|
|
0 commit comments