chillCode404
diff --git a/‎python/ray/llm/_internal/serve/constants.py‎
Lines changed: 4 additions & 0 deletions b/‎python/ray/llm/_internal/serve/constants.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/engine/protocol.py‎
Lines changed: 12 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/engine/protocol.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/ingress/builder.py‎
Lines changed: 80 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/ingress/builder.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/ingress/router.py‎
Lines changed: 138 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/ingress/router.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/server/llm_server.py‎
Lines changed: 3 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/server/llm_server.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py‎
Lines changed: 17 additions & 0 deletions b/‎python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py‎
Lines changed: 17 additions & 0 deletions
@@ -62,6 +62,10 @@
     "RAYLLM_GUIDED_DECODING_BACKEND", "xgrammar"
 )
 
+RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING = (
+    os.environ.get("RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING", "0") == "1"
+)
+
 MAX_NUM_STOPPING_SEQUENCES = int(os.getenv("RAYLLM_MAX_NUM_STOPPING_SEQUENCES", "8"))
 ENV_VARS_TO_PROPAGATE = {
     "HUGGING_FACE_HUB_TOKEN",
 
@@ -221,6 +221,18 @@ async def check_health(self) -> None:
         """
         return
 
+    async def build_asgi_app(self) -> Any:
+        """Build an ASGI app that serves directly from this engine's frontend.
+
+        Used by direct streaming, which serves traffic from the LLMServer
+        replica's own ASGI ingress instead of the OpenAiIngress deployment.
+        Engines that do not support direct serving should keep the default,
+        which raises NotImplementedError.
+        """
+        raise NotImplementedError(
+            f"{type(self).__name__} does not support direct ASGI serving."
+        )
+
     ##############################################################
     # Optional methods
     # These methods will be implemented in the future to allow
 
@@ -10,6 +10,7 @@
     maybe_apply_llm_deployment_config_defaults,
 )
 from ray.llm._internal.common.utils.import_utils import load_class
+from ray.llm._internal.serve.constants import RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING
 from ray.llm._internal.serve.core.configs.llm_config import LLMConfig
 from ray.llm._internal.serve.core.configs.openai_api_models import to_model_metadata
 from ray.llm._internal.serve.core.ingress.ingress import (
@@ -19,12 +20,46 @@
 from ray.llm._internal.serve.core.server.builder import (
     build_llm_deployment,
 )
+from ray.llm._internal.serve.core.server.llm_server import LLMServer
 from ray.llm._internal.serve.observability.logging import get_logger
 from ray.serve.deployment import Application
 
 logger = get_logger(__name__)
 
 
+def _build_direct_streaming_llm_deployment(llm_config: LLMConfig) -> Application:
+    """Build the LLMServer deployment with late-bound ASGI ingress enabled.
+
+    The real ASGI app (vLLM FastAPI) is constructed inside
+    `LLMServer.__serve_build_asgi_app__` after the engine starts.
+    """
+    server_cls = llm_config.server_cls or LLMServer
+    return build_llm_deployment(
+        llm_config,
+        deployment_cls=serve.ingress()(server_cls),
+    )
+
+
+def _build_openai_ingress_request_router(*, server: Application) -> Application:
+    """Build the ingress request router peer for OpenAI compatible LLM apps.
+
+    The returned Application is attached to the ingress application with
+    ``Application._with_ingress_request_router``.
+
+    ``num_replicas`` is pinned to 1 because HAProxy's ingress request router
+    backend currently expects a single endpoint. TODO(eicherseiji): expose
+    these as a user-overridable IngressRequestRouterConfig once HAProxy
+    supports multiple router replicas.
+    """
+    from ray.llm._internal.serve.core.ingress.router import LLMRouter
+
+    return serve.deployment(
+        LLMRouter,
+        num_replicas=1,
+        max_ongoing_requests=1000,
+    ).bind(server=server)
+
+
 class IngressClsConfig(BaseModelExtended):
     ingress_cls: Union[str, Type[OpenAiIngress]] = Field(
         default=OpenAiIngress,
@@ -104,6 +139,37 @@ def _validate_model_ids(self):
         return self
 
 
+def _validate_direct_streaming_builder_config(
+    builder_config: LLMServingArgs,
+) -> None:
+    if len(builder_config.llm_configs) > 1:
+        raise ValueError(
+            "RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING currently supports exactly one "
+            "LLM config. Multi-model direct streaming requires composing multiple "
+            "LLMServer deployments into the main application graph, which is not "
+            "supported yet."
+        )
+
+    if builder_config.ingress_deployment_config:
+        raise ValueError(
+            "RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING does not support "
+            "ingress_deployment_config because LLMServer is used directly as "
+            "the ingress deployment. Configure LLMServer through each "
+            "LLMConfig.deployment_config instead."
+        )
+
+    ingress_cls_config = builder_config.ingress_cls_config
+    if (
+        ingress_cls_config.ingress_cls != OpenAiIngress
+        or ingress_cls_config.ingress_extra_kwargs
+    ):
+        raise ValueError(
+            "RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING does not support "
+            "ingress_cls_config because LLMServer is used directly as the "
+            "ingress deployment."
+        )
+
+
 def build_openai_app(builder_config: dict) -> Application:
     """Build an OpenAI compatible app with the llm deployment setup from
     the given builder configuration.
@@ -119,6 +185,20 @@ def build_openai_app(builder_config: dict) -> Application:
     builder_config = LLMServingArgs.model_validate(builder_config)
     llm_configs = builder_config.llm_configs
 
+    # Direct streaming attaches LLMRouter as the ingress request router and
+    # uses the LLMServer deployment itself as the ingress app, so it returns
+    # before the regular OpenAiIngress wiring.
+    if RAY_SERVE_LLM_ENABLE_DIRECT_STREAMING:
+        _validate_direct_streaming_builder_config(builder_config)
+        direct_deployment = _build_direct_streaming_llm_deployment(llm_configs[0])
+        logger.info(
+            "Direct streaming enabled: "
+            "LLMServer=ingress, LLMRouter=ingress_request_router"
+        )
+        return direct_deployment._with_ingress_request_router(
+            _build_openai_ingress_request_router(server=direct_deployment)
+        )
+
     llm_deployments = {c.model_id: build_llm_deployment(c) for c in llm_configs}
     model_cards = {c.model_id: to_model_metadata(c.model_id, c) for c in llm_configs}
     lora_paths = {
 
@@ -0,0 +1,138 @@
+import random
+from typing import FrozenSet, List, Optional, Tuple
+
+from fastapi import FastAPI, HTTPException, Request
+
+from ray import serve
+from ray.serve._private.common import ReplicaID
+from ray.serve.handle import DeploymentHandle
+
+_BODY_TRUNCATED_HEADER = "x-body-truncated"
+
+_ReplicaCacheSignature = FrozenSet[ReplicaID]
+
+router_app = FastAPI()
+
+
+@serve.ingress(router_app)
+class LLMRouter:
+    """Ingress request router for direct streaming.
+
+    When direct streaming is enabled, HAProxy calls /internal/route on this
+    deployment to get a data plane replica, then forwards traffic directly
+    to the matching LLMServer replica's backend HTTP port.
+
+    /internal/route HTTP contract
+    -----------------------------
+    Request:
+        POST /internal/route
+        Content-Type: application/json
+        Body: the target ChatCompletions / Completions request payload.
+            Today the router uses round-robin and ignores the body, but it
+            is plumbed through so future routing policies (e.g. prefix
+            cache aware) can score replicas against ``messages`` /
+            ``prompt``. HAProxy should continue forwarding the payload
+            (subject to truncation below).
+
+    Truncated bodies:
+        HAProxy may forward only a prefix of the request body for routing.
+        When it does, it must set the ``x-body-truncated`` header. The
+        router forwards both the body bytes and this signal to
+        ``_pick_replica`` for future body-aware policies.
+
+    Responses:
+        200 ``{"host": str, "port": int, "replica_id": str}``: pick
+            succeeded.
+        4xx/5xx FastAPI ``{"detail": str}``: informational only; HAProxy
+            treats any non-200 as a routing failure.
+
+    Health:
+        ``GET /health`` is exposed as a human-operator convenience.
+        Serve uses ``check_health()`` for replica readiness, not HTTP.
+    """
+
+    async def __init__(self, server: DeploymentHandle):
+        # Randomized so multiple LLMRouter replicas don't lockstep on the
+        # same replica sequence.
+        self._round_robin_counter = random.randrange(2**31)
+        self._cached_dict_id: Optional[int] = None
+        self._cached_replica_signature: Optional[_ReplicaCacheSignature] = None
+        self._cached_endpoints: List[Tuple[str, int, str]] = []
+        self._handle: DeploymentHandle = server
+
+        # Force the handle's local router and request router to construct
+        # synchronously so /internal/route can read them in the hot path.
+        # `curr_replicas` is populated separately by controller broadcast;
+        # /internal/route returns 503 (HAProxy retries) until then, which
+        # decouples router liveness from LLMServer cold start.
+        self._handle._init()
+        self._request_router = self._handle._get_request_router()
+        if self._request_router is None:
+            raise RuntimeError(
+                "DeploymentHandle._get_request_router() returned None after "
+                "_init(); Serve internals may have changed."
+            )
+
+    async def check_health(self):
+        if self._handle._get_request_router() is None:
+            raise RuntimeError("request router not initialized")
+
+    @router_app.post("/internal/route")
+    async def route(self, request: Request):
+        body = await request.body()
+        body_truncated = _BODY_TRUNCATED_HEADER in request.headers
+        try:
+            host, port, replica_id = self._pick_replica(
+                request_body=body, body_truncated=body_truncated
+            )
+        except RuntimeError as e:
+            raise HTTPException(status_code=503, detail=str(e))
+        return {"host": host, "port": port, "replica_id": replica_id}
+
+    @router_app.get("/health")
+    async def health(self):
+        return {"status": "ok"}
+
+    def _ready_endpoints(self) -> List[Tuple[str, int, str]]:
+        """Backend (host, port, full_id) tuples, cached on replica-set change."""
+        curr_replicas = self._request_router.curr_replicas
+        # RequestRouter swaps the dict wholesale on every controller broadcast,
+        # so dict identity is a cheap "did anything change" check; the keyset
+        # check then filters out broadcasts that didn't actually change the
+        # replica set.
+        if id(curr_replicas) == self._cached_dict_id:
+            return self._cached_endpoints
+        signature = frozenset(curr_replicas.keys())
+        if signature != self._cached_replica_signature:
+            self._cached_replica_signature = signature
+            ready = sorted(
+                (r for r in curr_replicas.values() if r.backend_http_endpoint),
+                key=lambda r: r.replica_id.unique_id,
+            )
+            self._cached_endpoints = [
+                (*r.backend_http_endpoint, r.replica_id.to_full_id_str()) for r in ready
+            ]
+        self._cached_dict_id = id(curr_replicas)
+        return self._cached_endpoints
+
+    def _pick_replica(
+        self,
+        request_body: Optional[bytes] = None,
+        body_truncated: bool = False,
+    ) -> Tuple[str, int, str]:
+        """Pick a backend HTTP replica.
+
+        Today this is plain round-robin and ignores the payload. The
+        ``request_body`` (possibly a HAProxy-truncated prefix, indicated by
+        ``body_truncated``) is plumbed through so a future prefix cache aware
+        policy can score replicas against the request's prompt / messages
+        without changing the /internal/route contract or the call site.
+        """
+        del request_body, body_truncated
+        candidates = self._ready_endpoints()
+        if not candidates:
+            raise RuntimeError("no backend-http replicas")
+
+        index = self._round_robin_counter % len(candidates)
+        self._round_robin_counter += 1
+        return candidates[index]
@@ -195,6 +195,9 @@ async def start(self):
             self.engine = self._engine_cls(self._llm_config)
             await asyncio.wait_for(self._start_engine(), timeout=ENGINE_START_TIMEOUT_S)
 
+    async def __serve_build_asgi_app__(self):
+        return await self.engine.build_asgi_app()
+
     def _init_multiplex_loader(
         self, model_downloader_cls: Optional[Type[LoraModelLoader]] = None
     ):
 
@@ -267,6 +267,22 @@ def __init__(
         self._oai_serving_scores: Optional["ServingScores"] = None
         self._oai_serving_tokenization: Optional["OpenAIServingTokenization"] = None
 
+    async def build_asgi_app(self):
+        from vllm.entrypoints.openai.api_server import build_app, init_app_state
+
+        supported_tasks = ("generate",)
+        if hasattr(self._engine_client, "get_supported_tasks"):
+            supported_tasks = await self._engine_client.get_supported_tasks()
+
+        app = build_app(self._vllm_args, supported_tasks=supported_tasks)
+        await init_app_state(
+            self._engine_client,
+            app.state,
+            self._vllm_args,
+            supported_tasks=supported_tasks,
+        )
+        return app
+
     async def start(self) -> None:
         """Start the vLLM engine.
 
@@ -316,6 +332,7 @@ async def start(self) -> None:
         merged = _convert_config_dicts(merged)
 
         args = _dict_to_namespace(merged)
+        self._vllm_args = args
 
         # Query supported tasks from the engine so init_app_state initializes the correct serving objects.
         # Without this, vLLM falls back to 'generate' only.