ray-project · machichima · Feb 8, 2026 · Feb 8, 2026 · Feb 14, 2026 · Feb 14, 2026
diff --git a/ci/lint/pydoclint-baseline.txt b/ci/lint/pydoclint-baseline.txt
@@ -1453,10 +1453,6 @@ python/ray/serve/_private/proxy_response_generator.py
 python/ray/serve/_private/proxy_state.py
     DOC201: Method `ProxyStateManager.get_targets` does not have a return section in docstring
 --------------------
-python/ray/serve/_private/router.py
-    DOC101: Method `SingletonThreadRouter.assign_request`: Docstring contains fewer arguments than in function signature.
-    DOC103: Method `SingletonThreadRouter.assign_request`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**request_kwargs: , *request_args: , request_meta: RequestMetadata].
---------------------
 python/ray/serve/_private/storage/kv_store.py
     DOC201: Method `RayInternalKVStore.put` does not have a return section in docstring
     DOC201: Method `RayInternalKVStore.delete` does not have a return section in docstring

@@ -171,6 +171,7 @@ See the [model composition guide](serve-model-composition) for how to update cod
    serve.exceptions.RequestCancelledError
    serve.exceptions.gRPCStatusError
    serve.exceptions.DeploymentUnavailableError
+   serve.exceptions.ReplicaUnavailableError
 ```
 
 
@@ -524,4 +525,4 @@ Content-Type: application/json
 
    serve.llm.LLMServer
    serve.llm.LLMRouter
-```
+```
@@ -820,6 +820,9 @@ class RequestMetadata:
     request_serialization: str = "cloudpickle"
     response_serialization: str = "cloudpickle"
 
+    # Token for a replica-side slot reserved by choose_replica().
+    _reserved_slot_token: Optional[str] = None
+
     @property
     def is_http_request(self) -> bool:
         return self._request_protocol == RequestProtocol.HTTP

@@ -4,8 +4,19 @@
 import logging
 import queue
 import time
+from contextlib import asynccontextmanager
 from functools import wraps
-from typing import Any, Callable, Coroutine, Dict, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    AsyncIterator,
+    Callable,
+    Coroutine,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import ray
 from ray import cloudpickle
@@ -16,6 +27,7 @@
 )
 from ray.serve._private.replica import UserCallableWrapper
 from ray.serve._private.replica_result import ReplicaResult
+from ray.serve._private.request_router.replica_wrapper import ReplicaSelection
 from ray.serve._private.router import Router
 from ray.serve._private.utils import GENERATOR_COMPOSITION_NOT_SUPPORTED_ERROR
 from ray.serve.deployment import Deployment
@@ -341,6 +353,39 @@ def generator_result_callback(item: Any):
         )
         return noop_future
 
+    @asynccontextmanager
+    async def choose_replica(
+        self,
+        request_meta: RequestMetadata,
+        *request_args,
+        **request_kwargs,
+    ) -> AsyncIterator[ReplicaSelection]:
+        """Choose replica is not supported in local testing mode.
+
+        This is a stub implementation to satisfy the Router ABC interface.
+        """
+        raise NotImplementedError(
+            "choose_replica is not supported in local testing mode. "
+            "Use assign_request instead."
+        )
+        yield  # Make this a generator for asynccontextmanager
+
+    def dispatch(
+        self,
+        selection: ReplicaSelection,
+        request_meta: RequestMetadata,
+        *request_args,
+        **request_kwargs,
+    ) -> concurrent.futures.Future[ReplicaResult]:
+        """Dispatch is not supported in local testing mode.
+
+        This is a stub implementation to satisfy the Router ABC interface.
+        """
+        raise NotImplementedError(
+            "dispatch is not supported in local testing mode. "
+            "Use assign_request instead."
+        )
+
     async def broadcast(
         self,
         request_meta: RequestMetadata,

@@ -1182,13 +1182,36 @@ def __init__(
         self._direct_ingress_grpc_server_task: Optional[asyncio.Task] = None
 
         self._num_queued_requests = 0
+        self._reserved_slots: Set[str] = set()
 
     @property
     def max_ongoing_requests(self) -> int:
         return self._deployment_config.max_ongoing_requests
 
     def get_num_ongoing_requests(self) -> int:
-        return self._metrics_manager.get_num_ongoing_requests()
+        return self._metrics_manager.get_num_ongoing_requests() + len(
+            self._reserved_slots
+        )
+
+    async def reserve_slot(
+        self, request_metadata: RequestMetadata, slot_token: str
+    ) -> Tuple[bool, int]:
+        """Reserve replica capacity for a future dispatch call."""
+        if not self._can_accept_request(request_metadata):
+            return False, self.get_num_ongoing_requests()
+
+        await self._semaphore.acquire()
+        self._reserved_slots.add(slot_token)
+        return True, self.get_num_ongoing_requests()
+
+    def release_slot(self, slot_token: str) -> Tuple[bool, int]:
+        """Release replica capacity reserved by choose_replica()."""
+        if slot_token not in self._reserved_slots:
+            return False, self.get_num_ongoing_requests()
+
+        self._reserved_slots.remove(slot_token)
+        self._semaphore.release()
+        return True, self.get_num_ongoing_requests()
 
     def get_metadata(self) -> ReplicaMetadata:
         current_rank = ray.serve.context._get_internal_replica_context().rank
@@ -1865,12 +1888,25 @@ def _on_request_failed(self, request_metadata: RequestMetadata, e: Exception):
 
     @asynccontextmanager
     async def _start_request(self, request_metadata: RequestMetadata):
-        async with self._semaphore:
+        reserved_slot_token = request_metadata._reserved_slot_token
+        if reserved_slot_token:
+            if reserved_slot_token not in self._reserved_slots:
+                raise RuntimeError(
+                    "Request tried to consume an unknown reserved slot "
+                    f"{reserved_slot_token}."
+                )
+            self._reserved_slots.remove(reserved_slot_token)
+        else:
+            await self._semaphore.acquire()
+
+        try:
             try:
                 self._metrics_manager.inc_num_ongoing_requests(request_metadata)
                 yield
             finally:
                 self._metrics_manager.dec_num_ongoing_requests(request_metadata)
+        finally:
+            self._semaphore.release()
 
     async def _drain_ongoing_requests(self):
         """Wait for any ongoing requests to finish.
@@ -2759,6 +2795,16 @@ def get_num_ongoing_requests(self) -> int:
         """
         return self._replica_impl.get_num_ongoing_requests()
 
+    async def reserve_slot(
+        self, request_metadata: RequestMetadata, slot_token: str
+    ) -> Tuple[bool, int]:
+        """Reserve capacity for a future choose_replica/dispatch request."""
+        return await self._replica_impl.reserve_slot(request_metadata, slot_token)
+
+    def release_slot(self, slot_token: str) -> Tuple[bool, int]:
+        """Release capacity reserved by choose_replica()."""
+        return self._replica_impl.release_slot(slot_token)
+
     async def is_allocated(self) -> str:
         """poke the replica to check whether it's alive.
 

@@ -3,6 +3,7 @@
     PowerOfTwoChoicesRequestRouter,
 )
 from ray.serve._private.request_router.replica_wrapper import (  # noqa: F401
+    ReplicaSelection,
     RunningReplica,
 )
 from ray.serve._private.request_router.request_router import (  # noqa: F401

@@ -1,21 +1,22 @@
 import asyncio
-import logging
 import pickle
+import uuid
 from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
 from typing import Any, Dict, Optional, Set, Tuple
 
 import grpc
 
 import ray
 from ray.actor import ActorHandle
 from ray.serve._private.common import (
+    DeploymentID,
     ReplicaID,
+    ReplicaQueueLengthInfo,
+    RequestMetadata,
     RunningReplicaInfo,
 )
-from ray.serve._private.constants import (
-    RAY_SERVE_REPLICA_GRPC_MAX_MESSAGE_LENGTH,
-    SERVE_LOGGER_NAME,
-)
+from ray.serve._private.constants import RAY_SERVE_REPLICA_GRPC_MAX_MESSAGE_LENGTH
 from ray.serve._private.replica_result import (
     ActorReplicaResult,
     ReplicaResult,
@@ -35,8 +36,6 @@
     _is_tracing_enabled,
 )
 
-logger = logging.getLogger(SERVE_LOGGER_NAME)
-
 
 class ReplicaWrapper(ABC):
     """This is used to abstract away details of the transport layer
@@ -201,6 +200,10 @@ def __init__(self, replica_info: RunningReplicaInfo):
         self._actor_replica_wrapper = ActorReplicaWrapper(self._actor_handle)
         self._grpc_replica_wrapper = None
 
+        # Active local slot reservation tokens for Java replicas. Python replicas
+        # reserve capacity on the actor-side semaphore.
+        self._reserved_slots: Set[str] = set()
+
     def update_replica_info(self, replica_info: RunningReplicaInfo) -> None:
         """Update mutable fields from a new RunningReplicaInfo.
 
@@ -324,3 +327,129 @@ def try_send_request(
             return wrapper.send_request_java(pr)
 
         return wrapper.send_request_python(pr, with_rejection=with_rejection)
+
+    async def reserve_slot(
+        self, request_metadata: RequestMetadata
+    ) -> Tuple[str, ReplicaQueueLengthInfo]:
+        """Reserve a slot on this replica for an upcoming request.
+
+        Returns a unique token that can be used to release the slot later.
+        This is used in the choose_replica/dispatch pattern to track
+        reservations that haven't been dispatched yet.
+        """
+        if self._replica_info.is_cross_language:
+            slot_token = str(uuid.uuid4())
+            self._reserved_slots.add(slot_token)
+            return slot_token, ReplicaQueueLengthInfo(
+                accepted=True,
+                num_ongoing_requests=len(self._reserved_slots),
+            )
+
+        slot_token = str(uuid.uuid4())
+        obj_ref = self._actor_handle.reserve_slot.remote(request_metadata, slot_token)
+        try:
+            accepted, num_ongoing_requests = await obj_ref
+        except asyncio.CancelledError:
+            ray.cancel(obj_ref)
+            self._actor_handle.release_slot.remote(slot_token)
+            raise
+
+        return slot_token, ReplicaQueueLengthInfo(
+            accepted=accepted,
+            num_ongoing_requests=num_ongoing_requests,
+        )
+
+    async def release_slot(self, slot_token: str) -> ReplicaQueueLengthInfo:
+        """Release a previously reserved slot.
+
+        This should be called if a request is not dispatched after
+        reserving a slot (e.g., due to an error or cancellation).
+        """
+        if self._replica_info.is_cross_language:
+            self._reserved_slots.discard(slot_token)
+            return ReplicaQueueLengthInfo(
+                accepted=True,
+                num_ongoing_requests=len(self._reserved_slots),
+            )
+
+        _, num_ongoing_requests = await self._actor_handle.release_slot.remote(
+            slot_token
+        )
+        return ReplicaQueueLengthInfo(
+            accepted=True,
+            num_ongoing_requests=num_ongoing_requests,
+        )
+
+
+@dataclass
+class ReplicaSelection:
+    """Represents a selected replica, holding information for dispatch or coordination.
+
+    This class is returned by the choose_replica() context manager.
+    The slot reservation lifecycle is managed by the context manager.
+    """
+
+    # Public, user-accessible fields
+    replica_id: str
+    """Unique identifier for the selected replica."""
+
+    node_ip: str
+    """IP address of the node running this replica."""
+
+    port: Optional[int]
+    """Port number for direct communication (if configured)."""
+
+    node_id: str
+    """Ray node ID where the replica is running."""
+
+    availability_zone: Optional[str]
+    """Cloud availability zone of the replica's node."""
+
+    # Internal fields (not part of public API)
+    _replica: RunningReplica
+    _deployment_id: Optional[DeploymentID]
+    _request_metadata: RequestMetadata
+    _method_name: str
+    _slot_token: str  # Token for reserved slot
+    _dispatched: bool = field(
+        default=False, init=False
+    )  # Tracks if dispatch was called
+
+    @property
+    def address(self) -> str:
+        """Returns the replica address in host:port format."""
+        if self.port:
+            return f"{self.node_ip}:{self.port}"
+        return self.node_ip
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Serialize public fields to a dictionary."""
+        return {
+            "replica_id": self.replica_id,
+            "node_ip": self.node_ip,
+            "port": self.port,
+            "node_id": self.node_id,
+            "availability_zone": self.availability_zone,
+        }
+
+    def _mark_dispatched(self) -> None:
+        """Internal: Mark this selection as dispatched (slot consumed).
+
+        Raises:
+            RuntimeError: If the selection has already been dispatched.
+        """
+        if self._dispatched:
+            raise RuntimeError(
+                f"ReplicaSelection for {self.replica_id} has already been dispatched. "
+                "Each selection can only be dispatched once."
+            )
+        self._dispatched = True
+
+    async def _release_slot(
+        self, *, force: bool = False
+    ) -> Optional[ReplicaQueueLengthInfo]:
+        """Internal: Release the reserved slot."""
+        if self._dispatched and not force:
+            return None
+
+        return await self._replica.release_slot(self._slot_token)
@@ -850,6 +850,25 @@ def on_send_request(self, replica_id: ReplicaID):
             self._replica_queue_len_cache.update(replica_id, new_queue_len)
             self._update_router_queue_len_gauge(replica_id, new_queue_len)
 
+    def on_replica_result_finished(self, replica_id: ReplicaID):
+        """Decrement queue length cache when a request finishes or is cancelled.
+
+        This is used when a reserved slot is released without being dispatched
+        (e.g., in choose_replica context manager cleanup).
+
+        We cannot rely on on_new_queue_len_info() to correct the cache in this
+        path. The queue length cache is incremented optimistically when a slot is
+        reserved, before dispatch happens. If dispatch is never called or fails
+        before the request reaches the replica, no queue_len_info response is
+        produced, so the cache would otherwise remain inflated.
+        """
+        if self._use_replica_queue_len_cache:
+            num_ongoing_requests = self._replica_queue_len_cache.get(replica_id) or 0
+            if num_ongoing_requests > 0:
+                new_queue_len = num_ongoing_requests - 1
+                self._replica_queue_len_cache.update(replica_id, new_queue_len)
+                self._update_router_queue_len_gauge(replica_id, new_queue_len)
+
     def decrement_queue_len_cache(self, replica_id: ReplicaID):
         """Decrement the queue length cache for a replica.