From 4dc1d4e2cb41ea86fb02aca1f16d7555de0dcbd2 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 8 Feb 2026 19:27:43 +0800 Subject: [PATCH 01/57] feat: add DeploymentHandle skeleton Signed-off-by: machichima --- .../serve/_private/request_router/common.py | 60 +++++++++++++++++++ python/ray/serve/handle.py | 46 ++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index b373f47f1528..b15f5add39c3 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -10,6 +10,8 @@ SERVE_LOGGER_NAME, ) from ray.util.annotations import PublicAPI +from ray.serve._private.request_router.replica_wrapper import RunningReplica +from ray.serve.handle import DeploymentHandle logger = logging.getLogger(SERVE_LOGGER_NAME) @@ -63,6 +65,64 @@ class ReplicaQueueLengthCacheEntry: timestamp: float +@dataclass +class ReplicaSelection: + """Represents a selected replica, holding information for dispatch or coordination. + + This class is returned by the choose_replica() context manager. + The slot reservation lifecycle is managed by the context manager. + """ + + # Public, user-accessible fields + replica_id: str + """Unique identifier for the selected replica.""" + + node_ip: str + """IP address of the node running this replica.""" + + port: Optional[int] + """Port number for direct communication (if configured).""" + + node_id: str + """Ray node ID where the replica is running.""" + + availability_zone: Optional[str] + """Cloud availability zone of the replica's node.""" + + # Internal fields (not part of public API) + _replica: "RunningReplica" + _deployment_handle: "DeploymentHandle" + _method_name: str + _slot_token: str # Token for reserved slot + _dispatched: bool = field(default=False, init=False) # Tracks if dispatch was called + + @property + def address(self) -> str: + """Returns the replica address in host:port format.""" + if self.port: + return f"{self.node_ip}:{self.port}" + return self.node_ip + + def to_dict(self) -> Dict[str, Any]: + """Serialize public fields to a dictionary.""" + return { + "replica_id": self.replica_id, + "node_ip": self.node_ip, + "port": self.port, + "node_id": self.node_id, + "availability_zone": self.availability_zone, + } + + def _mark_dispatched(self) -> None: + """Internal: Mark this selection as dispatched (slot consumed).""" + self._dispatched = True + + def _release_slot(self) -> None: + """Internal: Release the reserved slot if not yet dispatched.""" + if not self._dispatched: + self._replica.release_slot(self._slot_token) + + class ReplicaQueueLengthCache: def __init__( self, diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index c2c72b0f1bfe..1ab7c6173f2a 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -2,6 +2,7 @@ import concurrent.futures import logging import time +from typing_extensions import AsyncContextManager import warnings from typing import ( Any, @@ -38,6 +39,7 @@ InitHandleOptionsBase, ) from ray.serve._private.replica_result import ReplicaResult +from ray.serve._private.request_router.common import ReplicaSelection from ray.serve._private.router import Router from ray.serve._private.usage import ServeUsageTag from ray.serve._private.utils import ( @@ -894,3 +896,47 @@ def remote( request_metadata, _is_router_running_in_separate_loop=self._is_router_running_in_separate_loop(), ) + def choose_replica( + self, + *args, + **kwargs, + ) -> AsyncContextManager[ReplicaSelection]: + """Execute the request router to select a replica without dispatching. + + This method runs the full routing logic (load balancing, locality awareness, + queue length probing, etc.) and returns an async context manager that yields + a ReplicaSelection. A request slot is reserved on the selected replica, + guaranteeing that dispatch will succeed. + + The context manager ensures proper cleanup: + - If dispatch() is called, the slot is consumed normally. + - If the context exits without dispatch (e.g., exception, early return), the slot is released. + + Args: + *args, **kwargs: Arguments that may influence routing decisions + (e.g., for multiplexed model routing). + + Returns: + AsyncContextManager[ReplicaSelection] - must be used with async with. + """ + pass + + async def dispatch( + self, + selection: ReplicaSelection, + *args, + **kwargs, + ) -> DeploymentResponse: + """Dispatch a request to a previously selected replica. + + Args: + selection: A ReplicaSelection from choose_replica() context manager. + *args, **kwargs: The actual request arguments. + + Returns: + DeploymentResponse or DeploymentResponseGenerator (if streaming). + + Raises: + ReplicaUnavailableError: If the selected replica is no longer available. + """ + pass From d85b8ebc0ccd9a61336166f873286c12d3fdc3e3 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 8 Feb 2026 20:15:35 +0800 Subject: [PATCH 02/57] feat: add AsyncioRouter skeleton Signed-off-by: machichima --- python/ray/serve/_private/router.py | 85 ++++++++++++++++++++++++++++- python/ray/serve/exceptions.py | 6 ++ 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 2205e322baf7..41d6c038edff 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -8,10 +8,11 @@ from asyncio import AbstractEventLoop, ensure_future, futures from collections import defaultdict from collections.abc import MutableMapping -from contextlib import contextmanager +from contextlib import asynccontextmanager, contextmanager from functools import lru_cache, partial from typing import ( Any, + AsyncIterator, Callable, Coroutine, DefaultDict, @@ -21,6 +22,7 @@ Union, ) +from python.ray.serve._private.request_router.common import ReplicaSelection import ray from ray.actor import ActorHandle from ray.exceptions import ActorDiedError, ActorUnavailableError, RayError @@ -62,7 +64,11 @@ resolve_deployment_response, ) from ray.serve.config import AutoscalingConfig -from ray.serve.exceptions import BackPressureError, DeploymentUnavailableError +from ray.serve.exceptions import ( + BackPressureError, + DeploymentUnavailableError, + ReplicaUnavailableError, +) from ray.util import metrics logger = logging.getLogger(SERVE_LOGGER_NAME) @@ -924,6 +930,81 @@ async def assign_request( raise + @asynccontextmanager + async def choose_replica( + self, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> AsyncIterator[ReplicaSelection]: + """Execute routing and reserve a slot, with automatic cleanup.""" + await self._request_router_initialized.wait() + + pr = PendingRequest( + args=list(request_args), + kwargs=request_kwargs, + metadata=request_meta, + ) + + # Resolve arguments (e.g., for multiplexed routing) + if not pr.resolved: + await self._resolve_request_arguments(pr) + + # Run the request router + replica = await self.request_router._choose_replica_for_request(pr) + + # TODO: Implement function to reserve a slot on the replica + slot_token = replica.reserve_slot() + + selection = ReplicaSelection( + replica_id=replica.replica_id.unique_id, + node_ip=replica._replica_info.node_ip, + port=replica._replica_info.port, + node_id=replica.node_id, + availability_zone=replica.availability_zone, + _replica=replica, + _deployment_handle=self._deployment_handle, + _method_name=request_meta.call_method, + _slot_token=slot_token, + ) + + try: + yield selection + finally: + # Release slot if dispatch was not called + selection._release_slot() + + async def dispatch( + self, + selection: ReplicaSelection, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> ReplicaResult: + """Dispatch to a specific replica, consuming the reserved slot.""" + # Verify replica is still available + if selection.replica_id not in self.request_router.curr_replicas: + raise ReplicaUnavailableError( + f"Replica {selection.replica_id} is no longer available" + ) + + pr = PendingRequest( + args=list(request_args), + kwargs=request_kwargs, + metadata=request_meta, + ) + + replica = selection._replica + + # Mark as dispatched so __aexit__ won't release the slot + selection._mark_dispatched() + + # Use the reserved slot + result = replica.send_request_with_slot(pr, selection._slot_token) + + self.request_router.on_send_request(replica.replica_id) + return result + async def shutdown(self): await self._metrics_manager.shutdown() diff --git a/python/ray/serve/exceptions.py b/python/ray/serve/exceptions.py index 6e3d201655e4..e72abe8f4ece 100644 --- a/python/ray/serve/exceptions.py +++ b/python/ray/serve/exceptions.py @@ -101,3 +101,9 @@ def __init__(self, deployment_id: DeploymentID): @property def message(self) -> str: return f"{self._deployment_id} is unavailable because it failed to deploy." + + +@PublicAPI(stability="alpha") +class ReplicaUnavailableError(RayServeException): + """Raised when the selected replica is no longer available.""" + pass From 81409425e87a8d7cfc5a3e0302cac6709b37dab4 Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 14 Feb 2026 20:30:04 +0800 Subject: [PATCH 03/57] fix: type and undefined err Signed-off-by: machichima --- python/ray/serve/_private/request_router/common.py | 8 +++++--- python/ray/serve/_private/router.py | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index b15f5add39c3..08727634f7fc 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -2,7 +2,7 @@ import logging import time from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Set +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set from ray.serve._private.common import ReplicaID, RequestMetadata from ray.serve._private.constants import ( @@ -10,8 +10,10 @@ SERVE_LOGGER_NAME, ) from ray.util.annotations import PublicAPI -from ray.serve._private.request_router.replica_wrapper import RunningReplica -from ray.serve.handle import DeploymentHandle + +if TYPE_CHECKING: + from ray.serve._private.request_router.replica_wrapper import RunningReplica + from ray.serve.handle import DeploymentHandle logger = logging.getLogger(SERVE_LOGGER_NAME) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 41d6c038edff..9fab93483f7f 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -22,7 +22,7 @@ Union, ) -from python.ray.serve._private.request_router.common import ReplicaSelection +from ray.serve._private.request_router.common import ReplicaSelection import ray from ray.actor import ActorHandle from ray.exceptions import ActorDiedError, ActorUnavailableError, RayError @@ -963,7 +963,7 @@ async def choose_replica( node_id=replica.node_id, availability_zone=replica.availability_zone, _replica=replica, - _deployment_handle=self._deployment_handle, + _deployment_handle=None, # TODO: use the actual DeploymentHandle _method_name=request_meta.call_method, _slot_token=slot_token, ) @@ -972,6 +972,7 @@ async def choose_replica( yield selection finally: # Release slot if dispatch was not called + # TODO: implement this function selection._release_slot() async def dispatch( @@ -1000,6 +1001,7 @@ async def dispatch( selection._mark_dispatched() # Use the reserved slot + # TODO: implement this function result = replica.send_request_with_slot(pr, selection._slot_token) self.request_router.on_send_request(replica.replica_id) From 1e909ca78def344d84d359020090bf6ba2a07c4a Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 14 Feb 2026 20:30:48 +0800 Subject: [PATCH 04/57] test: update FakeReplica and add TestChooseReplica Signed-off-by: machichima --- python/ray/serve/tests/unit/test_router.py | 230 +++++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 3f68aeaa382f..18c1ebf1e0e9 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -96,16 +96,39 @@ def __init__( queue_len_info: Optional[ReplicaQueueLengthInfo] = None, is_cross_language: bool = False, error: Optional[Exception] = None, + node_id: str = "fake-node-id", + availability_zone: Optional[str] = None, + node_ip: str = "127.0.0.1", + port: Optional[int] = 8000, ): self._replica_id = replica_id self._is_cross_language = is_cross_language self._queue_len_info = queue_len_info self._error = error + self._reserved_slots: Dict[str, bool] = {} # token -> is_released + self._slot_counter = 0 + + # Create a minimal _replica_info object to satisfy router.py requirements + from unittest.mock import Mock + self._replica_info = Mock() + self._replica_info.node_id = node_id + self._replica_info.availability_zone = availability_zone + self._replica_info.node_ip = node_ip + self._replica_info.port = port + self._replica_info.replica_id = replica_id @property def replica_id(self) -> ReplicaID: return self._replica_id + @property + def node_id(self) -> str: + return self._replica_info.node_id + + @property + def availability_zone(self) -> Optional[str]: + return self._replica_info.availability_zone + @property def is_cross_language(self) -> bool: return self._is_cross_language @@ -113,6 +136,35 @@ def is_cross_language(self) -> bool: def get_queue_len(self, *, deadline_s: float) -> int: raise NotImplementedError + def reserve_slot(self) -> str: + """Reserve a slot and return a token.""" + self._slot_counter += 1 + token = f"slot-token-{self._slot_counter}" + self._reserved_slots[token] = False # Not released yet + return token + + def release_slot(self, slot_token: str) -> None: + """Release a reserved slot.""" + if slot_token in self._reserved_slots: + self._reserved_slots[slot_token] = True + + def send_request_with_slot( + self, pr: PendingRequest, slot_token: str + ) -> FakeReplicaResult: + """Send request using a reserved slot.""" + assert ( + slot_token in self._reserved_slots + ), f"Invalid slot token: {slot_token}" + assert not self._reserved_slots[ + slot_token + ], f"Slot already released: {slot_token}" + + # Create result same way as try_send_request + if pr.metadata.is_streaming: + return FakeReplicaResult(self._replica_id, is_generator_object=True) + else: + return FakeReplicaResult(self._replica_id, is_generator_object=False) + def try_send_request( self, pr: PendingRequest, with_rejection: bool ) -> FakeReplicaResult: @@ -760,6 +812,184 @@ async def test_on_request_routed( assert fake_request_router.on_request_routed_called is True +@pytest.mark.asyncio +class TestChooseReplica: + """Tests for choose_replica() and dispatch() flow.""" + + async def test_basic_choose_and_dispatch( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test basic choose_replica() and dispatch() workflow.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + call_method="test_method", + ) + + # Choose replica + async with router.choose_replica(request_metadata) as selection: + # Verify selection contains correct information + assert selection.replica_id == r1_id.unique_id + assert selection._replica == replica + assert selection._method_name == "test_method" + assert selection._slot_token in replica._reserved_slots + assert not replica._reserved_slots[selection._slot_token] # Not released + + # Dispatch request + replica_result = await router.dispatch(selection, request_metadata) + assert replica_result._replica_id == r1_id + assert not replica_result._is_generator_object + + # After context exit, slot should not be released (dispatch was called) + assert not replica._reserved_slots[selection._slot_token] + + async def test_choose_without_dispatch_releases_slot( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test that exiting context without dispatch releases the slot.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + slot_token = None + async with router.choose_replica(request_metadata) as selection: + slot_token = selection._slot_token + assert slot_token in replica._reserved_slots + assert not replica._reserved_slots[slot_token] # Not released + # Exit without calling dispatch + + # After context exit, slot should be released + assert replica._reserved_slots[slot_token] is True + + async def test_choose_with_exception_releases_slot( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test that exception in context releases the slot.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + slot_token = None + with pytest.raises(RuntimeError): + async with router.choose_replica(request_metadata) as selection: + slot_token = selection._slot_token + assert slot_token in replica._reserved_slots + assert not replica._reserved_slots[slot_token] + raise RuntimeError("Test exception") + + # After exception, slot should be released + assert replica._reserved_slots[slot_token] is True + + @pytest.mark.parametrize("is_streaming", [False, True]) + async def test_choose_and_dispatch_streaming( + self, + setup_router: Tuple[AsyncioRouter, FakeRequestRouter], + is_streaming: bool, + ): + """Test choose_replica() and dispatch() with streaming requests.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + is_streaming=is_streaming, + ) + + async with router.choose_replica(request_metadata) as selection: + replica_result = await router.dispatch(selection, request_metadata) + assert replica_result._replica_id == r1_id + if is_streaming: + assert replica_result._is_generator_object + else: + assert not replica_result._is_generator_object + + async def test_slot_reservation_mock_interaction( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test that reserve_slot and send_request_with_slot are called correctly.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + # Track initial state + initial_slot_count = replica._slot_counter + + async with router.choose_replica(request_metadata) as selection: + # Verify reserve_slot was called + assert replica._slot_counter == initial_slot_count + 1 + assert len(replica._reserved_slots) == 1 + + # Dispatch and verify send_request_with_slot works + replica_result = await router.dispatch(selection, request_metadata) + assert replica_result._replica_id == r1_id + + async def test_multiple_sequential_selections( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test multiple sequential choose_replica() and dispatch() calls.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + for i in range(3): + request_metadata = RequestMetadata( + request_id=f"test-request-{i}", + internal_request_id=f"test-internal-request-{i}", + ) + + async with router.choose_replica(request_metadata) as selection: + replica_result = await router.dispatch(selection, request_metadata) + assert replica_result._replica_id == r1_id + + # All slots should have been created and used + assert replica._slot_counter == 3 + + def running_replica_info(replica_id: ReplicaID) -> RunningReplicaInfo: return RunningReplicaInfo( replica_id=replica_id, From 3df37957df2a0ca6100570e97b155bfe5dc0bdbd Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 16 Feb 2026 12:29:25 +0800 Subject: [PATCH 05/57] feat+test: choose_replica + ensure slot reserved Signed-off-by: machichima --- .../request_router/replica_wrapper.py | 33 ++++ .../_private/request_router/request_router.py | 13 ++ python/ray/serve/_private/router.py | 159 ++++++++++++++---- python/ray/serve/handle.py | 70 +++++++- python/ray/serve/tests/unit/test_router.py | 121 +++++++++++++ 5 files changed, 355 insertions(+), 41 deletions(-) diff --git a/python/ray/serve/_private/request_router/replica_wrapper.py b/python/ray/serve/_private/request_router/replica_wrapper.py index 2aa4f497e599..db65fbe03290 100644 --- a/python/ray/serve/_private/request_router/replica_wrapper.py +++ b/python/ray/serve/_private/request_router/replica_wrapper.py @@ -1,6 +1,7 @@ import asyncio import logging import pickle +import uuid from abc import ABC, abstractmethod from typing import Any, Dict, Optional, Set @@ -190,6 +191,10 @@ def __init__(self, replica_info: RunningReplicaInfo): self._actor_replica_wrapper = ActorReplicaWrapper(self._actor_handle) self._grpc_replica_wrapper = None + # Slot reservation tracking for choose_replica/dispatch pattern + # Maps slot_token -> True to track active reservations + self._reserved_slots: Set[str] = set() + @property def replica_id(self) -> ReplicaID: """ID of this replica.""" @@ -286,3 +291,31 @@ def try_send_request( return wrapper.send_request_java(pr) return wrapper.send_request_python(pr, with_rejection=with_rejection) + + def reserve_slot(self) -> str: + """Reserve a slot on this replica for an upcoming request. + + Returns a unique token that can be used to release the slot later. + This is used in the choose_replica/dispatch pattern to track + reservations that haven't been dispatched yet. + + Note: This only tracks the reservation locally. The actual queue + length management is handled by the replica actor itself. + """ + slot_token = str(uuid.uuid4()) + self._reserved_slots.add(slot_token) + return slot_token + + def release_slot(self, slot_token: str) -> None: + """Release a previously reserved slot. + + This should be called if a request is not dispatched after + reserving a slot (e.g., due to an error or cancellation). + """ + if slot_token in self._reserved_slots: + self._reserved_slots.discard(slot_token) + else: + logger.warning( + f"Attempted to release unknown slot token {slot_token} " + f"on replica {self.replica_id}" + ) diff --git a/python/ray/serve/_private/request_router/request_router.py b/python/ray/serve/_private/request_router/request_router.py index a518c6f4d609..7df621108628 100644 --- a/python/ray/serve/_private/request_router/request_router.py +++ b/python/ray/serve/_private/request_router/request_router.py @@ -792,6 +792,19 @@ def on_send_request(self, replica_id: ReplicaID): self._replica_queue_len_cache.update(replica_id, new_queue_len) self._update_router_queue_len_gauge(replica_id, new_queue_len) + def on_replica_result_finished(self, replica_id: ReplicaID): + """Decrement queue length cache when a request finishes or is cancelled. + + This is used when a reserved slot is released without being dispatched + (e.g., in choose_replica context manager cleanup). + """ + if self._use_replica_queue_len_cache: + num_ongoing_requests = self._replica_queue_len_cache.get(replica_id) or 0 + if num_ongoing_requests > 0: + new_queue_len = num_ongoing_requests - 1 + self._replica_queue_len_cache.update(replica_id, new_queue_len) + self._update_router_queue_len_gauge(replica_id, new_queue_len) + def update_replicas(self, replicas: List[RunningReplica]): """Update the set of available replicas to be considered for routing. diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 9fab93483f7f..7544c1ec23c4 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -490,6 +490,26 @@ def assign_request( ) -> concurrent.futures.Future[ReplicaResult]: pass + @abstractmethod + @asynccontextmanager + async def choose_replica( + self, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> AsyncIterator[ReplicaSelection]: + pass + + @abstractmethod + def dispatch( + self, + selection: ReplicaSelection, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> concurrent.futures.Future[ReplicaResult]: + pass + @abstractmethod def shutdown(self) -> concurrent.futures.Future: pass @@ -562,6 +582,9 @@ def __init__( # Initializing `self._metrics_manager` before `self.long_poll_client` is # necessary to avoid race condition where `self.update_deployment_config()` # might be called before `self._metrics_manager` instance is created. + # Track reserved slots for choose_replica + self._num_reserved_slots = 0 + self._metrics_manager = RouterMetricsManager( deployment_id, handle_id, @@ -592,6 +615,24 @@ def __init__( event_loop, ) + # Gauge for tracking reserved slots + self._reserved_slots_gauge = metrics.Gauge( + "serve_reserved_slots_active", + description=( + "The current number of reserved slots for choose_replica operations." + ), + tag_keys=("deployment", "application", "handle", "actor_id"), + ) + self._reserved_slots_gauge.set_default_tags( + { + "deployment": deployment_id.name, + "application": deployment_id.app_name, + "handle": handle_id, + "actor_id": self_actor_id, + } + ) + self._reserved_slots_gauge.set(0) + # The Router needs to stay informed about changes to the target deployment's # running replicas and deployment config. We do this via the long poll system. # However, for efficiency, we don't want to create a LongPollClient for every @@ -937,43 +978,89 @@ async def choose_replica( *request_args, **request_kwargs, ) -> AsyncIterator[ReplicaSelection]: - """Execute routing and reserve a slot, with automatic cleanup.""" - await self._request_router_initialized.wait() + """Execute routing and reserve a slot, with automatic cleanup. + + This method: + 1. Checks deployment availability + 2. Registers request in context for tracking + 3. Checks backpressure (max_queued_requests) + 4. Increments serve_num_router_requests metric + 5. Selects a replica and reserves a slot + 6. Increments serve_reserved_slots_active metric + 7. Yields the ReplicaSelection + 8. On exit, releases the slot if not dispatched + """ + if not self._deployment_available: + raise DeploymentUnavailableError(self.deployment_id) - pr = PendingRequest( - args=list(request_args), - kwargs=request_kwargs, - metadata=request_meta, + # TODO: think if we need those here? + response_id = generate_request_id() + assign_request_task = asyncio.current_task() + ray.serve.context._add_request_pending_assignment( + request_meta.internal_request_id, response_id, assign_request_task ) - - # Resolve arguments (e.g., for multiplexed routing) - if not pr.resolved: - await self._resolve_request_arguments(pr) - - # Run the request router - replica = await self.request_router._choose_replica_for_request(pr) - - # TODO: Implement function to reserve a slot on the replica - slot_token = replica.reserve_slot() - - selection = ReplicaSelection( - replica_id=replica.replica_id.unique_id, - node_ip=replica._replica_info.node_ip, - port=replica._replica_info.port, - node_id=replica.node_id, - availability_zone=replica.availability_zone, - _replica=replica, - _deployment_handle=None, # TODO: use the actual DeploymentHandle - _method_name=request_meta.call_method, - _slot_token=slot_token, + assign_request_task.add_done_callback( + lambda _: ray.serve.context._remove_request_pending_assignment( + request_meta.internal_request_id, response_id + ) ) - try: - yield selection - finally: - # Release slot if dispatch was not called - # TODO: implement this function - selection._release_slot() + # Wait for the router to be initialized before sending the request. + await self._request_router_initialized.wait() + + with self._metrics_manager.wrap_request_assignment(request_meta): + pr = PendingRequest( + args=list(request_args), + kwargs=request_kwargs, + metadata=request_meta, + ) + + # Resolve request arguments BEFORE incrementing queued requests. + # This ensures that queue metrics reflect actual pending work, + # not time spent waiting for upstream DeploymentResponse arguments. + # See: https://github.com/ray-project/ray/issues/60624 + if not pr.resolved: + await self._resolve_request_arguments(pr) + + replica = await self.request_router._choose_replica_for_request(pr) + + # Reserve a slot on the replica + slot_token = replica.reserve_slot() + + # Update queue length cache to reflect the reservation + # This ensures concurrent choose_replica calls see the correct load + self.request_router.on_send_request(replica.replica_id) + + # Increment reserved slots metric + self._num_reserved_slots += 1 + self._reserved_slots_gauge.set(self._num_reserved_slots) + + selection = ReplicaSelection( + replica_id=replica.replica_id.unique_id, + node_ip=replica._replica_info.node_ip, + port=replica._replica_info.port, + node_id=replica.node_id, + availability_zone=replica.availability_zone, + _replica=replica, + _deployment_handle=None, # Will be injected by DeploymentHandle.choose_replica + _method_name=request_meta.call_method, + _slot_token=slot_token, + ) + + try: + yield selection + finally: + # Always release the slot token (for tracking cleanup) + selection._release_slot() + + # Always decrement cache to reflect that this choose_replica operation is done. + # The actual request queue length will be updated by the replica when it + # reports back via on_new_queue_len_info(). + self.request_router.on_replica_result_finished(replica.replica_id) + + # Decrement reserved slots metric + self._num_reserved_slots -= 1 + self._reserved_slots_gauge.set(self._num_reserved_slots) async def dispatch( self, @@ -984,7 +1071,8 @@ async def dispatch( ) -> ReplicaResult: """Dispatch to a specific replica, consuming the reserved slot.""" # Verify replica is still available - if selection.replica_id not in self.request_router.curr_replicas: + replica = selection._replica + if replica.replica_id not in self.request_router.curr_replicas: raise ReplicaUnavailableError( f"Replica {selection.replica_id} is no longer available" ) @@ -995,8 +1083,6 @@ async def dispatch( metadata=request_meta, ) - replica = selection._replica - # Mark as dispatched so __aexit__ won't release the slot selection._mark_dispatched() @@ -1004,7 +1090,6 @@ async def dispatch( # TODO: implement this function result = replica.send_request_with_slot(pr, selection._slot_token) - self.request_router.on_send_request(replica.replica_id) return result async def shutdown(self): diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 1ab7c6173f2a..410bb8240b14 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -2,6 +2,7 @@ import concurrent.futures import logging import time +from contextlib import asynccontextmanager from typing_extensions import AsyncContextManager import warnings from typing import ( @@ -234,6 +235,53 @@ def _remote( return self._router.assign_request(metadata, *args, **kwargs), metadata + @asynccontextmanager + async def _choose_replica( + self, + *args, + **kwargs, + ) -> AsyncIterator[ReplicaSelection]: + """Execute the request router to select a replica without dispatching.""" + if not self.is_initialized: + self._init() + + metadata = serve._private.default_impl.get_request_metadata( + self.init_options, self.handle_options + ) + + # Call the router's choose_replica and inject the deployment handle + async with self._router.choose_replica(metadata, *args, **kwargs) as selection: + # Inject the deployment handle for validation + selection._deployment_handle = self + yield selection + + def _dispatch( + self, + selection: ReplicaSelection, + *args, + **kwargs, + ) -> Tuple[concurrent.futures.Future, RequestMetadata]: + """Dispatch a request to a previously selected replica.""" + # Validate that the selection belongs to this handle + if selection._deployment_handle is not self: + raise ValueError( + f"Cannot dispatch a selection created by a different DeploymentHandle. " + f"This handle is for {self.deployment_id}, but the selection was created " + f"for a different deployment." + ) + + if not self.is_initialized: + self._init() + + metadata = serve._private.default_impl.get_request_metadata( + self.init_options, self.handle_options + ) + + if self._router is None: + raise RuntimeError("Router is not initialized") + + return self._router.dispatch(selection, metadata, *args, **kwargs), metadata + def options( self, *, @@ -896,6 +944,7 @@ def remote( request_metadata, _is_router_running_in_separate_loop=self._is_router_running_in_separate_loop(), ) + def choose_replica( self, *args, @@ -919,14 +968,14 @@ def choose_replica( Returns: AsyncContextManager[ReplicaSelection] - must be used with async with. """ - pass + return self._choose_replica(*args, **kwargs) - async def dispatch( + def dispatch( self, selection: ReplicaSelection, *args, **kwargs, - ) -> DeploymentResponse: + ) -> Union[DeploymentResponse[Any], DeploymentResponseGenerator[Any]]: """Dispatch a request to a previously selected replica. Args: @@ -938,5 +987,18 @@ async def dispatch( Raises: ReplicaUnavailableError: If the selected replica is no longer available. + ValueError: If selection was created by a different DeploymentHandle. """ - pass + future, request_metadata = self._dispatch(selection, args, kwargs) + if self.handle_options.stream: + return DeploymentResponseGenerator( + future, + request_metadata, + _is_router_running_in_separate_loop=self._is_router_running_in_separate_loop(), + ) + else: + return DeploymentResponse( + future, + request_metadata, + _is_router_running_in_separate_loop=self._is_router_running_in_separate_loop(), + ) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 18c1ebf1e0e9..2618191f9b6e 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -244,6 +244,13 @@ def on_send_request(self, replica_id: ReplicaID): num_ongoing_requests = self._replica_queue_len_cache.get(replica_id) or 0 self._replica_queue_len_cache.update(replica_id, num_ongoing_requests + 1) + def on_replica_result_finished(self, replica_id: ReplicaID): + """Decrement queue length cache when a request finishes or is cancelled.""" + if self._use_queue_len_cache: + num_ongoing_requests = self._replica_queue_len_cache.get(replica_id) or 0 + if num_ongoing_requests > 0: + self._replica_queue_len_cache.update(replica_id, num_ongoing_requests - 1) + def on_replica_actor_unavailable(self, replica_id: ReplicaID): self._replica_queue_len_cache.invalidate_key(replica_id) @@ -989,6 +996,120 @@ async def test_multiple_sequential_selections( # All slots should have been created and used assert replica._slot_counter == 3 + @pytest.mark.parametrize( + "setup_router", + [{"enable_queue_len_cache": True}], + indirect=True, + ) + async def test_cache_updated_on_choose_replica( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test that queue length cache is updated when choosing a replica.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + # Initially cache should be empty + assert fake_request_router.replica_queue_len_cache.get(r1_id) is None + + # Choose replica should update cache to 1 + async with router.choose_replica(request_metadata) as selection: + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + + # Dispatch should NOT increment again (already counted in choose) + await router.dispatch(selection, request_metadata) + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + + # After context exit, cache is decremented back to 0 + # (Real replica will update cache via on_new_queue_len_info when it processes) + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 0 + + @pytest.mark.parametrize( + "setup_router", + [{"enable_queue_len_cache": True}], + indirect=True, + ) + async def test_cache_decremented_on_choose_without_dispatch( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test that cache is decremented when choose exits without dispatch.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + # Choose replica without dispatch + async with router.choose_replica(request_metadata) as selection: + # Cache should be 1 (reservation) + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + # Exit without calling dispatch + + # After context exit without dispatch, cache should be decremented to 0 + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 0 + + @pytest.mark.parametrize( + "setup_router", + [{"enable_queue_len_cache": True}], + indirect=True, + ) + async def test_concurrent_choose_replica_updates_cache( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test that concurrent choose_replica calls correctly update cache.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + # Start 3 concurrent choose_replica operations + async def choose_and_hold(request_id: str): + metadata = RequestMetadata( + request_id=request_id, + internal_request_id=request_id, + ) + async with router.choose_replica(metadata) as selection: + # Hold the selection + await asyncio.sleep(0.01) + return selection + + # Create tasks + tasks = [ + asyncio.create_task(choose_and_hold(f"request-{i}")) + for i in range(3) + ] + + # Wait a bit for all to enter context + await asyncio.sleep(0.005) + + # Cache should reflect all 3 reservations + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 3 + + # Let them all exit + await asyncio.gather(*tasks) + + # After all exit without dispatch, cache should be 0 + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 0 + def running_replica_info(replica_id: ReplicaID) -> RunningReplicaInfo: return RunningReplicaInfo( From 07a896219295608a21123d985c7f62ed812346a4 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 16 Feb 2026 18:36:36 +0800 Subject: [PATCH 06/57] feat: enable calling from handle layer Signed-off-by: machichima --- python/ray/serve/_private/router.py | 111 +++++++++++++++++++++++----- python/ray/serve/handle.py | 9 ++- 2 files changed, 99 insertions(+), 21 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 7544c1ec23c4..9e795ef97e94 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -492,7 +492,7 @@ def assign_request( @abstractmethod @asynccontextmanager - async def choose_replica( + def choose_replica( self, request_meta: RequestMetadata, *request_args, @@ -1083,6 +1083,10 @@ async def dispatch( metadata=request_meta, ) + # Resolve request arguments + if not pr.resolved: + await self._resolve_request_arguments(pr) + # Mark as dispatched so __aexit__ won't release the slot selection._mark_dispatched() @@ -1181,19 +1185,63 @@ def assign_request( A concurrent.futures.Future resolving to the ReplicaResult representing the assigned request. """ + return self._wrap_asyncio_call_in_future( + self._asyncio_router.assign_request( + request_meta, *request_args, **request_kwargs + ) + ) + + @asynccontextmanager + async def choose_replica( + self, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> AsyncIterator[ReplicaSelection]: + """Delegate to AsyncioRouter's choose_replica.""" + async with self._asyncio_router.choose_replica( + request_meta, *request_args, **request_kwargs + ) as selection: + yield selection + + def dispatch( + self, + selection: ReplicaSelection, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> concurrent.futures.Future[ReplicaResult]: + """Dispatch request to a previously selected replica. + + Similar to assign_request, wraps the async dispatch in a concurrent.futures.Future. + """ + return self._wrap_asyncio_call_in_future( + self._asyncio_router.dispatch( + selection, request_meta, *request_args, **request_kwargs + ) + ) + + def _wrap_asyncio_call_in_future( + self, + coro: Coroutine, + ) -> concurrent.futures.Future[ReplicaResult]: + """Wrap an async call in a concurrent.futures.Future for cross-thread execution. + + This is a helper method to execute AsyncioRouter's async methods on the dedicated asyncio event loop thread. + + Args: + coro: The coroutine to execute (e.g., _asyncio_router.assign_request(...)) + + Returns: + A concurrent.futures.Future that resolves to the ReplicaResult. + """ + # Extract operation name from coroutine for logging + operation_name = coro.__name__ if hasattr(coro, "__name__") else "operation" def asyncio_future_callback( asyncio_future: asyncio.Future, concurrent_future: concurrent.futures.Future ): - """Callback attached to the asyncio Task running assign_request. - - This runs when the asyncio Task finishes (completes, fails, or is cancelled). - Its primary goal is to propagate cancellation initiated via the - `concurrent_future` back to the `ReplicaResult` in situations where - asyncio_future didn't see the cancellation event in time. Think of it - like a second line of defense for cancellation of replica results. - """ - # Check if the cancellation originated from the concurrent.futures.Future + """Callback to propagate cancellation from concurrent.futures.Future to ReplicaResult.""" if ( concurrent_future.cancelled() and not asyncio_future.cancelled() @@ -1201,27 +1249,21 @@ def asyncio_future_callback( ): result: ReplicaResult = asyncio_future.result() logger.info( - "Asyncio task completed despite cancellation attempt. " - "Attempting to cancel the request that was assigned to a replica." + f"Asyncio task completed despite cancellation attempt during {operation_name}. " + "Attempting to cancel the request." ) result.cancel() concurrent_future = concurrent.futures.Future() def create_task_and_setup(): - task = self._asyncio_loop.create_task( - self._asyncio_router.assign_request( - request_meta, *request_args, **request_kwargs - ) - ) + task = self._asyncio_loop.create_task(coro) - # Set up your cancellation callback task.add_done_callback( lambda _: asyncio_future_callback(_, concurrent_future) ) try: - # chain the two futures to handle direction channel of cancellation futures._chain_future( ensure_future(task, loop=self._asyncio_loop), concurrent_future ) @@ -1232,7 +1274,6 @@ def create_task_and_setup(): concurrent_future.set_exception(exc) raise - # Schedule on the event loop thread self._asyncio_loop.call_soon_threadsafe(create_task_and_setup) return concurrent_future @@ -1354,5 +1395,35 @@ def assign_request( ), ) + @asynccontextmanager + async def choose_replica( + self, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> AsyncIterator[ReplicaSelection]: + """Delegate to AsyncioRouter's choose_replica.""" + async with self._asyncio_router.choose_replica( + request_meta, *request_args, **request_kwargs + ) as selection: + yield selection + + def dispatch( + self, + selection: ReplicaSelection, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> asyncio.Future[ReplicaResult]: + """Dispatch request to a previously selected replica. + + Returns an asyncio.Future wrapping the async dispatch call. + """ + return self._asyncio_loop.create_task( + self._asyncio_router.dispatch( + selection, request_meta, *request_args, **request_kwargs + ) + ) + def shutdown(self) -> asyncio.Future: return self._asyncio_loop.create_task(self._asyncio_router.shutdown()) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 410bb8240b14..972d9202fc8a 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -978,9 +978,16 @@ def dispatch( ) -> Union[DeploymentResponse[Any], DeploymentResponseGenerator[Any]]: """Dispatch a request to a previously selected replica. + By default, the result is a `DeploymentResponse` that can be awaited to fetch + the result of the call. Like `.remote()`, `DeploymentResponse` objects can be + passed as arguments for deployment composition. + + If `handle.options(stream=True)` is set and a generator method is called, this + returns a `DeploymentResponseGenerator` instead. + Args: selection: A ReplicaSelection from choose_replica() context manager. - *args, **kwargs: The actual request arguments. + *args, **kwargs: The request arguments to send to the replica Returns: DeploymentResponse or DeploymentResponseGenerator (if streaming). From 3faa9a8a2d2c3ad3a7cf976b640df4237eaf1a08 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 16 Feb 2026 19:05:03 +0800 Subject: [PATCH 07/57] feat: dispatch try_send_request Signed-off-by: machichima --- python/ray/serve/_private/router.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 9e795ef97e94..f1560f02e154 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1090,9 +1090,9 @@ async def dispatch( # Mark as dispatched so __aexit__ won't release the slot selection._mark_dispatched() - # Use the reserved slot - # TODO: implement this function - result = replica.send_request_with_slot(pr, selection._slot_token) + # Send the request without rejection since we already reserved a slot + # The slot reservation guarantees that the replica will accept this request + result = replica.try_send_request(pr, with_rejection=False) return result From 470958cee5ae220de1217d597c6bcda10a4aa863 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 16 Feb 2026 19:05:38 +0800 Subject: [PATCH 08/57] test: ensure dispatch send request to the correct replica Signed-off-by: machichima --- python/ray/serve/tests/unit/test_router.py | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 2618191f9b6e..597933b208aa 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -107,6 +107,7 @@ def __init__( self._error = error self._reserved_slots: Dict[str, bool] = {} # token -> is_released self._slot_counter = 0 + self._requests_sent = [] # Track all requests sent to this replica # Create a minimal _replica_info object to satisfy router.py requirements from unittest.mock import Mock @@ -168,6 +169,12 @@ def send_request_with_slot( def try_send_request( self, pr: PendingRequest, with_rejection: bool ) -> FakeReplicaResult: + # Track the request + self._requests_sent.append({ + "request_id": pr.metadata.request_id, + "with_rejection": with_rejection, + }) + if with_rejection: if self._error: raise self._error @@ -1110,6 +1117,63 @@ async def choose_and_hold(request_id: str): # After all exit without dispatch, cache should be 0 assert fake_request_router.replica_queue_len_cache.get(r1_id) == 0 + async def test_dispatch_replica_unavailable( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test dispatch() raises error when replica becomes unavailable.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + from ray.serve.exceptions import ReplicaUnavailableError + + async with router.choose_replica(request_metadata) as selection: + # Simulate replica becoming unavailable + fake_request_router._replica_to_return = None + + # dispatch should raise ReplicaUnavailableError + with pytest.raises(ReplicaUnavailableError): + await router.dispatch(selection, request_metadata) + + async def test_dispatch_uses_reserved_slot( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Test that dispatch() sends request using reserved slot without rejection.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + async with router.choose_replica(request_metadata) as selection: + # Verify no requests sent yet + assert len(replica._requests_sent) == 0 + + # Dispatch the request + replica_result = await router.dispatch(selection, request_metadata) + assert replica_result._replica_id == r1_id + + # Verify request was sent with with_rejection=False + assert len(replica._requests_sent) == 1 + assert replica._requests_sent[0]["request_id"] == "test-request-1" + assert replica._requests_sent[0]["with_rejection"] is False + def running_replica_info(replica_id: ReplicaID) -> RunningReplicaInfo: return RunningReplicaInfo( From b0c792ccb4ce83b24e9383b41d332327cbe4de56 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 16 Feb 2026 22:12:05 +0800 Subject: [PATCH 09/57] fix: args/kwargs type + verify same deployment id Signed-off-by: machichima --- python/ray/serve/handle.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 972d9202fc8a..eb694727d98b 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -238,8 +238,8 @@ def _remote( @asynccontextmanager async def _choose_replica( self, - *args, - **kwargs, + args: Tuple[Any], + kwargs: Dict[str, Any], ) -> AsyncIterator[ReplicaSelection]: """Execute the request router to select a replica without dispatching.""" if not self.is_initialized: @@ -258,16 +258,19 @@ async def _choose_replica( def _dispatch( self, selection: ReplicaSelection, - *args, - **kwargs, + args: Tuple[Any], + kwargs: Dict[str, Any], ) -> Tuple[concurrent.futures.Future, RequestMetadata]: """Dispatch a request to a previously selected replica.""" - # Validate that the selection belongs to this handle - if selection._deployment_handle is not self: + # Validate that the selection belongs to the same deployment + if ( + selection._deployment_handle is not None + and selection._deployment_handle.deployment_id != self.deployment_id + ): raise ValueError( - f"Cannot dispatch a selection created by a different DeploymentHandle. " + f"Cannot dispatch a selection created for a different deployment. " f"This handle is for {self.deployment_id}, but the selection was created " - f"for a different deployment." + f"for {selection._deployment_handle.deployment_id}." ) if not self.is_initialized: @@ -968,7 +971,7 @@ def choose_replica( Returns: AsyncContextManager[ReplicaSelection] - must be used with async with. """ - return self._choose_replica(*args, **kwargs) + return self._choose_replica(args, kwargs) def dispatch( self, From 008b190bee0be241e75656d112825ded3a97a7f7 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 16 Feb 2026 22:12:58 +0800 Subject: [PATCH 10/57] test: ensure single/stack pattern works Signed-off-by: machichima --- python/ray/serve/tests/test_handle_1.py | 112 ++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/python/ray/serve/tests/test_handle_1.py b/python/ray/serve/tests/test_handle_1.py index 20cbc7442976..89a6d28e28de 100644 --- a/python/ray/serve/tests/test_handle_1.py +++ b/python/ray/serve/tests/test_handle_1.py @@ -339,5 +339,117 @@ async def __call__(self): assert h.remote().result(timeout_s=10) == ("((r1))", "((r1))") +@pytest.mark.asyncio +async def test_choose_replica_and_dispatch_single(serve_instance): + """Test choose_replica + dispatch for simple single selection pattern.""" + + @serve.deployment(num_replicas=2) + class Backend: + def process(self, msg: str): + from ray.serve.api import get_replica_context + + replica_id = get_replica_context().replica_id.unique_id + return {"actual_replica_id": replica_id, "response": msg} + + @serve.deployment + class SimpleProxy: + def __init__(self, backend: DeploymentHandle): + self.backend = backend + + async def handle_request(self, request: str): + # Context manager ensures slot is released if dispatch fails or is skipped + async with self.backend.process.choose_replica(request) as selection: + assert selection.replica_id is not None + assert selection.node_ip is not None + + # Dispatch to the selected replica with same arguments + response = await self.backend.process.dispatch(selection, request) + + # Return both the selection and the response for verification + return {"selected_replica_id": selection.replica_id, **response} + + h = serve.run(SimpleProxy.bind(Backend.bind())) + result = await h.handle_request.remote("test_message") + + # Verify the result contains a replica_id and the message + assert result["response"] == "test_message" + + # Verify that dispatch sent the request to the replica we selected + assert result["actual_replica_id"] == result["selected_replica_id"], ( + f"dispatch sent request to wrong replica: " + f"selected {result['selected_replica_id']}, but got response from {result['actual_replica_id']}" + ) + + +@pytest.mark.asyncio +async def test_choose_replica_and_dispatch_parallel(serve_instance): + """Test parallel selection pattern (e.g., PD proxy) using AsyncExitStack.""" + from contextlib import AsyncExitStack + + @serve.deployment(num_replicas=2) + class PrefillServer: + def chat(self, msg: str): + from ray.serve.api import get_replica_context + + replica_id = get_replica_context().replica_id.unique_id + return {"actual_replica_id": replica_id, "response": msg} + + @serve.deployment(num_replicas=2) + class DecodeServer: + def chat(self, msg: str): + from ray.serve.api import get_replica_context + + replica_id = get_replica_context().replica_id.unique_id + return {"actual_replica_id": replica_id, "response": msg} + + @serve.deployment + class PDProxy: + def __init__( + self, + prefill_server: DeploymentHandle, + decode_server: DeploymentHandle, + ): + self.prefill = prefill_server + self.decode = decode_server + + async def handle_request(self, request: str): + # Use AsyncExitStack to manage multiple context managers in parallel + async with AsyncExitStack() as stack: + # Step 1: Select and RESERVE replicas from BOTH deployments in parallel + p_selection, d_selection = await asyncio.gather( + stack.enter_async_context(self.prefill.chat.choose_replica()), + stack.enter_async_context(self.decode.chat.choose_replica()), + ) + + # Step 2: Prepare requests with cross-deployment info + p_msg = f"prefill:{request}" + d_msg = f"decode:{request}" + + # Step 3: Dispatch to both with the exchanged info + p_result, d_result = await asyncio.gather( + self.prefill.chat.dispatch(p_selection, p_msg), + self.decode.chat.dispatch(d_selection, d_msg), + ) + return {"prefill": {"selected_replica_id": p_selection.replica_id, **p_result}, + "decode": {"selected_replica_id": d_selection.replica_id, **d_result}} + + h = serve.run(PDProxy.bind(PrefillServer.bind(), DecodeServer.bind())) + result = await h.handle_request.remote("test_parallel") + + + assert result["prefill"]["response"] == "prefill:test_parallel" + assert result["decode"]["response"] == "decode:test_parallel" + + # Verify that dispatch sent the request to the replica we selected + assert result["prefill"]["actual_replica_id"] == result["prefill"]["selected_replica_id"], ( + f"dispatch sent request to wrong replica for prefill: " + f"selected {result['prefill']['selected_replica_id']}, but got response from {result['prefill']['actual_replica_id']}" + ) + assert result["decode"]["actual_replica_id"] == result["decode"]["selected_replica_id"], ( + f"dispatch sent request to wrong replica for prefill: " + f"selected {result['decode']['selected_replica_id']}, but got response from {result['decode']['actual_replica_id']}" + ) + + if __name__ == "__main__": sys.exit(pytest.main(["-v", "-s", __file__])) From 1deff342da3edee2aa81f17bb5e65b7fdccc1641 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 17 Feb 2026 19:10:35 +0800 Subject: [PATCH 11/57] fix: account for reserved slots when checking avail Signed-off-by: machichima --- .../serve/_private/request_router/request_router.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/ray/serve/_private/request_router/request_router.py b/python/ray/serve/_private/request_router/request_router.py index 7df621108628..3b1af9692068 100644 --- a/python/ray/serve/_private/request_router/request_router.py +++ b/python/ray/serve/_private/request_router/request_router.py @@ -987,7 +987,10 @@ async def _select_from_candidate_replicas( # Include replicas whose queues are full as not in the cache so we will # actively probe them. Otherwise we may end up in "deadlock" until their # cache entries expire. - if queue_len is None or queue_len >= r.max_ongoing_requests: + # Account for reserved slots to ensure choose_replica guarantees + # that subsequent dispatch will not be rejected. + total_load = (queue_len or 0) + len(r._reserved_slots) + if queue_len is None or total_load >= r.max_ongoing_requests: not_in_cache.append(r) elif queue_len < lowest_queue_len: lowest_queue_len = queue_len @@ -1006,7 +1009,9 @@ async def _select_from_candidate_replicas( # None is returned if we failed to get the queue len. continue - if queue_len < r.max_ongoing_requests and queue_len < lowest_queue_len: + # Account for reserved slots when checking availability + total_load = queue_len + len(r._reserved_slots) + if total_load < r.max_ongoing_requests and queue_len < lowest_queue_len: lowest_queue_len = queue_len chosen_replica_id = r.replica_id elif len(not_in_cache) > 0: @@ -1310,7 +1315,9 @@ def select_available_replicas( available_replicas = [] for r in candidates: queue_len = self._replica_queue_len_cache.get(r.replica_id) - if queue_len is None or queue_len < r.max_ongoing_requests: + # Account for reserved slots when determining availability + total_load = (queue_len or 0) + len(r._reserved_slots) + if queue_len is None or total_load < r.max_ongoing_requests: available_replicas.append(r) return available_replicas From 88db9dd2c4842643dced1274135dd6db9fb30f0d Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 17 Feb 2026 19:16:10 +0800 Subject: [PATCH 12/57] refactor: lint and format Signed-off-by: machichima --- .../serve/_private/request_router/common.py | 4 +++- python/ray/serve/_private/router.py | 2 +- python/ray/serve/exceptions.py | 1 + python/ray/serve/handle.py | 7 ++++-- python/ray/serve/tests/test_handle_1.py | 22 +++++++++++++---- python/ray/serve/tests/unit/test_router.py | 24 +++++++++---------- 6 files changed, 39 insertions(+), 21 deletions(-) diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index 08727634f7fc..59a8f9b2cd30 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -96,7 +96,9 @@ class ReplicaSelection: _deployment_handle: "DeploymentHandle" _method_name: str _slot_token: str # Token for reserved slot - _dispatched: bool = field(default=False, init=False) # Tracks if dispatch was called + _dispatched: bool = field( + default=False, init=False + ) # Tracks if dispatch was called @property def address(self) -> str: diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index f1560f02e154..6a3ad15141fd 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -22,7 +22,6 @@ Union, ) -from ray.serve._private.request_router.common import ReplicaSelection import ray from ray.actor import ActorHandle from ray.exceptions import ActorDiedError, ActorUnavailableError, RayError @@ -54,6 +53,7 @@ ) from ray.serve._private.replica_result import ReplicaResult from ray.serve._private.request_router import PendingRequest, RequestRouter +from ray.serve._private.request_router.common import ReplicaSelection from ray.serve._private.request_router.pow_2_router import ( PowerOfTwoChoicesRequestRouter, ) diff --git a/python/ray/serve/exceptions.py b/python/ray/serve/exceptions.py index e72abe8f4ece..51671c7fbd14 100644 --- a/python/ray/serve/exceptions.py +++ b/python/ray/serve/exceptions.py @@ -106,4 +106,5 @@ def message(self) -> str: @PublicAPI(stability="alpha") class ReplicaUnavailableError(RayServeException): """Raised when the selected replica is no longer available.""" + pass diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index eb694727d98b..e1bbcf075bbf 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -2,9 +2,8 @@ import concurrent.futures import logging import time -from contextlib import asynccontextmanager -from typing_extensions import AsyncContextManager import warnings +from contextlib import asynccontextmanager from typing import ( Any, AsyncIterator, @@ -19,6 +18,8 @@ cast, ) +from typing_extensions import AsyncContextManager + import ray from ray import serve from ray._raylet import ObjectRefGenerator # type: ignore[attr-defined] @@ -248,6 +249,8 @@ async def _choose_replica( metadata = serve._private.default_impl.get_request_metadata( self.init_options, self.handle_options ) + if self._router is None: + raise RuntimeError("Router is not initialized") # Call the router's choose_replica and inject the deployment handle async with self._router.choose_replica(metadata, *args, **kwargs) as selection: diff --git a/python/ray/serve/tests/test_handle_1.py b/python/ray/serve/tests/test_handle_1.py index 89a6d28e28de..598cbbcd60fb 100644 --- a/python/ray/serve/tests/test_handle_1.py +++ b/python/ray/serve/tests/test_handle_1.py @@ -430,22 +430,34 @@ async def handle_request(self, request: str): self.prefill.chat.dispatch(p_selection, p_msg), self.decode.chat.dispatch(d_selection, d_msg), ) - return {"prefill": {"selected_replica_id": p_selection.replica_id, **p_result}, - "decode": {"selected_replica_id": d_selection.replica_id, **d_result}} + return { + "prefill": { + "selected_replica_id": p_selection.replica_id, + **p_result, + }, + "decode": { + "selected_replica_id": d_selection.replica_id, + **d_result, + }, + } h = serve.run(PDProxy.bind(PrefillServer.bind(), DecodeServer.bind())) result = await h.handle_request.remote("test_parallel") - assert result["prefill"]["response"] == "prefill:test_parallel" assert result["decode"]["response"] == "decode:test_parallel" # Verify that dispatch sent the request to the replica we selected - assert result["prefill"]["actual_replica_id"] == result["prefill"]["selected_replica_id"], ( + assert ( + result["prefill"]["actual_replica_id"] + == result["prefill"]["selected_replica_id"] + ), ( f"dispatch sent request to wrong replica for prefill: " f"selected {result['prefill']['selected_replica_id']}, but got response from {result['prefill']['actual_replica_id']}" ) - assert result["decode"]["actual_replica_id"] == result["decode"]["selected_replica_id"], ( + assert ( + result["decode"]["actual_replica_id"] == result["decode"]["selected_replica_id"] + ), ( f"dispatch sent request to wrong replica for prefill: " f"selected {result['decode']['selected_replica_id']}, but got response from {result['decode']['actual_replica_id']}" ) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 597933b208aa..74bda4900a9d 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -111,6 +111,7 @@ def __init__( # Create a minimal _replica_info object to satisfy router.py requirements from unittest.mock import Mock + self._replica_info = Mock() self._replica_info.node_id = node_id self._replica_info.availability_zone = availability_zone @@ -153,9 +154,7 @@ def send_request_with_slot( self, pr: PendingRequest, slot_token: str ) -> FakeReplicaResult: """Send request using a reserved slot.""" - assert ( - slot_token in self._reserved_slots - ), f"Invalid slot token: {slot_token}" + assert slot_token in self._reserved_slots, f"Invalid slot token: {slot_token}" assert not self._reserved_slots[ slot_token ], f"Slot already released: {slot_token}" @@ -170,10 +169,12 @@ def try_send_request( self, pr: PendingRequest, with_rejection: bool ) -> FakeReplicaResult: # Track the request - self._requests_sent.append({ - "request_id": pr.metadata.request_id, - "with_rejection": with_rejection, - }) + self._requests_sent.append( + { + "request_id": pr.metadata.request_id, + "with_rejection": with_rejection, + } + ) if with_rejection: if self._error: @@ -256,7 +257,9 @@ def on_replica_result_finished(self, replica_id: ReplicaID): if self._use_queue_len_cache: num_ongoing_requests = self._replica_queue_len_cache.get(replica_id) or 0 if num_ongoing_requests > 0: - self._replica_queue_len_cache.update(replica_id, num_ongoing_requests - 1) + self._replica_queue_len_cache.update( + replica_id, num_ongoing_requests - 1 + ) def on_replica_actor_unavailable(self, replica_id: ReplicaID): self._replica_queue_len_cache.invalidate_key(replica_id) @@ -1100,10 +1103,7 @@ async def choose_and_hold(request_id: str): return selection # Create tasks - tasks = [ - asyncio.create_task(choose_and_hold(f"request-{i}")) - for i in range(3) - ] + tasks = [asyncio.create_task(choose_and_hold(f"request-{i}")) for i in range(3)] # Wait a bit for all to enter context await asyncio.sleep(0.005) From c6f9b45f25e4a63644e183d828b9f62cb9a3de57 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 17 Feb 2026 19:21:20 +0800 Subject: [PATCH 13/57] refactor: move reserved slots metrics to RouterMetricsManager Signed-off-by: machichima --- python/ray/serve/_private/router.py | 54 ++++++++++++++++------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 6a3ad15141fd..38cda4a5ff50 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -140,6 +140,25 @@ def __init__( # so non-atomic read and write operations need to be guarded by # this thread-safe lock. self._queries_lock = threading.Lock() + + # Track reserved slots for choose_replica operations + self._num_reserved_slots = 0 + self._reserved_slots_gauge = metrics.Gauge( + "serve_reserved_slots_active", + description=( + "The current number of reserved slots for choose_replica operations." + ), + tag_keys=("deployment", "application", "handle", "actor_id"), + ) + self._reserved_slots_gauge.set_default_tags( + { + "deployment": deployment_id.name, + "application": deployment_id.app_name, + "handle": self._handle_id, + "actor_id": self._self_actor_id, + } + ) + self._reserved_slots_gauge.set(0) # Regularly aggregate and push autoscaling metrics to controller self.metrics_pusher = MetricsPusher() self.metrics_store = InMemoryMetricsStore() @@ -334,6 +353,14 @@ def dec_num_queued_requests(self): if not self._cached_metrics_enabled: self.num_queued_requests_gauge.set(self.num_queued_requests) + def inc_reserved_slots(self): + self._num_reserved_slots += 1 + self._reserved_slots_gauge.set(self._num_reserved_slots) + + def dec_reserved_slots(self): + self._num_reserved_slots -= 1 + self._reserved_slots_gauge.set(self._num_reserved_slots) + def inc_num_running_requests_for_replica(self, replica_id: ReplicaID): with self._queries_lock: self.num_requests_sent_to_replicas[replica_id] += 1 @@ -582,9 +609,6 @@ def __init__( # Initializing `self._metrics_manager` before `self.long_poll_client` is # necessary to avoid race condition where `self.update_deployment_config()` # might be called before `self._metrics_manager` instance is created. - # Track reserved slots for choose_replica - self._num_reserved_slots = 0 - self._metrics_manager = RouterMetricsManager( deployment_id, handle_id, @@ -615,24 +639,6 @@ def __init__( event_loop, ) - # Gauge for tracking reserved slots - self._reserved_slots_gauge = metrics.Gauge( - "serve_reserved_slots_active", - description=( - "The current number of reserved slots for choose_replica operations." - ), - tag_keys=("deployment", "application", "handle", "actor_id"), - ) - self._reserved_slots_gauge.set_default_tags( - { - "deployment": deployment_id.name, - "application": deployment_id.app_name, - "handle": handle_id, - "actor_id": self_actor_id, - } - ) - self._reserved_slots_gauge.set(0) - # The Router needs to stay informed about changes to the target deployment's # running replicas and deployment config. We do this via the long poll system. # However, for efficiency, we don't want to create a LongPollClient for every @@ -1032,8 +1038,7 @@ async def choose_replica( self.request_router.on_send_request(replica.replica_id) # Increment reserved slots metric - self._num_reserved_slots += 1 - self._reserved_slots_gauge.set(self._num_reserved_slots) + self._metrics_manager.inc_reserved_slots() selection = ReplicaSelection( replica_id=replica.replica_id.unique_id, @@ -1059,8 +1064,7 @@ async def choose_replica( self.request_router.on_replica_result_finished(replica.replica_id) # Decrement reserved slots metric - self._num_reserved_slots -= 1 - self._reserved_slots_gauge.set(self._num_reserved_slots) + self._metrics_manager.dec_reserved_slots() async def dispatch( self, From f7b30bb4d55cab884b907860ccdfad9c160713c4 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 17 Feb 2026 19:24:20 +0800 Subject: [PATCH 14/57] refactor: lint and docstring Signed-off-by: machichima --- python/ray/serve/handle.py | 15 ++++++++------- python/ray/serve/tests/unit/test_router.py | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index e1bbcf075bbf..7e4011ec993d 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -953,8 +953,8 @@ def remote( def choose_replica( self, - *args, - **kwargs, + *args: Any, + **kwargs: Any, ) -> AsyncContextManager[ReplicaSelection]: """Execute the request router to select a replica without dispatching. @@ -968,8 +968,8 @@ def choose_replica( - If the context exits without dispatch (e.g., exception, early return), the slot is released. Args: - *args, **kwargs: Arguments that may influence routing decisions - (e.g., for multiplexed model routing). + *args: Arguments that may influence routing decisions + **kwargs: Keyword arguments that may influence routing decisions. Returns: AsyncContextManager[ReplicaSelection] - must be used with async with. @@ -979,8 +979,8 @@ def choose_replica( def dispatch( self, selection: ReplicaSelection, - *args, - **kwargs, + *args: Any, + **kwargs: Any, ) -> Union[DeploymentResponse[Any], DeploymentResponseGenerator[Any]]: """Dispatch a request to a previously selected replica. @@ -993,7 +993,8 @@ def dispatch( Args: selection: A ReplicaSelection from choose_replica() context manager. - *args, **kwargs: The request arguments to send to the replica + *args: The request arguments to send to the replica. + **kwargs: The request keyword arguments to send to the replica. Returns: DeploymentResponse or DeploymentResponseGenerator (if streaming). diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 74bda4900a9d..8097e625b6fa 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -1066,7 +1066,7 @@ async def test_cache_decremented_on_choose_without_dispatch( ) # Choose replica without dispatch - async with router.choose_replica(request_metadata) as selection: + async with router.choose_replica(request_metadata): # Cache should be 1 (reservation) assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 # Exit without calling dispatch From 0090c5c551de93ccee2278c26cda8a63929f1c76 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 17 Feb 2026 19:51:17 +0800 Subject: [PATCH 15/57] refactor: remove unused code Signed-off-by: machichima --- python/ray/serve/_private/router.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 38cda4a5ff50..c9f89e720d8f 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -988,29 +988,16 @@ async def choose_replica( This method: 1. Checks deployment availability - 2. Registers request in context for tracking - 3. Checks backpressure (max_queued_requests) - 4. Increments serve_num_router_requests metric - 5. Selects a replica and reserves a slot - 6. Increments serve_reserved_slots_active metric - 7. Yields the ReplicaSelection - 8. On exit, releases the slot if not dispatched + 2. Checks backpressure (max_queued_requests) + 3. Increments serve_num_router_requests metric + 4. Selects a replica and reserves a slot + 5. Increments serve_reserved_slots_active metric + 6. Yields the ReplicaSelection + 7. On exit, releases the slot if not dispatched """ if not self._deployment_available: raise DeploymentUnavailableError(self.deployment_id) - # TODO: think if we need those here? - response_id = generate_request_id() - assign_request_task = asyncio.current_task() - ray.serve.context._add_request_pending_assignment( - request_meta.internal_request_id, response_id, assign_request_task - ) - assign_request_task.add_done_callback( - lambda _: ray.serve.context._remove_request_pending_assignment( - request_meta.internal_request_id, response_id - ) - ) - # Wait for the router to be initialized before sending the request. await self._request_router_initialized.wait() From 504bb8a39386ec079913a90ef4e3ac7a7577ee2d Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 17 Feb 2026 19:51:26 +0800 Subject: [PATCH 16/57] refactor: clean-up test comments Signed-off-by: machichima --- python/ray/serve/tests/test_handle_1.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/ray/serve/tests/test_handle_1.py b/python/ray/serve/tests/test_handle_1.py index 598cbbcd60fb..65553eef2404 100644 --- a/python/ray/serve/tests/test_handle_1.py +++ b/python/ray/serve/tests/test_handle_1.py @@ -362,7 +362,7 @@ async def handle_request(self, request: str): assert selection.replica_id is not None assert selection.node_ip is not None - # Dispatch to the selected replica with same arguments + # Dispatch to the selected replica response = await self.backend.process.dispatch(selection, request) # Return both the selection and the response for verification @@ -371,7 +371,7 @@ async def handle_request(self, request: str): h = serve.run(SimpleProxy.bind(Backend.bind())) result = await h.handle_request.remote("test_message") - # Verify the result contains a replica_id and the message + # Verify the result contains the message assert result["response"] == "test_message" # Verify that dispatch sent the request to the replica we selected @@ -415,17 +415,16 @@ def __init__( async def handle_request(self, request: str): # Use AsyncExitStack to manage multiple context managers in parallel async with AsyncExitStack() as stack: - # Step 1: Select and RESERVE replicas from BOTH deployments in parallel + # Select and RESERVE replicas from BOTH deployments in parallel p_selection, d_selection = await asyncio.gather( stack.enter_async_context(self.prefill.chat.choose_replica()), stack.enter_async_context(self.decode.chat.choose_replica()), ) - # Step 2: Prepare requests with cross-deployment info p_msg = f"prefill:{request}" d_msg = f"decode:{request}" - # Step 3: Dispatch to both with the exchanged info + # Dispatch to both selected replicas p_result, d_result = await asyncio.gather( self.prefill.chat.dispatch(p_selection, p_msg), self.decode.chat.dispatch(d_selection, d_msg), From f529d6897be15302affe21e0b614d2ea2d3e6af3 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 18 Feb 2026 18:54:04 +0800 Subject: [PATCH 17/57] test: add placeholder method to LocalRouter Signed-off-by: machichima --- .../ray/serve/_private/local_testing_mode.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/python/ray/serve/_private/local_testing_mode.py b/python/ray/serve/_private/local_testing_mode.py index 303a55c04fbc..1ce8bf7b304f 100644 --- a/python/ray/serve/_private/local_testing_mode.py +++ b/python/ray/serve/_private/local_testing_mode.py @@ -4,8 +4,9 @@ import logging import queue import time +from contextlib import asynccontextmanager from functools import wraps -from typing import Any, Callable, Coroutine, Dict, Optional, Tuple, Union +from typing import Any, AsyncIterator, Callable, Coroutine, Dict, Optional, Tuple, Union import ray from ray import cloudpickle @@ -16,6 +17,7 @@ ) from ray.serve._private.replica import UserCallableWrapper from ray.serve._private.replica_result import ReplicaResult +from ray.serve._private.request_router.common import ReplicaSelection from ray.serve._private.router import Router from ray.serve._private.utils import GENERATOR_COMPOSITION_NOT_SUPPORTED_ERROR from ray.serve.deployment import Deployment @@ -341,6 +343,39 @@ def generator_result_callback(item: Any): ) return noop_future + @asynccontextmanager + async def choose_replica( + self, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> AsyncIterator[ReplicaSelection]: + """Choose replica is not supported in local testing mode. + + This is a stub implementation to satisfy the Router ABC interface. + """ + raise NotImplementedError( + "choose_replica is not supported in local testing mode. " + "Use assign_request instead." + ) + yield # Make this a generator for asynccontextmanager + + def dispatch( + self, + selection: ReplicaSelection, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> concurrent.futures.Future[ReplicaResult]: + """Dispatch is not supported in local testing mode. + + This is a stub implementation to satisfy the Router ABC interface. + """ + raise NotImplementedError( + "dispatch is not supported in local testing mode. " + "Use assign_request instead." + ) + def shutdown(self): noop_future = concurrent.futures.Future() noop_future.set_result(None) From c460e573436123d49e4473cfdb5f083074074f56 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 18 Feb 2026 20:38:53 +0800 Subject: [PATCH 18/57] fix: also release slot after dispatch Signed-off-by: machichima --- python/ray/serve/_private/request_router/common.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index 59a8f9b2cd30..64ba7918b30c 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -122,9 +122,8 @@ def _mark_dispatched(self) -> None: self._dispatched = True def _release_slot(self) -> None: - """Internal: Release the reserved slot if not yet dispatched.""" - if not self._dispatched: - self._replica.release_slot(self._slot_token) + """Internal: Release the reserved slot.""" + self._replica.release_slot(self._slot_token) class ReplicaQueueLengthCache: From a3048a4a2a7dcb89342683f6d19eb69bfb1af123 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 18 Feb 2026 20:53:39 +0800 Subject: [PATCH 19/57] test: fix param type + release slot after dispatch Signed-off-by: machichima --- python/ray/serve/tests/unit/test_router.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 7c11b2017a0a..56f8c8b9087e 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -112,7 +112,7 @@ def __init__( self._is_cross_language = is_cross_language self._queue_len_info = queue_len_info self._error = error - self._reserved_slots: Dict[str, bool] = {} # token -> is_released + self._reserved_slots: Set[str] = set() self._slot_counter = 0 self._requests_sent = [] # Track all requests sent to this replica @@ -149,22 +149,18 @@ def reserve_slot(self) -> str: """Reserve a slot and return a token.""" self._slot_counter += 1 token = f"slot-token-{self._slot_counter}" - self._reserved_slots[token] = False # Not released yet + self._reserved_slots.add(token) return token def release_slot(self, slot_token: str) -> None: """Release a reserved slot.""" - if slot_token in self._reserved_slots: - self._reserved_slots[slot_token] = True + self._reserved_slots.discard(slot_token) def send_request_with_slot( self, pr: PendingRequest, slot_token: str ) -> FakeReplicaResult: """Send request using a reserved slot.""" assert slot_token in self._reserved_slots, f"Invalid slot token: {slot_token}" - assert not self._reserved_slots[ - slot_token - ], f"Slot already released: {slot_token}" # Create result same way as try_send_request if pr.metadata.is_streaming: @@ -873,15 +869,14 @@ async def test_basic_choose_and_dispatch( assert selection._replica == replica assert selection._method_name == "test_method" assert selection._slot_token in replica._reserved_slots - assert not replica._reserved_slots[selection._slot_token] # Not released # Dispatch request replica_result = await router.dispatch(selection, request_metadata) assert replica_result._replica_id == r1_id assert not replica_result._is_generator_object - # After context exit, slot should not be released (dispatch was called) - assert not replica._reserved_slots[selection._slot_token] + # After context exit, slot should be released + assert selection._slot_token not in replica._reserved_slots async def test_choose_without_dispatch_releases_slot( self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] @@ -904,11 +899,10 @@ async def test_choose_without_dispatch_releases_slot( async with router.choose_replica(request_metadata) as selection: slot_token = selection._slot_token assert slot_token in replica._reserved_slots - assert not replica._reserved_slots[slot_token] # Not released # Exit without calling dispatch # After context exit, slot should be released - assert replica._reserved_slots[slot_token] is True + assert slot_token not in replica._reserved_slots async def test_choose_with_exception_releases_slot( self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] @@ -932,11 +926,10 @@ async def test_choose_with_exception_releases_slot( async with router.choose_replica(request_metadata) as selection: slot_token = selection._slot_token assert slot_token in replica._reserved_slots - assert not replica._reserved_slots[slot_token] raise RuntimeError("Test exception") # After exception, slot should be released - assert replica._reserved_slots[slot_token] is True + assert slot_token not in replica._reserved_slots @pytest.mark.parametrize("is_streaming", [False, True]) async def test_choose_and_dispatch_streaming( From 56a6a89d58752bcef402824a0c448b92c30ddb97 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 18 Feb 2026 21:17:13 +0800 Subject: [PATCH 20/57] fix: choose_replica run on router thread loop Signed-off-by: machichima --- python/ray/serve/_private/router.py | 35 +++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index b480684eb124..ab4292a990dc 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1201,11 +1201,38 @@ async def choose_replica( *request_args, **request_kwargs, ) -> AsyncIterator[ReplicaSelection]: - """Delegate to AsyncioRouter's choose_replica.""" - async with self._asyncio_router.choose_replica( - request_meta, *request_args, **request_kwargs - ) as selection: + """Bridge async context manager to router event loop. + + This ensures choose_replica runs on the singleton router loop, + maintaining thread safety for all state modifications. + """ + import sys + + # Enter context on router loop + async def enter_context(): + cm = self._asyncio_router.choose_replica( + request_meta, *request_args, **request_kwargs + ) + selection = await cm.__aenter__() + return selection, cm + + future = asyncio.run_coroutine_threadsafe( + enter_context(), self._asyncio_loop + ) + selection, context_manager = await asyncio.wrap_future(future) + + try: yield selection + finally: + # Exit context on router loop + async def exit_context(exc_type, exc_val, exc_tb): + return await context_manager.__aexit__(exc_type, exc_val, exc_tb) + + exc_info = sys.exc_info() + future = asyncio.run_coroutine_threadsafe( + exit_context(*exc_info), self._asyncio_loop + ) + await asyncio.wrap_future(future) def dispatch( self, From 732215f5249626f36fa128f933f99937cd3ba4f6 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 18 Feb 2026 21:58:46 +0800 Subject: [PATCH 21/57] fix: check if already dispatched Signed-off-by: machichima --- .../serve/_private/request_router/common.py | 11 +++++++++- python/ray/serve/_private/router.py | 22 +++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index 64ba7918b30c..b8bd98b2f5da 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -118,7 +118,16 @@ def to_dict(self) -> Dict[str, Any]: } def _mark_dispatched(self) -> None: - """Internal: Mark this selection as dispatched (slot consumed).""" + """Internal: Mark this selection as dispatched (slot consumed). + + Raises: + RuntimeError: If the selection has already been dispatched. + """ + if self._dispatched: + raise RuntimeError( + f"ReplicaSelection for {self.replica_id} has already been dispatched. " + "Each selection can only be dispatched once." + ) self._dispatched = True def _release_slot(self) -> None: diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index ab4292a990dc..d9ee55e96bd8 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1072,7 +1072,24 @@ async def dispatch( *request_args, **request_kwargs, ) -> ReplicaResult: - """Dispatch to a specific replica, consuming the reserved slot.""" + """Dispatch to a specific replica, consuming the reserved slot. + + Args: + selection: The replica selection from choose_replica(). + request_meta: Request metadata. + *request_args: Request positional arguments. + **request_kwargs: Request keyword arguments. + + Returns: + ReplicaResult for the dispatched request. + + Raises: + RuntimeError: If the selection has already been dispatched. + ReplicaUnavailableError: If the replica is no longer available. + """ + # Mark as dispatched (this will raise if already dispatched) + selection._mark_dispatched() + # Verify replica is still available replica = selection._replica if replica.replica_id not in self.request_router.curr_replicas: @@ -1090,9 +1107,6 @@ async def dispatch( if not pr.resolved: await self._resolve_request_arguments(pr) - # Mark as dispatched so __aexit__ won't release the slot - selection._mark_dispatched() - # Send the request without rejection since we already reserved a slot # The slot reservation guarantees that the replica will accept this request result = replica.try_send_request(pr, with_rejection=False) From 251f9e7a9c9325cde5006aaf9e2324dd44623fd4 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 18 Feb 2026 22:03:11 +0800 Subject: [PATCH 22/57] fix: dispatch inc metrics + add completion callback Signed-off-by: machichima --- python/ray/serve/_private/router.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index d9ee55e96bd8..2b5a11f23c5e 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1,6 +1,7 @@ import asyncio import concurrent.futures import logging +import sys import threading import time import weakref @@ -1111,6 +1112,21 @@ async def dispatch( # The slot reservation guarantees that the replica will accept this request result = replica.try_send_request(pr, with_rejection=False) + # Keep track of requests that have been sent out to replicas + if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: + self._metrics_manager.inc_num_running_requests_for_replica( + replica.replica_id + ) + + # Always register callback to notify router when request completes + # (needed for token release in queue-based routing, metrics tracking, etc.) + callback = partial( + self._process_finished_request, + replica.replica_id, + pr.metadata.internal_request_id, + ) + result.add_done_callback(callback) + return result async def shutdown(self): @@ -1220,8 +1236,6 @@ async def choose_replica( This ensures choose_replica runs on the singleton router loop, maintaining thread safety for all state modifications. """ - import sys - # Enter context on router loop async def enter_context(): cm = self._asyncio_router.choose_replica( From e1ef1facef4456e01715ae1228eac22648db9b54 Mon Sep 17 00:00:00 2001 From: machichima Date: Wed, 18 Feb 2026 22:10:49 +0800 Subject: [PATCH 23/57] fix: add wrap_queued_request in choose_replica Signed-off-by: machichima --- python/ray/serve/_private/router.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 2b5a11f23c5e..2c0d1195d1c0 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1028,16 +1028,21 @@ async def choose_replica( if not pr.resolved: await self._resolve_request_arguments(pr) - replica = await self.request_router._choose_replica_for_request(pr) + # Increment queued requests while choosing replica and reserving slot + num_curr_replicas = len(self.request_router.curr_replicas) + with self._metrics_manager.wrap_queued_request( + is_retry=False, num_curr_replicas=num_curr_replicas + ): + replica = await self.request_router._choose_replica_for_request(pr) - # Reserve a slot on the replica - slot_token = replica.reserve_slot() + # Reserve a slot on the replica + slot_token = replica.reserve_slot() - # Update queue length cache to reflect the reservation - # This ensures concurrent choose_replica calls see the correct load - self.request_router.on_send_request(replica.replica_id) + # Update queue length cache to reflect the reservation + # This ensures concurrent choose_replica calls see the correct load + self.request_router.on_send_request(replica.replica_id) - # Increment reserved slots metric + # Increment reserved slots metric (after queue metric is decremented) self._metrics_manager.inc_reserved_slots() selection = ReplicaSelection( From 4fe9ebba2d6d37daee70c8315e7c4c859fe70758 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 6 Mar 2026 18:43:41 +0800 Subject: [PATCH 24/57] fix: skip choose_replica and dispatch test in local mode Signed-off-by: machichima --- python/ray/serve/tests/test_handle_1.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/ray/serve/tests/test_handle_1.py b/python/ray/serve/tests/test_handle_1.py index 65553eef2404..ebc4dd4ce76d 100644 --- a/python/ray/serve/tests/test_handle_1.py +++ b/python/ray/serve/tests/test_handle_1.py @@ -340,6 +340,10 @@ async def __call__(self): @pytest.mark.asyncio +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode doesn't support choose_replica/dispatch", +) async def test_choose_replica_and_dispatch_single(serve_instance): """Test choose_replica + dispatch for simple single selection pattern.""" @@ -382,6 +386,10 @@ async def handle_request(self, request: str): @pytest.mark.asyncio +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode doesn't support choose_replica/dispatch", +) async def test_choose_replica_and_dispatch_parallel(serve_instance): """Test parallel selection pattern (e.g., PD proxy) using AsyncExitStack.""" from contextlib import AsyncExitStack From 400930679263e2ef44d9695c21b2c2a1bc03e479 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 6 Mar 2026 19:28:45 +0800 Subject: [PATCH 25/57] fix: manual decrease cache only when not dispatched Signed-off-by: machichima --- python/ray/serve/_private/router.py | 31 ++++++++++++++++++----------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 2c0d1195d1c0..f5c1a3992627 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1057,19 +1057,21 @@ async def choose_replica( _slot_token=slot_token, ) - try: - yield selection - finally: - # Always release the slot token (for tracking cleanup) - selection._release_slot() - - # Always decrement cache to reflect that this choose_replica operation is done. - # The actual request queue length will be updated by the replica when it - # reports back via on_new_queue_len_info(). + try: + yield selection + finally: + # Always release the slot token (for tracking cleanup) + selection._release_slot() + + # Decrement cache only if this selection was never dispatched. + # Once dispatch is called, the request remains in-flight and should be + # accounted for until request completion is reflected via queue length + # callbacks/probes. + if not selection._dispatched: self.request_router.on_replica_result_finished(replica.replica_id) - # Decrement reserved slots metric - self._metrics_manager.dec_reserved_slots() + # Decrement reserved slots metric + self._metrics_manager.dec_reserved_slots() async def dispatch( self, @@ -1115,7 +1117,12 @@ async def dispatch( # Send the request without rejection since we already reserved a slot # The slot reservation guarantees that the replica will accept this request - result = replica.try_send_request(pr, with_rejection=False) + try: + result = replica.try_send_request(pr, with_rejection=False) + except Exception: + # Release the queue-length increment performed during choose_replica when dispatch fails. + self.request_router.on_replica_result_finished(replica.replica_id) + raise # Keep track of requests that have been sent out to replicas if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: From 642f05f8111e1fb128bb74a4c77a69484f8c4141 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 6 Mar 2026 19:32:32 +0800 Subject: [PATCH 26/57] fix: call request_counter.inc in DeploymentHandle._choose_replica Signed-off-by: machichima --- python/ray/serve/handle.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 7e4011ec993d..241f64df95bc 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -249,6 +249,12 @@ async def _choose_replica( metadata = serve._private.default_impl.get_request_metadata( self.init_options, self.handle_options ) + self.request_counter.inc( + tags={ + "route": metadata.route, + "application": metadata.app_name, + } + ) if self._router is None: raise RuntimeError("Router is not initialized") From a074356b792293654b2bed0f8c4ee6044ecfa544 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 6 Mar 2026 19:34:49 +0800 Subject: [PATCH 27/57] refactor: precommit Signed-off-by: machichima --- python/ray/serve/_private/router.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index f5c1a3992627..e08eb4a69fbe 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1256,9 +1256,7 @@ async def enter_context(): selection = await cm.__aenter__() return selection, cm - future = asyncio.run_coroutine_threadsafe( - enter_context(), self._asyncio_loop - ) + future = asyncio.run_coroutine_threadsafe(enter_context(), self._asyncio_loop) selection, context_manager = await asyncio.wrap_future(future) try: From 1d4d7358d6cf53f81e10628acb3b082a6f81b7bc Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 6 Mar 2026 20:02:26 +0800 Subject: [PATCH 28/57] fix: mark dispatch after check replica available Signed-off-by: machichima --- python/ray/serve/_private/router.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 08a64618c75c..10537cf4ae05 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1147,9 +1147,6 @@ async def dispatch( RuntimeError: If the selection has already been dispatched. ReplicaUnavailableError: If the replica is no longer available. """ - # Mark as dispatched (this will raise if already dispatched) - selection._mark_dispatched() - # Verify replica is still available replica = selection._replica if replica.replica_id not in self.request_router.curr_replicas: @@ -1157,6 +1154,9 @@ async def dispatch( f"Replica {selection.replica_id} is no longer available" ) + # Mark as dispatched (this will raise if already dispatched) + selection._mark_dispatched() + pr = PendingRequest( args=list(request_args), kwargs=request_kwargs, From 3ee929af4c09f061a31fb2fbbcfc6e0522d782d3 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 6 Mar 2026 20:46:22 +0800 Subject: [PATCH 29/57] fix: prevent double count in cache mode Signed-off-by: machichima --- .../_private/request_router/request_router.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/python/ray/serve/_private/request_router/request_router.py b/python/ray/serve/_private/request_router/request_router.py index bb6c40fd9f27..bf88725ebe8c 100644 --- a/python/ray/serve/_private/request_router/request_router.py +++ b/python/ray/serve/_private/request_router/request_router.py @@ -1025,9 +1025,7 @@ async def _select_from_candidate_replicas( # Include replicas whose queues are full as not in the cache so we will # actively probe them. Otherwise we may end up in "deadlock" until their # cache entries expire. - # Account for reserved slots to ensure choose_replica guarantees - # that subsequent dispatch will not be rejected. - total_load = (queue_len or 0) + len(r._reserved_slots) + total_load = queue_len or 0 if queue_len is None or total_load >= r.max_ongoing_requests: not_in_cache.append(r) elif queue_len < lowest_queue_len: @@ -1047,8 +1045,12 @@ async def _select_from_candidate_replicas( # None is returned if we failed to get the queue len. continue - # Account for reserved slots when checking availability - total_load = queue_len + len(r._reserved_slots) + # In non-cache mode, `queue_len` does not include reserved slots. + # Add slot reservation count in that case. + if self._use_replica_queue_len_cache: + total_load = queue_len + else: + total_load = queue_len + len(r._reserved_slots) if total_load < r.max_ongoing_requests and queue_len < lowest_queue_len: lowest_queue_len = queue_len chosen_replica_id = r.replica_id @@ -1349,8 +1351,13 @@ def select_available_replicas( available_replicas = [] for r in candidates: queue_len = self._replica_queue_len_cache.get(r.replica_id) - # Account for reserved slots when determining availability - total_load = (queue_len or 0) + len(r._reserved_slots) + # In cache mode, queue_len already includes reservations via + # on_send_request/on_replica_result_finished callbacks. + # In non-cache mode, include reserved slots explicitly. + if self._use_replica_queue_len_cache: + total_load = queue_len or 0 + else: + total_load = (queue_len or 0) + len(r._reserved_slots) if queue_len is None or total_load < r.max_ongoing_requests: available_replicas.append(r) From 332798636099df3953ab912e26332423d571278c Mon Sep 17 00:00:00 2001 From: machichima Date: Sat, 7 Mar 2026 16:47:58 +0800 Subject: [PATCH 30/57] fix: router test error Signed-off-by: machichima --- python/ray/serve/tests/unit/test_pow_2_request_router.py | 1 + python/ray/serve/tests/unit/test_router.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/ray/serve/tests/unit/test_pow_2_request_router.py b/python/ray/serve/tests/unit/test_pow_2_request_router.py index f46e8c38cfb4..acdb2d0d854f 100644 --- a/python/ray/serve/tests/unit/test_pow_2_request_router.py +++ b/python/ray/serve/tests/unit/test_pow_2_request_router.py @@ -57,6 +57,7 @@ def __init__( self._availability_zone = availability_zone self._queue_len = 0 self._max_ongoing_requests = max_ongoing_requests + self._reserved_slots: Set[str] = set() self._has_queue_len_response = asyncio.Event() self._reset_after_response = reset_after_response self._model_ids = model_ids or set() diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 56f8c8b9087e..f5a6a8e65338 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -1047,9 +1047,9 @@ async def test_cache_updated_on_choose_replica( await router.dispatch(selection, request_metadata) assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 - # After context exit, cache is decremented back to 0 - # (Real replica will update cache via on_new_queue_len_info when it processes) - assert fake_request_router.replica_queue_len_cache.get(r1_id) == 0 + # After dispatch, cache remains incremented while the request is in flight. + # It is decremented when request completion is observed. + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 @pytest.mark.parametrize( "setup_router", From f7fc4c0e88df951a59d3d675da074db70aa04cf4 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 8 Mar 2026 17:31:45 +0800 Subject: [PATCH 31/57] fix: api docs ReplicaUnavailableError issue Signed-off-by: machichima --- python/ray/serve/handle.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 241f64df95bc..55b9a31024a0 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -996,6 +996,8 @@ def dispatch( If `handle.options(stream=True)` is set and a generator method is called, this returns a `DeploymentResponseGenerator` instead. + If the selected replica becomes unavailable before dispatch executes, + ``ReplicaUnavailableError`` is propagated from the router dispatch path. Args: selection: A ReplicaSelection from choose_replica() context manager. @@ -1006,7 +1008,6 @@ def dispatch( DeploymentResponse or DeploymentResponseGenerator (if streaming). Raises: - ReplicaUnavailableError: If the selected replica is no longer available. ValueError: If selection was created by a different DeploymentHandle. """ future, request_metadata = self._dispatch(selection, args, kwargs) From 1683ff44a2b51bf328e943d1ca7b178027ace519 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 8 Mar 2026 17:48:07 +0800 Subject: [PATCH 32/57] fix: decrease queue len when finished or no callback Signed-off-by: machichima --- python/ray/serve/_private/router.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 10537cf4ae05..c9291917395f 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -807,6 +807,9 @@ def _process_finished_request( # Notify request router that request completed (for cleanup, e.g., token release) if self.request_router: self.request_router.on_request_completed(replica_id, internal_request_id) + # Decrement queue length cache for successfully dispatched requests. + # The increment happens in on_send_request during dispatch/assignment. + self.request_router.on_replica_result_finished(replica_id) if isinstance(result, ActorDiedError): # Replica has died but controller hasn't notified the router yet. @@ -922,6 +925,7 @@ async def _route_and_send_request_once( self.request_router.on_request_completed( replica.replica_id, pr.metadata.internal_request_id ) + self.request_router.on_replica_result_finished(replica.replica_id) return None From edd7ccf2dda76411375e3e3695aeb8ad585de35b Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 8 Mar 2026 18:07:19 +0800 Subject: [PATCH 33/57] refactor: _deployment_handle to optional Signed-off-by: machichima --- python/ray/serve/_private/request_router/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index b8bd98b2f5da..56ec46b24f48 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -93,7 +93,7 @@ class ReplicaSelection: # Internal fields (not part of public API) _replica: "RunningReplica" - _deployment_handle: "DeploymentHandle" + _deployment_handle: Optional["DeploymentHandle"] _method_name: str _slot_token: str # Token for reserved slot _dispatched: bool = field( From afef7b34444efcb2d9528403a486bfaf8be65fc6 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 9 Mar 2026 21:19:42 +0800 Subject: [PATCH 34/57] refactor: add ReplicaUnavailableError to serve public API Signed-off-by: machichima --- doc/source/serve/api/index.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/serve/api/index.md b/doc/source/serve/api/index.md index 23d802be0aef..372aa3c67b06 100644 --- a/doc/source/serve/api/index.md +++ b/doc/source/serve/api/index.md @@ -168,6 +168,7 @@ See the [model composition guide](serve-model-composition) for how to update cod serve.exceptions.RequestCancelledError serve.exceptions.gRPCStatusError serve.exceptions.DeploymentUnavailableError + serve.exceptions.ReplicaUnavailableError ``` @@ -521,4 +522,4 @@ Content-Type: application/json serve.llm.LLMServer serve.llm.LLMRouter -``` \ No newline at end of file +``` From 28524682c031f2fea40bd9dd3c5d17dba6ff595a Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 20:35:46 +0800 Subject: [PATCH 35/57] fix: fix circular deps by setting deployment_id instead Signed-off-by: machichima --- .../serve/_private/request_router/common.py | 5 ++-- python/ray/serve/_private/router.py | 2 +- python/ray/serve/handle.py | 10 ++++---- python/ray/serve/tests/test_handle.py | 24 ++++++++++++++++++- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index 56ec46b24f48..2f3a45fbb555 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -4,7 +4,7 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set -from ray.serve._private.common import ReplicaID, RequestMetadata +from ray.serve._private.common import DeploymentID, ReplicaID, RequestMetadata from ray.serve._private.constants import ( RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S, SERVE_LOGGER_NAME, @@ -13,7 +13,6 @@ if TYPE_CHECKING: from ray.serve._private.request_router.replica_wrapper import RunningReplica - from ray.serve.handle import DeploymentHandle logger = logging.getLogger(SERVE_LOGGER_NAME) @@ -93,7 +92,7 @@ class ReplicaSelection: # Internal fields (not part of public API) _replica: "RunningReplica" - _deployment_handle: Optional["DeploymentHandle"] + _deployment_id: Optional[DeploymentID] _method_name: str _slot_token: str # Token for reserved slot _dispatched: bool = field( diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 49a10270201b..6b840e4e96c9 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1124,7 +1124,7 @@ async def choose_replica( node_id=replica.node_id, availability_zone=replica.availability_zone, _replica=replica, - _deployment_handle=None, # Will be injected by DeploymentHandle.choose_replica + _deployment_id=None, # Injected by DeploymentHandle for dispatch-time validation. _method_name=request_meta.call_method, _slot_token=slot_token, ) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 55b9a31024a0..db2d37538f9c 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -260,8 +260,8 @@ async def _choose_replica( # Call the router's choose_replica and inject the deployment handle async with self._router.choose_replica(metadata, *args, **kwargs) as selection: - # Inject the deployment handle for validation - selection._deployment_handle = self + # Record the owning deployment for dispatch-time validation. + selection._deployment_id = self.deployment_id yield selection def _dispatch( @@ -273,13 +273,13 @@ def _dispatch( """Dispatch a request to a previously selected replica.""" # Validate that the selection belongs to the same deployment if ( - selection._deployment_handle is not None - and selection._deployment_handle.deployment_id != self.deployment_id + selection._deployment_id is not None + and selection._deployment_id != self.deployment_id ): raise ValueError( f"Cannot dispatch a selection created for a different deployment. " f"This handle is for {self.deployment_id}, but the selection was created " - f"for {selection._deployment_handle.deployment_id}." + f"for {selection._deployment_id}." ) if not self.is_initialized: diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index 153d4e4d64cd..cdfa5bb9a979 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -4,12 +4,16 @@ from ray import serve from ray._common.test_utils import SignalActor -from ray.serve._private.common import OBJ_REF_NOT_SUPPORTED_ERROR +from ray.serve._private.common import ( + OBJ_REF_NOT_SUPPORTED_ERROR, + DeploymentID, +) from ray.serve._private.replica_result import ( ActorReplicaResult, ReplicaResult, gRPCReplicaResult, ) +from ray.serve._private.request_router.common import ReplicaSelection from ray.serve.handle import DeploymentHandle from ray.serve.tests.conftest import * # noqa from ray.serve.tests.conftest import _shared_serve_instance # noqa @@ -121,6 +125,24 @@ def __call__(self, inp1: str, inp2: str): ) +def test_dispatch_rejects_selection_from_different_deployment(): + handle = DeploymentHandle("deployment-a", "app") + selection = ReplicaSelection( + replica_id="replica-1", + node_ip="127.0.0.1", + port=None, + node_id="node-1", + availability_zone=None, + _replica=object(), + _deployment_id=DeploymentID(name="deployment-b", app_name="app"), + _method_name="__call__", + _slot_token="slot-1", + ) + + with pytest.raises(ValueError, match="different deployment"): + handle.dispatch(selection) + + if __name__ == "__main__": import sys From bc58182600386a69cbacf28fca7248e5c379ce97 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 20:44:49 +0800 Subject: [PATCH 36/57] refactor: extract common logic to utility function Signed-off-by: machichima --- python/ray/serve/handle.py | 49 +++++++++++++++----------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index db2d37538f9c..836808a40b36 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -184,6 +184,19 @@ def _is_router_running_in_separate_loop(self) -> bool: return False return self.init_options._run_router_in_separate_loop + def _init_router_and_get_metadata(self) -> Tuple[Router, RequestMetadata]: + if not self.is_initialized: + self._init() + + metadata = serve._private.default_impl.get_request_metadata( + self.init_options, self.handle_options + ) + + if self._router is None: + raise RuntimeError("Router is not initialized") + + return self._router, metadata + def _options( self, _prefer_local_routing=DEFAULT.VALUE, **kwargs ) -> "DeploymentHandle[T]": @@ -218,12 +231,7 @@ def _remote( args: Tuple[Any], kwargs: Dict[str, Any], ) -> Tuple[concurrent.futures.Future, RequestMetadata]: - if not self.is_initialized: - self._init() - - metadata = serve._private.default_impl.get_request_metadata( - self.init_options, self.handle_options - ) + router, metadata = self._init_router_and_get_metadata() self.request_counter.inc( tags={ @@ -231,10 +239,7 @@ def _remote( "application": metadata.app_name, } ) - if self._router is None: - raise RuntimeError("Router is not initialized") - - return self._router.assign_request(metadata, *args, **kwargs), metadata + return router.assign_request(metadata, *args, **kwargs), metadata @asynccontextmanager async def _choose_replica( @@ -243,23 +248,16 @@ async def _choose_replica( kwargs: Dict[str, Any], ) -> AsyncIterator[ReplicaSelection]: """Execute the request router to select a replica without dispatching.""" - if not self.is_initialized: - self._init() - - metadata = serve._private.default_impl.get_request_metadata( - self.init_options, self.handle_options - ) + router, metadata = self._init_router_and_get_metadata() self.request_counter.inc( tags={ "route": metadata.route, "application": metadata.app_name, } ) - if self._router is None: - raise RuntimeError("Router is not initialized") # Call the router's choose_replica and inject the deployment handle - async with self._router.choose_replica(metadata, *args, **kwargs) as selection: + async with router.choose_replica(metadata, *args, **kwargs) as selection: # Record the owning deployment for dispatch-time validation. selection._deployment_id = self.deployment_id yield selection @@ -282,17 +280,8 @@ def _dispatch( f"for {selection._deployment_id}." ) - if not self.is_initialized: - self._init() - - metadata = serve._private.default_impl.get_request_metadata( - self.init_options, self.handle_options - ) - - if self._router is None: - raise RuntimeError("Router is not initialized") - - return self._router.dispatch(selection, metadata, *args, **kwargs), metadata + router, metadata = self._init_router_and_get_metadata() + return router.dispatch(selection, metadata, *args, **kwargs), metadata def options( self, From de7e6e36dcf4ee9bc3844d19ecb744c0f9afa56d Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 20:48:15 +0800 Subject: [PATCH 37/57] fix: move import to the top of the file Signed-off-by: machichima --- python/ray/serve/tests/test_handle_1.py | 9 ++------- python/ray/serve/tests/unit/test_router.py | 6 +----- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/python/ray/serve/tests/test_handle_1.py b/python/ray/serve/tests/test_handle_1.py index ebc4dd4ce76d..3b2ea4aa1778 100644 --- a/python/ray/serve/tests/test_handle_1.py +++ b/python/ray/serve/tests/test_handle_1.py @@ -2,6 +2,7 @@ import concurrent.futures import sys import threading +from contextlib import AsyncExitStack from typing import Any import pytest @@ -13,6 +14,7 @@ RAY_SERVE_FORCE_LOCAL_TESTING_MODE, SERVE_DEFAULT_APP_NAME, ) +from ray.serve.api import get_replica_context from ray.serve.exceptions import RayServeException from ray.serve.handle import DeploymentHandle @@ -350,8 +352,6 @@ async def test_choose_replica_and_dispatch_single(serve_instance): @serve.deployment(num_replicas=2) class Backend: def process(self, msg: str): - from ray.serve.api import get_replica_context - replica_id = get_replica_context().replica_id.unique_id return {"actual_replica_id": replica_id, "response": msg} @@ -392,21 +392,16 @@ async def handle_request(self, request: str): ) async def test_choose_replica_and_dispatch_parallel(serve_instance): """Test parallel selection pattern (e.g., PD proxy) using AsyncExitStack.""" - from contextlib import AsyncExitStack @serve.deployment(num_replicas=2) class PrefillServer: def chat(self, msg: str): - from ray.serve.api import get_replica_context - replica_id = get_replica_context().replica_id.unique_id return {"actual_replica_id": replica_id, "response": msg} @serve.deployment(num_replicas=2) class DecodeServer: def chat(self, msg: str): - from ray.serve.api import get_replica_context - replica_id = get_replica_context().replica_id.unique_id return {"actual_replica_id": replica_id, "response": msg} diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index b5942fbafe2a..3d12cc4bb8ff 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -42,7 +42,7 @@ from ray.serve._private.test_utils import FakeCounter, FakeGauge, MockTimer from ray.serve._private.utils import decompress_metric_report, get_random_string from ray.serve.config import AutoscalingConfig -from ray.serve.exceptions import BackPressureError +from ray.serve.exceptions import BackPressureError, ReplicaUnavailableError class FakeReplicaResult(ReplicaResult): @@ -117,8 +117,6 @@ def __init__( self._requests_sent = [] # Track all requests sent to this replica # Create a minimal _replica_info object to satisfy router.py requirements - from unittest.mock import Mock - self._replica_info = Mock() self._replica_info.node_id = node_id self._replica_info.availability_zone = availability_zone @@ -1142,8 +1140,6 @@ async def test_dispatch_replica_unavailable( internal_request_id="test-internal-request-1", ) - from ray.serve.exceptions import ReplicaUnavailableError - async with router.choose_replica(request_metadata) as selection: # Simulate replica becoming unavailable fake_request_router._replica_to_return = None From 58f42f7c6c33b30294fcb8335539d06c6f4bb3ea Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 20:57:31 +0800 Subject: [PATCH 38/57] refactor: choose_replica to async Signed-off-by: machichima --- python/ray/serve/_private/router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 6b840e4e96c9..59220c29089b 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -546,7 +546,7 @@ def assign_request( @abstractmethod @asynccontextmanager - def choose_replica( + async def choose_replica( self, request_meta: RequestMetadata, *request_args, From 174f7bd56f1ba0da239f8741a889ad04e404f30c Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 21:00:53 +0800 Subject: [PATCH 39/57] refactor: fix docstrings Signed-off-by: machichima --- python/ray/serve/_private/router.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 59220c29089b..8fddbe79f586 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1351,10 +1351,7 @@ def dispatch( *request_args, **request_kwargs, ) -> concurrent.futures.Future[ReplicaResult]: - """Dispatch request to a previously selected replica. - - Similar to assign_request, wraps the async dispatch in a concurrent.futures.Future. - """ + """Dispatch request to a previously selected replica.""" return self._wrap_asyncio_call_in_future( self._asyncio_router.dispatch( selection, request_meta, *request_args, **request_kwargs @@ -1381,7 +1378,14 @@ def _wrap_asyncio_call_in_future( def asyncio_future_callback( asyncio_future: asyncio.Future, concurrent_future: concurrent.futures.Future ): - """Callback to propagate cancellation from concurrent.futures.Future to ReplicaResult.""" + """Callback attached to the asyncio Task running assign_request. + + This runs when the asyncio Task finishes (completes, fails, or is cancelled). + Its primary goal is to propagate cancellation initiated via the + `concurrent_future` back to the `ReplicaResult` in situations where + asyncio_future didn't see the cancellation event in time. Think of it + like a second line of defense for cancellation of replica results. + """ if ( concurrent_future.cancelled() and not asyncio_future.cancelled() From fe842d083d7eb0319e2519101b581468ef154edc Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 21:25:42 +0800 Subject: [PATCH 40/57] refactor: better document on_replica_result_finished Signed-off-by: machichima --- python/ray/serve/_private/request_router/request_router.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/ray/serve/_private/request_router/request_router.py b/python/ray/serve/_private/request_router/request_router.py index bf88725ebe8c..acb31941aa46 100644 --- a/python/ray/serve/_private/request_router/request_router.py +++ b/python/ray/serve/_private/request_router/request_router.py @@ -835,6 +835,12 @@ def on_replica_result_finished(self, replica_id: ReplicaID): This is used when a reserved slot is released without being dispatched (e.g., in choose_replica context manager cleanup). + + We cannot rely on on_new_queue_len_info() to correct the cache in this + path. The queue length cache is incremented optimistically when a slot is + reserved, before dispatch happens. If dispatch is never called or fails + before the request reaches the replica, no queue_len_info response is + produced, so the cache would otherwise remain inflated. """ if self._use_replica_queue_len_cache: num_ongoing_requests = self._replica_queue_len_cache.get(replica_id) or 0 From c01822ba921b83ccaced2a8e3454d3bf7e17a650 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 21:29:53 +0800 Subject: [PATCH 41/57] fix: remove redundant code as we always in cache mode Signed-off-by: machichima --- .../_private/request_router/request_router.py | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/python/ray/serve/_private/request_router/request_router.py b/python/ray/serve/_private/request_router/request_router.py index acb31941aa46..1ad51886861f 100644 --- a/python/ray/serve/_private/request_router/request_router.py +++ b/python/ray/serve/_private/request_router/request_router.py @@ -1031,8 +1031,7 @@ async def _select_from_candidate_replicas( # Include replicas whose queues are full as not in the cache so we will # actively probe them. Otherwise we may end up in "deadlock" until their # cache entries expire. - total_load = queue_len or 0 - if queue_len is None or total_load >= r.max_ongoing_requests: + if queue_len is None or queue_len >= r.max_ongoing_requests: not_in_cache.append(r) elif queue_len < lowest_queue_len: lowest_queue_len = queue_len @@ -1051,13 +1050,7 @@ async def _select_from_candidate_replicas( # None is returned if we failed to get the queue len. continue - # In non-cache mode, `queue_len` does not include reserved slots. - # Add slot reservation count in that case. - if self._use_replica_queue_len_cache: - total_load = queue_len - else: - total_load = queue_len + len(r._reserved_slots) - if total_load < r.max_ongoing_requests and queue_len < lowest_queue_len: + if queue_len < r.max_ongoing_requests and queue_len < lowest_queue_len: lowest_queue_len = queue_len chosen_replica_id = r.replica_id elif len(not_in_cache) > 0: @@ -1357,14 +1350,7 @@ def select_available_replicas( available_replicas = [] for r in candidates: queue_len = self._replica_queue_len_cache.get(r.replica_id) - # In cache mode, queue_len already includes reservations via - # on_send_request/on_replica_result_finished callbacks. - # In non-cache mode, include reserved slots explicitly. - if self._use_replica_queue_len_cache: - total_load = queue_len or 0 - else: - total_load = (queue_len or 0) + len(r._reserved_slots) - if queue_len is None or total_load < r.max_ongoing_requests: + if queue_len is None or queue_len < r.max_ongoing_requests: available_replicas.append(r) return available_replicas From 7fe198b69994bfa90e4b2ca1b9d5ce9e6d92aae1 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 21:34:23 +0800 Subject: [PATCH 42/57] refactor: make reserved_slots comment accurate Signed-off-by: machichima --- python/ray/serve/_private/request_router/replica_wrapper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/ray/serve/_private/request_router/replica_wrapper.py b/python/ray/serve/_private/request_router/replica_wrapper.py index 714470a3089d..ace9fc635e93 100644 --- a/python/ray/serve/_private/request_router/replica_wrapper.py +++ b/python/ray/serve/_private/request_router/replica_wrapper.py @@ -202,8 +202,7 @@ def __init__(self, replica_info: RunningReplicaInfo): self._actor_replica_wrapper = ActorReplicaWrapper(self._actor_handle) self._grpc_replica_wrapper = None - # Slot reservation tracking for choose_replica/dispatch pattern - # Maps slot_token -> True to track active reservations + # Active slot reservation tokens for the choose_replica/dispatch pattern. self._reserved_slots: Set[str] = set() @property From 446e7d3113281fc91e1784ab04f5c6a45f08672e Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 21:41:34 +0800 Subject: [PATCH 43/57] fix: reuse the RequestMetadata created from choose_replica Signed-off-by: machichima --- .../serve/_private/request_router/common.py | 1 + python/ray/serve/_private/router.py | 1 + python/ray/serve/handle.py | 18 ++++++++++++------ python/ray/serve/tests/test_handle.py | 6 ++++++ 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index 2f3a45fbb555..40f577d3332b 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -93,6 +93,7 @@ class ReplicaSelection: # Internal fields (not part of public API) _replica: "RunningReplica" _deployment_id: Optional[DeploymentID] + _request_metadata: RequestMetadata _method_name: str _slot_token: str # Token for reserved slot _dispatched: bool = field( diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 8fddbe79f586..5624388d388f 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1125,6 +1125,7 @@ async def choose_replica( availability_zone=replica.availability_zone, _replica=replica, _deployment_id=None, # Injected by DeploymentHandle for dispatch-time validation. + _request_metadata=request_meta, _method_name=request_meta.call_method, _slot_token=slot_token, ) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 836808a40b36..fb23cf4a8e27 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -184,18 +184,23 @@ def _is_router_running_in_separate_loop(self) -> bool: return False return self.init_options._run_router_in_separate_loop - def _init_router_and_get_metadata(self) -> Tuple[Router, RequestMetadata]: + def _init_router(self) -> Router: if not self.is_initialized: self._init() + if self._router is None: + raise RuntimeError("Router is not initialized") + + return self._router + + def _init_router_and_get_metadata(self) -> Tuple[Router, RequestMetadata]: + router = self._init_router() + metadata = serve._private.default_impl.get_request_metadata( self.init_options, self.handle_options ) - if self._router is None: - raise RuntimeError("Router is not initialized") - - return self._router, metadata + return router, metadata def _options( self, _prefer_local_routing=DEFAULT.VALUE, **kwargs @@ -280,7 +285,8 @@ def _dispatch( f"for {selection._deployment_id}." ) - router, metadata = self._init_router_and_get_metadata() + metadata = selection._request_metadata + router = self._init_router() return router.dispatch(selection, metadata, *args, **kwargs), metadata def options( diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index cdfa5bb9a979..9acd423ee657 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -7,6 +7,7 @@ from ray.serve._private.common import ( OBJ_REF_NOT_SUPPORTED_ERROR, DeploymentID, + RequestMetadata, ) from ray.serve._private.replica_result import ( ActorReplicaResult, @@ -135,6 +136,11 @@ def test_dispatch_rejects_selection_from_different_deployment(): availability_zone=None, _replica=object(), _deployment_id=DeploymentID(name="deployment-b", app_name="app"), + _request_metadata=RequestMetadata( + request_id="request-id", + internal_request_id="internal-request-id", + call_method="__call__", + ), _method_name="__call__", _slot_token="slot-1", ) From 1c5c9dcfbabe5834993a9bee2bf6ae6017cddf6e Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 23 Mar 2026 22:01:50 +0800 Subject: [PATCH 44/57] refactor: fix docstring Signed-off-by: machichima --- python/ray/serve/tests/test_handle_1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/serve/tests/test_handle_1.py b/python/ray/serve/tests/test_handle_1.py index 3b2ea4aa1778..7e9c43e94fa4 100644 --- a/python/ray/serve/tests/test_handle_1.py +++ b/python/ray/serve/tests/test_handle_1.py @@ -460,7 +460,7 @@ async def handle_request(self, request: str): assert ( result["decode"]["actual_replica_id"] == result["decode"]["selected_replica_id"] ), ( - f"dispatch sent request to wrong replica for prefill: " + f"dispatch sent request to wrong replica for decode: " f"selected {result['decode']['selected_replica_id']}, but got response from {result['decode']['actual_replica_id']}" ) From 62ea65e75163a64cfe4d99dd9c5d8b4ef12058b0 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 26 Mar 2026 22:19:19 +0800 Subject: [PATCH 45/57] test: multiple dispatch calls fail Signed-off-by: machichima --- python/ray/serve/tests/unit/test_router.py | 25 ++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 3d12cc4bb8ff..e3a3c1fe731a 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -1178,6 +1178,31 @@ async def test_dispatch_uses_reserved_slot( assert replica._requests_sent[0]["request_id"] == "test-request-1" assert replica._requests_sent[0]["with_rejection"] is False + async def test_multiple_dispatch_calls_fail( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """A ReplicaSelection can only be dispatched once.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + async with router.choose_replica(request_metadata) as selection: + await router.dispatch(selection, request_metadata) + + with pytest.raises(RuntimeError, match="already been dispatched"): + await router.dispatch(selection, request_metadata) + + assert len(replica._requests_sent) == 1 + def running_replica_info(replica_id: ReplicaID) -> RunningReplicaInfo: return RunningReplicaInfo( From 9cae53883abea34d6c222b71d38eb73335e32b49 Mon Sep 17 00:00:00 2001 From: machichima Date: Thu, 26 Mar 2026 22:33:33 +0800 Subject: [PATCH 46/57] test: dispatch won't be rejected even when reach the threshold Signed-off-by: machichima --- python/ray/serve/tests/unit/test_router.py | 31 ++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index e3a3c1fe731a..06e620790d4c 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -1178,6 +1178,37 @@ async def test_dispatch_uses_reserved_slot( assert replica._requests_sent[0]["request_id"] == "test-request-1" assert replica._requests_sent[0]["with_rejection"] is False + async def test_dispatch_not_rejected_after_choose_replica( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Dispatch won't be rejected even though max_ongoing_requests reaches its threshold""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica( + r1_id, + queue_len_info=ReplicaQueueLengthInfo( + # `accepted=False` simulates rejection due to max_ongoing_requests limit. + accepted=False, + num_ongoing_requests=999, + ), + ) + fake_request_router.set_replica_to_return(replica) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + async with router.choose_replica(request_metadata) as selection: + result = await router.dispatch(selection, request_metadata) + assert result._replica_id == r1_id + + assert len(replica._requests_sent) == 1 + assert replica._requests_sent[0]["with_rejection"] is False + async def test_multiple_dispatch_calls_fail( self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] ): From 80b3c07737fe9b9a3092c3a99eb77d701fb41062 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 27 Mar 2026 20:44:43 +0800 Subject: [PATCH 47/57] test: ensure specific error raised in choose_replica Signed-off-by: machichima --- python/ray/serve/tests/unit/test_router.py | 40 +++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 06e620790d4c..a85cb94e57f0 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -42,7 +42,11 @@ from ray.serve._private.test_utils import FakeCounter, FakeGauge, MockTimer from ray.serve._private.utils import decompress_metric_report, get_random_string from ray.serve.config import AutoscalingConfig -from ray.serve.exceptions import BackPressureError, ReplicaUnavailableError +from ray.serve.exceptions import ( + BackPressureError, + DeploymentUnavailableError, + ReplicaUnavailableError, +) class FakeReplicaResult(ReplicaResult): @@ -842,6 +846,40 @@ async def test_on_request_routed( class TestChooseReplica: """Tests for choose_replica() and dispatch() flow.""" + async def test_choose_replica_raises_deployment_unavailable( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """choose_replica fails when deployment is marked unavailable.""" + router, _ = setup_router + router._deployment_available = False + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + with pytest.raises(DeploymentUnavailableError): + async with router.choose_replica(request_metadata): + pass + + async def test_choose_replica_raises_backpressure( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """choose_replica raises BackPressureError when queue limit is reached.""" + router, _ = setup_router + router.update_deployment_config(DeploymentConfig(max_queued_requests=1)) + # Bump num_queued_requests to 1 to simulate a full queue + router._metrics_manager.inc_num_queued_requests() + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + with pytest.raises(BackPressureError): + async with router.choose_replica(request_metadata): + pass + async def test_basic_choose_and_dispatch( self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] ): From ad7c561739a73757ae672ce24b7ec567b20e0c72 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 27 Mar 2026 20:52:22 +0800 Subject: [PATCH 48/57] test: streaming support integration test Signed-off-by: machichima --- python/ray/serve/tests/test_handle_1.py | 40 +++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/python/ray/serve/tests/test_handle_1.py b/python/ray/serve/tests/test_handle_1.py index 7e9c43e94fa4..97992d2ce187 100644 --- a/python/ray/serve/tests/test_handle_1.py +++ b/python/ray/serve/tests/test_handle_1.py @@ -385,6 +385,46 @@ async def handle_request(self, request: str): ) +@pytest.mark.asyncio +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode doesn't support choose_replica/dispatch", +) +async def test_choose_replica_and_dispatch_streaming(serve_instance): + """Test choose_replica + dispatch with handle.options(stream=True).""" + + @serve.deployment(num_replicas=2) + class Backend: + async def stream(self, msg: str): + replica_id = get_replica_context().replica_id.unique_id + for i in range(3): + yield {"actual_replica_id": replica_id, "chunk": f"{msg}-{i}"} + + @serve.deployment + class StreamingProxy: + def __init__(self, backend: DeploymentHandle): + self.backend_stream = backend.stream.options(stream=True) + + async def handle_request(self, request: str): + async with self.backend_stream.choose_replica(request) as selection: + gen = self.backend_stream.dispatch(selection, request) + chunks = [item async for item in gen] + return {"selected_replica_id": selection.replica_id, "chunks": chunks} + + h = serve.run(StreamingProxy.bind(Backend.bind())) + result = await h.handle_request.remote("stream_test") + + assert [item["chunk"] for item in result["chunks"]] == [ + "stream_test-0", + "stream_test-1", + "stream_test-2", + ] + assert all( + item["actual_replica_id"] == result["selected_replica_id"] + for item in result["chunks"] + ) + + @pytest.mark.asyncio @pytest.mark.skipif( RAY_SERVE_FORCE_LOCAL_TESTING_MODE, From f32576a2a2a4f3bfc13cebd16dca2e51a9ec10e5 Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 27 Mar 2026 21:11:50 +0800 Subject: [PATCH 49/57] test: exception or early return release slot Signed-off-by: machichima --- python/ray/serve/tests/test_handle_1.py | 69 +++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/python/ray/serve/tests/test_handle_1.py b/python/ray/serve/tests/test_handle_1.py index 97992d2ce187..e78b14599aea 100644 --- a/python/ray/serve/tests/test_handle_1.py +++ b/python/ray/serve/tests/test_handle_1.py @@ -385,6 +385,75 @@ async def handle_request(self, request: str): ) +@pytest.mark.asyncio +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode doesn't support choose_replica/dispatch", +) +async def test_choose_replica_early_return_releases_slot(serve_instance): + """Early return from choose_replica should release the reserved slot.""" + + @serve.deployment(num_replicas=1, max_ongoing_requests=1) + class Backend: + def process(self, msg: str): + return msg + + @serve.deployment + class Proxy: + def __init__(self, backend: DeploymentHandle): + self.backend = backend + + async def handle_request(self, request: str): + async with self.backend.process.choose_replica(request): + pass + + # Ensure the second choose_replica request work + async with self.backend.process.choose_replica(request) as selection: + response = await asyncio.wait_for( + self.backend.process.dispatch(selection, request), timeout=2 + ) + return response + + h = serve.run(Proxy.bind(Backend.bind())) + assert await h.handle_request.remote("test_message") == "test_message" + + +@pytest.mark.asyncio +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode doesn't support choose_replica/dispatch", +) +async def test_choose_replica_exception_releases_slot(serve_instance): + """Exception in choose_replica context should release the reserved slot.""" + + @serve.deployment(num_replicas=1, max_ongoing_requests=1) + class Backend: + def process(self, msg: str): + return msg + + @serve.deployment + class Proxy: + def __init__(self, backend: DeploymentHandle): + self.backend = backend + + async def handle_request(self, request: str): + try: + async with self.backend.process.choose_replica(request): + raise RuntimeError("test exception") + except RuntimeError: + pass + + # Ensure the second choose_replica request work + async with self.backend.process.choose_replica(request) as selection: + response = await asyncio.wait_for( + self.backend.process.dispatch(selection, request), timeout=2 + ) + return response + + h = serve.run(Proxy.bind(Backend.bind())) + assert await h.handle_request.remote("test_message") == "test_message" + + @pytest.mark.asyncio @pytest.mark.skipif( RAY_SERVE_FORCE_LOCAL_TESTING_MODE, From 1af2c2ed24ec4ff3d7afcee2b40ade79ef46f98e Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 29 Mar 2026 08:48:22 +0800 Subject: [PATCH 50/57] fix: add missing callback param Signed-off-by: machichima --- python/ray/serve/_private/router.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 2d59f657ff7d..10d873f30b4d 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1315,6 +1315,7 @@ async def dispatch( self._process_finished_request, replica.replica_id, pr.metadata.internal_request_id, + replica.actor_id, ) result.add_done_callback(callback) From 99595518d99fedbd264f73dba6b230501e8bc12d Mon Sep 17 00:00:00 2001 From: machichima Date: Fri, 10 Apr 2026 21:57:00 +0800 Subject: [PATCH 51/57] fix: prevent duplicate decrease queue length cache Signed-off-by: machichima --- python/ray/serve/_private/router.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 10d873f30b4d..ac9d3d37546f 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -869,9 +869,6 @@ def _process_finished_request( # Notify request router that request completed (for cleanup, e.g., token release) if self.request_router: self.request_router.on_request_completed(replica_id, internal_request_id) - # Decrement queue length cache for successfully dispatched requests. - # The increment happens in on_send_request during dispatch/assignment. - self.request_router.on_replica_result_finished(replica_id) actor_died_error = self._get_actor_died_error(result) if actor_died_error is not None: @@ -1318,6 +1315,12 @@ async def dispatch( replica.actor_id, ) result.add_done_callback(callback) + result.add_done_callback( + lambda _: self._event_loop.call_soon_threadsafe( + self.request_router.decrement_queue_len_cache, + replica.replica_id, + ) + ) return result From 779123c0bfed035a9ac7b17675ac30ea46010ed2 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 20 Apr 2026 21:27:40 +0800 Subject: [PATCH 52/57] fix: use call_soon_threadsafe Signed-off-by: machichima --- python/ray/serve/_private/router.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index ac9d3d37546f..b41116abd1a2 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1314,7 +1314,9 @@ async def dispatch( pr.metadata.internal_request_id, replica.actor_id, ) - result.add_done_callback(callback) + result.add_done_callback( + lambda _, cb=callback: self._event_loop.call_soon_threadsafe(cb, _) + ) result.add_done_callback( lambda _: self._event_loop.call_soon_threadsafe( self.request_router.decrement_queue_len_cache, From f978363bf1954ba76434f08e45b0201ff5a04fb4 Mon Sep 17 00:00:00 2001 From: machichima Date: Mon, 20 Apr 2026 22:04:10 +0800 Subject: [PATCH 53/57] fix: move resolve_request_arguments into try/except block Signed-off-by: machichima --- python/ray/serve/_private/router.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index b41116abd1a2..b5f295d4af5c 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1287,13 +1287,11 @@ async def dispatch( metadata=request_meta, ) - # Resolve request arguments - if not pr.resolved: - await self._resolve_request_arguments(pr) - # Send the request without rejection since we already reserved a slot # The slot reservation guarantees that the replica will accept this request try: + if not pr.resolved: + await self._resolve_request_arguments(pr) result = replica.try_send_request(pr, with_rejection=False) except Exception: # Release the queue-length increment performed during choose_replica when dispatch fails. From 5d2154a6957713eca12ff0a45ac776002530be56 Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 21 Apr 2026 21:01:47 +0800 Subject: [PATCH 54/57] refactor: ReplicaSelection to replica_wrapper Signed-off-by: machichima --- .../ray/serve/_private/local_testing_mode.py | 2 +- .../serve/_private/request_router/__init__.py | 1 + .../serve/_private/request_router/common.py | 76 +------------------ .../request_router/replica_wrapper.py | 72 ++++++++++++++++++ python/ray/serve/_private/router.py | 6 +- python/ray/serve/handle.py | 2 +- python/ray/serve/tests/test_handle.py | 2 +- 7 files changed, 82 insertions(+), 79 deletions(-) diff --git a/python/ray/serve/_private/local_testing_mode.py b/python/ray/serve/_private/local_testing_mode.py index 1ce8bf7b304f..ddce4e1b9eb3 100644 --- a/python/ray/serve/_private/local_testing_mode.py +++ b/python/ray/serve/_private/local_testing_mode.py @@ -17,7 +17,7 @@ ) from ray.serve._private.replica import UserCallableWrapper from ray.serve._private.replica_result import ReplicaResult -from ray.serve._private.request_router.common import ReplicaSelection +from ray.serve._private.request_router.replica_wrapper import ReplicaSelection from ray.serve._private.router import Router from ray.serve._private.utils import GENERATOR_COMPOSITION_NOT_SUPPORTED_ERROR from ray.serve.deployment import Deployment diff --git a/python/ray/serve/_private/request_router/__init__.py b/python/ray/serve/_private/request_router/__init__.py index b4b761681077..cbbccfe9dc4b 100644 --- a/python/ray/serve/_private/request_router/__init__.py +++ b/python/ray/serve/_private/request_router/__init__.py @@ -3,6 +3,7 @@ PowerOfTwoChoicesRequestRouter, ) from ray.serve._private.request_router.replica_wrapper import ( # noqa: F401 + ReplicaSelection, RunningReplica, ) from ray.serve._private.request_router.request_router import ( # noqa: F401 diff --git a/python/ray/serve/_private/request_router/common.py b/python/ray/serve/_private/request_router/common.py index 40f577d3332b..b373f47f1528 100644 --- a/python/ray/serve/_private/request_router/common.py +++ b/python/ray/serve/_private/request_router/common.py @@ -2,18 +2,15 @@ import logging import time from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set +from typing import Any, Callable, Dict, List, Optional, Set -from ray.serve._private.common import DeploymentID, ReplicaID, RequestMetadata +from ray.serve._private.common import ReplicaID, RequestMetadata from ray.serve._private.constants import ( RAY_SERVE_QUEUE_LENGTH_CACHE_TIMEOUT_S, SERVE_LOGGER_NAME, ) from ray.util.annotations import PublicAPI -if TYPE_CHECKING: - from ray.serve._private.request_router.replica_wrapper import RunningReplica - logger = logging.getLogger(SERVE_LOGGER_NAME) @@ -66,75 +63,6 @@ class ReplicaQueueLengthCacheEntry: timestamp: float -@dataclass -class ReplicaSelection: - """Represents a selected replica, holding information for dispatch or coordination. - - This class is returned by the choose_replica() context manager. - The slot reservation lifecycle is managed by the context manager. - """ - - # Public, user-accessible fields - replica_id: str - """Unique identifier for the selected replica.""" - - node_ip: str - """IP address of the node running this replica.""" - - port: Optional[int] - """Port number for direct communication (if configured).""" - - node_id: str - """Ray node ID where the replica is running.""" - - availability_zone: Optional[str] - """Cloud availability zone of the replica's node.""" - - # Internal fields (not part of public API) - _replica: "RunningReplica" - _deployment_id: Optional[DeploymentID] - _request_metadata: RequestMetadata - _method_name: str - _slot_token: str # Token for reserved slot - _dispatched: bool = field( - default=False, init=False - ) # Tracks if dispatch was called - - @property - def address(self) -> str: - """Returns the replica address in host:port format.""" - if self.port: - return f"{self.node_ip}:{self.port}" - return self.node_ip - - def to_dict(self) -> Dict[str, Any]: - """Serialize public fields to a dictionary.""" - return { - "replica_id": self.replica_id, - "node_ip": self.node_ip, - "port": self.port, - "node_id": self.node_id, - "availability_zone": self.availability_zone, - } - - def _mark_dispatched(self) -> None: - """Internal: Mark this selection as dispatched (slot consumed). - - Raises: - RuntimeError: If the selection has already been dispatched. - """ - if self._dispatched: - raise RuntimeError( - f"ReplicaSelection for {self.replica_id} has already been dispatched. " - "Each selection can only be dispatched once." - ) - self._dispatched = True - - def _release_slot(self) -> None: - """Internal: Release the reserved slot.""" - self._replica.release_slot(self._slot_token) - - class ReplicaQueueLengthCache: def __init__( self, diff --git a/python/ray/serve/_private/request_router/replica_wrapper.py b/python/ray/serve/_private/request_router/replica_wrapper.py index 98a201e19beb..3f3aa8ea751f 100644 --- a/python/ray/serve/_private/request_router/replica_wrapper.py +++ b/python/ray/serve/_private/request_router/replica_wrapper.py @@ -3,6 +3,7 @@ import pickle import uuid from abc import ABC, abstractmethod +from dataclasses import dataclass, field from typing import Any, Dict, Optional, Set import grpc @@ -10,7 +11,9 @@ import ray from ray.actor import ActorHandle from ray.serve._private.common import ( + DeploymentID, ReplicaID, + RequestMetadata, RunningReplicaInfo, ) from ray.serve._private.constants import ( @@ -347,3 +350,72 @@ def release_slot(self, slot_token: str) -> None: f"Attempted to release unknown slot token {slot_token} " f"on replica {self.replica_id}" ) + + +@dataclass +class ReplicaSelection: + """Represents a selected replica, holding information for dispatch or coordination. + + This class is returned by the choose_replica() context manager. + The slot reservation lifecycle is managed by the context manager. + """ + + # Public, user-accessible fields + replica_id: str + """Unique identifier for the selected replica.""" + + node_ip: str + """IP address of the node running this replica.""" + + port: Optional[int] + """Port number for direct communication (if configured).""" + + node_id: str + """Ray node ID where the replica is running.""" + + availability_zone: Optional[str] + """Cloud availability zone of the replica's node.""" + + # Internal fields (not part of public API) + _replica: RunningReplica + _deployment_id: Optional[DeploymentID] + _request_metadata: RequestMetadata + _method_name: str + _slot_token: str # Token for reserved slot + _dispatched: bool = field( + default=False, init=False + ) # Tracks if dispatch was called + + @property + def address(self) -> str: + """Returns the replica address in host:port format.""" + if self.port: + return f"{self.node_ip}:{self.port}" + return self.node_ip + + def to_dict(self) -> Dict[str, Any]: + """Serialize public fields to a dictionary.""" + return { + "replica_id": self.replica_id, + "node_ip": self.node_ip, + "port": self.port, + "node_id": self.node_id, + "availability_zone": self.availability_zone, + } + + def _mark_dispatched(self) -> None: + """Internal: Mark this selection as dispatched (slot consumed). + + Raises: + RuntimeError: If the selection has already been dispatched. + """ + if self._dispatched: + raise RuntimeError( + f"ReplicaSelection for {self.replica_id} has already been dispatched. " + "Each selection can only be dispatched once." + ) + self._dispatched = True + + def _release_slot(self) -> None: + """Internal: Release the reserved slot.""" + self._replica.release_slot(self._slot_token) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index b5f295d4af5c..3968429660e1 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -55,11 +55,13 @@ ) from ray.serve._private.replica_result import ReplicaResult from ray.serve._private.request_router import PendingRequest, RequestRouter -from ray.serve._private.request_router.common import ReplicaSelection from ray.serve._private.request_router.pow_2_router import ( PowerOfTwoChoicesRequestRouter, ) -from ray.serve._private.request_router.replica_wrapper import RunningReplica +from ray.serve._private.request_router.replica_wrapper import ( + ReplicaSelection, + RunningReplica, +) from ray.serve._private.tracing_utils import ( create_propagated_context, is_span_recording, diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index fb23cf4a8e27..a92658e25a8d 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -41,7 +41,7 @@ InitHandleOptionsBase, ) from ray.serve._private.replica_result import ReplicaResult -from ray.serve._private.request_router.common import ReplicaSelection +from ray.serve._private.request_router.replica_wrapper import ReplicaSelection from ray.serve._private.router import Router from ray.serve._private.usage import ServeUsageTag from ray.serve._private.utils import ( diff --git a/python/ray/serve/tests/test_handle.py b/python/ray/serve/tests/test_handle.py index 9acd423ee657..2c4fbb5e735e 100644 --- a/python/ray/serve/tests/test_handle.py +++ b/python/ray/serve/tests/test_handle.py @@ -14,7 +14,7 @@ ReplicaResult, gRPCReplicaResult, ) -from ray.serve._private.request_router.common import ReplicaSelection +from ray.serve._private.request_router.replica_wrapper import ReplicaSelection from ray.serve.handle import DeploymentHandle from ray.serve.tests.conftest import * # noqa from ray.serve.tests.conftest import _shared_serve_instance # noqa From b7e376c7866bd86542fe98d2197880213c8263be Mon Sep 17 00:00:00 2001 From: machichima Date: Tue, 21 Apr 2026 21:38:43 +0800 Subject: [PATCH 55/57] test: remove redundant test Signed-off-by: machichima --- python/ray/serve/tests/unit/test_router.py | 31 ---------------------- 1 file changed, 31 deletions(-) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index f6e5f78d52fd..d974d076b211 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -1407,37 +1407,6 @@ async def test_dispatch_uses_reserved_slot( assert replica._requests_sent[0]["request_id"] == "test-request-1" assert replica._requests_sent[0]["with_rejection"] is False - async def test_dispatch_not_rejected_after_choose_replica( - self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] - ): - """Dispatch won't be rejected even though max_ongoing_requests reaches its threshold""" - router, fake_request_router = setup_router - - r1_id = ReplicaID( - unique_id="test-replica-1", deployment_id=DeploymentID(name="test") - ) - replica = FakeReplica( - r1_id, - queue_len_info=ReplicaQueueLengthInfo( - # `accepted=False` simulates rejection due to max_ongoing_requests limit. - accepted=False, - num_ongoing_requests=999, - ), - ) - fake_request_router.set_replica_to_return(replica) - - request_metadata = RequestMetadata( - request_id="test-request-1", - internal_request_id="test-internal-request-1", - ) - - async with router.choose_replica(request_metadata) as selection: - result = await router.dispatch(selection, request_metadata) - assert result._replica_id == r1_id - - assert len(replica._requests_sent) == 1 - assert replica._requests_sent[0]["with_rejection"] is False - async def test_multiple_dispatch_calls_fail( self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] ): From d65ad18669985e25e406898c26003d65987a7cb6 Mon Sep 17 00:00:00 2001 From: machichima Date: Sun, 26 Apr 2026 17:48:46 +0800 Subject: [PATCH 56/57] fix: calling on_replica_result_finished only when on_send_request is called Signed-off-by: machichima --- python/ray/serve/_private/router.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 7122dbd4efc2..83aa6d0c1bd3 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1005,6 +1005,7 @@ async def _route_and_send_request_once( result: Optional[ReplicaResult] = None replica: Optional[RunningReplica] = None callback_registered = False + queue_len_incremented = False try: # Resolve request arguments BEFORE incrementing queued requests. # This ensures that queue metrics reflect actual pending work, @@ -1032,6 +1033,7 @@ async def _route_and_send_request_once( result = replica.try_send_request(pr, with_rejection=with_rejection) # Proactively update the queue length cache. self.request_router.on_send_request(replica.replica_id) + queue_len_incremented = True # Keep track of requests that have been sent out to replicas if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: @@ -1126,7 +1128,11 @@ async def _route_and_send_request_once( self.request_router.on_request_completed( replica.replica_id, pr.metadata.internal_request_id ) - self.request_router.on_replica_result_finished(replica.replica_id) + # Only decrement if on_send_request was called (i.e., try_send_request + # succeeded and incremented the cache). If try_send_request raised error, + # on_send_request was never called so there is no +1 to undo. + if queue_len_incremented: + self.request_router.on_replica_result_finished(replica.replica_id) return None From c9509df7bbaabb0fda5e321f4babffc38ff0cb77 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sat, 9 May 2026 00:27:09 +0000 Subject: [PATCH 57/57] Reserve replica capacity Signed-off-by: Jeffrey Wang --- python/ray/serve/_private/common.py | 3 + python/ray/serve/_private/replica.py | 50 +++- .../request_router/replica_wrapper.py | 69 +++-- python/ray/serve/_private/router.py | 133 +++++++--- python/ray/serve/tests/unit/test_router.py | 240 +++++++++++++++++- 5 files changed, 433 insertions(+), 62 deletions(-) diff --git a/python/ray/serve/_private/common.py b/python/ray/serve/_private/common.py index c095a3ee44e5..29cd797791a3 100644 --- a/python/ray/serve/_private/common.py +++ b/python/ray/serve/_private/common.py @@ -820,6 +820,9 @@ class RequestMetadata: request_serialization: str = "cloudpickle" response_serialization: str = "cloudpickle" + # Token for a replica-side slot reserved by choose_replica(). + _reserved_slot_token: Optional[str] = None + @property def is_http_request(self) -> bool: return self._request_protocol == RequestProtocol.HTTP diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py index c71b2694ae11..dea36d8f319d 100644 --- a/python/ray/serve/_private/replica.py +++ b/python/ray/serve/_private/replica.py @@ -1182,13 +1182,36 @@ def __init__( self._direct_ingress_grpc_server_task: Optional[asyncio.Task] = None self._num_queued_requests = 0 + self._reserved_slots: Set[str] = set() @property def max_ongoing_requests(self) -> int: return self._deployment_config.max_ongoing_requests def get_num_ongoing_requests(self) -> int: - return self._metrics_manager.get_num_ongoing_requests() + return self._metrics_manager.get_num_ongoing_requests() + len( + self._reserved_slots + ) + + async def reserve_slot( + self, request_metadata: RequestMetadata, slot_token: str + ) -> Tuple[bool, int]: + """Reserve replica capacity for a future dispatch call.""" + if not self._can_accept_request(request_metadata): + return False, self.get_num_ongoing_requests() + + await self._semaphore.acquire() + self._reserved_slots.add(slot_token) + return True, self.get_num_ongoing_requests() + + def release_slot(self, slot_token: str) -> Tuple[bool, int]: + """Release replica capacity reserved by choose_replica().""" + if slot_token not in self._reserved_slots: + return False, self.get_num_ongoing_requests() + + self._reserved_slots.remove(slot_token) + self._semaphore.release() + return True, self.get_num_ongoing_requests() def get_metadata(self) -> ReplicaMetadata: current_rank = ray.serve.context._get_internal_replica_context().rank @@ -1865,12 +1888,25 @@ def _on_request_failed(self, request_metadata: RequestMetadata, e: Exception): @asynccontextmanager async def _start_request(self, request_metadata: RequestMetadata): - async with self._semaphore: + reserved_slot_token = request_metadata._reserved_slot_token + if reserved_slot_token: + if reserved_slot_token not in self._reserved_slots: + raise RuntimeError( + "Request tried to consume an unknown reserved slot " + f"{reserved_slot_token}." + ) + self._reserved_slots.remove(reserved_slot_token) + else: + await self._semaphore.acquire() + + try: try: self._metrics_manager.inc_num_ongoing_requests(request_metadata) yield finally: self._metrics_manager.dec_num_ongoing_requests(request_metadata) + finally: + self._semaphore.release() async def _drain_ongoing_requests(self): """Wait for any ongoing requests to finish. @@ -2759,6 +2795,16 @@ def get_num_ongoing_requests(self) -> int: """ return self._replica_impl.get_num_ongoing_requests() + async def reserve_slot( + self, request_metadata: RequestMetadata, slot_token: str + ) -> Tuple[bool, int]: + """Reserve capacity for a future choose_replica/dispatch request.""" + return await self._replica_impl.reserve_slot(request_metadata, slot_token) + + def release_slot(self, slot_token: str) -> Tuple[bool, int]: + """Release capacity reserved by choose_replica().""" + return self._replica_impl.release_slot(slot_token) + async def is_allocated(self) -> str: """poke the replica to check whether it's alive. diff --git a/python/ray/serve/_private/request_router/replica_wrapper.py b/python/ray/serve/_private/request_router/replica_wrapper.py index 85a100425578..fc529e66bf22 100644 --- a/python/ray/serve/_private/request_router/replica_wrapper.py +++ b/python/ray/serve/_private/request_router/replica_wrapper.py @@ -1,5 +1,4 @@ import asyncio -import logging import pickle import uuid from abc import ABC, abstractmethod @@ -13,13 +12,11 @@ from ray.serve._private.common import ( DeploymentID, ReplicaID, + ReplicaQueueLengthInfo, RequestMetadata, RunningReplicaInfo, ) -from ray.serve._private.constants import ( - RAY_SERVE_REPLICA_GRPC_MAX_MESSAGE_LENGTH, - SERVE_LOGGER_NAME, -) +from ray.serve._private.constants import RAY_SERVE_REPLICA_GRPC_MAX_MESSAGE_LENGTH from ray.serve._private.replica_result import ( ActorReplicaResult, ReplicaResult, @@ -39,8 +36,6 @@ _is_tracing_enabled, ) -logger = logging.getLogger(SERVE_LOGGER_NAME) - class ReplicaWrapper(ABC): """This is used to abstract away details of the transport layer @@ -205,7 +200,8 @@ def __init__(self, replica_info: RunningReplicaInfo): self._actor_replica_wrapper = ActorReplicaWrapper(self._actor_handle) self._grpc_replica_wrapper = None - # Active slot reservation tokens for the choose_replica/dispatch pattern. + # Active local slot reservation tokens for Java replicas. Python replicas + # reserve capacity on the actor-side semaphore. self._reserved_slots: Set[str] = set() def update_replica_info(self, replica_info: RunningReplicaInfo) -> None: @@ -332,34 +328,58 @@ def try_send_request( return wrapper.send_request_python(pr, with_rejection=with_rejection) - def reserve_slot(self) -> str: + async def reserve_slot( + self, request_metadata: RequestMetadata + ) -> Tuple[str, ReplicaQueueLengthInfo]: """Reserve a slot on this replica for an upcoming request. Returns a unique token that can be used to release the slot later. This is used in the choose_replica/dispatch pattern to track reservations that haven't been dispatched yet. - - Note: This only tracks the reservation locally. The actual queue - length management is handled by the replica actor itself. """ + if self._replica_info.is_cross_language: + slot_token = str(uuid.uuid4()) + self._reserved_slots.add(slot_token) + return slot_token, ReplicaQueueLengthInfo( + accepted=True, + num_ongoing_requests=len(self._reserved_slots), + ) + slot_token = str(uuid.uuid4()) - self._reserved_slots.add(slot_token) - return slot_token + obj_ref = self._actor_handle.reserve_slot.remote(request_metadata, slot_token) + try: + accepted, num_ongoing_requests = await obj_ref + except asyncio.CancelledError: + ray.cancel(obj_ref) + self._actor_handle.release_slot.remote(slot_token) + raise + + return slot_token, ReplicaQueueLengthInfo( + accepted=accepted, + num_ongoing_requests=num_ongoing_requests, + ) - def release_slot(self, slot_token: str) -> None: + async def release_slot(self, slot_token: str) -> ReplicaQueueLengthInfo: """Release a previously reserved slot. This should be called if a request is not dispatched after reserving a slot (e.g., due to an error or cancellation). """ - if slot_token in self._reserved_slots: + if self._replica_info.is_cross_language: self._reserved_slots.discard(slot_token) - else: - logger.warning( - f"Attempted to release unknown slot token {slot_token} " - f"on replica {self.replica_id}" + return ReplicaQueueLengthInfo( + accepted=True, + num_ongoing_requests=len(self._reserved_slots), ) + _, num_ongoing_requests = await self._actor_handle.release_slot.remote( + slot_token + ) + return ReplicaQueueLengthInfo( + accepted=True, + num_ongoing_requests=num_ongoing_requests, + ) + @dataclass class ReplicaSelection: @@ -425,6 +445,11 @@ def _mark_dispatched(self) -> None: ) self._dispatched = True - def _release_slot(self) -> None: + async def _release_slot( + self, *, force: bool = False + ) -> Optional[ReplicaQueueLengthInfo]: """Internal: Release the reserved slot.""" - self._replica.release_slot(self._slot_token) + if self._dispatched and not force: + return None + + return await self._replica.release_slot(self._slot_token) diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 26eae779ec2b..d4463534214c 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -1294,19 +1294,46 @@ async def choose_replica( if not pr.resolved: await self._resolve_request_arguments(pr) - # Increment queued requests while choosing replica and reserving slot - num_curr_replicas = len(self.request_router.curr_replicas) - with self._metrics_manager.wrap_queued_request( - is_retry=False, num_curr_replicas=num_curr_replicas - ): - replica = await self.request_router._choose_replica_for_request(pr) + is_retry = False + while True: + num_curr_replicas = len(self.request_router.curr_replicas) + with self._metrics_manager.wrap_queued_request( + is_retry=is_retry, num_curr_replicas=num_curr_replicas + ): + replica = await self.request_router._choose_replica_for_request( + pr, is_retry=is_retry + ) - # Reserve a slot on the replica - slot_token = replica.reserve_slot() + # Reserve capacity on the replica actor. This must happen on + # the replica, not just in the router cache, so dispatch can + # send without the rejection protocol. + try: + slot_token, queue_info = await replica.reserve_slot( + request_meta + ) + except ActorDiedError as e: + self._handle_actor_died_error( + replica.replica_id, replica.actor_id, e + ) + is_retry = True + continue + except ActorUnavailableError: + self.request_router.on_replica_actor_unavailable( + replica.replica_id + ) + logger.warning( + f"{replica.replica_id} is temporarily unavailable." + ) + is_retry = True + continue - # Update queue length cache to reflect the reservation - # This ensures concurrent choose_replica calls see the correct load - self.request_router.on_send_request(replica.replica_id) + self.request_router.on_new_queue_len_info( + replica.replica_id, queue_info + ) + if queue_info.accepted: + break + + is_retry = True # Increment reserved slots metric (after queue metric is decremented) self._metrics_manager.inc_reserved_slots() @@ -1327,15 +1354,11 @@ async def choose_replica( try: yield selection finally: - # Always release the slot token (for tracking cleanup) - selection._release_slot() - - # Decrement cache only if this selection was never dispatched. - # Once dispatch is called, the request remains in-flight and should be - # accounted for until request completion is reflected via queue length - # callbacks/probes. - if not selection._dispatched: - self.request_router.on_replica_result_finished(replica.replica_id) + queue_info = await selection._release_slot() + if queue_info is not None: + self.request_router.on_new_queue_len_info( + replica.replica_id, queue_info + ) # Decrement reserved slots metric self._metrics_manager.dec_reserved_slots() @@ -1362,31 +1385,43 @@ async def dispatch( RuntimeError: If the selection has already been dispatched. ReplicaUnavailableError: If the replica is no longer available. """ - # Verify replica is still available - replica = selection._replica - if replica.replica_id not in self.request_router.curr_replicas: - raise ReplicaUnavailableError( - f"Replica {selection.replica_id} is no longer available" - ) - - # Mark as dispatched (this will raise if already dispatched) selection._mark_dispatched() + return await self._dispatch_to_marked_selection( + selection, request_meta, *request_args, **request_kwargs + ) + async def _dispatch_to_marked_selection( + self, + selection: ReplicaSelection, + request_meta: RequestMetadata, + *request_args, + **request_kwargs, + ) -> ReplicaResult: + """Dispatch a selection that has already been consumed by dispatch().""" + # Verify replica is still available + replica = selection._replica pr = PendingRequest( args=list(request_args), kwargs=request_kwargs, - metadata=request_meta, + metadata=replace(request_meta, _reserved_slot_token=selection._slot_token), ) # Send the request without rejection since we already reserved a slot # The slot reservation guarantees that the replica will accept this request try: + if replica.replica_id not in self.request_router.curr_replicas: + raise ReplicaUnavailableError( + f"Replica {selection.replica_id} is no longer available" + ) if not pr.resolved: await self._resolve_request_arguments(pr) result = replica.try_send_request(pr, with_rejection=False) - except Exception: - # Release the queue-length increment performed during choose_replica when dispatch fails. - self.request_router.on_replica_result_finished(replica.replica_id) + except BaseException: + queue_info = await selection._release_slot(force=True) + if queue_info is not None: + self.request_router.on_new_queue_len_info( + replica.replica_id, queue_info + ) raise # Keep track of requests that have been sent out to replicas @@ -1412,9 +1447,25 @@ async def dispatch( replica.replica_id, ) ) + result.add_done_callback( + lambda _: self._event_loop.call_soon_threadsafe( + lambda: self._event_loop.create_task( + self._release_slot_if_still_reserved(selection) + ) + ) + ) return result + async def _release_slot_if_still_reserved( + self, selection: ReplicaSelection + ) -> None: + """Best-effort cleanup if a dispatched request was cancelled before starting.""" + try: + await selection._release_slot(force=True) + except Exception: + logger.debug("Failed to release reserved replica slot.", exc_info=True) + async def broadcast( self, request_meta: RequestMetadata, @@ -1668,8 +1719,15 @@ def dispatch( **request_kwargs, ) -> concurrent.futures.Future[ReplicaResult]: """Dispatch request to a previously selected replica.""" + try: + selection._mark_dispatched() + except Exception as exc: + future = concurrent.futures.Future() + future.set_exception(exc) + return future + return self._wrap_asyncio_call_in_future( - self._asyncio_router.dispatch( + self._asyncio_router._dispatch_to_marked_selection( selection, request_meta, *request_args, **request_kwargs ) ) @@ -1890,8 +1948,15 @@ def dispatch( Returns an asyncio.Future wrapping the async dispatch call. """ + try: + selection._mark_dispatched() + except Exception as exc: + future = self._asyncio_loop.create_future() + future.set_exception(exc) + return future + return self._asyncio_loop.create_task( - self._asyncio_router.dispatch( + self._asyncio_router._dispatch_to_marked_selection( selection, request_meta, *request_args, **request_kwargs ) ) diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index a37c3c6fa468..4c4040f2a0cd 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -4,6 +4,7 @@ import sys import threading from collections import defaultdict +from dataclasses import replace from typing import Callable, Dict, List, Optional, Set, Tuple from unittest.mock import Mock, patch @@ -26,6 +27,7 @@ RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE, RAY_SERVE_METRICS_EXPORT_INTERVAL_MS, ) +from ray.serve._private.replica import Replica as ServeReplica from ray.serve._private.replica_result import ReplicaResult from ray.serve._private.request_router import ( PendingRequest, @@ -36,11 +38,16 @@ from ray.serve._private.router import ( QUEUED_REQUESTS_KEY, AsyncioRouter, + CurrentLoopRouter, RouterMetricsManager, SingletonThreadRouter, ) from ray.serve._private.test_utils import FakeCounter, FakeGauge, MockTimer -from ray.serve._private.utils import decompress_metric_report, get_random_string +from ray.serve._private.utils import ( + Semaphore, + decompress_metric_report, + get_random_string, +) from ray.serve.config import AutoscalingConfig, RequestRouterConfig from ray.serve.exceptions import ( BackPressureError, @@ -120,6 +127,8 @@ def __init__( self._reserved_slots: Set[str] = set() self._slot_counter = 0 self._requests_sent = [] # Track all requests sent to this replica + self._reject_reservation = False + self._reservation_queue_len = 1 # Create a minimal _replica_info object to satisfy router.py requirements self._replica_info = Mock() @@ -153,16 +162,31 @@ def is_cross_language(self) -> bool: def get_queue_len(self, *, deadline_s: float) -> int: raise NotImplementedError - def reserve_slot(self) -> str: + async def reserve_slot( + self, request_metadata: RequestMetadata + ) -> Tuple[str, ReplicaQueueLengthInfo]: """Reserve a slot and return a token.""" + if self._reject_reservation: + return "", ReplicaQueueLengthInfo( + accepted=False, + num_ongoing_requests=self._reservation_queue_len, + ) + self._slot_counter += 1 token = f"slot-token-{self._slot_counter}" self._reserved_slots.add(token) - return token + return token, ReplicaQueueLengthInfo( + accepted=True, + num_ongoing_requests=len(self._reserved_slots), + ) - def release_slot(self, slot_token: str) -> None: + async def release_slot(self, slot_token: str) -> ReplicaQueueLengthInfo: """Release a reserved slot.""" self._reserved_slots.discard(slot_token) + return ReplicaQueueLengthInfo( + accepted=True, + num_ongoing_requests=len(self._reserved_slots), + ) def send_request_with_slot( self, pr: PendingRequest, slot_token: str @@ -185,6 +209,9 @@ def try_send_request( raise self._error # Track the request + if pr.metadata._reserved_slot_token: + self._reserved_slots.discard(pr.metadata._reserved_slot_token) + self._requests_sent.append( { "request_id": pr.metadata.request_id, @@ -372,6 +399,52 @@ def dummy_request_metadata(is_streaming: bool = False) -> RequestMetadata: ) +class FakeReplicaMetricsManager: + def __init__(self): + self.num_ongoing_requests = 0 + + def get_num_ongoing_requests(self) -> int: + return self.num_ongoing_requests + + def inc_num_ongoing_requests(self, request_metadata: RequestMetadata): + self.num_ongoing_requests += 1 + + def dec_num_ongoing_requests(self, request_metadata: RequestMetadata): + self.num_ongoing_requests -= 1 + + +@pytest.mark.asyncio +class TestReplicaSlotReservation: + async def test_reserved_slot_counts_against_replica_capacity(self): + replica = ServeReplica.__new__(ServeReplica) + replica._deployment_config = Mock(max_ongoing_requests=1) + replica._metrics_manager = FakeReplicaMetricsManager() + replica._reserved_slots = set() + replica._semaphore = Semaphore(lambda: replica.max_ongoing_requests) + + request_metadata = dummy_request_metadata() + + slot_token = "slot-token-1" + accepted, queue_len = await replica.reserve_slot(request_metadata, slot_token) + assert accepted + assert slot_token in replica._reserved_slots + assert queue_len == 1 + assert replica.get_num_ongoing_requests() == 1 + + accepted, queue_len = await replica.reserve_slot( + request_metadata, "slot-token-2" + ) + assert not accepted + assert queue_len == 1 + + dispatch_metadata = replace(request_metadata, _reserved_slot_token=slot_token) + async with replica._start_request(dispatch_metadata): + assert slot_token not in replica._reserved_slots + assert replica.get_num_ongoing_requests() == 1 + + assert replica.get_num_ongoing_requests() == 0 + + @pytest.mark.asyncio class TestBroadcast: async def test_unavailable_fails_without_waiting_for_router_init(self): @@ -1385,6 +1458,84 @@ async def test_cache_updated_on_choose_replica( # It is decremented when request completion is observed. assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + @pytest.mark.parametrize( + "setup_router", + [{"enable_queue_len_cache": True}], + indirect=True, + ) + async def test_current_loop_dispatch_marks_selection_before_task_runs( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Wrapper dispatch should consume the selection before its task runs.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + current_loop_router = CurrentLoopRouter.__new__(CurrentLoopRouter) + current_loop_router._asyncio_loop = asyncio.get_running_loop() + current_loop_router._asyncio_router = router + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + async with router.choose_replica(request_metadata) as selection: + dispatch_task = current_loop_router.dispatch(selection, request_metadata) + assert selection._dispatched + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + assert len(replica._requests_sent) == 0 + + replica_result = await dispatch_task + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + + replica_result.fire_done_callbacks() + await asyncio.sleep(0) + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 0 + + @pytest.mark.parametrize( + "setup_router", + [{"enable_queue_len_cache": True}], + indirect=True, + ) + async def test_current_loop_dispatch_failure_releases_cache( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """If wrapper dispatch fails after consuming a selection, it owns cleanup.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + current_loop_router = CurrentLoopRouter.__new__(CurrentLoopRouter) + current_loop_router._asyncio_loop = asyncio.get_running_loop() + current_loop_router._asyncio_router = router + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + async with router.choose_replica(request_metadata) as selection: + fake_request_router._replica_to_return = None + dispatch_task = current_loop_router.dispatch(selection, request_metadata) + assert selection._dispatched + + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + + with pytest.raises(ReplicaUnavailableError): + await dispatch_task + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 0 + @pytest.mark.parametrize( "setup_router", [{"enable_queue_len_cache": True}], @@ -1416,6 +1567,41 @@ async def test_cache_decremented_on_choose_without_dispatch( # After context exit without dispatch, cache should be decremented to 0 assert fake_request_router.replica_queue_len_cache.get(r1_id) == 0 + @pytest.mark.parametrize( + "setup_router", + [{"enable_queue_len_cache": True}], + indirect=True, + ) + async def test_choose_replica_retries_when_reservation_rejected( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """choose_replica should only yield after replica-side capacity is reserved.""" + router, fake_request_router = setup_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + r2_id = ReplicaID( + unique_id="test-replica-2", deployment_id=DeploymentID(name="test") + ) + r1 = FakeReplica(r1_id) + r2 = FakeReplica(r2_id) + r1._reject_reservation = True + fake_request_router.set_replica_to_return(r1) + fake_request_router.set_replica_to_return_on_retry(r2) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + async with router.choose_replica(request_metadata) as selection: + assert selection._replica == r2 + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + assert fake_request_router.replica_queue_len_cache.get(r2_id) == 1 + + assert fake_request_router.replica_queue_len_cache.get(r2_id) == 0 + @pytest.mark.parametrize( "setup_router", [{"enable_queue_len_cache": True}], @@ -1951,6 +2137,52 @@ def test_request_assignment( assert isinstance(future, concurrent.futures.Future) assert future.result()._replica_id == r1_id + @pytest.mark.asyncio + @pytest.mark.parametrize( + "setup_router", + [{"enable_queue_len_cache": True}], + indirect=True, + ) + async def test_dispatch_marks_selection_before_scheduled_coroutine_runs( + self, + setup_router: Tuple[AsyncioRouter, FakeRequestRouter], + setup_singleton_thread_router: SingletonThreadRouter, + monkeypatch, + ): + _, fake_request_router = setup_router + thread_router = setup_singleton_thread_router + + r1_id = ReplicaID( + unique_id="test-replica-1", deployment_id=DeploymentID(name="test") + ) + replica = FakeReplica(r1_id) + fake_request_router.set_replica_to_return(replica) + + pending_coros = [] + + def delay_asyncio_call(coro): + pending_coros.append(coro) + return concurrent.futures.Future() + + monkeypatch.setattr( + thread_router, "_wrap_asyncio_call_in_future", delay_asyncio_call + ) + + request_metadata = RequestMetadata( + request_id="test-request-1", + internal_request_id="test-internal-request-1", + ) + + async with thread_router.choose_replica(request_metadata) as selection: + dispatch_future = thread_router.dispatch(selection, request_metadata) + assert selection._dispatched + + assert fake_request_router.replica_queue_len_cache.get(r1_id) == 1 + assert len(pending_coros) == 1 + + dispatch_future.cancel() + pending_coros[0].close() + @pytest.mark.asyncio async def test_cancellation_propagation( self,