ray-project
diff --git a/‎python/ray/llm/_internal/serve/core/configs/accelerators.py‎
Lines changed: 59 additions & 20 deletions b/‎python/ray/llm/_internal/serve/core/configs/accelerators.py‎
Lines changed: 59 additions & 20 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/server/llm_server.py‎
Lines changed: 31 additions & 19 deletions b/‎python/ray/llm/_internal/serve/core/server/llm_server.py‎
Lines changed: 31 additions & 19 deletions
diff --git a/‎python/ray/llm/tests/serve/cpu/configs/test_models.py‎
Lines changed: 9 additions & 15 deletions b/‎python/ray/llm/tests/serve/cpu/configs/test_models.py‎
Lines changed: 9 additions & 15 deletions
diff --git a/‎python/ray/llm/tests/serve/cpu/deployments/conftest.py‎
Lines changed: 15 additions & 0 deletions b/‎python/ray/llm/tests/serve/cpu/deployments/conftest.py‎
Lines changed: 15 additions & 0 deletions
@@ -8,7 +8,11 @@
 import ray.util.accelerators.accelerators as accelerators
 from ray.llm._internal.serve.observability.logging import get_logger
 from ray.util.placement_group import PlacementGroup, placement_group
-from ray.util.tpu import get_tpu_version_from_type, slice_placement_group
+from ray.util.tpu import (
+    get_chips_per_host,
+    get_tpu_version_from_type,
+    slice_placement_group,
+)
 
 logger = get_logger(__name__)
 
@@ -91,14 +95,38 @@ def create_placement_group(
     ) -> PlacementGroup:
         pass
 
-    @property
-    def requires_deferred_placement_group(self) -> bool:
-        """
-        If True, Ray Serve will not provision a placement group for the deployment.
-        Instead, creation is deferred to the replica at runtime.
-        Defaults to False.
-        """
-        return False
+    def get_placement_group_labels(
+        self, accelerator_type_str: Optional[str] = None
+    ) -> Optional[Dict[str, str]]:
+        """Returns labels to be applied to the placement group bundles."""
+        return None
+
+    def apply_placement_group_bundle_labels(
+        self,
+        deployment_options: Dict[str, Any],
+        accelerator_type_str: Optional[str],
+        num_bundles: int,
+    ) -> None:
+        """Safely applies hardware-specific labels to the deployment options."""
+        accel_labels = self.get_placement_group_labels(accelerator_type_str)
+        if not accel_labels:
+            return
+
+        existing_selectors = (
+            deployment_options.get("placement_group_bundle_label_selector") or []
+        )
+        merged_selectors = []
+
+        for i in range(num_bundles):
+            labels = (
+                existing_selectors[i].copy()
+                if i < len(existing_selectors) and existing_selectors[i]
+                else {}
+            )
+            labels.update(accel_labels)
+            merged_selectors.append(labels)
+
+        deployment_options["placement_group_bundle_label_selector"] = merged_selectors
 
     @property
     @abstractmethod
@@ -180,10 +208,21 @@ def __init__(self, config: TPUConfig):
     def default_bundles(
         self, *, num_devices: int, accelerator_type_str: Optional[str] = None
     ):
-        bundle = {"TPU": 1}
+        if self._config.topology and accelerator_type_str:
+            version = get_tpu_version_from_type(accelerator_type_str)
+            chips_per_host = get_chips_per_host(self._config.topology, version)
+
+            num_bundles = max(1, num_devices // chips_per_host)
+            bundle = {"TPU": chips_per_host}
+        else:
+            # Fallback to single-chip/single-host scheduling
+            num_bundles = num_devices
+            bundle = {"TPU": 1}
+
         if accelerator_type_str:
             bundle[format_ray_accelerator_resource(accelerator_type_str)] = 0.001
-        return [bundle.copy() for _ in range(num_devices)]
+
+        return [bundle.copy() for _ in range(num_bundles)]
 
     def create_placement_group(
         self,
@@ -239,15 +278,15 @@ def create_placement_group(
         )
         return self._slice_pg_wrapper.placement_group
 
-    @property
-    def requires_deferred_placement_group(self) -> bool:
-        """
-        If a TPU topology is specified, we defer PG creation so the replica can
-        provision a `SlicePlacementGroup` at runtime. This ensures multi-host
-        TPU slices are gang-scheduled atomically according to their physical
-        topology rather than fragmented across the cluster.
-        """
-        return bool(self._config.topology)
+    def get_placement_group_labels(
+        self, accelerator_type_str: Optional[str] = None
+    ) -> Optional[Dict[str, str]]:
+        if self._config.topology and accelerator_type_str:
+            return {
+                "ray.io/tpu-topology": self._config.topology,
+                "ray.io/accelerator-type": accelerator_type_str,
+            }
+        return None
 
     @property
     def requires_remote_initialization(self) -> bool:
 
@@ -706,27 +706,40 @@ def get_deployment_options(cls, llm_config: "LLMConfig"):
         # deployment_options
         ray_actor_options = deployment_options.get("ray_actor_options", {})
 
-        if not engine_config.accelerator.requires_deferred_placement_group:
-            replica_actor_resources = {
-                "CPU": ray_actor_options.get("num_cpus", 1),
-                "GPU": ray_actor_options.get("num_gpus", 0),
-                **ray_actor_options.get("resources", {}),
-            }
-            if "memory" in ray_actor_options:
-                replica_actor_resources["memory"] = ray_actor_options["memory"]
+        replica_actor_resources = {
+            "CPU": ray_actor_options.get("num_cpus", 1),
+            "GPU": ray_actor_options.get("num_gpus", 0),
+            **ray_actor_options.get("resources", {}),
+        }
+        if "memory" in ray_actor_options:
+            replica_actor_resources["memory"] = ray_actor_options["memory"]
 
-            # TODO: Move this _merge_replica_actor_and_child_actor_bundles to a
-            # more generic place.
-            pg_bundles = _merge_replica_actor_and_child_actor_bundles(
-                engine_config.placement_bundles, replica_actor_resources
-            )
+        # TODO: Move this _merge_replica_actor_and_child_actor_bundles to a
+        # more generic place.
+        pg_bundles = _merge_replica_actor_and_child_actor_bundles(
+            engine_config.placement_bundles, replica_actor_resources
+        )
+
+        deployment_options.update(
+            {
+                "placement_group_bundles": pg_bundles,
+                "placement_group_strategy": engine_config.placement_strategy,
+            }
+        )
 
-            deployment_options.update(
-                {
-                    "placement_group_bundles": pg_bundles,
-                    "placement_group_strategy": engine_config.placement_strategy,
-                }
+        # Append hardware-specific `bundle_label_selectors` to the deployment options if needed
+        accelerator_type_str = (
+            getattr(
+                llm_config.accelerator_type, "value", str(llm_config.accelerator_type)
             )
+            if llm_config.accelerator_type
+            else None
+        )
+        engine_config.accelerator.apply_placement_group_bundle_labels(
+            deployment_options=deployment_options,
+            accelerator_type_str=accelerator_type_str,
+            num_bundles=len(pg_bundles),
+        )
 
         # Handle env vars from runtime_env
         default_runtime_env = ray.get_runtime_context().runtime_env
@@ -735,7 +748,6 @@ def get_deployment_options(cls, llm_config: "LLMConfig"):
                 "worker_process_setup_hook"
             ] = "ray.llm._internal.serve._worker_process_setup_hook"
 
-        ray_actor_options = deployment_options.get("ray_actor_options", {})
         ray_actor_options["runtime_env"] = {
             **default_runtime_env,
             # Existing runtime_env should take precedence over the default.
 
@@ -6,8 +6,6 @@
 
 from ray.llm._internal.common.utils.download_utils import NodeModelDownloadable
 from ray.llm._internal.serve.core.configs.accelerators import (
-    CPUAccelerator,
-    GPUAccelerator,
     TPUAccelerator,
     TPUConfig,
 )
@@ -403,19 +401,16 @@ def test_engine_config_infers_tpu_from_accelerator_type_string(self):
         assert isinstance(engine_config.accelerator, TPUAccelerator)
         assert engine_config.accelerator_type == "TPU-V6E"
 
-    def test_requires_deferred_placement_group(self):
-        """Test that requires_deferred_placement_group correctly identifies deferred PG requirements."""
-        cpu_accel = CPUAccelerator()
-        assert cpu_accel.requires_deferred_placement_group is False
+    def test_tpu_accelerator_get_placement_group_labels(self):
+        """Test that TPUAccelerator correctly generates topology labels for Serve."""
+        tpu_accel_no_topology = TPUAccelerator(TPUConfig(kind="tpu"))
+        assert tpu_accel_no_topology.get_placement_group_labels("TPU-V6E") is None
 
-        gpu_accel = GPUAccelerator()
-        assert gpu_accel.requires_deferred_placement_group is False
-
-        tpu_accel_no_topo = TPUAccelerator(TPUConfig(kind="tpu"))
-        assert tpu_accel_no_topo.requires_deferred_placement_group is False
-
-        tpu_accel_with_topo = TPUAccelerator(TPUConfig(kind="tpu", topology="4x4"))
-        assert tpu_accel_with_topo.requires_deferred_placement_group is True
+        tpu_accel_with_topology = TPUAccelerator(TPUConfig(kind="tpu", topology="4x4"))
+        assert tpu_accel_with_topology.get_placement_group_labels("TPU-V6E") == {
+            "ray.io/tpu-topology": "4x4",
+            "ray.io/accelerator-type": "TPU-V6E",
+        }
 
     def test_tpu_accelerator_get_remote_options(self):
         """Test that TPUAccelerator get_remote_options returns an empty resources dict and label selector."""
@@ -427,7 +422,6 @@ def test_tpu_accelerator_get_remote_options(self):
         options_with_type = tpu_accel.get_remote_options("TPU-V6E")
         assert options_with_type == {
             "resources": {},
-            "accelerator_type": "TPU-V6E",
             "label_selector": {"ray.io/accelerator-type": "TPU-V6E"},
         }
 
 
@@ -1,6 +1,7 @@
 import pytest
 
 import ray
+from ray import serve
 from ray.tests.conftest import _ray_start_cluster
 
 
@@ -15,6 +16,19 @@ def llm_config_with_mock_engine(llm_config):
     yield llm_config
 
 
+@pytest.fixture(autouse=True)
+def isolated_serve_state(ray_tpu_cluster):
+    """
+    Automatically runs before and after every test to ensure Serve
+    starts on a random port and wipes its state, while reusing the
+    module scoped cluster.
+    """
+    serve.shutdown()
+    serve.start(http_options={"port": 0})
+    yield
+    serve.shutdown()
+
+
 @pytest.fixture(scope="module")
 def ray_tpu_cluster():
     """
@@ -36,6 +50,7 @@ def ray_tpu_cluster():
                 "ray.io/tpu-slice-name": "test-slice",
                 "ray.io/tpu-worker-id": str(i),
                 "ray.io/tpu-pod-type": pod_type,
+                "ray.io/accelerator-type": "TPU-V6E",
             }
             resources = {"TPU": 4, "accelerator_type:TPU-V6E": 4}