ray-project
diff --git a/‎python/ray/llm/_internal/serve/core/configs/accelerators.py‎
Lines changed: 13 additions & 58 deletions b/‎python/ray/llm/_internal/serve/core/configs/accelerators.py‎
Lines changed: 13 additions & 58 deletions
diff --git a/‎python/ray/llm/_internal/serve/engines/vllm/vllm_models.py‎
Lines changed: 0 additions & 14 deletions b/‎python/ray/llm/_internal/serve/engines/vllm/vllm_models.py‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎python/ray/llm/tests/serve/cpu/configs/test_models.py‎
Lines changed: 0 additions & 46 deletions b/‎python/ray/llm/tests/serve/cpu/configs/test_models.py‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine_tpu.py‎
Lines changed: 41 additions & 98 deletions b/‎python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine_tpu.py‎
Lines changed: 41 additions & 98 deletions
@@ -6,14 +6,9 @@
 from typing_extensions import Annotated
 
 import ray.util.accelerators.accelerators as accelerators
-from ray._private.accelerators.tpu import get_chips_per_host
 from ray.llm._internal.serve.observability.logging import get_logger
 from ray.util.placement_group import PlacementGroup, placement_group
-from ray.util.tpu import (
-    get_num_chips_from_topology,
-    get_tpu_version_from_type,
-    slice_placement_group,
-)
+from ray.util.tpu import get_tpu_version_from_type, slice_placement_group
 
 logger = get_logger(__name__)
 
@@ -32,21 +27,6 @@ def format_ray_accelerator_resource(accelerator_type_str: str) -> str:
     return f"accelerator_type:{accelerator_type_str}"
 
 
-def get_inferred_tensor_parallel_size(topology: Optional[str]) -> Optional[int]:
-    """Infers the tensor parallel size from the TPU topology."""
-    if not topology:
-        return None
-
-    try:
-        return get_num_chips_from_topology(topology)
-    except ValueError as e:
-        logger.warning(
-            f"Failed to infer tensor_parallel_size from topology '{topology}': {e}. "
-            "Defaulting to None."
-        )
-        return None
-
-
 def infer_hardware_kind_from_bundles(
     placement_group_config: Optional[Dict[str, Any]]
 ) -> Optional[str]:
@@ -200,35 +180,10 @@ def __init__(self, config: TPUConfig):
     def default_bundles(
         self, *, num_devices: int, accelerator_type_str: Optional[str] = None
     ):
-        if not self._config.topology:
-            # Fallback to per-chip bundles if no topology is specified
-            bundle = {"TPU": 1}
-            if accelerator_type_str:
-                bundle[format_ray_accelerator_resource(accelerator_type_str)] = 0.001
-            return [bundle.copy() for _ in range(num_devices)]
-
-        # Topology is specified, compute per-host bundles
-        if not accelerator_type_str:
-            raise ValueError(
-                "`accelerator_type` must be specified when `topology` is present "
-                "in order to compute TPU resource requirements."
-            )
-        version = get_tpu_version_from_type(accelerator_type_str)
-        chips_per_host = get_chips_per_host(self._config.topology, version)
-
-        if num_devices > chips_per_host and num_devices % chips_per_host != 0:
-            raise ValueError(
-                f"num_devices ({num_devices}) must be a multiple of "
-                f"chips_per_host ({chips_per_host}) for TPU topologies."
-            )
-
-        num_hosts = max(1, num_devices // chips_per_host)
-
-        tpu_resources = min(num_devices, chips_per_host)
-        bundle = {"TPU": tpu_resources}
-        bundle[format_ray_accelerator_resource(accelerator_type_str)] = 0.001
-
-        return [bundle.copy() for _ in range(num_hosts)]
+        bundle = {"TPU": 1}
+        if accelerator_type_str:
+            bundle[format_ray_accelerator_resource(accelerator_type_str)] = 0.001
+        return [bundle.copy() for _ in range(num_devices)]
 
     def create_placement_group(
         self,
@@ -299,15 +254,11 @@ def requires_remote_initialization(self) -> bool:
         return True
 
     def get_remote_options(self, accelerator_type_str: str = None):
-        # The PlacementGroupSchedulingStrategy natively handles routing the task to
-        # the correct hardware. We omit TPU resource requests to avoid consuming
-        # chips that the model engine workers must use.
-        options: Dict[str, Any] = {"resources": {}}
+        # TPUs use custom resource strings rather than a native kwarg
+        options: Dict[str, Any] = {"resources": {"TPU": 0.001}}
+
         if accelerator_type_str:
-            # Pin the task to the TPU accelerator to avoid scheduling on a CPU bundle.
-            options["label_selector"] = {
-                "ray.io/accelerator-type": accelerator_type_str
-            }
+            options["accelerator_type"] = accelerator_type_str
         return options
 
     def shutdown(self):
@@ -319,3 +270,7 @@ def shutdown(self):
                 logger.warning(f"Failed to shut down TPU slice PG: {e}")
             finally:
                 self._slice_pg_wrapper = None
+
+    def __del__(self):
+        """Ensure placement groups are cleaned up when this backend is garbage collected."""
+        self.shutdown()
@@ -23,7 +23,6 @@
     TPUAccelerator,
     TPUConfig,
     format_ray_accelerator_resource,
-    get_inferred_tensor_parallel_size,
 )
 from ray.llm._internal.serve.core.configs.llm_config import (
     AcceleratorType,
@@ -194,19 +193,6 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig":
             mirror_config = llm_config.model_loading_config.model_source
 
         all_engine_kwargs = llm_config.engine_kwargs.copy()
-
-        # If tensor_parallel_size is not specified, try to infer it from topology
-        if "tensor_parallel_size" not in all_engine_kwargs:
-            if isinstance(llm_config.accelerator_config, TPUConfig):
-                total_chips = get_inferred_tensor_parallel_size(
-                    llm_config.accelerator_config.topology
-                )
-                if total_chips is not None:
-                    all_engine_kwargs["tensor_parallel_size"] = total_chips
-                    logger.info(
-                        f"Inferred tensor_parallel_size={total_chips} from TPUConfig."
-                    )
-
         engine_kwargs = {}
         frontend_kwargs = {}
 
 
@@ -417,52 +417,6 @@ def test_requires_deferred_placement_group(self):
         tpu_accel_with_topo = TPUAccelerator(TPUConfig(kind="tpu", topology="4x4"))
         assert tpu_accel_with_topo.requires_deferred_placement_group is True
 
-    @pytest.mark.parametrize(
-        "topology,num_devices,accelerator_type_str,expected_bundles_count,expected_chips_per_host",
-        [
-            ("1x1", 1, "TPU-V6E", 1, 1),
-            ("1x1", 1, "TPU-V7X", 1, 1),
-            ("4x4", 16, "TPU-V6E", 4, 4),
-            ("2x2x2", 8, "TPU-V5P", 2, 4),
-            ("2x2", 4, "TPU-V5LITEPOD", 1, 4),
-            ("2x2x1", 4, "TPU-V4", 1, 4),
-            ("2x4", 8, "TPU-V6E", 1, 8),
-        ],
-    )
-    def test_default_bundles_topology(
-        self,
-        topology,
-        num_devices,
-        accelerator_type_str,
-        expected_bundles_count,
-        expected_chips_per_host,
-    ):
-        """Test that different topologies return correct per-host bundles."""
-        tpu_accel = TPUAccelerator(TPUConfig(kind="tpu", topology=topology))
-        bundles = tpu_accel.default_bundles(
-            num_devices=num_devices, accelerator_type_str=accelerator_type_str
-        )
-
-        assert len(bundles) == expected_bundles_count
-        for bundle in bundles:
-            assert bundle["TPU"] == expected_chips_per_host
-            assert f"accelerator_type:{accelerator_type_str}" in bundle
-
-    def test_default_bundles_topology_missing_accelerator_type_raises(self):
-        """Test that ValueError is raised when topology is present but accelerator type is missing."""
-        tpu_accel = TPUAccelerator(TPUConfig(kind="tpu", topology="4x4"))
-        with pytest.raises(
-            ValueError,
-            match="`accelerator_type` must be specified when `topology` is present",
-        ):
-            tpu_accel.default_bundles(num_devices=16, accelerator_type_str=None)
-
-    def test_default_bundles_topology_non_multiple_num_devices_raises(self):
-        """Test that ValueError is raised when num_devices is not a multiple of chips_per_host."""
-        tpu_accel = TPUAccelerator(TPUConfig(kind="tpu", topology="4x4"))
-        with pytest.raises(ValueError, match="must be a multiple of chips_per_host"):
-            tpu_accel.default_bundles(num_devices=6, accelerator_type_str="TPU-V6E")
-
 
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))
@@ -26,33 +26,29 @@ def test_tpu_slice_placement_group_creation_default_resources(ray_tpu_cluster):
     llm_config = LLMConfig(
         model_loading_config=ModelLoadingConfig(model_id="test-tpu-model"),
         accelerator_type="TPU-V6E",
-        accelerator_config=TPUConfig(kind="tpu", topology="4x4"),
+        accelerator_config={"kind": "tpu", "topology": "4x4"},
     )
 
     engine_config = llm_config.get_engine_config()
+    pg = engine_config.get_or_create_pg()
 
-    pg = None
-    try:
-        pg = engine_config.get_or_create_pg()
+    assert isinstance(pg, PlacementGroup)
 
-        assert isinstance(pg, PlacementGroup)
+    pg_table = placement_group_table(pg)
+    assert pg_table["strategy"] == "PACK"
 
-        pg_table = placement_group_table(pg)
-        assert pg_table["strategy"] == "PACK"
+    # 4x4 v6e = 16 chips. We default to 1 TPU chip per bundle.
+    assert len(pg_table["bundles"]) == 16
+    for bundle in pg_table["bundles"].values():
+        assert "TPU" in bundle
+        assert bundle["TPU"] == 1
 
-        # 4x4 v6e = 16 chips. We default to 4 TPU chips per bundle (per-host).
-        assert len(pg_table["bundles"]) == 4
-        for bundle in pg_table["bundles"].values():
-            assert "TPU" in bundle
-            assert bundle["TPU"] == 4.0
-    finally:
-        # Let the backend tear down its own resources if it has any
-        engine_config.accelerator.shutdown()
-        if pg is not None:
-            try:
-                ray.util.remove_placement_group(pg)
-            except Exception:
-                pass
+    # Let the backend tear down its own resources if it has any
+    engine_config.accelerator.shutdown()
+    try:
+        ray.util.remove_placement_group(pg)
+    except Exception:
+        pass  # Already cleaned up by the wrapper
 
 
 def test_tpu_slice_placement_group_creation_host_resources(ray_tpu_cluster):
@@ -63,36 +59,32 @@ def test_tpu_slice_placement_group_creation_host_resources(ray_tpu_cluster):
     llm_config = LLMConfig(
         model_loading_config=ModelLoadingConfig(model_id="test-tpu-model"),
         accelerator_type="TPU-V6E",
-        accelerator_config=TPUConfig(kind="tpu", topology="4x4"),
+        accelerator_config={"kind": "tpu", "topology": "4x4"},
         placement_group_config={
             "strategy": "STRICT_SPREAD",
-            "bundles": [{"TPU": 4}] * 4,
+            "bundles": [{"TPU": 4}],
         },
     )
 
     engine_config = llm_config.get_engine_config()
+    pg = engine_config.get_or_create_pg()
+
+    assert isinstance(pg, PlacementGroup)
 
-    pg = None
+    pg_table = placement_group_table(pg)
+    assert pg_table["strategy"] == "STRICT_SPREAD"
+    # We should provision 4 host-level bundles instead of the default 16 chip-level bundles.
+    assert len(pg_table["bundles"]) == 4
+    for bundle in pg_table["bundles"].values():
+        assert "TPU" in bundle
+        assert bundle["TPU"] == 4
+
+    # Let the backend tear down its own resources if it has any
+    engine_config.accelerator.shutdown()
     try:
-        pg = engine_config.get_or_create_pg()
-
-        assert isinstance(pg, PlacementGroup)
-
-        pg_table = placement_group_table(pg)
-        assert pg_table["strategy"] == "STRICT_SPREAD"
-        # We should provision 4 host-level bundles instead of the default 16 chip-level bundles.
-        assert len(pg_table["bundles"]) == 4
-        for bundle in pg_table["bundles"].values():
-            assert "TPU" in bundle
-            assert bundle["TPU"] == 4
-    finally:
-        # Let the backend tear down its own resources if it has any
-        engine_config.accelerator.shutdown()
-        if pg is not None:
-            try:
-                ray.util.remove_placement_group(pg)
-            except Exception:
-                pass
+        ray.util.remove_placement_group(pg)
+    except Exception:
+        pass  # Already cleaned up by the wrapper
 
 
 def test_single_tpu_fallback(ray_tpu_cluster):
@@ -229,17 +221,15 @@ def test_tpu_slice_placement_group_creation_cpu_driver_homogeneous_tpu_bundles_p
         pass
 
 
-def test_tpu_serve_deployment_default_host_level_bundles(ray_tpu_cluster):
+def test_tpu_serve_deployment_default_chip_level_bundles(ray_tpu_cluster):
     """
     Verifies that a Serve deployment created for a multi-host TPU slice defaults
-    to host-level bundles when no placement_group_config is specified.
+    to chip-level bundles when no placement_group_config is specified.
     """
-    from ray.llm._internal.serve.core.configs.accelerators import TPUConfig
-
     llm_config = LLMConfig(
         model_loading_config=ModelLoadingConfig(model_id="test-tpu-model"),
         accelerator_type="TPU-V6E",
-        accelerator_config=TPUConfig(kind="tpu", topology="4x4"),
+        accelerator_config={"kind": "tpu", "topology": "4x4"},
     )
 
     app = serve.deployment(LLMServer).bind(llm_config, engine_cls=PGCreationMockEngine)
@@ -266,10 +256,10 @@ def test_tpu_serve_deployment_default_host_level_bundles(ray_tpu_cluster):
     worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
 
     assert worker_pg["strategy"] == "PACK"
-    # 4x4 topology = 16 chips. Default is 4 bundles of 4 TPUs (per-host).
-    assert len(worker_pg["bundles"]) == 4
+    # 4x4 topology = 16 chips. Default is 16 bundles of 1 TPU.
+    assert len(worker_pg["bundles"]) == 16
     for bundle in worker_pg["bundles"].values():
-        assert bundle.get("TPU", 0) == 4.0
+        assert bundle.get("TPU", 0) == 1
 
     serve.shutdown()
 
@@ -282,7 +272,7 @@ def test_tpu_serve_deployment_explicit_host_level_bundles(ray_tpu_cluster):
     llm_config = LLMConfig(
         model_loading_config=ModelLoadingConfig(model_id="test-tpu-model"),
         accelerator_type="TPU-V6E",
-        accelerator_config=TPUConfig(kind="tpu", topology="4x4"),
+        accelerator_config={"kind": "tpu", "topology": "4x4"},
         placement_group_config={"bundle_per_worker": {"TPU": 4}},
     )
 
@@ -318,52 +308,5 @@ def test_tpu_serve_deployment_explicit_host_level_bundles(ray_tpu_cluster):
     serve.shutdown()
 
 
-def test_tpu_serve_deployment_explicit_per_chip_bundles(ray_tpu_cluster):
-    """
-    Verifies that a user can explicitly request chip-level bundles (1 TPU per bundle)
-    for a full multi-host TPU slice via placement_group_config.
-    """
-    from ray.llm._internal.serve.core.configs.accelerators import TPUConfig
-
-    llm_config = LLMConfig(
-        model_loading_config=ModelLoadingConfig(model_id="test-tpu-model"),
-        accelerator_type="TPU-V6E",
-        accelerator_config=TPUConfig(kind="tpu", topology="4x4"),
-        placement_group_config={"bundle_per_worker": {"TPU": 1}},
-        engine_kwargs={"tensor_parallel_size": 16},
-    )
-
-    app = serve.deployment(LLMServer).bind(llm_config, engine_cls=PGCreationMockEngine)
-    serve.run(app)
-
-    pg_table = ray.util.placement_group_table()
-    active_pgs = list(
-        {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
-    )
-
-    assert (
-        len(active_pgs) == 2
-    ), "Expected 2 PGs - one for TPU Head, one for worker bundles"
-
-    tpu_head_resource = "TPU-v6e-16-head"
-    head_pgs = [
-        pg
-        for pg in active_pgs
-        if len(pg["bundles"]) == 1
-        and tpu_head_resource in list(pg["bundles"].values())[0]
-    ]
-    assert len(head_pgs) == 1
-
-    worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
-
-    assert worker_pg["strategy"] == "PACK"
-    # 4x4 topology = 16 chips. Explicitly requested 16 bundles of 1 TPU.
-    assert len(worker_pg["bundles"]) == 16
-    for bundle in worker_pg["bundles"].values():
-        assert bundle.get("TPU", 0) == 1.0
-
-    serve.shutdown()
-
-
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))