use Serve's TPUAcceleratorConfig in Ray LLM

ryanaoleary · ryanaoleary · commit 6f172f36df5a · 2026-05-12T11:46:08.000Z
Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;

Update Ray LLM to use TPUAcceleratorConfig

Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;
diff --git a/python/ray/llm/_internal/serve/core/configs/accelerators.py b/python/ray/llm/_internal/serve/core/configs/accelerators.py
@@ -7,6 +7,7 @@
 
 import ray.util.accelerators.accelerators as accelerators
 from ray.llm._internal.serve.observability.logging import get_logger
+from ray.serve.config import TPUAcceleratorConfig
 from ray.util.placement_group import PlacementGroup, placement_group
 from ray.util.tpu import get_tpu_version_from_type, slice_placement_group
 
@@ -111,6 +112,15 @@ def get_remote_options(self, accelerator_type_str: str = None) -> Dict[str, Any]
         """Returns the hardware-specific kwargs for ray.remote().options()."""
         pass
 
+    def get_deployment_options(
+        self,
+        *,
+        accelerator_type: Optional[str] = None,
+        placement_group_config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """Returns Serve deployment options specific to this accelerator."""
+        return {}
+
     def shutdown(self) -> None:
         """Release any resources owned by this backend. Idempotent."""
         return
@@ -261,6 +271,34 @@ def get_remote_options(self, accelerator_type_str: str = None):
             options["accelerator_type"] = accelerator_type_str
         return options
 
+    def get_deployment_options(
+        self,
+        *,
+        accelerator_type: Optional[str] = None,
+        placement_group_config: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        if not self._config.topology:
+            return {}
+
+        if not accelerator_type:
+            raise ValueError(
+                "accelerator_type must be specified when "
+                "accelerator_config is a TPUConfig with topology."
+            )
+
+        version = get_tpu_version_from_type(accelerator_type)
+        resources_per_bundle = (placement_group_config or {}).get("bundle_per_worker")
+
+        return {
+            "accelerator_config": TPUAcceleratorConfig(
+                topology=self._config.topology,
+                accelerator_version=version,
+                num_slices=getattr(self._config, "num_slices", 1),
+                chips_per_vm=getattr(self._config, "chips_per_vm", None),
+                resources_per_bundle=resources_per_bundle,
+            )
+        }
+
     def shutdown(self):
         if self._slice_pg_wrapper is not None:
             try:
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -740,4 +740,12 @@ def get_deployment_options(cls, llm_config: "LLMConfig"):
         }
         deployment_options["ray_actor_options"] = ray_actor_options
 
+        # Let the accelerator backend populate hardware-specific deployment options.
+        deployment_options.update(
+            engine_config.accelerator.get_deployment_options(
+                accelerator_type=llm_config.accelerator_type,
+                placement_group_config=llm_config.placement_group_config,
+            )
+        )
+
         return deployment_options
diff --git a/python/ray/llm/tests/serve/cpu/deployments/conftest.py b/python/ray/llm/tests/serve/cpu/deployments/conftest.py
@@ -20,6 +20,9 @@ def ray_tpu_cluster():
     """
     Simulates a Ray cluster with a multi-host TPU v6e-16 slice (4x4 topology).
     """
+    if ray.is_initialized():
+        ray.shutdown()
+
     pod_type = "v6e-16"
     topology = "4x4"
 
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine_tpu.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine_tpu.py
@@ -1,4 +1,5 @@
 import sys
+import time
 
 import pytest
 
@@ -30,25 +31,28 @@ def test_tpu_slice_placement_group_creation_default_resources(ray_tpu_cluster):
     )
 
     engine_config = llm_config.get_engine_config()
-    pg = engine_config.get_or_create_pg()
-
-    assert isinstance(pg, PlacementGroup)
+    pg = None
+    try:
+        pg = engine_config.get_or_create_pg()
 
-    pg_table = placement_group_table(pg)
-    assert pg_table["strategy"] == "PACK"
+        assert isinstance(pg, PlacementGroup)
 
-    # 4x4 v6e = 16 chips. We default to 1 TPU chip per bundle.
-    assert len(pg_table["bundles"]) == 16
-    for bundle in pg_table["bundles"].values():
-        assert "TPU" in bundle
-        assert bundle["TPU"] == 1
+        pg_table = placement_group_table(pg)
+        assert pg_table["strategy"] == "PACK"
 
-    # Let the backend tear down its own resources if it has any
-    engine_config.accelerator.shutdown()
-    try:
-        ray.util.remove_placement_group(pg)
-    except Exception:
-        pass  # Already cleaned up by the wrapper
+        # 4x4 v6e = 16 chips. We default to 1 TPU chip per bundle.
+        assert len(pg_table["bundles"]) == 16
+        for bundle in pg_table["bundles"].values():
+            assert "TPU" in bundle
+            assert bundle["TPU"] == 1
+    finally:
+        # Let the backend tear down its own resources if it has any
+        engine_config.accelerator.shutdown()
+        if pg is not None:
+            try:
+                ray.util.remove_placement_group(pg)
+            except Exception:
+                pass
 
 
 def test_tpu_slice_placement_group_creation_host_resources(ray_tpu_cluster):
@@ -67,24 +71,27 @@ def test_tpu_slice_placement_group_creation_host_resources(ray_tpu_cluster):
     )
 
     engine_config = llm_config.get_engine_config()
-    pg = engine_config.get_or_create_pg()
-
-    assert isinstance(pg, PlacementGroup)
-
-    pg_table = placement_group_table(pg)
-    assert pg_table["strategy"] == "STRICT_SPREAD"
-    # We should provision 4 host-level bundles instead of the default 16 chip-level bundles.
-    assert len(pg_table["bundles"]) == 4
-    for bundle in pg_table["bundles"].values():
-        assert "TPU" in bundle
-        assert bundle["TPU"] == 4
-
-    # Let the backend tear down its own resources if it has any
-    engine_config.accelerator.shutdown()
+    pg = None
     try:
-        ray.util.remove_placement_group(pg)
-    except Exception:
-        pass  # Already cleaned up by the wrapper
+        pg = engine_config.get_or_create_pg()
+
+        assert isinstance(pg, PlacementGroup)
+
+        pg_table = placement_group_table(pg)
+        assert pg_table["strategy"] == "STRICT_SPREAD"
+        # We should provision 4 host-level bundles instead of the default 16 chip-level bundles.
+        assert len(pg_table["bundles"]) == 4
+        for bundle in pg_table["bundles"].values():
+            assert "TPU" in bundle
+            assert bundle["TPU"] == 4
+    finally:
+        # Let the backend tear down its own resources if it has any
+        engine_config.accelerator.shutdown()
+        if pg is not None:
+            try:
+                ray.util.remove_placement_group(pg)
+            except Exception:
+                pass
 
 
 def test_single_tpu_fallback(ray_tpu_cluster):
@@ -98,20 +105,23 @@ def test_single_tpu_fallback(ray_tpu_cluster):
     )
 
     engine_config = llm_config.get_engine_config()
-    pg = engine_config.get_or_create_pg()
-
-    pg_table = placement_group_table(pg)
+    pg = None
+    try:
+        pg = engine_config.get_or_create_pg()
 
-    # Verify it falls back to the default PACK strategy for 1 GPU/TPU
-    assert len(pg_table["bundles"]) == 1
-    assert pg_table["strategy"] == "PACK"
+        pg_table = placement_group_table(pg)
 
-    # Let the backend tear down its own resources if it has any
-    engine_config.accelerator.shutdown()
-    try:
-        ray.util.remove_placement_group(pg)
-    except Exception:
-        pass  # Already cleaned up by the wrapper
+        # Verify it falls back to the default PACK strategy for 1 GPU/TPU
+        assert len(pg_table["bundles"]) == 1
+        assert pg_table["strategy"] == "PACK"
+    finally:
+        # Let the backend tear down its own resources if it has any
+        engine_config.accelerator.shutdown()
+        if pg is not None:
+            try:
+                ray.util.remove_placement_group(pg)
+            except Exception:
+                pass
 
 
 def test_tpu_slice_placement_group_creation_bundle_per_worker(ray_tpu_cluster):
@@ -221,47 +231,49 @@ def test_tpu_slice_placement_group_creation_cpu_driver_homogeneous_tpu_bundles_p
         pass
 
 
-def test_tpu_serve_deployment_default_chip_level_bundles(ray_tpu_cluster):
+def test_tpu_serve_deployment_default_host_level_bundles(ray_tpu_cluster):
     """
     Verifies that a Serve deployment created for a multi-host TPU slice defaults
-    to chip-level bundles when no placement_group_config is specified.
+    to host-level bundles when no placement_group_config is specified.
     """
     llm_config = LLMConfig(
         model_loading_config=ModelLoadingConfig(model_id="test-tpu-model"),
         accelerator_type="TPU-V6E",
         accelerator_config={"kind": "tpu", "topology": "4x4"},
     )
 
-    app = serve.deployment(LLMServer).bind(llm_config, engine_cls=PGCreationMockEngine)
-    serve.run(app)
-
-    pg_table = ray.util.placement_group_table()
-    active_pgs = list(
-        {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
+    serve_options = LLMServer.get_deployment_options(llm_config)
+    app = serve.deployment(**serve_options)(LLMServer).bind(
+        llm_config, engine_cls=PGCreationMockEngine
     )
-
-    assert (
-        len(active_pgs) == 2
-    ), "Expected 2 PGs - one for TPU Head, one for worker bundles"
-
-    tpu_head_resource = "TPU-v6e-16-head"
-    head_pgs = [
-        pg
-        for pg in active_pgs
-        if len(pg["bundles"]) == 1
-        and tpu_head_resource in list(pg["bundles"].values())[0]
-    ]
-    assert len(head_pgs) == 1
-
-    worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
-
-    assert worker_pg["strategy"] == "PACK"
-    # 4x4 topology = 16 chips. Default is 16 bundles of 1 TPU.
-    assert len(worker_pg["bundles"]) == 16
-    for bundle in worker_pg["bundles"].values():
-        assert bundle.get("TPU", 0) == 1
-
-    serve.shutdown()
+    try:
+        serve.run(app)
+
+        # Wait for the head PG to be removed (eventual consistency).
+        start_time = time.time()
+        timeout = 10
+        while time.time() - start_time < timeout:
+            pg_table = ray.util.placement_group_table()
+            active_pgs = list(
+                {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
+            )
+            if len(active_pgs) == 1:
+                break
+            time.sleep(0.5)
+
+        assert (
+            len(active_pgs) == 1
+        ), f"Expected exactly 1 active PG (the worker PG), but found {len(active_pgs)}. Head PG may not have been removed."
+
+        worker_pg = active_pgs[0]
+
+        assert worker_pg["strategy"] == "PACK"
+        # 4x4 topology = 16 chips. Default is host-level bundles (4 bundles of 4 TPUs).
+        assert len(worker_pg["bundles"]) == 4
+        for bundle in worker_pg["bundles"].values():
+            assert bundle.get("TPU", 0) == 4
+    finally:
+        serve.shutdown()
 
 
 def test_tpu_serve_deployment_explicit_host_level_bundles(ray_tpu_cluster):
@@ -276,36 +288,38 @@ def test_tpu_serve_deployment_explicit_host_level_bundles(ray_tpu_cluster):
         placement_group_config={"bundle_per_worker": {"TPU": 4}},
     )
 
-    app = serve.deployment(LLMServer).bind(llm_config, engine_cls=PGCreationMockEngine)
-    serve.run(app)
-
-    pg_table = ray.util.placement_group_table()
-    active_pgs = list(
-        {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
+    serve_options = LLMServer.get_deployment_options(llm_config)
+    app = serve.deployment(**serve_options)(LLMServer).bind(
+        llm_config, engine_cls=PGCreationMockEngine
     )
-
-    assert (
-        len(active_pgs) == 2
-    ), "Expected 2 PGs - one for TPU Head, one for worker bundles"
-
-    tpu_head_resource = "TPU-v6e-16-head"
-    head_pgs = [
-        pg
-        for pg in active_pgs
-        if len(pg["bundles"]) == 1
-        and tpu_head_resource in list(pg["bundles"].values())[0]
-    ]
-    assert len(head_pgs) == 1
-
-    worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
-
-    assert worker_pg["strategy"] == "PACK"
-    # 4x4 topology = 16 chips. With 4 TPUs per bundle, expect exactly 4 bundles.
-    assert len(worker_pg["bundles"]) == 4
-    for bundle in worker_pg["bundles"].values():
-        assert bundle.get("TPU", 0) == 4
-
-    serve.shutdown()
+    try:
+        serve.run(app)
+
+        # Wait for the head PG to be removed (eventual consistency).
+        start_time = time.time()
+        timeout = 10
+        while time.time() - start_time < timeout:
+            pg_table = ray.util.placement_group_table()
+            active_pgs = list(
+                {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
+            )
+            if len(active_pgs) == 1:
+                break
+            time.sleep(0.5)
+
+        assert (
+            len(active_pgs) == 1
+        ), f"Expected exactly 1 active PG (the worker PG), but found {len(active_pgs)}. Head PG may not have been removed."
+
+        worker_pg = active_pgs[0]
+
+        assert worker_pg["strategy"] == "PACK"
+        # 4x4 topology = 16 chips. With 4 TPUs per bundle, expect exactly 4 bundles.
+        assert len(worker_pg["bundles"]) == 4
+        for bundle in worker_pg["bundles"].values():
+            assert bundle.get("TPU", 0) == 4
+    finally:
+        serve.shutdown()
 
 
 if __name__ == "__main__":
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_server.py
@@ -8,6 +8,7 @@
 import pytest
 
 from ray import serve
+from ray.llm._internal.serve.core.configs.accelerators import TPUConfig
 from ray.llm._internal.serve.core.configs.llm_config import (
     LLMConfig,
     LoraConfig,
@@ -692,6 +693,28 @@ def test_deferred_placement_group_for_tpu_topology(self):
         assert "placement_group_bundles" not in serve_options
         assert "placement_group_strategy" not in serve_options
 
+    def test_tpu_accelerator_config_translation(self):
+        """Test that TPUConfig is correctly translated to Serve TPUAcceleratorConfig."""
+
+        llm_config = LLMConfig(
+            model_loading_config=ModelLoadingConfig(model_id="test-tpu-model"),
+            accelerator_type="TPU-V6E",
+            accelerator_config=TPUConfig(kind="tpu", topology="4x4"),
+            placement_group_config={"bundle_per_worker": {"TPU": 1}},
+            llm_engine="vLLM",
+        )
+
+        serve_options = LLMServer.get_deployment_options(llm_config)
+
+        assert "placement_group_bundles" not in serve_options
+        assert "placement_group_strategy" not in serve_options
+
+        assert "accelerator_config" in serve_options
+        acc_config = serve_options["accelerator_config"]
+        assert acc_config.topology == "4x4"
+        assert acc_config.accelerator_version == "v6e"
+        assert acc_config.resources_per_bundle == {"TPU": 1}
+
 
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", __file__]))
diff --git a/python/ray/serve/tests/test_accelerator_config.py b/python/ray/serve/tests/test_accelerator_config.py