Update Ray LLM to use TPUAcceleratorConfig

ryanaoleary · ryanaoleary · commit 812ef6315bc9 · 2026-05-11T20:59:49.000Z
Signed-off-by: Ryan O'Leary &lt;ryanaoleary@google.com&gt;
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -22,6 +22,7 @@
     MODEL_RESPONSE_BATCH_TIMEOUT_MS,
     RAYLLM_VLLM_ENGINE_CLS_ENV,
 )
+from ray.llm._internal.serve.core.configs.accelerators import TPUConfig
 from ray.llm._internal.serve.core.configs.llm_config import (
     DiskMultiplexConfig,
     LLMConfig,
@@ -39,6 +40,8 @@
 from ray.llm._internal.serve.utils.server_utils import (
     get_serve_request_id,
 )
+from ray.serve.config import TPUAcceleratorConfig
+from ray.util.tpu import get_tpu_version_from_type
 
 if TYPE_CHECKING:
     from ray.llm._internal.serve.core.configs.openai_api_models import (
@@ -740,4 +743,22 @@ def get_deployment_options(cls, llm_config: "LLMConfig"):
         }
         deployment_options["ray_actor_options"] = ray_actor_options
 
+        if (
+            llm_config.accelerator_config is not None
+            and isinstance(llm_config.accelerator_config, TPUConfig)
+            and llm_config.accelerator_config.topology
+        ):
+            if not llm_config.accelerator_type:
+                raise ValueError(
+                    "llm_config.accelerator_type must be specified when "
+                    "accelerator_config is a TPUConfig."
+                )
+            version = get_tpu_version_from_type(llm_config.accelerator_type)
+
+            deployment_options["accelerator_config"] = TPUAcceleratorConfig(
+                topology=llm_config.accelerator_config.topology,
+                accelerator_version=version,
+                num_slices=1,
+            )
+
         return deployment_options
diff --git a/python/ray/llm/tests/serve/cpu/deployments/conftest.py b/python/ray/llm/tests/serve/cpu/deployments/conftest.py
@@ -20,6 +20,9 @@ def ray_tpu_cluster():
     """
     Simulates a Ray cluster with a multi-host TPU v6e-16 slice (4x4 topology).
     """
+    if ray.is_initialized():
+        ray.shutdown()
+
     pod_type = "v6e-16"
     topology = "4x4"
 
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine_tpu.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine_tpu.py
@@ -30,25 +30,28 @@ def test_tpu_slice_placement_group_creation_default_resources(ray_tpu_cluster):
     )
 
     engine_config = llm_config.get_engine_config()
-    pg = engine_config.get_or_create_pg()
-
-    assert isinstance(pg, PlacementGroup)
+    pg = None
+    try:
+        pg = engine_config.get_or_create_pg()
 
-    pg_table = placement_group_table(pg)
-    assert pg_table["strategy"] == "PACK"
+        assert isinstance(pg, PlacementGroup)
 
-    # 4x4 v6e = 16 chips. We default to 1 TPU chip per bundle.
-    assert len(pg_table["bundles"]) == 16
-    for bundle in pg_table["bundles"].values():
-        assert "TPU" in bundle
-        assert bundle["TPU"] == 1
+        pg_table = placement_group_table(pg)
+        assert pg_table["strategy"] == "PACK"
 
-    # Let the backend tear down its own resources if it has any
-    engine_config.accelerator.shutdown()
-    try:
-        ray.util.remove_placement_group(pg)
-    except Exception:
-        pass  # Already cleaned up by the wrapper
+        # 4x4 v6e = 16 chips. We default to 1 TPU chip per bundle.
+        assert len(pg_table["bundles"]) == 16
+        for bundle in pg_table["bundles"].values():
+            assert "TPU" in bundle
+            assert bundle["TPU"] == 1
+    finally:
+        # Let the backend tear down its own resources if it has any
+        engine_config.accelerator.shutdown()
+        if pg is not None:
+            try:
+                ray.util.remove_placement_group(pg)
+            except Exception:
+                pass
 
 
 def test_tpu_slice_placement_group_creation_host_resources(ray_tpu_cluster):
@@ -67,24 +70,27 @@ def test_tpu_slice_placement_group_creation_host_resources(ray_tpu_cluster):
     )
 
     engine_config = llm_config.get_engine_config()
-    pg = engine_config.get_or_create_pg()
-
-    assert isinstance(pg, PlacementGroup)
-
-    pg_table = placement_group_table(pg)
-    assert pg_table["strategy"] == "STRICT_SPREAD"
-    # We should provision 4 host-level bundles instead of the default 16 chip-level bundles.
-    assert len(pg_table["bundles"]) == 4
-    for bundle in pg_table["bundles"].values():
-        assert "TPU" in bundle
-        assert bundle["TPU"] == 4
-
-    # Let the backend tear down its own resources if it has any
-    engine_config.accelerator.shutdown()
+    pg = None
     try:
-        ray.util.remove_placement_group(pg)
-    except Exception:
-        pass  # Already cleaned up by the wrapper
+        pg = engine_config.get_or_create_pg()
+
+        assert isinstance(pg, PlacementGroup)
+
+        pg_table = placement_group_table(pg)
+        assert pg_table["strategy"] == "STRICT_SPREAD"
+        # We should provision 4 host-level bundles instead of the default 16 chip-level bundles.
+        assert len(pg_table["bundles"]) == 4
+        for bundle in pg_table["bundles"].values():
+            assert "TPU" in bundle
+            assert bundle["TPU"] == 4
+    finally:
+        # Let the backend tear down its own resources if it has any
+        engine_config.accelerator.shutdown()
+        if pg is not None:
+            try:
+                ray.util.remove_placement_group(pg)
+            except Exception:
+                pass
 
 
 def test_single_tpu_fallback(ray_tpu_cluster):
@@ -98,20 +104,23 @@ def test_single_tpu_fallback(ray_tpu_cluster):
     )
 
     engine_config = llm_config.get_engine_config()
-    pg = engine_config.get_or_create_pg()
+    pg = None
+    try:
+        pg = engine_config.get_or_create_pg()
 
-    pg_table = placement_group_table(pg)
+        pg_table = placement_group_table(pg)
 
-    # Verify it falls back to the default PACK strategy for 1 GPU/TPU
-    assert len(pg_table["bundles"]) == 1
-    assert pg_table["strategy"] == "PACK"
-
-    # Let the backend tear down its own resources if it has any
-    engine_config.accelerator.shutdown()
-    try:
-        ray.util.remove_placement_group(pg)
-    except Exception:
-        pass  # Already cleaned up by the wrapper
+        # Verify it falls back to the default PACK strategy for 1 GPU/TPU
+        assert len(pg_table["bundles"]) == 1
+        assert pg_table["strategy"] == "PACK"
+    finally:
+        # Let the backend tear down its own resources if it has any
+        engine_config.accelerator.shutdown()
+        if pg is not None:
+            try:
+                ray.util.remove_placement_group(pg)
+            except Exception:
+                pass
 
 
 def test_tpu_slice_placement_group_creation_bundle_per_worker(ray_tpu_cluster):
@@ -233,35 +242,36 @@ def test_tpu_serve_deployment_default_chip_level_bundles(ray_tpu_cluster):
     )
 
     app = serve.deployment(LLMServer).bind(llm_config, engine_cls=PGCreationMockEngine)
-    serve.run(app)
-
-    pg_table = ray.util.placement_group_table()
-    active_pgs = list(
-        {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
-    )
+    try:
+        serve.run(app)
 
-    assert (
-        len(active_pgs) == 2
-    ), "Expected 2 PGs - one for TPU Head, one for worker bundles"
+        pg_table = ray.util.placement_group_table()
+        active_pgs = list(
+            {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
+        )
 
-    tpu_head_resource = "TPU-v6e-16-head"
-    head_pgs = [
-        pg
-        for pg in active_pgs
-        if len(pg["bundles"]) == 1
-        and tpu_head_resource in list(pg["bundles"].values())[0]
-    ]
-    assert len(head_pgs) == 1
+        assert (
+            len(active_pgs) == 2
+        ), "Expected 2 PGs - one for TPU Head, one for worker bundles"
 
-    worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
+        tpu_head_resource = "TPU-v6e-16-head"
+        head_pgs = [
+            pg
+            for pg in active_pgs
+            if len(pg["bundles"]) == 1
+            and tpu_head_resource in list(pg["bundles"].values())[0]
+        ]
+        assert len(head_pgs) == 1
 
-    assert worker_pg["strategy"] == "PACK"
-    # 4x4 topology = 16 chips. Default is 16 bundles of 1 TPU.
-    assert len(worker_pg["bundles"]) == 16
-    for bundle in worker_pg["bundles"].values():
-        assert bundle.get("TPU", 0) == 1
+        worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
 
-    serve.shutdown()
+        assert worker_pg["strategy"] == "PACK"
+        # 4x4 topology = 16 chips. Default is 16 bundles of 1 TPU.
+        assert len(worker_pg["bundles"]) == 16
+        for bundle in worker_pg["bundles"].values():
+            assert bundle.get("TPU", 0) == 1
+    finally:
+        serve.shutdown()
 
 
 def test_tpu_serve_deployment_explicit_host_level_bundles(ray_tpu_cluster):
@@ -277,35 +287,36 @@ def test_tpu_serve_deployment_explicit_host_level_bundles(ray_tpu_cluster):
     )
 
     app = serve.deployment(LLMServer).bind(llm_config, engine_cls=PGCreationMockEngine)
-    serve.run(app)
-
-    pg_table = ray.util.placement_group_table()
-    active_pgs = list(
-        {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
-    )
-
-    assert (
-        len(active_pgs) == 2
-    ), "Expected 2 PGs - one for TPU Head, one for worker bundles"
-
-    tpu_head_resource = "TPU-v6e-16-head"
-    head_pgs = [
-        pg
-        for pg in active_pgs
-        if len(pg["bundles"]) == 1
-        and tpu_head_resource in list(pg["bundles"].values())[0]
-    ]
-    assert len(head_pgs) == 1
-
-    worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
+    try:
+        serve.run(app)
 
-    assert worker_pg["strategy"] == "PACK"
-    # 4x4 topology = 16 chips. With 4 TPUs per bundle, expect exactly 4 bundles.
-    assert len(worker_pg["bundles"]) == 4
-    for bundle in worker_pg["bundles"].values():
-        assert bundle.get("TPU", 0) == 4
+        pg_table = ray.util.placement_group_table()
+        active_pgs = list(
+            {k: v for k, v in pg_table.items() if v["state"] == "CREATED"}.values()
+        )
 
-    serve.shutdown()
+        assert (
+            len(active_pgs) == 2
+        ), "Expected 2 PGs - one for TPU Head, one for worker bundles"
+
+        tpu_head_resource = "TPU-v6e-16-head"
+        head_pgs = [
+            pg
+            for pg in active_pgs
+            if len(pg["bundles"]) == 1
+            and tpu_head_resource in list(pg["bundles"].values())[0]
+        ]
+        assert len(head_pgs) == 1
+
+        worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
+
+        assert worker_pg["strategy"] == "PACK"
+        # 4x4 topology = 16 chips. With 4 TPUs per bundle, expect exactly 4 bundles.
+        assert len(worker_pg["bundles"]) == 4
+        for bundle in worker_pg["bundles"].values():
+            assert bundle.get("TPU", 0) == 4
+    finally:
+        serve.shutdown()
 
 
 if __name__ == "__main__":
diff --git a/python/ray/serve/tests/test_accelerator_config.py b/python/ray/serve/tests/test_accelerator_config.py
@@ -60,7 +60,6 @@ def mock_tpu_cluster():
 
 def test_tpu_accelerator_config_integration(mock_tpu_cluster):
     """Test that AcceleratorConfig correctly creates SlicePlacementGroup in a mock cluster."""
-
     tpu_config = TPUAcceleratorConfig(topology="4x4", accelerator_version="v6e")
 
     request = CreatePlacementGroupRequest(
@@ -89,7 +88,6 @@ def test_tpu_accelerator_config_integration(mock_tpu_cluster):
     replica_pg.shutdown()
     assert replica_pg._slice_pg is None
 
-
 def test_tpu_accelerator_config_partial_failure_cleanup(mock_tpu_cluster):
     """Test that SlicePlacementGroup cleans up head PGs if a multi-slice reservation fails."""