Update Ray LLM to use TPUAcceleratorConfig

ryanaoleary · ryanaoleary · commit 35059d290b9e · 2026-05-07T23:19:01.000Z
diff --git a/python/ray/llm/_internal/serve/core/server/llm_server.py b/python/ray/llm/_internal/serve/core/server/llm_server.py
@@ -22,6 +22,7 @@
     MODEL_RESPONSE_BATCH_TIMEOUT_MS,
     RAYLLM_VLLM_ENGINE_CLS_ENV,
 )
+from ray.llm._internal.serve.core.configs.accelerators import TPUConfig
 from ray.llm._internal.serve.core.configs.llm_config import (
     DiskMultiplexConfig,
     LLMConfig,
@@ -39,6 +40,8 @@
 from ray.llm._internal.serve.utils.server_utils import (
     get_serve_request_id,
 )
+from ray.serve.config import TPUAcceleratorConfig
+from ray.util.tpu import get_tpu_version_from_type
 
 if TYPE_CHECKING:
     from ray.llm._internal.serve.core.configs.openai_api_models import (
@@ -737,4 +740,20 @@ def get_deployment_options(cls, llm_config: "LLMConfig"):
         }
         deployment_options["ray_actor_options"] = ray_actor_options
 
+        if llm_config.accelerator_config is not None and isinstance(
+            llm_config.accelerator_config, TPUConfig
+        ):
+            if not llm_config.accelerator_type:
+                raise ValueError(
+                    "llm_config.accelerator_type must be specified when "
+                    "accelerator_config is a TPUConfig."
+                )
+            version = get_tpu_version_from_type(llm_config.accelerator_type)
+
+            deployment_options["accelerator_config"] = TPUAcceleratorConfig(
+                topology=llm_config.accelerator_config.topology,
+                accelerator_version=version,
+                num_slices=1,
+            )
+
         return deployment_options
diff --git a/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine_tpu.py b/python/ray/llm/tests/serve/cpu/deployments/llm/test_llm_engine_tpu.py
@@ -37,11 +37,11 @@ def test_tpu_slice_placement_group_creation_default_resources(ray_tpu_cluster):
     pg_table = placement_group_table(pg)
     assert pg_table["strategy"] == "PACK"
 
-    # 4x4 v6e = 16 chips. We default to 1 TPU chip per bundle.
-    assert len(pg_table["bundles"]) == 16
+    # 4x4 v6e = 16 chips. We default to 4 TPU chips per bundle (per-host).
+    assert len(pg_table["bundles"]) == 4
     for bundle in pg_table["bundles"].values():
         assert "TPU" in bundle
-        assert bundle["TPU"] == 1
+        assert bundle["TPU"] == 4.0
 
     # Let the backend tear down its own resources if it has any
     engine_config.accelerator.shutdown()
@@ -62,7 +62,7 @@ def test_tpu_slice_placement_group_creation_host_resources(ray_tpu_cluster):
         accelerator_config={"kind": "tpu", "topology": "4x4"},
         placement_group_config={
             "strategy": "STRICT_SPREAD",
-            "bundles": [{"TPU": 4}],
+            "bundles": [{"TPU": 4}] * 4,
         },
     )
 
@@ -256,10 +256,10 @@ def test_tpu_serve_deployment_default_chip_level_bundles(ray_tpu_cluster):
     worker_pg = [pg for pg in active_pgs if pg not in head_pgs][0]
 
     assert worker_pg["strategy"] == "PACK"
-    # 4x4 topology = 16 chips. Default is 16 bundles of 1 TPU.
-    assert len(worker_pg["bundles"]) == 16
+    # 4x4 topology = 16 chips. Default is 4 bundles of 4 TPUs (per-host).
+    assert len(worker_pg["bundles"]) == 4
     for bundle in worker_pg["bundles"].values():
-        assert bundle.get("TPU", 0) == 1
+        assert bundle.get("TPU", 0) == 4.0
 
     serve.shutdown()