Skip to content

Commit 744e5b3

Browse files
authored
Add v6e samples for Ray Serve LLM Multi-Host TPU guide (#2078)
* add v6e samples for Serve+multi-host TPU guide * style: remove extra whitespace before END region tags * refactor: make serve_tpu_multihost.py dynamically configurable via env vars * refactor: clean up serving script comments * Upgrade base image to vllm-tpu:v0.21.0 and pin Ray nightly wheel * chore: remove unreferenced ray-cluster sample * chore: use serve-svc for gradio host * feat: add observability label to rayservice * fix: use local disk for vLLM XLA cache to prevent GCS race conditions
1 parent d50f908 commit 744e5b3

7 files changed

Lines changed: 444 additions & 0 deletions

File tree

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_dockerfile]
16+
FROM vllm/vllm-tpu:v0.21.0
17+
18+
ENV VLLM_TARGET_DEVICE=tpu
19+
ENV VLLM_XLA_CACHE_PATH=/data
20+
21+
USER root
22+
23+
RUN pip install --no-cache-dir -U \
24+
"https://s3-us-west-2.amazonaws.com/ray-wheels/master/75b85027a859439fae5634e49aa6443f6fbecfeb/ray-3.0.0.dev0-cp312-cp312-manylinux2014_x86_64.whl" && \
25+
pip install --no-cache-dir --no-deps "ray[llm]"
26+
27+
COPY serve_tpu_multihost.py /home/ray/serve_tpu_multihost.py
28+
# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_dockerfile]
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_components_gradio]
16+
apiVersion: apps/v1
17+
kind: Deployment
18+
metadata:
19+
name: gradio
20+
labels:
21+
app: gradio
22+
spec:
23+
replicas: 1
24+
selector:
25+
matchLabels:
26+
app: gradio
27+
template:
28+
metadata:
29+
labels:
30+
app: gradio
31+
spec:
32+
containers:
33+
- name: gradio
34+
image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.7
35+
resources:
36+
requests:
37+
cpu: "250m"
38+
memory: "512Mi"
39+
limits:
40+
cpu: "500m"
41+
memory: "512Mi"
42+
env:
43+
- name: CONTEXT_PATH
44+
value: "/v1/chat/completions"
45+
- name: HOST
46+
value: "http://vllm-tpu-multihost-serve-svc:8000"
47+
- name: LLM_ENGINE
48+
value: "openai-chat"
49+
- name: MODEL_ID
50+
value: "google/gemma-4-31B-it"
51+
- name: DISABLE_SYSTEM_MESSAGE
52+
value: "true"
53+
ports:
54+
- containerPort: 7860
55+
---
56+
apiVersion: v1
57+
kind: Service
58+
metadata:
59+
name: gradio
60+
spec:
61+
selector:
62+
app: gradio
63+
ports:
64+
- protocol: TCP
65+
port: 8080
66+
targetPort: 7860
67+
type: ClusterIP
68+
# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_components_gradio]
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_components_model_downloader_job]
16+
apiVersion: batch/v1
17+
kind: Job
18+
metadata:
19+
name: model-downloader
20+
spec:
21+
ttlSecondsAfterFinished: 60
22+
template:
23+
metadata:
24+
annotations:
25+
gke-gcsfuse/volumes: "true"
26+
gke-gcsfuse/memory-limit: "0"
27+
spec:
28+
serviceAccountName: ${KSA_NAME}
29+
restartPolicy: OnFailure
30+
containers:
31+
- name: downloader
32+
image: python:3.10-slim
33+
command: ["/bin/sh", "-c"]
34+
args:
35+
- |
36+
pip install -U huggingface_hub filelock
37+
38+
python -c '
39+
import filelock
40+
41+
class DummyLock:
42+
def __init__(self, *args, **kwargs): pass
43+
def __enter__(self): return self
44+
def __exit__(self, *args): pass
45+
def acquire(self, *args, **kwargs): pass
46+
def release(self, *args, **kwargs): pass
47+
48+
filelock.FileLock = DummyLock
49+
50+
from huggingface_hub import snapshot_download
51+
snapshot_download(
52+
repo_id="google/gemma-4-31B-it",
53+
local_dir="/data/google/gemma-4-31B-it"
54+
)
55+
'
56+
env:
57+
- name: HF_TOKEN
58+
valueFrom:
59+
secretKeyRef:
60+
name: hf-secret
61+
key: hf_api_token
62+
volumeMounts:
63+
- name: gcs-fuse-csi-ephemeral
64+
mountPath: /data
65+
volumes:
66+
- name: gcs-fuse-csi-ephemeral
67+
csi:
68+
driver: gcsfuse.csi.storage.gke.io
69+
volumeAttributes:
70+
bucketName: ${GS_BUCKET}
71+
mountOptions: "implicit-dirs"
72+
# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_components_model_downloader_job]
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_networking_all_netdev_template]
16+
apiVersion: resource.k8s.io/v1
17+
kind: ResourceClaimTemplate
18+
metadata:
19+
name: all-netdev
20+
spec:
21+
spec:
22+
devices:
23+
requests:
24+
- name: req-netdev
25+
exactly:
26+
deviceClassName: netdev.google.com
27+
allocationMode: All
28+
# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_networking_all_netdev_template]
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_networking_dranet_compute_class]
16+
apiVersion: cloud.google.com/v1
17+
kind: ComputeClass
18+
metadata:
19+
name: dranet-compute-class
20+
spec:
21+
nodePoolAutoCreation:
22+
enabled: true
23+
nodePoolConfig:
24+
dra:
25+
networking:
26+
enabled: true
27+
priorities:
28+
- machineType: ct6e-standard-4t
29+
acceleratorNetworkProfile: auto
30+
# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_networking_dranet_compute_class]
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START gke_ai_ml_gke_ray_rayserve_rayservice_tpu_v6e_multihost]
16+
apiVersion: ray.io/v1
17+
kind: RayService
18+
metadata:
19+
name: vllm-tpu-multihost
20+
labels:
21+
ai.gke.io/model: "gemma-4-31B-it"
22+
ai.gke.io/inference-server: "vllm"
23+
spec:
24+
serveConfigV2: |
25+
http_options:
26+
host: 0.0.0.0
27+
port: 8000
28+
applications:
29+
- name: llm
30+
import_path: ai-ml.gke-ray.rayserve.llm.tpu.serve_tpu_multihost:deployment
31+
runtime_env:
32+
working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
33+
env_vars:
34+
# Use local disk to prevent multi-host GCSFuse race conditions
35+
VLLM_XLA_CACHE_PATH: "/tmp/vllm_xla_cache"
36+
rayClusterConfig:
37+
headGroupSpec:
38+
rayStartParams: {}
39+
template:
40+
metadata:
41+
annotations:
42+
gke-gcsfuse/volumes: "true"
43+
gke-gcsfuse/cpu-limit: "0"
44+
gke-gcsfuse/memory-limit: "0"
45+
gke-gcsfuse/ephemeral-storage-limit: "0"
46+
spec:
47+
serviceAccountName: $KSA_NAME
48+
containers:
49+
- name: ray-head
50+
image: $CUSTOM_IMAGE_URI
51+
imagePullPolicy: Always
52+
ports:
53+
- containerPort: 6379
54+
name: gcs
55+
- containerPort: 8265
56+
name: dashboard
57+
- containerPort: 10001
58+
name: client
59+
- containerPort: 8000
60+
name: serve
61+
resources:
62+
limits:
63+
cpu: "2"
64+
memory: 16Gi
65+
requests:
66+
cpu: "2"
67+
memory: 16Gi
68+
volumeMounts:
69+
- name: dshm
70+
mountPath: /dev/shm
71+
- name: gcs-fuse-csi-ephemeral
72+
mountPath: /data
73+
volumes:
74+
- name: dshm
75+
emptyDir:
76+
medium: Memory
77+
- name: gke-gcsfuse-cache
78+
emptyDir:
79+
medium: Memory
80+
- name: gcs-fuse-csi-ephemeral
81+
csi:
82+
driver: gcsfuse.csi.storage.gke.io
83+
volumeAttributes:
84+
bucketName: $GS_BUCKET
85+
mountOptions: "implicit-dirs"
86+
workerGroupSpecs:
87+
- groupName: tpu-group
88+
replicas: 1
89+
minReplicas: 1
90+
maxReplicas: 1
91+
numOfHosts: 4
92+
rayStartParams: {}
93+
template:
94+
metadata:
95+
annotations:
96+
gke-gcsfuse/volumes: "true"
97+
gke-gcsfuse/cpu-limit: "0"
98+
gke-gcsfuse/memory-limit: "0"
99+
gke-gcsfuse/ephemeral-storage-limit: "0"
100+
spec:
101+
serviceAccountName: $KSA_NAME
102+
containers:
103+
- name: ray-worker
104+
image: $CUSTOM_IMAGE_URI
105+
imagePullPolicy: Always
106+
resources:
107+
limits:
108+
cpu: "20"
109+
google.com/tpu: "4"
110+
memory: 200Gi
111+
requests:
112+
cpu: "20"
113+
google.com/tpu: "4"
114+
memory: 200Gi
115+
claims:
116+
- name: netdev
117+
env:
118+
- name: HF_HOME
119+
value: "/data/huggingface"
120+
- name: HF_TOKEN
121+
valueFrom:
122+
secretKeyRef:
123+
name: hf-secret
124+
key: hf_api_token
125+
- name: JAX_PLATFORMS
126+
value: "tpu,cpu"
127+
- name: NODE_IP
128+
valueFrom:
129+
fieldRef:
130+
fieldPath: status.hostIP
131+
- name: VBAR_CONTROL_SERVICE_URL
132+
value: $(NODE_IP):8353
133+
- name: TPU_MULTIHOST_BACKEND
134+
value: "ray"
135+
- name: TPU_BACKEND_TYPE
136+
value: "jax"
137+
- name: ENABLE_PJRT_COMPATIBILITY
138+
value: "true"
139+
volumeMounts:
140+
- name: dshm
141+
mountPath: /dev/shm
142+
- name: gcs-fuse-csi-ephemeral
143+
mountPath: /data
144+
volumes:
145+
- name: dshm
146+
emptyDir:
147+
medium: Memory
148+
- name: gke-gcsfuse-cache
149+
emptyDir:
150+
medium: Memory
151+
- name: gcs-fuse-csi-ephemeral
152+
csi:
153+
driver: gcsfuse.csi.storage.gke.io
154+
volumeAttributes:
155+
bucketName: $GS_BUCKET
156+
mountOptions: "implicit-dirs"
157+
resourceClaims:
158+
- name: netdev
159+
resourceClaimTemplateName: all-netdev
160+
nodeSelector:
161+
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
162+
cloud.google.com/gke-tpu-topology: 4x4
163+
# [END gke_ai_ml_gke_ray_rayserve_rayservice_tpu_v6e_multihost]

0 commit comments

Comments
 (0)