Add v6e samples for Ray Serve LLM Multi-Host TPU guide (#2078)

ryanaoleary · web-flow · commit 744e5b3c38b6 · 2026-06-11T19:49:36.000Z
* add v6e samples for Serve+multi-host TPU guide

* style: remove extra whitespace before END region tags

* refactor: make serve_tpu_multihost.py dynamically configurable via env vars

* refactor: clean up serving script comments

* Upgrade base image to vllm-tpu:v0.21.0 and pin Ray nightly wheel

* chore: remove unreferenced ray-cluster sample

* chore: use serve-svc for gradio host

* feat: add observability label to rayservice

* fix: use local disk for vLLM XLA cache to prevent GCS race conditions
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/Dockerfile b/ai-ml/gke-ray/rayserve/llm/tpu/Dockerfile
@@ -0,0 +1,28 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_dockerfile]
+FROM vllm/vllm-tpu:v0.21.0
+
+ENV VLLM_TARGET_DEVICE=tpu
+ENV VLLM_XLA_CACHE_PATH=/data
+
+USER root
+
+RUN pip install --no-cache-dir -U \
+    "https://s3-us-west-2.amazonaws.com/ray-wheels/master/75b85027a859439fae5634e49aa6443f6fbecfeb/ray-3.0.0.dev0-cp312-cp312-manylinux2014_x86_64.whl" && \
+    pip install --no-cache-dir --no-deps "ray[llm]"
+
+COPY serve_tpu_multihost.py /home/ray/serve_tpu_multihost.py
+# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_dockerfile]
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/components/gradio.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/components/gradio.yaml
@@ -0,0 +1,68 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_components_gradio]
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gradio
+  labels:
+    app: gradio
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gradio
+  template:
+    metadata:
+      labels:
+        app: gradio
+    spec:
+      containers:
+      - name: gradio
+        image: us-docker.pkg.dev/google-samples/containers/gke/gradio-app:v1.0.7
+        resources:
+          requests:
+            cpu: "250m"
+            memory: "512Mi"
+          limits:
+            cpu: "500m"
+            memory: "512Mi"
+        env:
+        - name: CONTEXT_PATH
+          value: "/v1/chat/completions"
+        - name: HOST
+          value: "http://vllm-tpu-multihost-serve-svc:8000"
+        - name: LLM_ENGINE
+          value: "openai-chat"
+        - name: MODEL_ID
+          value: "google/gemma-4-31B-it"
+        - name: DISABLE_SYSTEM_MESSAGE
+          value: "true"
+        ports:
+        - containerPort: 7860
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gradio
+spec:
+  selector:
+    app: gradio
+  ports:
+  - protocol: TCP
+    port: 8080
+    targetPort: 7860
+  type: ClusterIP
+# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_components_gradio]
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/components/model-downloader-job.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/components/model-downloader-job.yaml
@@ -0,0 +1,72 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_components_model_downloader_job]
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: model-downloader
+spec:
+  ttlSecondsAfterFinished: 60
+  template:
+    metadata:
+      annotations:
+        gke-gcsfuse/volumes: "true"
+        gke-gcsfuse/memory-limit: "0"
+    spec:
+      serviceAccountName: ${KSA_NAME}
+      restartPolicy: OnFailure
+      containers:
+      - name: downloader
+        image: python:3.10-slim
+        command: ["/bin/sh", "-c"]
+        args:
+        - |
+          pip install -U huggingface_hub filelock
+
+          python -c '
+          import filelock
+
+          class DummyLock:
+              def __init__(self, *args, **kwargs): pass
+              def __enter__(self): return self
+              def __exit__(self, *args): pass
+              def acquire(self, *args, **kwargs): pass
+              def release(self, *args, **kwargs): pass
+
+          filelock.FileLock = DummyLock
+
+          from huggingface_hub import snapshot_download
+          snapshot_download(
+              repo_id="google/gemma-4-31B-it", 
+              local_dir="/data/google/gemma-4-31B-it"
+          )
+          '
+        env:
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-secret
+              key: hf_api_token
+        volumeMounts:
+        - name: gcs-fuse-csi-ephemeral
+          mountPath: /data
+      volumes:
+      - name: gcs-fuse-csi-ephemeral
+        csi:
+          driver: gcsfuse.csi.storage.gke.io
+          volumeAttributes:
+            bucketName: ${GS_BUCKET}
+            mountOptions: "implicit-dirs"
+# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_components_model_downloader_job]
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/networking/all-netdev-template.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/networking/all-netdev-template.yaml
@@ -0,0 +1,28 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_networking_all_netdev_template]
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: all-netdev
+spec:
+  spec:
+    devices:
+      requests:
+      - name: req-netdev
+        exactly:
+          deviceClassName: netdev.google.com
+          allocationMode: All
+# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_networking_all_netdev_template]
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/networking/dranet-compute-class.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/networking/dranet-compute-class.yaml
@@ -0,0 +1,30 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_llm_tpu_networking_dranet_compute_class]
+apiVersion: cloud.google.com/v1
+kind: ComputeClass
+metadata:
+  name: dranet-compute-class
+spec:
+  nodePoolAutoCreation:
+    enabled: true
+  nodePoolConfig:
+    dra:
+      networking:
+        enabled: true
+  priorities:
+  - machineType: ct6e-standard-4t
+    acceleratorNetworkProfile: auto
+# [END gke_ai_ml_gke_ray_rayserve_llm_tpu_networking_dranet_compute_class]
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml
@@ -0,0 +1,163 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_gke_ray_rayserve_rayservice_tpu_v6e_multihost]
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: vllm-tpu-multihost
+  labels:
+    ai.gke.io/model: "gemma-4-31B-it"
+    ai.gke.io/inference-server: "vllm"
+spec:
+  serveConfigV2: |
+    http_options:
+      host: 0.0.0.0
+      port: 8000
+    applications:
+      - name: llm
+        import_path: ai-ml.gke-ray.rayserve.llm.tpu.serve_tpu_multihost:deployment
+        runtime_env:
+          working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip"
+          env_vars:
+            # Use local disk to prevent multi-host GCSFuse race conditions
+            VLLM_XLA_CACHE_PATH: "/tmp/vllm_xla_cache"
+  rayClusterConfig:
+    headGroupSpec:
+      rayStartParams: {}
+      template:
+        metadata:
+          annotations:
+            gke-gcsfuse/volumes: "true"
+            gke-gcsfuse/cpu-limit: "0"
+            gke-gcsfuse/memory-limit: "0"
+            gke-gcsfuse/ephemeral-storage-limit: "0"
+        spec:
+          serviceAccountName: $KSA_NAME
+          containers:
+          - name: ray-head
+            image: $CUSTOM_IMAGE_URI
+            imagePullPolicy: Always
+            ports:
+            - containerPort: 6379
+              name: gcs
+            - containerPort: 8265
+              name: dashboard
+            - containerPort: 10001
+              name: client
+            - containerPort: 8000
+              name: serve
+            resources:
+              limits:
+                cpu: "2"
+                memory: 16Gi
+              requests:
+                cpu: "2"
+                memory: 16Gi
+            volumeMounts:
+            - name: dshm
+              mountPath: /dev/shm
+            - name: gcs-fuse-csi-ephemeral
+              mountPath: /data
+          volumes:
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: gke-gcsfuse-cache
+            emptyDir:
+              medium: Memory
+          - name: gcs-fuse-csi-ephemeral
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              volumeAttributes:
+                bucketName: $GS_BUCKET
+                mountOptions: "implicit-dirs"
+    workerGroupSpecs:
+    - groupName: tpu-group
+      replicas: 1
+      minReplicas: 1
+      maxReplicas: 1
+      numOfHosts: 4
+      rayStartParams: {}
+      template:
+        metadata:
+          annotations:
+            gke-gcsfuse/volumes: "true"
+            gke-gcsfuse/cpu-limit: "0"
+            gke-gcsfuse/memory-limit: "0"
+            gke-gcsfuse/ephemeral-storage-limit: "0"
+        spec:
+          serviceAccountName: $KSA_NAME
+          containers:
+            - name: ray-worker
+              image: $CUSTOM_IMAGE_URI
+              imagePullPolicy: Always
+              resources:
+                limits:
+                  cpu: "20"
+                  google.com/tpu: "4"
+                  memory: 200Gi
+                requests:
+                  cpu: "20"
+                  google.com/tpu: "4"
+                  memory: 200Gi
+                claims:
+                - name: netdev
+              env:
+                - name: HF_HOME
+                  value: "/data/huggingface"
+                - name: HF_TOKEN
+                  valueFrom:
+                    secretKeyRef:
+                      name: hf-secret
+                      key: hf_api_token
+                - name: JAX_PLATFORMS
+                  value: "tpu,cpu"
+                - name: NODE_IP
+                  valueFrom:
+                    fieldRef:
+                      fieldPath: status.hostIP
+                - name: VBAR_CONTROL_SERVICE_URL
+                  value: $(NODE_IP):8353
+                - name: TPU_MULTIHOST_BACKEND
+                  value: "ray"
+                - name: TPU_BACKEND_TYPE
+                  value: "jax"
+                - name: ENABLE_PJRT_COMPATIBILITY
+                  value: "true"
+              volumeMounts:
+              - name: dshm
+                mountPath: /dev/shm
+              - name: gcs-fuse-csi-ephemeral
+                mountPath: /data
+          volumes:
+          - name: dshm
+            emptyDir:
+              medium: Memory
+          - name: gke-gcsfuse-cache
+            emptyDir:
+              medium: Memory
+          - name: gcs-fuse-csi-ephemeral
+            csi:
+              driver: gcsfuse.csi.storage.gke.io
+              volumeAttributes:
+                bucketName: $GS_BUCKET
+                mountOptions: "implicit-dirs"
+          resourceClaims:
+            - name: netdev
+              resourceClaimTemplateName: all-netdev
+          nodeSelector:
+            cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
+            cloud.google.com/gke-tpu-topology: 4x4
+# [END gke_ai_ml_gke_ray_rayserve_rayservice_tpu_v6e_multihost]
diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/serve_tpu_multihost.py b/ai-ml/gke-ray/rayserve/llm/tpu/serve_tpu_multihost.py