feat(vllm): Update to vllm v0.17.1, lmcache to v0.4.1, and switch vllm-tensorizer build to dedicated buildkit endpoints (#132)

c2w-sea · Rexwang8 · web-flow · commit 4fdf652fc758 · 2026-03-18T08:54:04.000-07:00
* Update vllm version to v0.17.1. 
* Update lmcache to v0.4.1. Also move LMCache version parameter to build config
* Keep the same flashinfer version (v0.6.4). Checked openai's vllm image and vllm's v.0.17.1 runtime requirements to confirm the version
* The vllm upgrade caused the docker buildkit job to consistently OOM. Per CBS team suggestion, we switch to dedicated buildkit endpoints for vllm -tensorizer to launch pods with much higher mem limit (~500G vs ~60G)

---------

Co-authored-by: rexwang8 &lt;rexkingsbackyard@gmail.com&gt;
diff --git a/.github/configurations/vllm-tensorizer.yml b/.github/configurations/vllm-tensorizer.yml
@@ -1,7 +1,9 @@
 vllm-commit:
-  - 'v0.16.0'
+  - 'v0.17.1'
 flashinfer-commit:
   - 'v0.6.4'
+lmcache-commit:
+  - 'v0.4.1'
 builder-base-image:
   - 'ghcr.io/coreweave/ml-containers/torch:17ad6db-nccl-cuda12.9.1-ubuntu22.04-nccl2.29.2-1-torch2.10.0-vision0.25.0-audio2.10.0-abi1'
 final-base-image:
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -24,6 +24,11 @@ on:
         description: "Platforms for which to build (default: linux/amd64,linux/arm64)"
         type: string
         default: linux/amd64,linux/arm64
+      dedicated-buildkit:
+        required: false
+        description: "Instead of shared consumer endpoints, use dedicated BuildKit endpoints (BUILDKIT_DEDICATED_0) backed by high-memory-limit worker pods to prevent OOMs during large builds."
+        type: boolean
+        default: false
     outputs:
       outcome:
         description: "The outcome of the build"
@@ -62,10 +67,10 @@ jobs:
         uses: docker/setup-buildx-action@v3.7.1
         with:
           driver: remote
-          endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
+          endpoint: ${{ inputs.dedicated-buildkit && secrets.BUILDKIT_DEDICATED_0_AMD64_ENDPOINT || secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
           platforms: linux/amd64
           append: |
-            - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }}
+            - endpoint: ${{ inputs.dedicated-buildkit && secrets.BUILDKIT_DEDICATED_0_ARM64_ENDPOINT || secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }}
               platforms: linux/arm64
         env:
           BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
diff --git a/.github/workflows/vllm-tensorizer.yml b/.github/workflows/vllm-tensorizer.yml
@@ -22,9 +22,11 @@ jobs:
     with:
       image-name: vllm-tensorizer
       folder: vllm-tensorizer
+      dedicated-buildkit: true
       tag-suffix: ${{ matrix.vllm-commit }}
       build-args: |
         VLLM_COMMIT=${{ matrix.vllm-commit }}
         FLASHINFER_COMMIT=${{ matrix.flashinfer-commit }}
+        LMCACHE_COMMIT=${{ matrix.lmcache-commit }}
         BUILDER_BASE_IMAGE=${{ matrix.builder-base-image }}
         FINAL_BASE_IMAGE=${{ matrix.final-base-image }}
diff --git a/vllm-tensorizer/Dockerfile b/vllm-tensorizer/Dockerfile
@@ -79,7 +79,7 @@ RUN git clone --filter=tree:0 --no-single-branch --no-checkout \
 
 FROM alpine/git:2.36.3 AS lmcache-downloader
 WORKDIR /git
-ARG LMCACHE_COMMIT='v0.3.13'
+ARG LMCACHE_COMMIT
 RUN git clone --filter=tree:0 --no-single-branch --no-checkout \
       https://github.com/LMCache/LMCache && \
     git -C LMCache checkout "${LMCACHE_COMMIT}"