[NV] llm-d: switch base to vllm/vllm-openai:v0.22.0 (pin tag)

ezrasilvera · ezrasilvera · commit 8aa1c18e79f6 · 2026-06-04T21:03:11.000+03:00
Signed-off-by: Ezra Silvera &lt;ezra@il.ibm.com&gt;
diff --git a/benchmarks/llm-d/Dockerfile b/benchmarks/llm-d/Dockerfile
@@ -1,16 +1,19 @@
 # Combined image for the InferenceX llm-d-vllm framework.
 #
-# Base = ghcr.io/llm-d/llm-d-cuda which already ships vLLM + DeepEP +
-# NVSHMEM + GDRCopy. We add the EPP, the routing-sidecar, and Envoy on top
-# so every node in a SLURM allocation can play any role (prefill, decode,
-# or coordinator) from a single image.
+# Base = vllm/vllm-openai (vLLM with the OpenAI-compatible API server).
+# We add the EPP, the routing-sidecar, and Envoy on top so every node in
+# a SLURM allocation can play any role (prefill, decode, or coordinator)
+# from a single image. DeepEP / NVSHMEM / GDRCopy are NOT bundled by
+# this base; they are not used by the simple 1P+1D recipe
+# (LWS_GROUP_SIZE=1 short-circuits the wide-EP NVSHMEM env in
+# server.sh). Wide-EP recipes will need a base that ships them.
 #
 # Configs (epp-config.yaml, envoy.yaml, per-topology recipes) are NOT
 # baked in. They are mounted at runtime by job.slurm so config-only
 # iteration does not require an image rebuild. See
 # benchmarks/multi_node/llm-d/job.slurm for the expected mount layout.
 
-FROM ghcr.io/llm-d/llm-d-cuda:v0.7.0
+FROM vllm/vllm-openai:v0.22.0
 
 COPY --from=ghcr.io/llm-d/llm-d-router-endpoint-picker-dev:main \
        /app/epp /usr/local/bin/epp