Updating dsv4 b200 vllm version (#1384)

wzhao18 · functionstackx · web-flow · commit 06186467c50b · 2026-05-29T19:10:00.000-04:00
* Try updating b200 dsv4

* add changelog

* Set MAX_CUDAGRAPH_CAPTURE_SIZE to 2048 unconditionally

* Update Docker image for dsv4-fp4-b200-vllm

* Update vLLM image tag in perf-changelog.yaml

Updated the vLLM image tag to specify the nightly version.

* Update Docker image tag for dsv4-fp4-b200-vllm

* Update vLLM image tag to v0.22.0

* Update conc-end values in nvidia-master.yaml

---------

Co-authored-by: functionstackx &lt;47992694+functionstackx@users.noreply.github.com&gt;
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -1756,7 +1756,7 @@ dsv4-fp4-b200-sglang:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
 
 dsv4-fp4-b200-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -1770,7 +1770,8 @@ dsv4-fp4-b200-vllm:
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 128, conc-end: 128 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 4096 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
+      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
     - isl: 8192
       osl: 1024
       search-space:
diff --git a/benchmarks/single_node/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/dsv4_fp4_b200_vllm.sh
@@ -42,13 +42,9 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
 fi
 
-# Mega-MoE backend and the lower GMU only kick in on the DP-attn path,
-# per the vLLM v0.20.0 DeepSeek-V4-Pro recipe. All configs share the
-# FULL_AND_PIECEWISE compilation config.
 GMU_ARGS=()
 MOE_ARGS=()
 if [ "${DP_ATTENTION}" = "true" ]; then
-    GMU_ARGS=(--gpu-memory-utilization 0.85)
     MOE_ARGS=(--moe-backend deep_gemm_mega_moe)
 fi
 
@@ -58,10 +54,9 @@ else
     MAX_NUM_BATCHED_TOKENS=2048
 fi
 
+MAX_CUDAGRAPH_CAPTURE_SIZE=2048
+
 BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
-if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then
-    BENCHMARK_MAX_MODEL_LEN=4096
-fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
@@ -90,7 +85,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --tool-call-parser deepseek_v4 \
     --enable-auto-tool-choice \
     --reasoning-parser deepseek_v4 \
-    --max-cudagraph-capture-size 2048 \
+    --max-cudagraph-capture-size "$MAX_CUDAGRAPH_CAPTURE_SIZE" \
     --max-model-len "$SERVE_MAX_MODEL_LEN" \
     --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3208,3 +3208,9 @@
     - "1k1k and 8k1k STP hightpt and lowlat srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/ (resolved from upstream srt-slurm PR #160 via srtctl resolve-override)"
     - "Wire glm5/fp8 model + dynamo-sglang framework branches into runners/launch_gb300-nv.sh with SA upstream defaults (SLURM_PARTITION=batch_1, SLURM_ACCOUNT=benchmark, SQUASH_FILE under /home/sa-shared/gharunners/squash/)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1557
+
+- config-keys:
+    - dsv4-fp4-b200-vllm
+  description:
+    - "Update vLLM image tag to v0.22.0"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1384