[AMD][ROCM] dsv4-fp4-mi355x-vllm, Bump vLLM ROCm image to (nightly-4f940896) (#1546)

seungrokj · claude · web-flow · commit f21079620999 · 2026-05-21T13:24:24.000+08:00
* Bump vLLM ROCm image for dsv4-fp4-mi355x-vllm (nightly-4f940896) Update vllm/vllm-openai-rocm image from nightly-b50646e5 (May 18) to nightly-4f940896 (May 20) for dsv4-fp4-mi355x-vllm config. * Update perf-changelog with PR #1546 * Update server args for dsv4-fp4-mi355x-vllm: add compilation-config, bump gpu-util to 0.8 * Update dsv4_fp4_mi355x_vllm.sh * Update dsv4_fp4_mi355x_vllm.sh * Update amd-master.yaml * Update dsv4_fp4_mi355x_vllm.sh * Apply HF_HUB_CACHE_MOUNT override for vllm framework on DeepSeek-V4-Pro Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Remove VLLM_CACHE_ROOT workaround for dsv4 fp4 vllm Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Add --async-scheduling and --no-enable-prefix-caching to dsv4 fp4 vllm Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: seungrokj <seungrokj@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
@@ -1814,7 +1814,7 @@ dsv4-fp4-mi355x-sglang:
 # gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
 # probe to validate the ROCm DP+EP path.
 dsv4-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-b50646e5effd7cb5884cd96fdff4c53c18521198
+  image: vllm/vllm-openai-rocm:nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -1826,11 +1826,11 @@ dsv4-fp4-mi355x-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 8, conc-start: 4, conc-end: 512 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 128 }
+      - { tp: 8, conc-start: 4, conc-end: 512 }
 
 # Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
 # PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks
diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh
@@ -18,11 +18,8 @@ set -eo pipefail
 # parameters (w13_weight_scale / w2_weight_scale), so safetensors
 # loading raises KeyError.
 #
-# --quantization deepseek_v4_fp8 forces the FP4-aware
-# DeepseekV4FP8Config instead of relying on model_type auto-detection.
-# That keeps the mixed-precision checkpoint on the intended MoE path
-# and avoids falling back to plain Fp8Config, which rejects
-# triton_unfused.
+# --compilation-config mode=3 with FULL_AND_PIECEWISE cudagraph mode
+# enables full CUDA graph capture for improved throughput on MI355X.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
@@ -48,10 +45,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_USE_AITER_LINEAR=1
-# Loading the ~960 GB checkpoint into KV/weights can exceed the default
-# engine-ready timeout on first run from cold HF cache.
-export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -77,20 +70,16 @@ set -x
 vllm serve $MODEL --port $PORT \
     "${PARALLEL_ARGS[@]}" \
     "${EP_ARGS[@]}" \
+    --async-scheduling \
+    --no-enable-prefix-caching \
     --distributed-executor-backend mp \
-    --gpu-memory-utilization 0.6 \
-    --max-model-len $MAX_MODEL_LEN \
-    --max-num-seqs 128 \
-    --max-num-batched-tokens 8192 \
+    --gpu-memory-utilization 0.8 \
     --kv-cache-dtype fp8 \
     --trust-remote-code \
-    --enforce-eager \
-    --async-scheduling \
-    --quantization deepseek_v4_fp8 \
     --moe-backend triton_unfused \
-    --no-enable-prefix-caching \
     --tokenizer-mode deepseek_v4 \
-    --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &
+    --reasoning-parser deepseek_v4 \
+    --compilation-config '{"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3056,3 +3056,9 @@
   description:
     - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475
+
+- config-keys:
+    - dsv4-fp4-mi355x-vllm
+  description:
+    - "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
@@ -218,7 +218,7 @@ else
     fi
 
     # to prevent reading outdated saved model. use a fresh model from hf repo
-    if [[ "$FRAMEWORK" == "atom" ]] && [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then
+    if [[ ("$FRAMEWORK" == "vllm" || "$FRAMEWORK" == "atom") ]] && [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then
         export HF_HUB_CACHE_MOUNT="/it-share/hf-hub-cache/"
     fi