Skip to content

Commit f210796

Browse files
seungrokjclaude
andauthored
[AMD][ROCM] dsv4-fp4-mi355x-vllm, Bump vLLM ROCm image to (nightly-4f940896) (#1546)
* Bump vLLM ROCm image for dsv4-fp4-mi355x-vllm (nightly-4f940896) Update vllm/vllm-openai-rocm image from nightly-b50646e5 (May 18) to nightly-4f940896 (May 20) for dsv4-fp4-mi355x-vllm config. * Update perf-changelog with PR #1546 * Update server args for dsv4-fp4-mi355x-vllm: add compilation-config, bump gpu-util to 0.8 * Update dsv4_fp4_mi355x_vllm.sh * Update dsv4_fp4_mi355x_vllm.sh * Update amd-master.yaml * Update dsv4_fp4_mi355x_vllm.sh * Apply HF_HUB_CACHE_MOUNT override for vllm framework on DeepSeek-V4-Pro Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Remove VLLM_CACHE_ROOT workaround for dsv4 fp4 vllm Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Add --async-scheduling and --no-enable-prefix-caching to dsv4 fp4 vllm Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: seungrokj <seungrokj@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent c178e28 commit f210796

4 files changed

Lines changed: 17 additions & 22 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,7 +1814,7 @@ dsv4-fp4-mi355x-sglang:
18141814
# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
18151815
# probe to validate the ROCm DP+EP path.
18161816
dsv4-fp4-mi355x-vllm:
1817-
image: vllm/vllm-openai-rocm:nightly-b50646e5effd7cb5884cd96fdff4c53c18521198
1817+
image: vllm/vllm-openai-rocm:nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458
18181818
model: deepseek-ai/DeepSeek-V4-Pro
18191819
model-prefix: dsv4
18201820
runner: mi355x
@@ -1826,11 +1826,11 @@ dsv4-fp4-mi355x-vllm:
18261826
- isl: 1024
18271827
osl: 1024
18281828
search-space:
1829-
- { tp: 8, conc-start: 4, conc-end: 128 }
1829+
- { tp: 8, conc-start: 4, conc-end: 512 }
18301830
- isl: 8192
18311831
osl: 1024
18321832
search-space:
1833-
- { tp: 8, conc-start: 4, conc-end: 128 }
1833+
- { tp: 8, conc-start: 4, conc-end: 512 }
18341834

18351835
# Day-0 single-sequence marker for DeepSeek-V4 on ATOM (ROCm/ATOM#650).
18361836
# PR1 of the ATOM DSv4 series still uses torch sparse-attention fallbacks

benchmarks/single_node/dsv4_fp4_mi355x_vllm.sh

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,8 @@ set -eo pipefail
1818
# parameters (w13_weight_scale / w2_weight_scale), so safetensors
1919
# loading raises KeyError.
2020
#
21-
# --quantization deepseek_v4_fp8 forces the FP4-aware
22-
# DeepseekV4FP8Config instead of relying on model_type auto-detection.
23-
# That keeps the mixed-precision checkpoint on the intended MoE path
24-
# and avoids falling back to plain Fp8Config, which rejects
25-
# triton_unfused.
21+
# --compilation-config mode=3 with FULL_AND_PIECEWISE cudagraph mode
22+
# enables full CUDA graph capture for improved throughput on MI355X.
2623

2724
source "$(dirname "$0")/../benchmark_lib.sh"
2825

@@ -48,10 +45,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
4845
fi
4946

5047
export VLLM_ROCM_USE_AITER=1
51-
export VLLM_ROCM_USE_AITER_LINEAR=1
52-
# Loading the ~960 GB checkpoint into KV/weights can exceed the default
53-
# engine-ready timeout on first run from cold HF cache.
54-
export VLLM_ENGINE_READY_TIMEOUT_S=3600
5548

5649
SERVER_LOG=/workspace/server.log
5750
PORT=${PORT:-8888}
@@ -77,20 +70,16 @@ set -x
7770
vllm serve $MODEL --port $PORT \
7871
"${PARALLEL_ARGS[@]}" \
7972
"${EP_ARGS[@]}" \
73+
--async-scheduling \
74+
--no-enable-prefix-caching \
8075
--distributed-executor-backend mp \
81-
--gpu-memory-utilization 0.6 \
82-
--max-model-len $MAX_MODEL_LEN \
83-
--max-num-seqs 128 \
84-
--max-num-batched-tokens 8192 \
76+
--gpu-memory-utilization 0.8 \
8577
--kv-cache-dtype fp8 \
8678
--trust-remote-code \
87-
--enforce-eager \
88-
--async-scheduling \
89-
--quantization deepseek_v4_fp8 \
9079
--moe-backend triton_unfused \
91-
--no-enable-prefix-caching \
9280
--tokenizer-mode deepseek_v4 \
93-
--reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &
81+
--reasoning-parser deepseek_v4 \
82+
--compilation-config '{"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' > $SERVER_LOG 2>&1 &
9483

9584
SERVER_PID=$!
9685

perf-changelog.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3056,3 +3056,9 @@
30563056
description:
30573057
- "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130"
30583058
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475
3059+
3060+
- config-keys:
3061+
- dsv4-fp4-mi355x-vllm
3062+
description:
3063+
- "Bump vLLM ROCm image from nightly-b50646e5effd7cb5884cd96fdff4c53c18521198 to nightly-4f940896a32c9e2a0eba7f50d521bf5f6b4de458"
3064+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1546

runners/launch_mi355x-amds.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ else
218218
fi
219219

220220
# to prevent reading outdated saved model. use a fresh model from hf repo
221-
if [[ "$FRAMEWORK" == "atom" ]] && [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then
221+
if [[ ("$FRAMEWORK" == "vllm" || "$FRAMEWORK" == "atom") ]] && [[ "$MODEL" == "deepseek-ai/DeepSeek-V4-Pro" ]]; then
222222
export HF_HUB_CACHE_MOUNT="/it-share/hf-hub-cache/"
223223
fi
224224

0 commit comments

Comments
 (0)