@@ -18,11 +18,8 @@ set -eo pipefail
1818# parameters (w13_weight_scale / w2_weight_scale), so safetensors
1919# loading raises KeyError.
2020#
21- # --quantization deepseek_v4_fp8 forces the FP4-aware
22- # DeepseekV4FP8Config instead of relying on model_type auto-detection.
23- # That keeps the mixed-precision checkpoint on the intended MoE path
24- # and avoids falling back to plain Fp8Config, which rejects
25- # triton_unfused.
21+ # --compilation-config mode=3 with FULL_AND_PIECEWISE cudagraph mode
22+ # enables full CUDA graph capture for improved throughput on MI355X.
2623
2724source " $( dirname " $0 " ) /../benchmark_lib.sh"
2825
@@ -48,10 +45,6 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
4845fi
4946
5047export VLLM_ROCM_USE_AITER=1
51- export VLLM_ROCM_USE_AITER_LINEAR=1
52- # Loading the ~960 GB checkpoint into KV/weights can exceed the default
53- # engine-ready timeout on first run from cold HF cache.
54- export VLLM_ENGINE_READY_TIMEOUT_S=3600
5548
5649SERVER_LOG=/workspace/server.log
5750PORT=${PORT:- 8888}
@@ -77,20 +70,16 @@ set -x
7770vllm serve $MODEL --port $PORT \
7871 " ${PARALLEL_ARGS[@]} " \
7972 " ${EP_ARGS[@]} " \
73+ --async-scheduling \
74+ --no-enable-prefix-caching \
8075 --distributed-executor-backend mp \
81- --gpu-memory-utilization 0.6 \
82- --max-model-len $MAX_MODEL_LEN \
83- --max-num-seqs 128 \
84- --max-num-batched-tokens 8192 \
76+ --gpu-memory-utilization 0.8 \
8577 --kv-cache-dtype fp8 \
8678 --trust-remote-code \
87- --enforce-eager \
88- --async-scheduling \
89- --quantization deepseek_v4_fp8 \
9079 --moe-backend triton_unfused \
91- --no-enable-prefix-caching \
9280 --tokenizer-mode deepseek_v4 \
93- --reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &
81+ --reasoning-parser deepseek_v4 \
82+ --compilation-config ' {"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' > $SERVER_LOG 2>&1 &
9483
9584SERVER_PID=$!
9685
0 commit comments