Skip to content

Commit dc720aa

Browse files
Merge remote-tracking branch 'upstream/amd/vllm_disagg_mvp_dev2' into amd/vllm_disagg_minimax_fp8_cdna3_v2
2 parents 973351e + e8b2230 commit dc720aa

17 files changed

Lines changed: 1057 additions & 94 deletions

.github/configs/amd-master.yaml

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,7 +1611,7 @@ minimaxm2.5-fp8-mi325x-vllm-disagg:
16111611
- "VLLM_MORIIO_CONNECTOR_READ_MODE=1"
16121612

16131613
dsr1-fp4-mi355x-sglang-disagg:
1614-
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
1614+
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
16151615
model: amd/DeepSeek-R1-0528-MXFP4-v2
16161616
model-prefix: dsr1
16171617
runner: mi355x-disagg
@@ -1800,6 +1800,25 @@ dsr1-fp4-mi355x-sglang-disagg:
18001800
- "DECODE_NODES=2"
18011801
- "DECODE_MTP_SIZE=0"
18021802

1803+
# 1*DEP8 + 1*DEP8
1804+
- spec-decoding: "none"
1805+
conc-list: [ 128, 256, 512 ]
1806+
prefill:
1807+
num-worker: 1
1808+
tp: 8
1809+
ep: 8
1810+
dp-attn: true
1811+
additional-settings:
1812+
- "PREFILL_NODES=1"
1813+
decode:
1814+
num-worker: 1
1815+
tp: 8
1816+
ep: 8
1817+
dp-attn: true
1818+
additional-settings:
1819+
- "DECODE_NODES=1"
1820+
- "DECODE_MTP_SIZE=0"
1821+
18031822
# 2*DEP8 + 1*DEP8
18041823
- spec-decoding: "none"
18051824
conc-list: [ 1024, 2048, 4096 ]
@@ -1820,7 +1839,7 @@ dsr1-fp4-mi355x-sglang-disagg:
18201839
- "DECODE_MTP_SIZE=0"
18211840

18221841
dsr1-fp4-mi355x-sglang-disagg-mtp:
1823-
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
1842+
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
18241843
model: amd/DeepSeek-R1-0528-MXFP4-v2
18251844
model-prefix: dsr1
18261845
runner: mi355x-disagg
@@ -1990,24 +2009,43 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
19902009
- "DECODE_NODES=2"
19912010
- "DECODE_MTP_SIZE=2"
19922011

1993-
# 1P2D TP4
2012+
# 1*DEP8 + 1*DEP8
19942013
- spec-decoding: "mtp"
1995-
conc-list: [ 64, 128, 256 ]
2014+
conc-list: [ 128, 512 ]
19962015
prefill:
19972016
num-worker: 1
1998-
tp: 4
1999-
ep: 1
2000-
dp-attn: false
2017+
tp: 8
2018+
ep: 8
2019+
dp-attn: true
20012020
additional-settings:
20022021
- "PREFILL_NODES=1"
20032022
decode:
2004-
num-worker: 2
2023+
num-worker: 1
20052024
tp: 8
2006-
ep: 1
2007-
dp-attn: false
2025+
ep: 8
2026+
dp-attn: true
20082027
additional-settings:
2009-
- "DECODE_NODES=2"
2010-
- "DECODE_MTP_SIZE=2"
2028+
- "DECODE_NODES=1"
2029+
- "DECODE_MTP_SIZE=1"
2030+
2031+
# 1*DEP8 + 1*DEP8
2032+
- spec-decoding: "mtp"
2033+
conc-list: [ 64, 256 ]
2034+
prefill:
2035+
num-worker: 1
2036+
tp: 8
2037+
ep: 8
2038+
dp-attn: true
2039+
additional-settings:
2040+
- "PREFILL_NODES=1"
2041+
decode:
2042+
num-worker: 1
2043+
tp: 8
2044+
ep: 8
2045+
dp-attn: true
2046+
additional-settings:
2047+
- "DECODE_NODES=1"
2048+
- "DECODE_MTP_SIZE=1"
20112049

20122050
# 2*DEP8 + 1*DEP8
20132051
- spec-decoding: "mtp"

.github/configs/nvidia-master.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2352,7 +2352,7 @@ glm5-fp4-b200-sglang-mtp:
23522352
# does not have a B300-specific recipe, so this config reuses the existing
23532353
# GLM-5 FP4 B200 SGLang recipe as-is until B300-specific tuning is available.
23542354
glm5-fp4-b300-sglang:
2355-
image: lmsysorg/sglang:v0.5.12-cu130
2355+
image: lmsysorg/sglang:v0.5.11-cu130
23562356
model: nvidia/GLM-5-NVFP4
23572357
model-prefix: glm5
23582358
runner: b300
@@ -2373,7 +2373,7 @@ glm5-fp4-b300-sglang:
23732373
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
23742374

23752375
glm5-fp4-b300-sglang-mtp:
2376-
image: lmsysorg/sglang:v0.5.12-cu130
2376+
image: lmsysorg/sglang:v0.5.11-cu130
23772377
model: nvidia/GLM-5-NVFP4
23782378
model-prefix: glm5
23792379
runner: b300
@@ -4754,11 +4754,11 @@ minimaxm2.5-fp8-h200-vllm:
47544754
- isl: 1024
47554755
osl: 1024
47564756
search-space:
4757-
- { tp: 8, conc-start: 4, conc-end: 128 }
4757+
- { tp: 4, conc-start: 1, conc-end: 256 }
47584758
- isl: 8192
47594759
osl: 1024
47604760
search-space:
4761-
- { tp: 8, conc-start: 4, conc-end: 128 }
4761+
- { tp: 4, conc-start: 1, conc-end: 256 }
47624762

47634763
# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is
47644764
# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this

benchmarks/benchmark_lib.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ mkdir -p "$PYTHONPYCACHEPREFIX" 2>/dev/null || true
1515

1616
GPU_MONITOR_PID=""
1717
GPU_METRICS_CSV="/workspace/gpu_metrics.csv"
18+
export GPU_METRICS_CSV
1819

1920
# Start background GPU monitoring that logs metrics every second to CSV.
2021
# Auto-detects NVIDIA (nvidia-smi) or AMD (amd-smi) GPUs.
@@ -32,6 +33,7 @@ start_gpu_monitor() {
3233
done
3334

3435
GPU_METRICS_CSV="$output"
36+
export GPU_METRICS_CSV
3537

3638
if command -v nvidia-smi &>/dev/null; then
3739
nvidia-smi --query-gpu=timestamp,index,power.draw,temperature.gpu,clocks.current.sm,clocks.current.memory,utilization.gpu,utilization.memory \

benchmarks/multi_node/amd_utils/env.sh

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ if [[ -z "$IBDEVICES" ]]; then
2222
DETECTED=$(ibv_devinfo 2>/dev/null | grep "hca_id:" | awk '{print $2}' | paste -sd',')
2323
if [[ -n "$DETECTED" ]]; then
2424
export IBDEVICES="$DETECTED"
25+
echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES via ibv_devinfo on $(hostname -s)"
2526
else
26-
echo "WARNING: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
27+
echo "ERROR: Unable to detect RDMA devices. Set IBDEVICES explicitly." >&2
28+
exit 1
2729
fi
28-
echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $(hostname -s)"
2930
else
3031
echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
3132
fi
@@ -52,6 +53,10 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
5253
# =========================================================================
5354
# vLLM/Nixl-specific environment
5455
# =========================================================================
56+
export VLLM_USE_V1=1
57+
export VLLM_SERVER_DEV_MODE=0
58+
export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1
59+
5560
set -x
5661

5762
export VLLM_MORIIO_QP_PER_TRANSFER=4
@@ -129,7 +134,8 @@ else
129134
export SGLANG_USE_AITER=1
130135

131136
export SGLANG_MORI_DISPATCH_DTYPE=auto
132-
export SGLANG_MORI_FP8_COMB=true
137+
export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast
138+
export MORI_COMBINE_DTYPE_DECODE=fp8
133139
export SGLANG_MORI_QP_PER_TRANSFER=4
134140
export SGLANG_MORI_NUM_WORKERS=4
135141
export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
@@ -148,7 +154,7 @@ else
148154

149155
# Enable spec v2
150156
export SGLANG_ENABLE_SPEC_V2=1
151-
export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
157+
export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=0
152158

153159
export SGLANG_LOG_MS=true
154160
export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32

benchmarks/multi_node/amd_utils/job.slurm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ DOCKER_ENV_COMMON=(
388388
-e DECODE_ENABLE_EP=\$DECODE_ENABLE_EP
389389
-e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP
390390
-e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE
391+
-e IS_MULTINODE=\$IS_MULTINODE
391392
)
392393

393394
# Engine-specific env vars

benchmarks/multi_node/amd_utils/server_sglang.sh

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ BENCH_REQUEST_RATE="${BENCH_REQUEST_RATE:-inf}"
3333
BENCH_NUM_PROMPTS_MULTIPLIER="${BENCH_NUM_PROMPTS_MULTIPLIER:-10}"
3434
BENCH_MAX_CONCURRENCY="${BENCH_MAX_CONCURRENCY:-512}"
3535

36+
# Extract the maximum concurrency from the x-delimited list
37+
BENCH_MAX_CONC_VALUE=$(echo "$BENCH_MAX_CONCURRENCY" | tr 'x' '\n' | sort -n | tail -1)
38+
3639
# Dry Run for debugging purpose
3740
DRY_RUN="${DRY_RUN:-0}"
3841

@@ -184,6 +187,15 @@ else
184187
prefill_enable_two_batch_overlap="false"
185188
fi
186189

190+
# When both DP and EP are enabled, override max-running-requests with max bench concurrency
191+
if [[ "$PREFILL_ENABLE_DP" == "true" ]] && [[ "$PREFILL_ENABLE_EP" == "true" ]]; then
192+
prefill_max_running_requests=$BENCH_MAX_CONC_VALUE
193+
prefill_dp_ranks=$PREFILL_TP_SIZE
194+
# MORI_MAX_DISPATCH_TOKENS_PREFILL stays at 8192 (no change)
195+
MORI_MOE_MAX_INPUT_TOKENS_PREFILL=$((MORI_MAX_DISPATCH_TOKENS_PREFILL * prefill_dp_ranks / 2))
196+
echo "[DP+EP override] Prefill: max-running-requests=$prefill_max_running_requests, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_PREFILL"
197+
fi
198+
187199
# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
188200
if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
189201
decode_cuda_graph_bs=($(seq $DECODE_CUDA_GRAPH_BS_DP_START $DECODE_CUDA_GRAPH_BS_DP_END))
@@ -196,6 +208,18 @@ else
196208
decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
197209
fi
198210

211+
# When both DP and EP are enabled, override max-running-requests and dispatch tokens
212+
if [[ "$DECODE_ENABLE_DP" == "true" ]] && [[ "$DECODE_ENABLE_EP" == "true" ]]; then
213+
decode_max_running_requests=$BENCH_MAX_CONC_VALUE
214+
decode_dp_ranks=$DECODE_TP_SIZE
215+
MORI_MAX_DISPATCH_TOKENS_DECODE=$((BENCH_MAX_CONC_VALUE / decode_dp_ranks))
216+
MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * decode_dp_ranks * 7 / 10))
217+
# Update derived variable
218+
SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
219+
export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD
220+
echo "[DP+EP override] Decode: max-running-requests=$decode_max_running_requests, DISPATCH_TOKENS=$MORI_MAX_DISPATCH_TOKENS_DECODE, MOE_MAX_INPUT=$MORI_MOE_MAX_INPUT_TOKENS_DECODE, INTER_KERNEL_SWITCH=$SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD"
221+
fi
222+
199223
# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
200224
PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
201225
if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
@@ -343,11 +367,6 @@ if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]
343367
DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
344368
unset MORI_MOE_MAX_INPUT_TOKENS_PREFILL
345369
unset MORI_MOE_MAX_INPUT_TOKENS_DECODE
346-
# NOTE: that currently with fp8_combine set, the evals do not pass on InferenceX eval harness
347-
# or on SGLang native harness for high concurrency 4k and gets no where near the golden score of
348-
# 0.95 on even basic GSM8k grade school math as confirmed by @billishyahao from AMD
349-
# and as confirmed by @Oseltamivir. This was initally merged with @billishyahao promising
350-
# that an fast follow PR to fix the evals via having quant correction in the fp8 combine
351370
fi
352371

353372
# =============================================================================
@@ -398,7 +417,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
398417
PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
399418
fi
400419
set +x
401-
PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
420+
PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
402421
--model-path $MODEL_DIR/$MODEL_NAME \
403422
--disaggregation-mode prefill \
404423
--disaggregation-ib-device ${IBDEVICES} \
@@ -630,7 +649,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
630649
PREFILL_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL}"
631650
fi
632651
set +x
633-
PREFILL_CMD="${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
652+
PREFILL_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_PREFILL} ${PREFILL_SDMA_ENV} ${PREFILL_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
634653
--model-path $MODEL_DIR/${MODEL_NAME} \
635654
--disaggregation-mode prefill \
636655
--disaggregation-ib-device ${IBDEVICES} \
@@ -698,7 +717,7 @@ else
698717
DECODE_MORI_MOE_ENV="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE}"
699718
fi
700719
set +x
701-
DECODE_CMD="${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
720+
DECODE_CMD="SGLANG_MORI_COMBINE_DTYPE=${MORI_COMBINE_DTYPE_DECODE} ${DECODE_MORI_MOE_ENV} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
702721
--model-path ${MODEL_DIR}/${MODEL_NAME} \
703722
--disaggregation-mode decode \
704723
--disaggregation-ib-device ${IBDEVICES} \
@@ -758,4 +777,4 @@ else
758777
fi
759778

760779
echo "Script completed successfully"
761-
exit 0
780+
exit 0

benchmarks/multi_node/amd_utils/server_vllm.sh

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -196,34 +196,6 @@ python3 $WS_PATH/sync.py barrier \
196196
--wait-for-all-ports \
197197
--timeout 600
198198

199-
# =============================================================================
200-
# ETCD Server Setup
201-
# =============================================================================
202-
203-
# echo "Proceeding to start etcd server on $host_name"
204-
# bash ${WS_PATH}/start_etcd.sh > /dev/null 2>&1 &
205-
# etcd_pid=$!
206-
207-
# echo "Waiting at etcd server barrier on $host_name"
208-
# python3 $WS_PATH/sync.py barrier \
209-
# --node-ips ${IPADDRS} \
210-
# --node-ports 2379 \
211-
# --wait-for-all-ports \
212-
# --timeout 300
213-
214-
# echo "All etcd servers are up : $host_name"
215-
# sleep 3
216-
217-
# echo "etcd endpoint health=================="
218-
# etcdctl endpoint health 2>&1 || /usr/local/bin/etcd/etcdctl endpoint health 2>&1 || true
219-
# echo "======================================"
220-
221-
# python3 $WS_PATH/sync.py barrier \
222-
# --node-ips ${IPADDRS} \
223-
# --node-ports 2379 \
224-
# --wait-for-all-ports \
225-
# --timeout 300
226-
227199
# =============================================================================
228200
# Cluster Topology Configuration
229201
# =============================================================================
@@ -246,15 +218,10 @@ echo "Decode node IPs: ${DECODE_ARGS}"
246218
# MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address)
247219
PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
248220

249-
# vLLM environment (UCX transport vars are set at the Docker level in job.slurm)
221+
# vLLM runtime environment (static vars moved to env.sh; these depend on per-node state)
250222
setup_vllm_env() {
251-
export VLLM_USE_V1=1
252-
export VLLM_SERVER_DEV_MODE=0
253223
export VLLM_NIXL_SIDE_CHANNEL_HOST=${rdma_ip}
254224
export VLLM_NIXL_SIDE_CHANNEL_PORT=5600
255-
# Workaround: disable request-ID randomization so MoRI-IO connector can
256-
# match completion IDs between prefill and decode without PR #34907 patch.
257-
export VLLM_DISABLE_REQUEST_ID_RANDOMIZATION=1
258225
for env_pair in ${MODEL_ENVS}; do
259226
export "$env_pair"
260227
done

benchmarks/single_node/glm5_fp4_b300.sh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,6 @@ nvidia-smi
2424

2525
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
2626

27-
# Downgrade flashinfer to the version pinned in sglang v0.5.11 to test the
28-
# trtllm batched-GEMM regression suspicion from sgl-project/sglang#25563
29-
# (suggested by @trevor-m). sglang v0.5.12's pyproject.toml moved from
30-
# flashinfer_python==0.6.8.post1 → 0.6.11.post1, and the trtllm GEMM crash
31-
# at bs=128 + EAGLE on B300 appeared in the same image bump.
32-
pip install --no-deps "flashinfer_python==0.6.8.post1" "flashinfer_cubin==0.6.8.post1"
33-
3427
SERVER_LOG=/workspace/server.log
3528
PORT=${PORT:-8888}
3629

benchmarks/single_node/glm5_fp4_b300_mtp.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,6 @@ nvidia-smi
2424
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
2525

2626
pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
27-
# Downgrade flashinfer to the version pinned in sglang v0.5.11 to test the
28-
# trtllm batched-GEMM regression suspicion from sgl-project/sglang#25563
29-
# (suggested by @trevor-m). sglang v0.5.12's pyproject.toml moved from
30-
# flashinfer_python==0.6.8.post1 → 0.6.11.post1, and the trtllm GEMM crash
31-
# at bs=128 + EAGLE on B300 appeared in the same image bump.
32-
pip install --no-deps "flashinfer_python==0.6.8.post1" "flashinfer_cubin==0.6.8.post1"
3327

3428
export SGL_ENABLE_JIT_DEEPGEMM=1
3529
export SGLANG_ENABLE_SPEC_V2=1

0 commit comments

Comments
 (0)