Skip to content

Commit 93a0207

Browse files
committed
add Qwen3.5-FP4-MI355X-SGLang-Disagg configution
1 parent d2ee68c commit 93a0207

4 files changed

Lines changed: 180 additions & 0 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,61 @@ qwen3.5-fp4-mi355x-sglang-mtp:
392392
- { tp: 2, conc-start: 4, conc-end: 256, spec-decoding: mtp }
393393
- { tp: 4, conc-start: 4, conc-end: 16, spec-decoding: mtp }
394394

395+
qwen3.5-fp4-mi355x-sglang-disagg:
396+
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
397+
model: amd/Qwen3.5-397B-A17B-MXFP4
398+
model-prefix: qwen3.5
399+
runner: mi355x-disagg
400+
precision: fp4
401+
framework: sglang-disagg
402+
multinode: true
403+
disagg: true
404+
scenarios:
405+
fixed-seq-len:
406+
- isl: 1024
407+
osl: 1024
408+
search-space:
409+
# Mirrors qwen3.5-fp8-mi355x-sglang-disagg TP8/EP1 low-concurrency sweep
410+
- spec-decoding: "none"
411+
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
412+
prefill:
413+
num-worker: 1
414+
tp: 8
415+
ep: 1
416+
dp-attn: false
417+
additional-settings:
418+
- "PREFILL_NODES=1"
419+
decode:
420+
num-worker: 1
421+
tp: 8
422+
ep: 1
423+
dp-attn: false
424+
additional-settings:
425+
- "DECODE_NODES=1"
426+
- "DECODE_MTP_SIZE=0"
427+
428+
- isl: 8192
429+
osl: 1024
430+
search-space:
431+
# Mirrors qwen3.5-fp8-mi355x-sglang-disagg TP8/EP1 with DP-attention sweep
432+
- spec-decoding: "none"
433+
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
434+
prefill:
435+
num-worker: 1
436+
tp: 8
437+
ep: 1
438+
dp-attn: true
439+
additional-settings:
440+
- "PREFILL_NODES=1"
441+
decode:
442+
num-worker: 1
443+
tp: 8
444+
ep: 1
445+
dp-attn: true
446+
additional-settings:
447+
- "DECODE_NODES=1"
448+
- "DECODE_MTP_SIZE=0"
449+
395450
qwen3.5-fp8-mi300x-sglang:
396451
image: lmsysorg/sglang:v0.5.12-rocm720-mi30x
397452
model: Qwen/Qwen3.5-397B-A17B-FP8

benchmarks/multi_node/amd_utils/models.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,37 @@ DeepSeek-R1-0528:
161161
chunked_prefill_size: 262144
162162
cuda_graph_bs_range: "1-128"
163163

164+
Qwen3.5-397B-A17B-MXFP4:
165+
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori --moe-dense-tp-size 1"
166+
mtp_flags: ""
167+
dp_flags: "--moe-a2a-backend mori --enable-dp-attention --enable-dp-lm-head"
168+
prefill:
169+
mem_fraction_static: 0.8
170+
disable_radix_cache: true
171+
dp:
172+
max_running_requests: 24
173+
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
174+
cuda_graph_bs: "1 2 3"
175+
no_dp:
176+
max_running_requests: 128
177+
chunked_prefill_size: 262144
178+
cuda_graph_bs_range: "1-128"
179+
decode:
180+
mem_fraction_static: 0.85
181+
prefill_round_robin_balance: true
182+
dp:
183+
max_running_requests: 4096
184+
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
185+
cuda_graph_bs_range: "1-160"
186+
ep_only:
187+
max_running_requests: 256
188+
chunked_prefill_size: 262144
189+
cuda_graph_bs_range: "1-256"
190+
no_dp:
191+
max_running_requests: 128
192+
chunked_prefill_size: 262144
193+
cuda_graph_bs_range: "1-128"
194+
164195
DeepSeek-R1-0528-MXFP4-Preview:
165196
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
166197
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
CONC_LIST \
7+
ISL \
8+
OSL \
9+
IMAGE \
10+
SPEC_DECODING \
11+
MODEL_PATH \
12+
PREFILL_NUM_WORKERS \
13+
PREFILL_TP \
14+
PREFILL_EP \
15+
PREFILL_DP_ATTN \
16+
DECODE_NUM_WORKERS \
17+
DECODE_TP \
18+
DECODE_EP \
19+
DECODE_DP_ATTN \
20+
PREFILL_NODES \
21+
DECODE_NODES \
22+
RANDOM_RANGE_RATIO
23+
24+
if [[ -n "$SLURM_JOB_ID" ]]; then
25+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
26+
fi
27+
28+
set -x
29+
30+
# Use upstreamed multi_node scripts (no external clone needed)
31+
cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
32+
33+
# Set up SGL launch script-specific environment variables
34+
export TIME_LIMIT="08:00:00"
35+
export MODEL_PATH=$MODEL_PATH
36+
export MODEL_NAME=$MODEL_NAME
37+
export CONTAINER_IMAGE=$IMAGE
38+
39+
if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
40+
export PREFILL_ENABLE_EP=false
41+
else
42+
export PREFILL_ENABLE_EP=true
43+
fi
44+
45+
if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
46+
export PREFILL_ENABLE_DP=true
47+
else
48+
export PREFILL_ENABLE_DP=false
49+
fi
50+
51+
if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
52+
export DECODE_ENABLE_EP=false
53+
else
54+
export DECODE_ENABLE_EP=true
55+
fi
56+
57+
if [[ "$DECODE_DP_ATTN" == "true" ]]; then
58+
export DECODE_ENABLE_DP=true
59+
else
60+
export DECODE_ENABLE_DP=false
61+
fi
62+
63+
# Launch jobs based on ISL/OSL
64+
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
65+
# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
66+
# expects the concurrencies.
67+
JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
68+
$PREFILL_NUM_WORKERS \
69+
$DECODE_NODES \
70+
$DECODE_NUM_WORKERS \
71+
$ISL $OSL "${CONC_LIST// /x}" inf \
72+
${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
73+
${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
74+
${PREFILL_TP} ${DECODE_TP} \
75+
${RANDOM_RANGE_RATIO} \
76+
${NODE_LIST:-})
77+
78+
if [[ $? -ne 0 ]]; then
79+
echo "Failed to submit job" >&2
80+
exit 1
81+
fi
82+
83+
echo "$JOB_ID"

perf-changelog.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2934,3 +2934,14 @@
29342934
description:
29352935
- "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517"
29362936
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440
2937+
2938+
- config-keys:
2939+
- qwen3.5-fp4-mi355x-sglang-disagg
2940+
description:
2941+
- "Add SGLang PD-disaggregation config for Qwen3.5-397B-A17B-MXFP4 on MI355X"
2942+
- "Image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501"
2943+
- "Model: amd/Qwen3.5-397B-A17B-MXFP4"
2944+
- "1P1D TP8/EP1: 1k1k pure-TP, 8k1k with DP-attention; conc 8-512"
2945+
- "New launch script: benchmarks/multi_node/qwen3.5_fp4_mi355x_sglang-disagg.sh"
2946+
- "New models.yaml entry: Qwen3.5-397B-A17B-MXFP4 (aiter attn, mori transfer, fp8_e4m3 KV cache)"
2947+
pr-link: XXX

0 commit comments

Comments
 (0)