Skip to content

Commit 2dca601

Browse files
authored
[AMD] improve dsr1 fp4 disagg (#1584)
* [AMD] improve dsr1 fp4 disagg > Co-authored-by: billishyahao <bill.he@amd.com> > Co-authored-by: Duyi-Wang <duyi.wang@amd.com> * add perf changelog * fix * fix regression * sync config * fix * fix * suppress aiter log * fix * fix * fix
1 parent f9aafa9 commit 2dca601

3 files changed

Lines changed: 70 additions & 10 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1861,8 +1861,8 @@ dsr1-fp4-mi355x-sglang-disagg:
18611861
- "DECODE_NODES=1"
18621862
- "DECODE_MTP_SIZE=0"
18631863

1864-
dsr1-fp4-mi355x-sglang-disagg-mtp:
1865-
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
1864+
dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
1865+
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
18661866
model: amd/DeepSeek-R1-0528-MXFP4-v2
18671867
model-prefix: dsr1
18681868
runner: mi355x-disagg
@@ -1970,7 +1970,19 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
19701970
additional-settings:
19711971
- "DECODE_NODES=1"
19721972
- "DECODE_MTP_SIZE=1"
1973+
19731974

1975+
dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
1976+
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
1977+
model: amd/DeepSeek-R1-0528-MXFP4-v2
1978+
model-prefix: dsr1
1979+
runner: mi355x-disagg
1980+
precision: fp4
1981+
framework: sglang-disagg
1982+
multinode: true
1983+
disagg: true
1984+
scenarios:
1985+
fixed-seq-len:
19741986
- isl: 8192
19751987
osl: 1024
19761988
search-space:
@@ -2015,7 +2027,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
20152027

20162028
# 1P2D TP8
20172029
- spec-decoding: "mtp"
2018-
conc-list: [ 64, 128, 256 ]
2030+
conc-list: [ 32, 64 ]
20192031
prefill:
20202032
num-worker: 1
20212033
tp: 8
@@ -2030,11 +2042,11 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
20302042
dp-attn: false
20312043
additional-settings:
20322044
- "DECODE_NODES=2"
2033-
- "DECODE_MTP_SIZE=2"
2045+
- "DECODE_MTP_SIZE=3"
20342046

20352047
# 1*DEP8 + 1*DEP8
20362048
- spec-decoding: "mtp"
2037-
conc-list: [ 128, 512 ]
2049+
conc-list: [ 640, 512 ]
20382050
prefill:
20392051
num-worker: 1
20402052
tp: 8
@@ -2049,11 +2061,11 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
20492061
dp-attn: true
20502062
additional-settings:
20512063
- "DECODE_NODES=1"
2052-
- "DECODE_MTP_SIZE=1"
2064+
- "DECODE_MTP_SIZE=3"
20532065

20542066
# 1*DEP8 + 1*DEP8
20552067
- spec-decoding: "mtp"
2056-
conc-list: [ 64, 256 ]
2068+
conc-list: [ 256 ]
20572069
prefill:
20582070
num-worker: 1
20592071
tp: 8
@@ -2068,7 +2080,46 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
20682080
dp-attn: true
20692081
additional-settings:
20702082
- "DECODE_NODES=1"
2071-
- "DECODE_MTP_SIZE=1"
2083+
- "DECODE_MTP_SIZE=3"
2084+
2085+
2086+
# 1*DEP8 + 1*DEP8
2087+
- spec-decoding: "mtp"
2088+
conc-list: [ 128 ]
2089+
prefill:
2090+
num-worker: 1
2091+
tp: 8
2092+
ep: 8
2093+
dp-attn: true
2094+
additional-settings:
2095+
- "PREFILL_NODES=1"
2096+
decode:
2097+
num-worker: 1
2098+
tp: 8
2099+
ep: 8
2100+
dp-attn: true
2101+
additional-settings:
2102+
- "DECODE_NODES=1"
2103+
- "DECODE_MTP_SIZE=3"
2104+
2105+
# 1*DEP8 + 1*DEP8
2106+
- spec-decoding: "mtp"
2107+
conc-list: [ 64 ]
2108+
prefill:
2109+
num-worker: 1
2110+
tp: 8
2111+
ep: 8
2112+
dp-attn: true
2113+
additional-settings:
2114+
- "PREFILL_NODES=1"
2115+
decode:
2116+
num-worker: 1
2117+
tp: 8
2118+
ep: 8
2119+
dp-attn: true
2120+
additional-settings:
2121+
- "DECODE_NODES=1"
2122+
- "DECODE_MTP_SIZE=3"
20722123

20732124
# 2*DEP8 + 1*DEP8
20742125
- spec-decoding: "mtp"
@@ -2088,7 +2139,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
20882139
additional-settings:
20892140
- "DECODE_NODES=1"
20902141
- "DECODE_MTP_SIZE=1"
2091-
2142+
20922143

20932144
# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
20942145
# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the

benchmarks/multi_node/amd_utils/env.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,11 @@ else
124124
# =========================================================================
125125

126126
export SGLANG_USE_AITER=1
127+
export AITER_LOG_LEVEL=ERROR
127128

128129
export SGLANG_MORI_DISPATCH_DTYPE=auto
129-
export SGLANG_MORI_FP8_COMB=true
130+
export MORI_COMBINE_DTYPE_PREFILL=fp8_direct_cast
131+
export MORI_COMBINE_DTYPE_DECODE=fp8
130132
export SGLANG_MORI_QP_PER_TRANSFER=4
131133
export SGLANG_MORI_NUM_WORKERS=4
132134
export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000

perf-changelog.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3201,6 +3201,13 @@
32013201
- "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh"
32023202
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579
32033203

3204+
- config-keys:
3205+
- dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp
3206+
description:
3207+
- "Bump the image to May 26"
3208+
- "Add conc 128/256 new sweep point"
3209+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1584
3210+
32043211
- config-keys:
32053212
- glm5-fp8-gb300-dynamo-sglang
32063213
description:

0 commit comments

Comments
 (0)