Skip to content

Commit 437e01a

Browse files
1am9trashyichiche
andauthored
[AMD][MI35X] Qwen3.5-fp8 SGLang single-node benchmark (#1669)
* [AMD] Switch Qwen3.5 FP8 MI355X benchmarks to aiter attention backend * [AMD] Bump Qwen3.5 FP8 MI355X image to v0.5.12.post1-rocm720-mi35x-20260528 * Update change log * Change mem-fraction-static back to 0.8 * Add context len args and limit batch to CONC size * Fix typo * Fix typo * Change config to tp4 --------- Co-authored-by: jacky.cheng <yichiche@amd.com>
1 parent 7388788 commit 437e01a

4 files changed

Lines changed: 31 additions & 19 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ qwen3.5-fp8-mi325x-sglang:
261261
- { tp: 8, conc-start: 4, conc-end: 64 }
262262

263263
qwen3.5-fp8-mi355x-sglang:
264-
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
264+
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
265265
model: Qwen/Qwen3.5-397B-A17B-FP8
266266
model-prefix: qwen3.5
267267
runner: mi355x
@@ -273,17 +273,14 @@ qwen3.5-fp8-mi355x-sglang:
273273
- isl: 1024
274274
osl: 1024
275275
search-space:
276-
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
277-
- { tp: 8, ep: 8, conc-start: 64, conc-end: 256 }
278-
- { tp: 2, ep: 2, conc-start: 128, conc-end: 256 }
276+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
279277
- isl: 8192
280278
osl: 1024
281279
search-space:
282-
- { tp: 2, ep: 2, conc-start: 4, conc-end: 32 }
283-
- { tp: 4, ep: 1, conc-start: 32, conc-end: 256 }
280+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
284281

285282
qwen3.5-fp8-mi355x-sglang-mtp:
286-
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
283+
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
287284
model: Qwen/Qwen3.5-397B-A17B-FP8
288285
model-prefix: qwen3.5
289286
runner: mi355x
@@ -295,14 +292,11 @@ qwen3.5-fp8-mi355x-sglang-mtp:
295292
- isl: 1024
296293
osl: 1024
297294
search-space:
298-
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
299-
- { tp: 8, ep: 8, conc-start: 64, conc-end: 256, spec-decoding: mtp }
300-
- { tp: 2, ep: 2, conc-start: 128, conc-end: 256, spec-decoding: mtp }
295+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
301296
- isl: 8192
302297
osl: 1024
303298
search-space:
304-
- { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp }
305-
- { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp }
299+
- { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
306300

307301
# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
308302
# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this

benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ fi
1818

1919
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
2020

21+
export SGLANG_USE_AITER_UNIFIED_ATTN=1
22+
export SGLANG_USE_AITER=1
23+
2124
SERVER_LOG=/workspace/server.log
2225
CONTEXT_LENGTH=$((ISL + OSL + 20))
23-
MAX_PREFILL_TOKENS=32768
2426

2527
EVAL_CONTEXT_ARGS=""
2628
if [ "${EVAL_ONLY}" = "true" ]; then
@@ -32,7 +34,7 @@ fi
3234
start_gpu_monitor
3335

3436
python3 -m sglang.launch_server \
35-
--attention-backend triton \
37+
--attention-backend aiter \
3638
--model-path $MODEL \
3739
--host=0.0.0.0 \
3840
--port $PORT \
@@ -41,11 +43,14 @@ python3 -m sglang.launch_server \
4143
--trust-remote-code \
4244
--tokenizer-worker-num 6 \
4345
--enable-aiter-allreduce-fusion \
46+
--max-running-requests $CONC \
4447
--cuda-graph-max-bs $CONC \
4548
--disable-radix-cache \
46-
--max-prefill-tokens $MAX_PREFILL_TOKENS \
49+
--chunked-prefill-size 32768 \
4750
--scheduler-recv-interval 30 \
48-
--mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
51+
--mem-fraction-static 0.8 \
52+
--model-loader-extra-config '{"enable_multithread_load": true}' \
53+
--page-size 16 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
4954

5055
SERVER_PID=$!
5156

benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ fi
1818

1919
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
2020

21+
export SGLANG_USE_AITER_UNIFIED_ATTN=1
22+
export SGLANG_USE_AITER=1
23+
2124
SERVER_LOG=/workspace/server.log
2225
CONTEXT_LENGTH=$((ISL + OSL + 20))
23-
MAX_PREFILL_TOKENS=32768
2426

2527
EVAL_CONTEXT_ARGS=""
2628
if [ "${EVAL_ONLY}" = "true" ]; then
@@ -32,7 +34,7 @@ fi
3234
start_gpu_monitor
3335

3436
python3 -m sglang.launch_server \
35-
--attention-backend triton \
37+
--attention-backend aiter \
3638
--model-path $MODEL \
3739
--host=0.0.0.0 \
3840
--port $PORT \
@@ -41,11 +43,14 @@ python3 -m sglang.launch_server \
4143
--trust-remote-code \
4244
--tokenizer-worker-num 6 \
4345
--enable-aiter-allreduce-fusion \
46+
--max-running-requests $CONC \
4447
--cuda-graph-max-bs $CONC \
4548
--disable-radix-cache \
46-
--max-prefill-tokens $MAX_PREFILL_TOKENS \
49+
--chunked-prefill-size 32768 \
4750
--scheduler-recv-interval 30 \
4851
--mem-fraction-static 0.8 \
52+
--model-loader-extra-config '{"enable_multithread_load": true}' \
53+
--page-size 16 \
4954
--speculative-algorithm EAGLE \
5055
--speculative-num-steps 3 \
5156
--speculative-eagle-topk 1 \

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3481,6 +3481,14 @@
34813481
- "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
34823482
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544
34833483

3484+
- config-keys:
3485+
- qwen3.5-fp8-mi355x-sglang
3486+
- qwen3.5-fp8-mi355x-sglang-mtp
3487+
description:
3488+
- "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528."
3489+
- "Update script for aiter attention backend from triton."
3490+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1669
3491+
34843492
- config-keys:
34853493
- minimaxm2.5-fp8-h200-vllm
34863494
description:

0 commit comments

Comments
 (0)