Skip to content

Commit b28da84

Browse files
committed
[AMD] Switch Qwen3.5 FP8 MI355X benchmarks to aiter attention backend
1 parent ea4f575 commit b28da84

2 files changed

Lines changed: 18 additions & 14 deletions

File tree

benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@ fi
1818

1919
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
2020

21+
export SGLANG_USE_AITER_UNIFIED_ATTN=1
22+
export SGLANG_USE_AITER=1
23+
2124
SERVER_LOG=/workspace/server.log
22-
CONTEXT_LENGTH=$((ISL + OSL + 20))
23-
MAX_PREFILL_TOKENS=32768
2425

2526
EVAL_CONTEXT_ARGS=""
2627
if [ "${EVAL_ONLY}" = "true" ]; then
2728
setup_eval_context
2829
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
29-
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
3030
fi
3131
# Start GPU monitoring (power, temperature, clocks every second)
3232
start_gpu_monitor
3333

3434
python3 -m sglang.launch_server \
35-
--attention-backend triton \
35+
--attention-backend aiter \
3636
--model-path $MODEL \
3737
--host=0.0.0.0 \
3838
--port $PORT \
@@ -41,11 +41,13 @@ python3 -m sglang.launch_server \
4141
--trust-remote-code \
4242
--tokenizer-worker-num 6 \
4343
--enable-aiter-allreduce-fusion \
44-
--cuda-graph-max-bs $CONC \
44+
--max-running-requests 512 \
4545
--disable-radix-cache \
46-
--max-prefill-tokens $MAX_PREFILL_TOKENS \
46+
--chunked-prefill-size 32768 \
4747
--scheduler-recv-interval 30 \
48-
--mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
48+
--mem-fraction-static 0.9 \
49+
--model-loader-extra-config '{"enable_multithread_load": true}' \
50+
--page-size 16 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
4951

5052
SERVER_PID=$!
5153

benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@ fi
1818

1919
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
2020

21+
export SGLANG_USE_AITER_UNIFIED_ATTN=1
22+
export SGLANG_USE_AITER=1
23+
2124
SERVER_LOG=/workspace/server.log
22-
CONTEXT_LENGTH=$((ISL + OSL + 20))
23-
MAX_PREFILL_TOKENS=32768
2425

2526
EVAL_CONTEXT_ARGS=""
2627
if [ "${EVAL_ONLY}" = "true" ]; then
2728
setup_eval_context
2829
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
29-
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
3030
fi
3131
# Start GPU monitoring (power, temperature, clocks every second)
3232
start_gpu_monitor
3333

3434
python3 -m sglang.launch_server \
35-
--attention-backend triton \
35+
--attention-backend aiter \
3636
--model-path $MODEL \
3737
--host=0.0.0.0 \
3838
--port $PORT \
@@ -41,11 +41,13 @@ python3 -m sglang.launch_server \
4141
--trust-remote-code \
4242
--tokenizer-worker-num 6 \
4343
--enable-aiter-allreduce-fusion \
44-
--cuda-graph-max-bs $CONC \
44+
--max-running-requests 512 \
4545
--disable-radix-cache \
46-
--max-prefill-tokens $MAX_PREFILL_TOKENS \
46+
--chunked-prefill-size 32768 \
4747
--scheduler-recv-interval 30 \
48-
--mem-fraction-static 0.8 \
48+
--mem-fraction-static 0.9 \
49+
--model-loader-extra-config '{"enable_multithread_load": true}' \
50+
--page-size 16 \
4951
--speculative-algorithm EAGLE \
5052
--speculative-num-steps 3 \
5153
--speculative-eagle-topk 1 \

0 commit comments

Comments
 (0)