Skip to content

Commit 50efc8f

Browse files
committed
Remove non-mtp update
1 parent a7b115d commit 50efc8f

3 files changed

Lines changed: 9 additions & 18 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ qwen3.5-fp8-mi325x-sglang:
261261
- { tp: 8, conc-start: 4, conc-end: 64 }
262262

263263
qwen3.5-fp8-mi355x-sglang:
264-
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528
264+
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517
265265
model: Qwen/Qwen3.5-397B-A17B-FP8
266266
model-prefix: qwen3.5
267267
runner: mi355x

benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@ fi
1818

1919
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
2020

21-
export SGLANG_USE_AITER_UNIFIED_ATTN=1
22-
export SGLANG_USE_AITER=1
23-
2421
SERVER_LOG=/workspace/server.log
22+
CONTEXT_LENGTH=$((ISL + OSL + 20))
23+
MAX_PREFILL_TOKENS=32768
2524

2625
EVAL_CONTEXT_ARGS=""
2726
if [ "${EVAL_ONLY}" = "true" ]; then
2827
setup_eval_context
2928
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
29+
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
3030
fi
3131
# Start GPU monitoring (power, temperature, clocks every second)
3232
start_gpu_monitor
3333

3434
python3 -m sglang.launch_server \
35-
--attention-backend aiter \
35+
--attention-backend triton \
3636
--model-path $MODEL \
3737
--host=0.0.0.0 \
3838
--port $PORT \
@@ -41,13 +41,11 @@ python3 -m sglang.launch_server \
4141
--trust-remote-code \
4242
--tokenizer-worker-num 6 \
4343
--enable-aiter-allreduce-fusion \
44-
--max-running-requests 512 \
44+
--cuda-graph-max-bs $CONC \
4545
--disable-radix-cache \
46-
--chunked-prefill-size 32768 \
46+
--max-prefill-tokens $MAX_PREFILL_TOKENS \
4747
--scheduler-recv-interval 30 \
48-
--mem-fraction-static 0.9 \
49-
--model-loader-extra-config '{"enable_multithread_load": true}' \
50-
--page-size 16 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
48+
--mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
5149

5250
SERVER_PID=$!
5351

perf-changelog.yaml

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3475,16 +3475,9 @@
34753475
- "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
34763476
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544
34773477

3478-
- config-keys:
3479-
- qwen3.5-fp8-mi355x-sglang
3480-
description:
3481-
- "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528."
3482-
- "Update script for aiter attention backend."
3483-
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1669
3484-
34853478
- config-keys:
34863479
- qwen3.5-fp8-mi355x-sglang-mtp
34873480
description:
34883481
- "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528."
34893482
- "Update script for aiter attention backend."
3490-
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1669
3483+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1671

0 commit comments

Comments
 (0)