From b28da845ab413dc3e2af8d64de11146565442a61 Mon Sep 17 00:00:00 2001 From: "jacky.cheng" Date: Fri, 5 Jun 2026 02:00:45 +0000 Subject: [PATCH 1/4] [AMD] Switch Qwen3.5 FP8 MI355X benchmarks to aiter attention backend --- .../fixed_seq_len/qwen3.5_fp8_mi355x.sh | 16 +++++++++------- .../fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh | 16 +++++++++------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh index 1661df465..591eea283 100644 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh @@ -18,21 +18,21 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +export SGLANG_USE_AITER_UNIFIED_ATTN=1 +export SGLANG_USE_AITER=1 + SERVER_LOG=/workspace/server.log -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor python3 -m sglang.launch_server \ - --attention-backend triton \ + --attention-backend aiter \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ @@ -41,11 +41,13 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ + --max-running-requests 512 \ --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --chunked-prefill-size 32768 \ --scheduler-recv-interval 30 \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.9 \ + --model-loader-extra-config '{"enable_multithread_load": true}' \ + --page-size 16 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh index 38230cc88..f422eae64 100755 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh @@ -18,21 +18,21 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +export SGLANG_USE_AITER_UNIFIED_ATTN=1 +export SGLANG_USE_AITER=1 + SERVER_LOG=/workspace/server.log -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor python3 -m sglang.launch_server \ - --attention-backend triton \ + --attention-backend aiter \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ @@ -41,11 +41,13 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ + --max-running-requests 512 \ --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --chunked-prefill-size 32768 \ --scheduler-recv-interval 30 \ - --mem-fraction-static 0.8 \ + --mem-fraction-static 0.9 \ + --model-loader-extra-config '{"enable_multithread_load": true}' \ + --page-size 16 \ --speculative-algorithm EAGLE \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ From 3b685b4f89e0dd6de905ce828bc8c2778fd61564 Mon Sep 17 00:00:00 2001 From: "jacky.cheng" Date: Fri, 5 Jun 2026 02:06:32 +0000 Subject: [PATCH 2/4] [AMD] Bump Qwen3.5 FP8 MI355X image to v0.5.12.post1-rocm720-mi35x-20260528 --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1ad705468..add9c8d54 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -261,7 +261,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x @@ -283,7 +283,7 @@ qwen3.5-fp8-mi355x-sglang: - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } qwen3.5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x From a7b115de69a9f882186ef81e0213f45b972fa306 Mon Sep 17 00:00:00 2001 From: thomawan Date: Fri, 5 Jun 2026 10:39:54 +0800 Subject: [PATCH 3/4] Update change log --- perf-changelog.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a5f3f3478..14d3c3014 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3474,3 +3474,17 @@ - "Use scheduler-recv-interval values 2/60/30/1200/600/1920 for conc 1-4/8/16/32/64/128-256" - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang + description: + - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528." + - "Update script for aiter attention backend." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1669 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang-mtp + description: + - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528." + - "Update script for aiter attention backend." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1669 From 50efc8fe8138e0b905354891e2d1e29c6682bdcc Mon Sep 17 00:00:00 2001 From: thomawan Date: Fri, 5 Jun 2026 11:19:46 +0800 Subject: [PATCH 4/4] Remove non-mtp update --- .github/configs/amd-master.yaml | 2 +- .../fixed_seq_len/qwen3.5_fp8_mi355x.sh | 16 +++++++--------- perf-changelog.yaml | 9 +-------- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index add9c8d54..542ce7d56 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -261,7 +261,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh index 591eea283..1661df465 100644 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x.sh @@ -18,21 +18,21 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -export SGLANG_USE_AITER_UNIFIED_ATTN=1 -export SGLANG_USE_AITER=1 - SERVER_LOG=/workspace/server.log +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor python3 -m sglang.launch_server \ - --attention-backend aiter \ + --attention-backend triton \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ @@ -41,13 +41,11 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ - --max-running-requests 512 \ + --cuda-graph-max-bs $CONC \ --disable-radix-cache \ - --chunked-prefill-size 32768 \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ --scheduler-recv-interval 30 \ - --mem-fraction-static 0.9 \ - --model-loader-extra-config '{"enable_multithread_load": true}' \ - --page-size 16 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 14d3c3014..c3a256ba9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3475,16 +3475,9 @@ - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544 -- config-keys: - - qwen3.5-fp8-mi355x-sglang - description: - - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528." - - "Update script for aiter attention backend." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1669 - - config-keys: - qwen3.5-fp8-mi355x-sglang-mtp description: - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528." - "Update script for aiter attention backend." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1669 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1671