diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1ad705468..542ce7d56 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -283,7 +283,7 @@ qwen3.5-fp8-mi355x-sglang: - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } qwen3.5-fp8-mi355x-sglang-mtp: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260517 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh index 38230cc88..f422eae64 100755 --- a/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_mi355x_mtp.sh @@ -18,21 +18,21 @@ fi if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +export SGLANG_USE_AITER_UNIFIED_ATTN=1 +export SGLANG_USE_AITER=1 + SERVER_LOG=/workspace/server.log -CONTEXT_LENGTH=$((ISL + OSL + 20)) -MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor python3 -m sglang.launch_server \ - --attention-backend triton \ + --attention-backend aiter \ --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ @@ -41,11 +41,13 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ - --cuda-graph-max-bs $CONC \ + --max-running-requests 512 \ --disable-radix-cache \ - --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --chunked-prefill-size 32768 \ --scheduler-recv-interval 30 \ - --mem-fraction-static 0.8 \ + --mem-fraction-static 0.9 \ + --model-loader-extra-config '{"enable_multithread_load": true}' \ + --page-size 16 \ --speculative-algorithm EAGLE \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a5f3f3478..c3a256ba9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3474,3 +3474,10 @@ - "Use scheduler-recv-interval values 2/60/30/1200/600/1920 for conc 1-4/8/16/32/64/128-256" - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang-mtp + description: + - "Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528." + - "Update script for aiter attention backend." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1671