File tree Expand file tree Collapse file tree
benchmarks/single_node/fixed_seq_len Expand file tree Collapse file tree Original file line number Diff line number Diff line change 4242 EP=()
4343fi
4444
45+ if [ " $ISL " = " 8192" ]; then
46+ ATTN_BACKEND=" FLASH_ATTN"
47+ AUTOTUNE_FLAG=()
48+ else
49+ ATTN_BACKEND=" FLASHINFER"
50+ AUTOTUNE_FLAG=(--enable-flashinfer-autotune)
51+ fi
52+
4553# Start GPU monitoring (power, temperature, clocks every second)
4654start_gpu_monitor
4755
@@ -55,8 +63,8 @@ vllm serve "$MODEL" --port "$PORT" \
5563--max-num-batched-tokens " $MAX_NUM_BATCHED_TOKENS " \
5664--kv-cache-dtype fp8 \
5765--moe-backend triton \
58- --attention-backend FLASHINFER \
59- --enable-flashinfer-autotune \
66+ --attention-backend " $ATTN_BACKEND " \
67+ " ${AUTOTUNE_FLAG[@]} " \
6068--compilation-config " $COMPILATION_CONFIG " \
6169--no-enable-prefix-caching \
6270--trust-remote-code > " $SERVER_LOG " 2>&1 &
Original file line number Diff line number Diff line change 34883488 - " Bump image to lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260528."
34893489 - " Update script for aiter attention backend."
34903490 pr-link : https://github.com/SemiAnalysisAI/InferenceX/pull/1669
3491+
3492+ - config-keys :
3493+ - minimaxm2.5-fp8-h200-vllm
3494+ description :
3495+ - " Switch attention backend from FLASHINFER to FLASH_ATTN for the 8k/1k cell of MiniMax-M2.5 FP8 H200 vLLM."
3496+ - " 1k/1k cell not changed in this PR: at 1k/1k all three measured configs."
3497+ pr-link : https://github.com/SemiAnalysisAI/InferenceX/pull/1668
You can’t perform that action at this time.
0 commit comments