fix(qwen3.5_fp8_b300): use --mm-attention-backend triton_attn

claude-fix-bot · functionstackx · commit e1d3a18155de · 2026-05-20T01:48:23.000-04:00
Same workaround as PR #1422 — bypass the broken flash-attn cute kernel sm_103 assertion in the Qwen-3.5-VL vision encoder by switching only the multi-modal attention path to triton_attn. Text decoder still uses --attention-backend trtllm_mha. See sgl-project/sglang#25564 + Dao-AILab/flash-attention#2572 for the upstream root cause and the in-flight fix.
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh
@@ -40,6 +40,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --kv-cache-dtype fp8_e4m3 \
 --mamba-ssm-dtype bfloat16 \
 --attention-backend trtllm_mha \
+--mm-attention-backend triton_attn \
 --moe-runner-backend flashinfer_trtllm \
 --cuda-graph-max-bs $CONC \
 --max-running-requests $CONC \
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
@@ -40,6 +40,7 @@ SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --mod
 --kv-cache-dtype fp8_e4m3 \
 --mamba-ssm-dtype bfloat16 \
 --attention-backend trtllm_mha \
+--mm-attention-backend triton_attn \
 --moe-runner-backend flashinfer_trtllm \
 --cuda-graph-max-bs $CONC \
 --max-running-requests $CONC \