11#! /usr/bin/env bash
22
33# Qwen-3.5-397B-A17B FP8 on H100 via sglang.
4- # Mirrors qwen3.5_fp8_h200.sh but with tighter memory accommodations:
5- # H100 has 80GB HBM3 vs H200's 141GB HBM3e, so weights + KV cache fit
6- # more snugly. Mem-fraction-static lowered from 0.8 → 0.75 and
7- # chunked-prefill-size from 16384 → 8192 to leave more headroom.
8- # Sweep tops out at conc=32 instead of 64 for the same reason.
4+ # Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64,
5+ # and TP8/EP8 with DP attention at conc 128-256.
96
107source " $( dirname " $0 " ) /../../benchmark_lib.sh"
118
129check_env_vars \
1310 MODEL \
1411 TP \
12+ DP_ATTENTION \
1513 CONC \
1614 ISL \
1715 OSL \
@@ -34,7 +32,47 @@ if [ "${EVAL_ONLY}" = "true" ]; then
3432 MAX_SEQ_LEN=" $EVAL_MAX_MODEL_LEN "
3533fi
3634
37- echo " CONC: $CONC , ISL: $ISL , OSL: $OSL , MAX_SEQ_LEN: $MAX_SEQ_LEN "
35+ PARALLEL_ARGS=(--tp " $TP " )
36+ if [ " ${EP_SIZE} " -gt 1 ]; then
37+ PARALLEL_ARGS+=(--expert-parallel-size " $EP_SIZE " )
38+ fi
39+
40+ SCHEDULER_RECV_INTERVAL=
41+ if [ " ${DP_ATTENTION} " != " true" ]; then
42+ case " $CONC " in
43+ 1|2|4)
44+ SCHEDULER_RECV_INTERVAL=2
45+ ;;
46+ 8)
47+ SCHEDULER_RECV_INTERVAL=60
48+ ;;
49+ 16)
50+ SCHEDULER_RECV_INTERVAL=30
51+ ;;
52+ 32)
53+ SCHEDULER_RECV_INTERVAL=1200
54+ ;;
55+ 64)
56+ SCHEDULER_RECV_INTERVAL=600
57+ ;;
58+ * )
59+ echo " Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2
60+ exit 1
61+ ;;
62+ esac
63+ fi
64+
65+ SCHEDULER_ARGS=()
66+ if [ -n " $SCHEDULER_RECV_INTERVAL " ]; then
67+ SCHEDULER_ARGS=(--scheduler-recv-interval " $SCHEDULER_RECV_INTERVAL " )
68+ fi
69+ if [ " ${DP_ATTENTION} " = " true" ]; then
70+ PARALLEL_ARGS+=(--dp-size " $TP " --enable-dp-attention)
71+ fi
72+
73+ echo " TP: $TP , EP_SIZE: $EP_SIZE , DP_ATTENTION: $DP_ATTENTION , CONC: $CONC , ISL: $ISL , OSL: $OSL , MAX_SEQ_LEN: $MAX_SEQ_LEN "
74+ echo " SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:- none} "
75+ echo " SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]} "
3876
3977start_gpu_monitor
4078
@@ -43,15 +81,14 @@ python3 -m sglang.launch_server \
4381 --model " $MODEL " \
4482 --host 0.0.0.0 \
4583 --port " $PORT " \
46- --tp " $TP " \
47- --expert-parallel-size " $EP_SIZE " \
84+ " ${PARALLEL_ARGS[@]} " \
4885 --reasoning-parser qwen3 \
4986 --tool-call-parser qwen3_coder \
5087 --enable-flashinfer-allreduce-fusion \
51- --max-running-requests 64 \
52- --chunked-prefill-size 8192 \
88+ --max-running-requests 256 \
89+ --chunked-prefill-size 16384 \
5390 --decode-log-interval 1 \
54- --mem-fraction-static 0.75 \
91+ --mem-fraction-static 0.8 \
5592 --cuda-graph-max-bs " $CONC " \
5693 --context-length " $MAX_SEQ_LEN " \
5794 --kv-cache-dtype fp8_e4m3 \
@@ -61,7 +98,9 @@ python3 -m sglang.launch_server \
6198 --tokenizer-worker-num 6 \
6299 --mamba-ssm-dtype bfloat16 \
63100 --disable-radix-cache \
101+ --enable-symm-mem \
64102 --trust-remote-code \
103+ " ${SCHEDULER_ARGS[@]} " \
65104 > " $SERVER_LOG " 2>&1 &
66105
67106SERVER_PID=$!
0 commit comments