11#! /usr/bin/env bash
22
33# Qwen-3.5-397B-A17B FP8 on H100 via sglang.
4- # Mirrors qwen3.5_fp8_h200.sh but with tighter memory accommodations:
5- # H100 has 80GB HBM3 vs H200's 141GB HBM3e, so weights + KV cache fit
6- # more snugly. Mem-fraction-static lowered from 0.8 → 0.75 and
7- # chunked-prefill-size from 16384 → 8192 to leave more headroom.
8- # Sweep tops out at conc=32 instead of 64 for the same reason.
4+ # Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64,
5+ # and TP8/EP8 with DP attention at conc 128-256.
96
107source " $( dirname " $0 " ) /../benchmark_lib.sh"
118
129check_env_vars \
1310 MODEL \
1411 TP \
12+ DP_ATTENTION \
1513 CONC \
1614 ISL \
1715 OSL \
@@ -35,7 +33,47 @@ if [ "${EVAL_ONLY}" = "true" ]; then
3533 MAX_SEQ_LEN=" $EVAL_MAX_MODEL_LEN "
3634fi
3735
38- echo " CONC: $CONC , ISL: $ISL , OSL: $OSL , MAX_SEQ_LEN: $MAX_SEQ_LEN "
36+ PARALLEL_ARGS=(--tp " $TP " )
37+ if [ " ${EP_SIZE} " -gt 1 ]; then
38+ PARALLEL_ARGS+=(--expert-parallel-size " $EP_SIZE " )
39+ fi
40+
41+ SCHEDULER_RECV_INTERVAL=
42+ if [ " ${DP_ATTENTION} " != " true" ]; then
43+ case " $CONC " in
44+ 1|2|4)
45+ SCHEDULER_RECV_INTERVAL=2
46+ ;;
47+ 8)
48+ SCHEDULER_RECV_INTERVAL=60
49+ ;;
50+ 16)
51+ SCHEDULER_RECV_INTERVAL=30
52+ ;;
53+ 32)
54+ SCHEDULER_RECV_INTERVAL=1200
55+ ;;
56+ 64)
57+ SCHEDULER_RECV_INTERVAL=600
58+ ;;
59+ * )
60+ echo " Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2
61+ exit 1
62+ ;;
63+ esac
64+ fi
65+
66+ SCHEDULER_ARGS=()
67+ if [ -n " $SCHEDULER_RECV_INTERVAL " ]; then
68+ SCHEDULER_ARGS=(--scheduler-recv-interval " $SCHEDULER_RECV_INTERVAL " )
69+ fi
70+ if [ " ${DP_ATTENTION} " = " true" ]; then
71+ PARALLEL_ARGS+=(--dp-size " $TP " --enable-dp-attention)
72+ fi
73+
74+ echo " TP: $TP , EP_SIZE: $EP_SIZE , DP_ATTENTION: $DP_ATTENTION , CONC: $CONC , ISL: $ISL , OSL: $OSL , MAX_SEQ_LEN: $MAX_SEQ_LEN "
75+ echo " SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:- none} "
76+ echo " SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]} "
3977
4078start_gpu_monitor
4179
@@ -44,15 +82,14 @@ python3 -m sglang.launch_server \
4482 --model " $MODEL " \
4583 --host 0.0.0.0 \
4684 --port " $PORT " \
47- --tp " $TP " \
48- --expert-parallel-size " $EP_SIZE " \
85+ " ${PARALLEL_ARGS[@]} " \
4986 --reasoning-parser qwen3 \
5087 --tool-call-parser qwen3_coder \
5188 --enable-flashinfer-allreduce-fusion \
52- --max-running-requests 64 \
53- --chunked-prefill-size 8192 \
89+ --max-running-requests 256 \
90+ --chunked-prefill-size 16384 \
5491 --decode-log-interval 1 \
55- --mem-fraction-static 0.75 \
92+ --mem-fraction-static 0.8 \
5693 --cuda-graph-max-bs " $CONC " \
5794 --context-length " $MAX_SEQ_LEN " \
5895 --kv-cache-dtype fp8_e4m3 \
@@ -62,7 +99,9 @@ python3 -m sglang.launch_server \
6299 --tokenizer-worker-num 6 \
63100 --mamba-ssm-dtype bfloat16 \
64101 --disable-radix-cache \
102+ --enable-symm-mem \
65103 --trust-remote-code \
104+ " ${SCHEDULER_ARGS[@]} " \
66105 > " $SERVER_LOG " 2>&1 &
67106
68107SERVER_PID=$!
0 commit comments