Skip to content

Commit 0607969

Browse files
Tune H100 Qwen SGLang Pareto recipe
1 parent eb8350e commit 0607969

3 files changed

Lines changed: 65 additions & 13 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9236,11 +9236,15 @@ qwen3.5-fp8-h100-sglang:
92369236
- isl: 1024
92379237
osl: 1024
92389238
search-space:
9239-
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
9239+
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
9240+
- { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
9241+
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
92409242
- isl: 8192
92419243
osl: 1024
92429244
search-space:
9243-
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
9245+
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
9246+
- { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
9247+
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
92449248

92459249
qwen3.5-fp8-h100-sglang-mtp:
92469250
image: lmsysorg/sglang:v0.5.12-cu130

benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100.sh

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
#!/usr/bin/env bash
22

33
# Qwen-3.5-397B-A17B FP8 on H100 via sglang.
4-
# Mirrors qwen3.5_fp8_h200.sh but with tighter memory accommodations:
5-
# H100 has 80GB HBM3 vs H200's 141GB HBM3e, so weights + KV cache fit
6-
# more snugly. Mem-fraction-static lowered from 0.8 → 0.75 and
7-
# chunked-prefill-size from 16384 → 8192 to leave more headroom.
8-
# Sweep tops out at conc=32 instead of 64 for the same reason.
4+
# Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64,
5+
# and TP8/EP8 with DP attention at conc 128-256.
96

107
source "$(dirname "$0")/../../benchmark_lib.sh"
118

129
check_env_vars \
1310
MODEL \
1411
TP \
12+
DP_ATTENTION \
1513
CONC \
1614
ISL \
1715
OSL \
@@ -34,7 +32,47 @@ if [ "${EVAL_ONLY}" = "true" ]; then
3432
MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN"
3533
fi
3634

37-
echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
35+
PARALLEL_ARGS=(--tp "$TP")
36+
if [ "${EP_SIZE}" -gt 1 ]; then
37+
PARALLEL_ARGS+=(--expert-parallel-size "$EP_SIZE")
38+
fi
39+
40+
SCHEDULER_RECV_INTERVAL=
41+
if [ "${DP_ATTENTION}" != "true" ]; then
42+
case "$CONC" in
43+
1|2|4)
44+
SCHEDULER_RECV_INTERVAL=2
45+
;;
46+
8)
47+
SCHEDULER_RECV_INTERVAL=60
48+
;;
49+
16)
50+
SCHEDULER_RECV_INTERVAL=30
51+
;;
52+
32)
53+
SCHEDULER_RECV_INTERVAL=1200
54+
;;
55+
64)
56+
SCHEDULER_RECV_INTERVAL=600
57+
;;
58+
*)
59+
echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2
60+
exit 1
61+
;;
62+
esac
63+
fi
64+
65+
SCHEDULER_ARGS=()
66+
if [ -n "$SCHEDULER_RECV_INTERVAL" ]; then
67+
SCHEDULER_ARGS=(--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL")
68+
fi
69+
if [ "${DP_ATTENTION}" = "true" ]; then
70+
PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention)
71+
fi
72+
73+
echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
74+
echo "SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:-none}"
75+
echo "SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]}"
3876

3977
start_gpu_monitor
4078

@@ -43,15 +81,14 @@ python3 -m sglang.launch_server \
4381
--model "$MODEL" \
4482
--host 0.0.0.0 \
4583
--port "$PORT" \
46-
--tp "$TP" \
47-
--expert-parallel-size "$EP_SIZE" \
84+
"${PARALLEL_ARGS[@]}" \
4885
--reasoning-parser qwen3 \
4986
--tool-call-parser qwen3_coder \
5087
--enable-flashinfer-allreduce-fusion \
51-
--max-running-requests 64 \
52-
--chunked-prefill-size 8192 \
88+
--max-running-requests 256 \
89+
--chunked-prefill-size 16384 \
5390
--decode-log-interval 1 \
54-
--mem-fraction-static 0.75 \
91+
--mem-fraction-static 0.8 \
5592
--cuda-graph-max-bs "$CONC" \
5693
--context-length "$MAX_SEQ_LEN" \
5794
--kv-cache-dtype fp8_e4m3 \
@@ -61,7 +98,9 @@ python3 -m sglang.launch_server \
6198
--tokenizer-worker-num 6 \
6299
--mamba-ssm-dtype bfloat16 \
63100
--disable-radix-cache \
101+
--enable-symm-mem \
64102
--trust-remote-code \
103+
"${SCHEDULER_ARGS[@]}" \
65104
> "$SERVER_LOG" 2>&1 &
66105

67106
SERVER_PID=$!

perf-changelog.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3444,3 +3444,12 @@
34443444
- "Add MiniMax-M2.5 NVFP4 GB300 disaggregated multinode vLLM benchmarks via Dynamo"
34453445
- "Add 1k1k/8k1k minimax recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/"
34463446
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1641
3447+
3448+
- config-keys:
3449+
- qwen3.5-fp8-h100-sglang
3450+
description:
3451+
- "Tune Qwen3.5-397B-A17B-FP8 H100 SGLang aggregate recipe for 1k/1k and 8k/1k sweeps"
3452+
- "Use TP8/EP1 for conc 1-8, TP8/EP8 for conc 16-64, and TP8/EP8 DP-attention for conc 128-256"
3453+
- "Use scheduler-recv-interval values 2/60/30/1200/600 for non-DP conc 1-4/8/16/32/64"
3454+
- "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
3455+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544

0 commit comments

Comments
 (0)