Skip to content

Commit 21edeff

Browse files
Tune H100 Qwen SGLang Pareto recipe
1 parent 70bf3a2 commit 21edeff

3 files changed

Lines changed: 65 additions & 13 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9210,11 +9210,15 @@ qwen3.5-fp8-h100-sglang:
92109210
- isl: 1024
92119211
osl: 1024
92129212
search-space:
9213-
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
9213+
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
9214+
- { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
9215+
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
92149216
- isl: 8192
92159217
osl: 1024
92169218
search-space:
9217-
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
9219+
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
9220+
- { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
9221+
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
92189222

92199223
qwen3.5-fp8-h100-sglang-mtp:
92209224
image: lmsysorg/sglang:v0.5.12-cu130

benchmarks/single_node/qwen3.5_fp8_h100.sh

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
#!/usr/bin/env bash
22

33
# Qwen-3.5-397B-A17B FP8 on H100 via sglang.
4-
# Mirrors qwen3.5_fp8_h200.sh but with tighter memory accommodations:
5-
# H100 has 80GB HBM3 vs H200's 141GB HBM3e, so weights + KV cache fit
6-
# more snugly. Mem-fraction-static lowered from 0.8 → 0.75 and
7-
# chunked-prefill-size from 16384 → 8192 to leave more headroom.
8-
# Sweep tops out at conc=32 instead of 64 for the same reason.
4+
# Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64,
5+
# and TP8/EP8 with DP attention at conc 128-256.
96

107
source "$(dirname "$0")/../benchmark_lib.sh"
118

129
check_env_vars \
1310
MODEL \
1411
TP \
12+
DP_ATTENTION \
1513
CONC \
1614
ISL \
1715
OSL \
@@ -35,7 +33,47 @@ if [ "${EVAL_ONLY}" = "true" ]; then
3533
MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN"
3634
fi
3735

38-
echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
36+
PARALLEL_ARGS=(--tp "$TP")
37+
if [ "${EP_SIZE}" -gt 1 ]; then
38+
PARALLEL_ARGS+=(--expert-parallel-size "$EP_SIZE")
39+
fi
40+
41+
SCHEDULER_RECV_INTERVAL=
42+
if [ "${DP_ATTENTION}" != "true" ]; then
43+
case "$CONC" in
44+
1|2|4)
45+
SCHEDULER_RECV_INTERVAL=2
46+
;;
47+
8)
48+
SCHEDULER_RECV_INTERVAL=60
49+
;;
50+
16)
51+
SCHEDULER_RECV_INTERVAL=30
52+
;;
53+
32)
54+
SCHEDULER_RECV_INTERVAL=1200
55+
;;
56+
64)
57+
SCHEDULER_RECV_INTERVAL=600
58+
;;
59+
*)
60+
echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2
61+
exit 1
62+
;;
63+
esac
64+
fi
65+
66+
SCHEDULER_ARGS=()
67+
if [ -n "$SCHEDULER_RECV_INTERVAL" ]; then
68+
SCHEDULER_ARGS=(--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL")
69+
fi
70+
if [ "${DP_ATTENTION}" = "true" ]; then
71+
PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention)
72+
fi
73+
74+
echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
75+
echo "SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:-none}"
76+
echo "SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]}"
3977

4078
start_gpu_monitor
4179

@@ -44,15 +82,14 @@ python3 -m sglang.launch_server \
4482
--model "$MODEL" \
4583
--host 0.0.0.0 \
4684
--port "$PORT" \
47-
--tp "$TP" \
48-
--expert-parallel-size "$EP_SIZE" \
85+
"${PARALLEL_ARGS[@]}" \
4986
--reasoning-parser qwen3 \
5087
--tool-call-parser qwen3_coder \
5188
--enable-flashinfer-allreduce-fusion \
52-
--max-running-requests 64 \
53-
--chunked-prefill-size 8192 \
89+
--max-running-requests 256 \
90+
--chunked-prefill-size 16384 \
5491
--decode-log-interval 1 \
55-
--mem-fraction-static 0.75 \
92+
--mem-fraction-static 0.8 \
5693
--cuda-graph-max-bs "$CONC" \
5794
--context-length "$MAX_SEQ_LEN" \
5895
--kv-cache-dtype fp8_e4m3 \
@@ -62,7 +99,9 @@ python3 -m sglang.launch_server \
6299
--tokenizer-worker-num 6 \
63100
--mamba-ssm-dtype bfloat16 \
64101
--disable-radix-cache \
102+
--enable-symm-mem \
65103
--trust-remote-code \
104+
"${SCHEDULER_ARGS[@]}" \
66105
> "$SERVER_LOG" 2>&1 &
67106

68107
SERVER_PID=$!

perf-changelog.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2850,6 +2850,15 @@
28502850
- "Add Qwen-3.5-397B-A17B FP8 sglang recipes (off + MTP/EAGLE) for H100 on lmsysorg/sglang:v0.5.12-cu130"
28512851
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1509
28522852

2853+
- config-keys:
2854+
- qwen3.5-fp8-h100-sglang
2855+
description:
2856+
- "Tune Qwen3.5-397B-A17B-FP8 H100 SGLang aggregate recipe for 1k/1k and 8k/1k sweeps"
2857+
- "Use TP8/EP1 for conc 1-8, TP8/EP8 for conc 16-64, and TP8/EP8 DP-attention for conc 128-256"
2858+
- "Use scheduler-recv-interval values 2/60/30/1200/600 for non-DP conc 1-4/8/16/32/64"
2859+
- "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
2860+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544
2861+
28532862
- config-keys:
28542863
- dsr1-fp8-mi325x-sglang-mtp
28552864
description:

0 commit comments

Comments
 (0)