Skip to content

Commit 29fb678

Browse files
Use TEP for Qwen H100 high concurrency
1 parent 0607969 commit 29fb678

3 files changed

Lines changed: 8 additions & 6 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9238,13 +9238,13 @@ qwen3.5-fp8-h100-sglang:
92389238
search-space:
92399239
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
92409240
- { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
9241-
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
9241+
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
92429242
- isl: 8192
92439243
osl: 1024
92449244
search-space:
92459245
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
92469246
- { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
9247-
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
9247+
- { tp: 8, ep: 8, conc-start: 128, conc-end: 256 }
92489248

92499249
qwen3.5-fp8-h100-sglang-mtp:
92509250
image: lmsysorg/sglang:v0.5.12-cu130

benchmarks/single_node/fixed_seq_len/qwen3.5_fp8_h100.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#!/usr/bin/env bash
22

33
# Qwen-3.5-397B-A17B FP8 on H100 via sglang.
4-
# Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64,
5-
# and TP8/EP8 with DP attention at conc 128-256.
4+
# Uses TP8/EP1 at conc 1-8 and TP8/EP8 at conc 16-256.
65

76
source "$(dirname "$0")/../../benchmark_lib.sh"
87

@@ -55,6 +54,9 @@ if [ "${DP_ATTENTION}" != "true" ]; then
5554
64)
5655
SCHEDULER_RECV_INTERVAL=600
5756
;;
57+
128|256)
58+
SCHEDULER_RECV_INTERVAL=1920
59+
;;
5860
*)
5961
echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2
6062
exit 1

perf-changelog.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3449,7 +3449,7 @@
34493449
- qwen3.5-fp8-h100-sglang
34503450
description:
34513451
- "Tune Qwen3.5-397B-A17B-FP8 H100 SGLang aggregate recipe for 1k/1k and 8k/1k sweeps"
3452-
- "Use TP8/EP1 for conc 1-8, TP8/EP8 for conc 16-64, and TP8/EP8 DP-attention for conc 128-256"
3453-
- "Use scheduler-recv-interval values 2/60/30/1200/600 for non-DP conc 1-4/8/16/32/64"
3452+
- "Use TP8/EP1 for conc 1-8 and TP8/EP8 for conc 16-256"
3453+
- "Use scheduler-recv-interval values 2/60/30/1200/600/1920 for conc 1-4/8/16/32/64/128-256"
34543454
- "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
34553455
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544

0 commit comments

Comments
 (0)