Skip to content

Commit 74f1a45

Browse files
committed
Add DSV4 GB300 wide-EP sweep configs, remove dominated old configs
Add 5 wide-EP sweep configs (EP=12/16/24/32/40) from Weiliang, remove 4 old dominated configs that are no longer on the frontier.
1 parent 437e01a commit 74f1a45

9 files changed

Lines changed: 289 additions & 206 deletions

.github/configs/amd-master.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1128,7 +1128,7 @@ gptoss-fp4-mi325x-vllm:
11281128

11291129
gptoss-fp4-mi355x-vllm:
11301130
image: vllm/vllm-openai-rocm:v0.22.0
1131-
model: openai/gpt-oss-120b
1131+
model: amd/gpt-oss-120b-w-mxfp4-a-fp8
11321132
model-prefix: gptoss
11331133
runner: mi355x
11341134
precision: fp4

.github/configs/nvidia-master.yaml

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1756,7 +1756,7 @@ dsv4-fp4-b200-sglang:
17561756
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
17571757

17581758
dsv4-fp4-b200-vllm:
1759-
image: vllm/vllm-openai:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
1759+
image: vllm/vllm-openai:v0.22.0
17601760
model: deepseek-ai/DeepSeek-V4-Pro
17611761
model-prefix: dsv4
17621762
runner: b200-dsv4
@@ -8781,75 +8781,90 @@ dsv4-fp4-gb300-dynamo-sglang:
87818781
tp: 16
87828782
ep: 16
87838783
dp-attn: true
8784-
# WideEP TP=16 decode: 4p1d-dep4-dep16. 8 nodes.
8785-
- conc-list: [1024]
8784+
# Low concurrency: 1p1d-tp4-tp4. 2 nodes.
8785+
- conc-list: [1]
87868786
prefill:
8787-
num-worker: 4
8787+
num-worker: 1
8788+
tp: 4
8789+
ep: 1
8790+
dp-attn: false
8791+
additional-settings:
8792+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml"
8793+
decode:
8794+
num-worker: 1
8795+
tp: 4
8796+
ep: 1
8797+
dp-attn: false
8798+
# --- Weiliang wide-EP sweep (srt-slurm PR#173), 18 nodes total ---
8799+
# EP=12: 15P+3D, conc=12000.
8800+
- conc-list: [12000]
8801+
prefill:
8802+
num-worker: 15
87888803
tp: 4
87898804
ep: 4
87908805
dp-attn: true
87918806
additional-settings:
8792-
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml"
8807+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-15p1d-dep4-dep12-18-c12000.yaml"
87938808
decode:
87948809
num-worker: 1
8795-
tp: 16
8796-
ep: 16
8810+
tp: 12
8811+
ep: 12
87978812
dp-attn: true
8798-
# WideEP TP=16 decode: 8p1d-dep4-dep16. 12 nodes.
8799-
- conc-list: [4096]
8813+
# EP=16: 14P+4D, conc=8192.
8814+
- conc-list: [8192]
88008815
prefill:
8801-
num-worker: 8
8816+
num-worker: 14
88028817
tp: 4
88038818
ep: 4
88048819
dp-attn: true
88058820
additional-settings:
8806-
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml"
8821+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-18-c8192.yaml"
88078822
decode:
88088823
num-worker: 1
88098824
tp: 16
88108825
ep: 16
88118826
dp-attn: true
8812-
# Low concurrency: 1p1d-tp4-tp4. 2 nodes.
8813-
- conc-list: [1]
8827+
# EP=24: 12P+6D, conc=3000.
8828+
- conc-list: [3000]
88148829
prefill:
8815-
num-worker: 1
8830+
num-worker: 12
88168831
tp: 4
8817-
ep: 1
8818-
dp-attn: false
8832+
ep: 4
8833+
dp-attn: true
88198834
additional-settings:
8820-
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml"
8835+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep24-18-c3000.yaml"
88218836
decode:
88228837
num-worker: 1
8823-
tp: 4
8824-
ep: 1
8825-
dp-attn: false
8826-
# Mid concurrency: 10p1d-dep4-dep16. 14 nodes.
8827-
- conc-list: [8192]
8838+
tp: 24
8839+
ep: 24
8840+
dp-attn: true
8841+
# EP=32: 10P+8D, conc=2500.
8842+
- conc-list: [2500]
88288843
prefill:
88298844
num-worker: 10
88308845
tp: 4
88318846
ep: 4
88328847
dp-attn: true
88338848
additional-settings:
8834-
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml"
8849+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml"
88358850
decode:
88368851
num-worker: 1
8837-
tp: 16
8838-
ep: 16
8852+
tp: 32
8853+
ep: 32
88398854
dp-attn: true
8840-
# Max concurrency: 12p1d-dep4-dep12. 15 nodes.
8841-
- conc-list: [21504]
8855+
# EP=40: 8P+10D, conc=2048.
8856+
- conc-list: [2048]
88428857
prefill:
8843-
num-worker: 12
8858+
num-worker: 8
88448859
tp: 4
88458860
ep: 4
88468861
dp-attn: true
88478862
additional-settings:
8848-
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml"
8863+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep40-18-c2048.yaml"
88498864
decode:
88508865
num-worker: 1
8851-
tp: 12
8852-
ep: 12
8866+
tp: 40
8867+
ep: 40
88538868
dp-attn: true
88548869

88558870
glm5-fp8-b200-dynamo-sglang:

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml renamed to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml

Lines changed: 22 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,8 @@
1-
name: "disagg-gb300-4p1d-dep4-dep16-8-c1024"
2-
3-
# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
4-
#
5-
# Schema/values come from PR #1213 (513cbef) — that PR introduced the
6-
# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
7-
# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
8-
# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
9-
# support either: `zip_override_*_hightpt` rejects with `Unknown field`
10-
# and `benchmark` only validates at top level. So this file inlines the
11-
# wideep [0] override and lifts `benchmark` back out — same operational
12-
# values, schema the pinned srtctl will accept.
13-
#
14-
# Other adjustments back to the InferenceX cluster shape: container &
15-
# model.path restored to the aliases mapped in launch_gb300.sh's
16-
# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
17-
# `deepseek-v4-pro`); `dynamo.install: true` added so the container
18-
# (which has no dynamo baked in) installs from the pinned hash.
19-
#
20-
# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
21-
# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
22-
# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
23-
# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
24-
# showed only `expert_location_dispatch.py` topk_ids int32 cast is an
25-
# active runtime diff vs container sglang; other patched files are
26-
# env-gated dead code under the same SGLANG_OPT_* flags this yaml
27-
# already sets.
28-
#
29-
# DG-related env intentionally diverged (DG cache path is host-specific):
30-
# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
31-
# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
32-
# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
1+
name: "disagg-gb300-10p1d-dep4-dep32-18-c2500"
2+
3+
# Weiliang wide-EP sweep point: EP=32, 10P+8D = 18 nodes, conc=2500.
4+
# Matches srt-slurm PR#173 zip_override EP=32 topology.
5+
# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image).
336

347
model:
358
path: "deepseek-v4-pro"
@@ -50,18 +23,19 @@ sbatch_directives:
5023
resources:
5124
gpu_type: "gb300"
5225
gpus_per_node: 4
53-
prefill_nodes: 4
54-
prefill_workers: 4
26+
prefill_nodes: 10
27+
prefill_workers: 10
5528
gpus_per_prefill: 4
56-
decode_nodes: 4
29+
decode_nodes: 8
5730
decode_workers: 1
58-
gpus_per_decode: 16
31+
gpus_per_decode: 32
5932

6033
frontend:
6134
type: dynamo
62-
enable_multiple_frontends: false
35+
enable_multiple_frontends: true
36+
num_additional_frontends: 8
6337
env:
64-
DYN_ROUTER_LOAD_BLOCK_SIZE: "1"
38+
DYN_ROUTER_LOAD_BLOCK_SIZE: "1"
6539
args:
6640
router-mode: "kv"
6741
router-kv-overlap-score-weight: 0
@@ -120,7 +94,6 @@ backend:
12094
SGLANG_LOG_FORWARD_ITERS: "1"
12195
SGLANG_LOG_MS: "1"
12296
SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
123-
# is single-node only and corrupts results in 2-node decode setups.
12497

12598
sglang_config:
12699
prefill:
@@ -141,9 +114,11 @@ backend:
141114

142115
disaggregation-mode: "prefill"
143116
disaggregation-transfer-backend: mooncake
117+
enable-dp-lm-head: true
144118

145119
mem-fraction-static: 0.90
146120
max-running-requests: 512
121+
cuda-graph-max-bs: 512
147122
chunked-prefill-size: 32768
148123

149124
decode:
@@ -153,30 +128,31 @@ backend:
153128
skip-tokenizer-init: true
154129
stream-interval: 60
155130

156-
load-balance-method: "total_requests"
157131
moe-a2a-backend: "megamoe"
158132

133+
moe-dense-tp-size: 1
134+
159135
disaggregation-mode: "decode"
160136
disaggregation-transfer-backend: mooncake
161137
disaggregation-decode-polling-interval: 8
162138

163139
mem-fraction-static: 0.94
164-
swa-full-tokens-ratio: 0.056
140+
swa-full-tokens-ratio: 0.20
165141
context-length: 9216
166-
tensor-parallel-size: 16
167-
data-parallel-size: 16
168-
expert-parallel-size: 16
142+
tensor-parallel-size: 32
143+
data-parallel-size: 32
144+
expert-parallel-size: 32
169145
enable-dp-attention: true
170146
enable-dp-lm-head: true
171-
max-running-requests: 21504
147+
max-running-requests: 18432
172148
cuda-graph-max-bs: 1280
173149

174150

175151
benchmark:
176152
type: "sa-bench"
177153
isl: 8192
178154
osl: 1024
179-
concurrencies: "1024"
155+
concurrencies: "2500"
180156
req_rate: "inf"
181157
use_chat_template: false
182158
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

0 commit comments

Comments
 (0)