Skip to content

Commit 6715c7a

Browse files
Oseltamivirclaude
andcommitted
bench: split dsv4 gb300 sglang mtp 8p1d into 40960 + 65536 single-conc jobs
Run two separate slurm allocations instead of one combined two-conc bench so the second conc is not killed by the per-allocation wall budget. The 65536 recipe also lowers num_prompts_mult to 2 and num_warmup_mult to 1 to keep total work within the wall time at the higher concurrency. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 4e2658e commit 6715c7a

3 files changed

Lines changed: 177 additions & 3 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8724,9 +8724,9 @@ dsv4-fp4-gb300-dynamo-sglang-mtp3:
87248724
tp: 16
87258725
ep: 16
87268726
dp-attn: true
8727-
# Mid curve 8p1d-dep8-dep8. 18 nodes.
8727+
# Mid curve 8p1d-dep8-dep8. 18 nodes. Conc 40960 (file kept as c32768.yaml).
87288728
- spec-decoding: mtp
8729-
conc-list: [40960, 65536]
8729+
conc-list: [40960]
87308730
prefill:
87318731
num-worker: 8
87328732
tp: 8
@@ -8739,6 +8739,21 @@ dsv4-fp4-gb300-dynamo-sglang-mtp3:
87398739
tp: 8
87408740
ep: 8
87418741
dp-attn: true
8742+
# Same shape, conc 65536. Lower num_prompts_mult/warmup so it fits the wall budget.
8743+
- spec-decoding: mtp
8744+
conc-list: [65536]
8745+
prefill:
8746+
num-worker: 8
8747+
tp: 8
8748+
ep: 8
8749+
dp-attn: true
8750+
additional-settings:
8751+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-8p1d-dep8-dep8-mtp-c65536.yaml"
8752+
decode:
8753+
num-worker: 1
8754+
tp: 8
8755+
ep: 8
8756+
dp-attn: true
87428757
- spec-decoding: mtp
87438758
conc-list: [16384]
87448759
prefill:

benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-8p1d-dep8-dep8-mtp-c32768.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ benchmark:
151151
isl: 8192
152152
osl: 256
153153
random_range_ratio: 1.0
154-
concurrencies: "40960x65536"
154+
concurrencies: "40960"
155155
req_rate: "inf"
156156
use_chat_template: true
157157
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-8p1d-dep8-dep8-mtp-c65536"
2+
3+
frontend:
4+
type: dynamo
5+
enable_multiple_frontends: true
6+
num_additional_frontends: 8
7+
8+
dynamo:
9+
hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
10+
install: true
11+
12+
model:
13+
path: "deepseek-v4-pro"
14+
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
15+
precision: "mxfp4"
16+
17+
sbatch_directives:
18+
cpus-per-task: "144"
19+
mem: "0"
20+
21+
resources:
22+
gpu_type: "gb300"
23+
gpus_per_node: 4
24+
prefill_nodes: 16
25+
prefill_workers: 8
26+
gpus_per_prefill: 8
27+
decode_nodes: 2
28+
decode_workers: 1
29+
gpus_per_decode: 8
30+
31+
backend:
32+
type: sglang
33+
34+
prefill_environment:
35+
PYTHONUNBUFFERED: "1"
36+
SGLANG_RADIX_DISABLE_REUSE: "1"
37+
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
38+
SGLANG_DEFAULT_THINKING: "1"
39+
SGLANG_DSV4_REASONING_EFFORT: "max"
40+
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
41+
SGLANG_OPT_USE_JIT_NORM: "1"
42+
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
43+
SGLANG_OPT_USE_TOPK_V2: "1"
44+
45+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
46+
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
47+
SGLANG_OPT_USE_FAST_MASK_EP: "1"
48+
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
49+
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
50+
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
51+
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
52+
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
53+
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
54+
55+
NCCL_MNNVL_ENABLE: "1"
56+
NCCL_CUMEM_ENABLE: "1"
57+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
58+
MC_FORCE_MNNVL: "1"
59+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
60+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
61+
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
62+
63+
decode_environment:
64+
PYTHONUNBUFFERED: "1"
65+
SGLANG_RADIX_DISABLE_REUSE: "1"
66+
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
67+
SGLANG_DEFAULT_THINKING: "1"
68+
SGLANG_DSV4_REASONING_EFFORT: "max"
69+
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
70+
SGLANG_OPT_USE_JIT_NORM: "1"
71+
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
72+
SGLANG_OPT_USE_TOPK_V2: "1"
73+
74+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
75+
SGLANG_OPT_USE_FAST_MASK_EP: "1"
76+
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
77+
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
78+
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
79+
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
80+
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
81+
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
82+
83+
NCCL_MNNVL_ENABLE: "1"
84+
NCCL_CUMEM_ENABLE: "1"
85+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
86+
MC_FORCE_MNNVL: "1"
87+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
88+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
89+
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
90+
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only.
91+
92+
sglang_config:
93+
prefill:
94+
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
95+
model-path: "/model/"
96+
trust-remote-code: true
97+
tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec.
98+
99+
disaggregation-mode: "prefill"
100+
disaggregation-transfer-backend: mooncake
101+
102+
tensor-parallel-size: 8
103+
data-parallel-size: 8
104+
expert-parallel-size: 8
105+
106+
enable-dp-attention: true
107+
enable-dp-lm-head: true
108+
109+
moe-a2a-backend: "deepep"
110+
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
111+
112+
mem-fraction-static: 0.9
113+
max-running-requests: 1024
114+
cuda-graph-max-bs: 1024
115+
chunked-prefill-size: 32768
116+
stream-interval: 60
117+
118+
decode:
119+
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
120+
model-path: "/model/"
121+
trust-remote-code: true
122+
tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec.
123+
124+
disaggregation-mode: "decode"
125+
disaggregation-transfer-backend: mooncake
126+
127+
tensor-parallel-size: 8
128+
data-parallel-size: 8
129+
expert-parallel-size: 8
130+
131+
enable-dp-attention: true
132+
enable-dp-lm-head: true
133+
134+
moe-a2a-backend: "deepep"
135+
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
136+
137+
speculative-algo: "EAGLE"
138+
speculative-num-steps: 3
139+
speculative-eagle-topk: 1
140+
speculative-num-draft-tokens: 4
141+
142+
mem-fraction-static: 0.9
143+
max-running-requests: 3072
144+
cuda-graph-max-bs: 1024
145+
swa-full-tokens-ratio: 0.15
146+
context-length: 16384
147+
stream-interval: 60
148+
149+
benchmark:
150+
type: "sa-bench"
151+
isl: 8192
152+
osl: 256
153+
random_range_ratio: 1.0
154+
concurrencies: "65536"
155+
num_prompts_mult: 2
156+
num_warmup_mult: 1
157+
req_rate: "inf"
158+
use_chat_template: true
159+
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

0 commit comments

Comments
 (0)