Skip to content

Commit 8407412

Browse files
Oseltamivirclaude
andcommitted
bench: add gb200 sglang mtp3 aggregated sweep (6 conc points)
Disaggregated GB200 SGLang MTP3 produces 0 output tokens across all concurrency points because dynamo's multi-node prefill registration only completes for node-rank-0 DP ranks. Both mooncake and nixl transports fail at the same SGLang code path (decode._update_handshake_waiters). NGC prebuilt image isn't a viable swap either: upstream's NGC config drops MTP, DP attention, and EP, plus requires RDMA we don't have. Add a parallel aggregated sweep that bypasses the dynamo disagg registration entirely. Single agg worker per recipe (TP=8 across 2 nodes), DP attention + EAGLE MTP, lmsysorg nightly image with CAR_V2 disabled (TP spans 2 nodes). max_running_requests scaled per conc; above ~1024 the worker queues server-side, so high-conc throughput numbers reflect single-pool capacity rather than disagg-style scaling. decode num-worker=0 in the matrix entries signals aggregated to the workflow. The 7 broken disagg entries remain so the matrix can be dispatched as-is for side-by-side comparison once disagg is fixed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent f8a6681 commit 8407412

7 files changed

Lines changed: 712 additions & 0 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8342,6 +8342,97 @@ dsv4-fp4-gb200-dynamo-sglang-mtp3:
83428342
ep: 8
83438343
dp-attn: true
83448344

8345+
# Aggregated fallback sweep: single agg worker, TP=8 across 2 nodes,
8346+
# DP attention + EAGLE MTP. Sidesteps the dynamo multi-node prefill
8347+
# registration bug that makes the disagg entries above produce 0
8348+
# output tokens on GB200 (see runs 25785003012, 25812320128). decode
8349+
# num-worker=0 signals aggregated. High-conc points queue server-side
8350+
# since there's only one worker pool.
8351+
- spec-decoding: mtp
8352+
conc-list: [512]
8353+
prefill:
8354+
num-worker: 1
8355+
tp: 8
8356+
ep: 8
8357+
dp-attn: true
8358+
additional-settings:
8359+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/agg-gb200-tp8-dep8-mtp-c512.yaml"
8360+
decode:
8361+
num-worker: 0
8362+
tp: 8
8363+
ep: 8
8364+
dp-attn: true
8365+
- spec-decoding: mtp
8366+
conc-list: [2048]
8367+
prefill:
8368+
num-worker: 1
8369+
tp: 8
8370+
ep: 8
8371+
dp-attn: true
8372+
additional-settings:
8373+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/agg-gb200-tp8-dep8-mtp-c2048.yaml"
8374+
decode:
8375+
num-worker: 0
8376+
tp: 8
8377+
ep: 8
8378+
dp-attn: true
8379+
- spec-decoding: mtp
8380+
conc-list: [4096]
8381+
prefill:
8382+
num-worker: 1
8383+
tp: 8
8384+
ep: 8
8385+
dp-attn: true
8386+
additional-settings:
8387+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/agg-gb200-tp8-dep8-mtp-c4096.yaml"
8388+
decode:
8389+
num-worker: 0
8390+
tp: 8
8391+
ep: 8
8392+
dp-attn: true
8393+
- spec-decoding: mtp
8394+
conc-list: [8192]
8395+
prefill:
8396+
num-worker: 1
8397+
tp: 8
8398+
ep: 8
8399+
dp-attn: true
8400+
additional-settings:
8401+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/agg-gb200-tp8-dep8-mtp-c8192.yaml"
8402+
decode:
8403+
num-worker: 0
8404+
tp: 8
8405+
ep: 8
8406+
dp-attn: true
8407+
- spec-decoding: mtp
8408+
conc-list: [12288]
8409+
prefill:
8410+
num-worker: 1
8411+
tp: 8
8412+
ep: 8
8413+
dp-attn: true
8414+
additional-settings:
8415+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/agg-gb200-tp8-dep8-mtp-c12288.yaml"
8416+
decode:
8417+
num-worker: 0
8418+
tp: 8
8419+
ep: 8
8420+
dp-attn: true
8421+
- spec-decoding: mtp
8422+
conc-list: [16384]
8423+
prefill:
8424+
num-worker: 1
8425+
tp: 8
8426+
ep: 8
8427+
dp-attn: true
8428+
additional-settings:
8429+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/agg-gb200-tp8-dep8-mtp-c16384.yaml"
8430+
decode:
8431+
num-worker: 0
8432+
tp: 8
8433+
ep: 8
8434+
dp-attn: true
8435+
83458436
# MTP variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM 0.20.1 image
83468437
# and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm.
83478438
dsv4-fp4-gb200-dynamo-vllm-mtp2:
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
name: "dsv4-pro-gb200-agg-8k1k-tp8-dep8-mtp-c12288"
2+
3+
frontend:
4+
type: dynamo
5+
enable_multiple_frontends: true
6+
num_additional_frontends: 8
7+
8+
dynamo:
9+
hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
10+
install: true
11+
12+
# 100-min readiness wall (default 30 min) so the per-worker dynamo
13+
# source build has room to finish before health-poll gives up.
14+
health_check:
15+
max_attempts: 600
16+
interval_seconds: 10
17+
18+
model:
19+
path: "deepseek-v4-pro"
20+
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
21+
precision: "mxfp4"
22+
23+
sbatch_directives:
24+
cpus-per-task: "144"
25+
mem: "0"
26+
27+
# Note: single agg worker on 2 nodes. Concurrencies above ~1024 will queue
28+
# server-side rather than run truly concurrent; throughput numbers are
29+
# still meaningful but TTFT/E2EL will reflect queueing delay.
30+
resources:
31+
gpu_type: "gb200"
32+
gpus_per_node: 4
33+
agg_nodes: 2
34+
agg_workers: 1
35+
gpus_per_agg: 8
36+
37+
backend:
38+
type: sglang
39+
40+
aggregated_environment:
41+
PYTHONUNBUFFERED: "1"
42+
SGLANG_RADIX_DISABLE_REUSE: "1"
43+
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
44+
SGLANG_DEFAULT_THINKING: "1"
45+
SGLANG_DSV4_REASONING_EFFORT: "max"
46+
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
47+
SGLANG_OPT_USE_JIT_NORM: "1"
48+
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
49+
SGLANG_OPT_USE_TOPK_V2: "1"
50+
51+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
52+
SGLANG_OPT_USE_FAST_MASK_EP: "1"
53+
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
54+
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
55+
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
56+
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
57+
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
58+
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
59+
60+
NCCL_MNNVL_ENABLE: "1"
61+
NCCL_CUMEM_ENABLE: "1"
62+
MC_FORCE_MNNVL: "1"
63+
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
64+
# CAR_V2 is single-node only; TP=8 spans 2 nodes so we disable it.
65+
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0"
66+
67+
sglang_config:
68+
aggregated:
69+
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
70+
model-path: "/model/"
71+
trust-remote-code: true
72+
tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec.
73+
74+
tensor-parallel-size: 8
75+
data-parallel-size: 8
76+
expert-parallel-size: 8
77+
78+
enable-dp-attention: true
79+
enable-dp-lm-head: true
80+
81+
moe-a2a-backend: "deepep"
82+
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
83+
84+
speculative-algo: "EAGLE"
85+
speculative-num-steps: 3
86+
speculative-eagle-topk: 1
87+
speculative-num-draft-tokens: 4
88+
89+
mem-fraction-static: 0.85
90+
max-running-requests: 1024
91+
cuda-graph-max-bs: 1024
92+
swa-full-tokens-ratio: 0.15
93+
context-length: 16384
94+
stream-interval: 60
95+
chunked-prefill-size: 32768
96+
97+
benchmark:
98+
type: "sa-bench"
99+
isl: 8192
100+
osl: 256
101+
random_range_ratio: 1.0
102+
concurrencies: "12288"
103+
req_rate: "inf"
104+
use_chat_template: true
105+
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
name: "dsv4-pro-gb200-agg-8k1k-tp8-dep8-mtp-c16384"
2+
3+
frontend:
4+
type: dynamo
5+
enable_multiple_frontends: true
6+
num_additional_frontends: 8
7+
8+
dynamo:
9+
hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
10+
install: true
11+
12+
# 100-min readiness wall (default 30 min) so the per-worker dynamo
13+
# source build has room to finish before health-poll gives up.
14+
health_check:
15+
max_attempts: 600
16+
interval_seconds: 10
17+
18+
model:
19+
path: "deepseek-v4-pro"
20+
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
21+
precision: "mxfp4"
22+
23+
sbatch_directives:
24+
cpus-per-task: "144"
25+
mem: "0"
26+
27+
# Note: single agg worker on 2 nodes. Concurrencies above ~1024 will queue
28+
# server-side rather than run truly concurrent; throughput numbers are
29+
# still meaningful but TTFT/E2EL will reflect queueing delay.
30+
resources:
31+
gpu_type: "gb200"
32+
gpus_per_node: 4
33+
agg_nodes: 2
34+
agg_workers: 1
35+
gpus_per_agg: 8
36+
37+
backend:
38+
type: sglang
39+
40+
aggregated_environment:
41+
PYTHONUNBUFFERED: "1"
42+
SGLANG_RADIX_DISABLE_REUSE: "1"
43+
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
44+
SGLANG_DEFAULT_THINKING: "1"
45+
SGLANG_DSV4_REASONING_EFFORT: "max"
46+
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
47+
SGLANG_OPT_USE_JIT_NORM: "1"
48+
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
49+
SGLANG_OPT_USE_TOPK_V2: "1"
50+
51+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
52+
SGLANG_OPT_USE_FAST_MASK_EP: "1"
53+
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
54+
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
55+
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
56+
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
57+
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
58+
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
59+
60+
NCCL_MNNVL_ENABLE: "1"
61+
NCCL_CUMEM_ENABLE: "1"
62+
MC_FORCE_MNNVL: "1"
63+
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
64+
# CAR_V2 is single-node only; TP=8 spans 2 nodes so we disable it.
65+
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0"
66+
67+
sglang_config:
68+
aggregated:
69+
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
70+
model-path: "/model/"
71+
trust-remote-code: true
72+
tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec.
73+
74+
tensor-parallel-size: 8
75+
data-parallel-size: 8
76+
expert-parallel-size: 8
77+
78+
enable-dp-attention: true
79+
enable-dp-lm-head: true
80+
81+
moe-a2a-backend: "deepep"
82+
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
83+
84+
speculative-algo: "EAGLE"
85+
speculative-num-steps: 3
86+
speculative-eagle-topk: 1
87+
speculative-num-draft-tokens: 4
88+
89+
mem-fraction-static: 0.85
90+
max-running-requests: 1024
91+
cuda-graph-max-bs: 1024
92+
swa-full-tokens-ratio: 0.15
93+
context-length: 16384
94+
stream-interval: 60
95+
chunked-prefill-size: 32768
96+
97+
benchmark:
98+
type: "sa-bench"
99+
isl: 8192
100+
osl: 256
101+
random_range_ratio: 1.0
102+
concurrencies: "16384"
103+
req_rate: "inf"
104+
use_chat_template: true
105+
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"

0 commit comments

Comments
 (0)