Skip to content

Commit f5ef131

Browse files
committed
Update B300 DSV4 SGLang sweep
1 parent cf0aa5b commit f5ef131

3 files changed

Lines changed: 62 additions & 111 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,44 +1999,32 @@ dsr1-fp8-b300-sglang:
19991999
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
20002000
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
20012001

2002-
# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
2003-
# lists B200 (not B300) as the Blackwell target. This config reuses the
2004-
# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
2005-
# until a B300-specific recipe ships. Prefix caching is disabled.
2006-
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
2002+
# DeepSeek-V4-Pro on B300 with SGLang (non-MTP). This follows the 8k/1k
2003+
# submission frontier from the 2026-05-19 Pareto HTML:
2004+
# TP-only low-latency line: TP8/EP1, no DP attention, c1-c64
2005+
# DP-attention throughput line: DEP8, DP attention, c512-c2048
20072006
dsv4-fp4-b300-sglang:
2008-
image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
2007+
image: lmsysorg/sglang:nightly-dev-cu13-20260522-7cf193fe
20092008
model: deepseek-ai/DeepSeek-V4-Pro
20102009
model-prefix: dsv4
20112010
runner: b300
20122011
precision: fp4
20132012
framework: sglang
20142013
multinode: false
2015-
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
2016-
# are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC:
2017-
# low-latency (CONC <= 32): TP-only
2018-
# balanced (32 < CONC <= 128): + DP-attn
2019-
# max-throughput (CONC > 128): + DP-attn
2020-
# Split so result filenames (ep=, dpa=) accurately reflect the recipe.
2021-
# ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
2022-
# while low-latency leaves ep_size at the default of 1.
2014+
# The benchmark script maps dp-attn=false to the TP-only recipe and
2015+
# dp-attn=true to the mixed-chunk DEP8 throughput recipe.
20232016
scenarios:
20242017
fixed-seq-len:
20252018
- isl: 1024
20262019
osl: 1024
20272020
search-space:
2028-
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
2029-
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
2030-
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
2031-
- { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
2021+
- { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] }
2022+
- { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] }
20322023
- isl: 8192
20332024
osl: 1024
20342025
search-space:
2035-
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
2036-
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
2037-
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
2038-
- { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
2039-
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
2026+
- { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] }
2027+
- { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] }
20402028

20412029
# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
20422030
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by

benchmarks/single_node/dsv4_fp4_b300_sglang.sh

Lines changed: 43 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ fi
2525
nvidia-smi
2626

2727
# Common SGLANG env vars (apply to every config).
28-
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
2928
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
3029
export SGLANG_OPT_USE_JIT_NORM=1
3130
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
@@ -48,6 +47,8 @@ EVAL_CONTEXT_ARGS=""
4847
if [ "${EVAL_ONLY}" = "true" ]; then
4948
setup_eval_context
5049
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
50+
else
51+
EVAL_CONTEXT_ARGS="--context-length 16384"
5152
fi
5253

5354
start_gpu_monitor --output "$PWD/gpu_metrics.csv"
@@ -60,105 +61,59 @@ else
6061
SWA_FULL_TOKENS_RATIO=0.1
6162
fi
6263

63-
# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
64-
# script's pattern). DP-attention runs the empirically-tuned high-concurrency
65-
# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
66-
# single-instance uses flashinfer_mxfp4 with the cookbook defaults.
64+
# Pick the launch recipe based on the two-line submission frontier:
65+
# TP8/no-DP-attn for low latency and DEP8/DP-attn for throughput.
6766
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
6867

69-
# Default; the DP-attn branch below overrides to 0.94.
70-
MEM_FRACTION_STATIC=0.90
71-
7268
if [ "${DP_ATTENTION}" = "true" ]; then
69+
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
70+
export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8
7371
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
7472
export SGLANG_OPT_USE_FAST_MASK_EP=1
7573
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
7674
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
7775
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
78-
# ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
79-
# recipes first (they also have ep=8) so they aren't shadowed by the
80-
# medium-conc EP_SIZE=8 branch below.
81-
if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then
82-
export NVSHMEM_DISABLE_IB=1
83-
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
84-
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
85-
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
86-
if [ "$CONC" = "2048" ]; then
87-
export SGLANG_LOG_FORWARD_ITERS=1
88-
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
89-
CUDA_GRAPH_MAX_BS=288
90-
MAX_RUNNING_REQUESTS=2560
91-
MEM_FRACTION_STATIC=0.87
92-
SWA_FULL_TOKENS_RATIO=0.06
93-
TOKENIZER_WORKER_NUM=4
94-
elif [ "$CONC" = "4096" ]; then
95-
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
96-
CUDA_GRAPH_MAX_BS=544
97-
MAX_RUNNING_REQUESTS=4352
98-
MEM_FRACTION_STATIC=0.835
99-
SWA_FULL_TOKENS_RATIO=0.075
100-
TOKENIZER_WORKER_NUM=8
101-
else
102-
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
103-
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
104-
CUDA_GRAPH_MAX_BS=1088
105-
MAX_RUNNING_REQUESTS=8192
106-
MEM_FRACTION_STATIC=0.80
107-
SWA_FULL_TOKENS_RATIO=0.3
108-
TOKENIZER_WORKER_NUM=16
109-
fi
110-
PARALLEL_ARGS=(
111-
--dp-size "$TP"
112-
--enable-dp-attention
113-
--moe-a2a-backend deepep
114-
--cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
115-
--deepep-config "$DEEPEP_CONFIG"
116-
--chunked-prefill-size 65536
117-
--tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
118-
--enable-prefill-delayer
119-
)
120-
if [ "$CONC" = "4096" ]; then
121-
PARALLEL_ARGS+=(--decode-log-interval 5)
122-
fi
123-
if [ "$CONC" = "8192" ]; then
124-
PARALLEL_ARGS+=(--stream-interval 30)
125-
fi
126-
elif [ "${EP_SIZE}" = "8" ]; then
127-
export NVSHMEM_DISABLE_IB=1
128-
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
129-
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
130-
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
131-
PARALLEL_ARGS=(
132-
--dp-size "$TP"
133-
--enable-dp-attention
134-
--moe-a2a-backend deepep
135-
--cuda-graph-max-bs 550
136-
--deepep-config "$DEEPEP_CONFIG"
137-
--chunked-prefill-size 16384
138-
--enable-prefill-delayer
139-
)
140-
MAX_RUNNING_REQUESTS=768
141-
MEM_FRACTION_STATIC=0.94
142-
else
143-
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
144-
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
145-
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
146-
PARALLEL_ARGS=(
147-
--dp-size "$TP"
148-
--enable-dp-attention
149-
--moe-runner-backend flashinfer_mxfp4
150-
--disable-flashinfer-autotune
151-
--deepep-config "$DEEPEP_CONFIG"
152-
--chunked-prefill-size 16384
153-
--enable-prefill-delayer
154-
)
155-
MEM_FRACTION_STATIC=0.94
156-
fi
76+
export NVSHMEM_DISABLE_IB=1
77+
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
78+
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
79+
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
80+
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
81+
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=2048
82+
export SGLANG_EXPERIMENTAL_ENABLE_PIECEWISE_CUDA_GRAPH_MOE_A2A=1
83+
export NCCL_MNNVL_ENABLE=1
84+
export NCCL_CUMEM_ENABLE=1
85+
export MC_FORCE_MNNVL=1
86+
export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True
87+
88+
MEM_FRACTION_STATIC=0.835
89+
MAX_RUNNING_REQUESTS=4352
90+
SWA_FULL_TOKENS_RATIO=0.075
91+
PARALLEL_ARGS=(
92+
--dp-size "$TP"
93+
--enable-dp-attention
94+
--moe-a2a-backend deepep
95+
--deepep-config "$DEEPEP_CONFIG"
96+
--cuda-graph-max-bs 544
97+
--enable-mixed-chunk
98+
--chunked-prefill-size 16384
99+
--max-prefill-tokens 16384
100+
--tokenizer-worker-num 8
101+
--decode-log-interval 5
102+
--stream-interval 30
103+
)
157104
else
105+
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=1
106+
MEM_FRACTION_STATIC=0.90
107+
MAX_RUNNING_REQUESTS=512
158108
PARALLEL_ARGS=(
159109
--moe-runner-backend flashinfer_mxfp4
160110
--chunked-prefill-size 8192
161111
--disable-flashinfer-autotune
112+
--cuda-graph-max-bs 512
113+
--tokenizer-worker-num 8
114+
--decode-log-interval 60
115+
--stream-interval 30
116+
--scheduler-recv-interval 30
162117
)
163118
fi
164119

@@ -177,7 +132,7 @@ PYTHONNOUSERSITE=1 sglang serve \
177132
--port $PORT \
178133
--trust-remote-code \
179134
--tp $TP \
180-
--max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
135+
--max-running-requests "$MAX_RUNNING_REQUESTS" \
181136
--mem-fraction-static "$MEM_FRACTION_STATIC" \
182137
--swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
183138
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3089,3 +3089,11 @@
30893089
description:
30903090
- "Update SGLang image from v0.5.10.post1-cu130 / v0.5.11-cu130 (30d old) to v0.5.12-cu130"
30913091
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1451
3092+
3093+
- config-keys:
3094+
- dsv4-fp4-b300-sglang
3095+
description:
3096+
- "Update DeepSeek-V4-Pro FP4 B300 SGLang non-MTP sweep to the 2026-05-19 8k/1k submission frontier: TP8 no-DP-attention c1-c64 and DEP8 DP-attention c512/c768/c1024/c1536/c2048"
3097+
- "Use lmsysorg/sglang:nightly-dev-cu13-20260522-7cf193fe to pick up the merged SGLang warmup path"
3098+
- "Map dp-attn=false to TP8 flashinfer_mxfp4 with chunked-prefill 8192; map dp-attn=true to DEP8 mixed-chunk DeepEP/DeepGEMM throughput settings"
3099+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/0

0 commit comments

Comments
 (0)