Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 7 additions & 17 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2003,37 +2003,27 @@ dsr1-fp8-b300-sglang:
# DeepSeek-V4-Pro on B300 with sglang (non-MTP).
# Uses nightly image with megamoe backend for high-concurrency profiles.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b
image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
# Recipes are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh
# by CONC:
# CONC 1|32: TP-only, flashinfer_mxfp4
# CONC 512: DP-attn, flashinfer_mxfp4
# CONC 2048-8192: DP-attn, megamoe
# ep is implicit in sglang: --moe-a2a-backend megamoe forces ep_size=tp_size,
# while low-latency leaves ep_size at the default of 1.
# The benchmark script maps dp-attn=false to the TP-only recipe and
# dp-attn=true to the mixed-chunk DEP8 throughput recipe.
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
- { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] }
- { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
- { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] }
- { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] }
Comment thread
cursor[bot] marked this conversation as resolved.

# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
Expand Down
152 changes: 60 additions & 92 deletions benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,135 +12,104 @@ check_env_vars \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
# `hf download`. Only fetch when MODEL looks like a HF repo ID.
if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi

nvidia-smi

# ─── Common env vars (all profiles) ───────────────────────────────────────────
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
# Common SGLANG env vars (apply to every config).
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1

# This config uses a standard sglang dev image (lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44), which
# installs sglang under /sgl-workspace, so launch_b300-nv.sh bind-mounts our repo
# at /workspace (it only switches to /ix for the deepseek-v4 editable images).
# Paths in this script are $PWD-relative so they work under either mount dir.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else
EVAL_CONTEXT_ARGS="--context-length 16384"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# ─── Per-concurrency launch profile ──────────────────────────────────────────
# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO,
# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars.
#
# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was
# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was
# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
if [[ "$ISL" == "1024" ]]; then
SWA_FULL_TOKENS_RATIO=0.5
else
SWA_FULL_TOKENS_RATIO=0.1
fi

if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
# TP-only, no DP attention
MEM_FRACTION_STATIC=0.90
SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
)

elif [ "$CONC" = "512" ]; then
# DP attention, flashinfer_mxfp4
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
MEM_FRACTION_STATIC=0.94
SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--chunked-prefill-size 16384
--enable-prefill-delayer
)
# Pick the launch recipe based on the two-line submission frontier:
# TP8/no-DP-attn for low latency and DEP8/DP-attn for throughput.

elif [ "$CONC" = "2048" ]; then
# DP attention, megamoe
if [ "${DP_ATTENTION}" = "true" ]; then
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
MEM_FRACTION_STATIC=0.87
SWA_FULL_TOKENS_RATIO=0.06
MAX_RUNNING_REQUESTS=2560
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 288
--chunked-prefill-size 65536
--tokenizer-worker-num 4
--enable-prefill-delayer
)
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=2048
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1
export SGLANG_EXPERIMENTAL_ENABLE_PIECEWISE_CUDA_GRAPH_MOE_A2A=1
export NCCL_MNNVL_ENABLE=1
export NCCL_CUMEM_ENABLE=1
export MC_FORCE_MNNVL=1
export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True

elif [ "$CONC" = "4096" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
MEM_FRACTION_STATIC=0.835
SWA_FULL_TOKENS_RATIO=0.075
MAX_RUNNING_REQUESTS=4352
SWA_FULL_TOKENS_RATIO=0.075
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 544
--chunked-prefill-size 65536
--enable-mixed-chunk
--chunked-prefill-size 16384
--max-prefill-tokens 16384
--tokenizer-worker-num 8
--enable-prefill-delayer
--decode-log-interval 5
--stream-interval 30
)

elif [ "$CONC" = "8192" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
MEM_FRACTION_STATIC=0.80
SWA_FULL_TOKENS_RATIO=0.3
MAX_RUNNING_REQUESTS=8192
else
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=1
MEM_FRACTION_STATIC=0.90
MAX_RUNNING_REQUESTS=512
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 1088
--chunked-prefill-size 65536
--tokenizer-worker-num 16
--enable-prefill-delayer
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
--cuda-graph-max-bs 512
--tokenizer-worker-num 8
--decode-log-interval 60
--stream-interval 30
--scheduler-recv-interval 30
)

else
echo "ERROR: unsupported CONC=$CONC" >&2
exit 1
fi

# Print all SGLANG_* env vars to both the CI step log and server.log so the
Expand All @@ -153,12 +122,12 @@ fi

set -x
PYTHONNOUSERSITE=1 sglang serve \
--model-path $MODEL_PATH --served-model-name $MODEL \
--model-path $MODEL \

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Server ignores staged MODEL_PATH

High Severity

On B300, launch_b300-nv.sh keeps MODEL as the Hugging Face id and sets MODEL_PATH to pre-staged weights, but this script passes --model-path $MODEL and no longer uses MODEL_PATH or --served-model-name, unlike dsr1_fp4_b300.sh and dsv4_fp4_b300_sglang_mtp.sh.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit fbeb15a. Configure here.

--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
--tp $TP \
--max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
--max-running-requests "$MAX_RUNNING_REQUESTS" \
--mem-fraction-static "$MEM_FRACTION_STATIC" \
--swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
Expand All @@ -168,7 +137,6 @@ SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas
pip install -q --upgrade transformers

run_benchmark_serving \
--model "$MODEL" \
Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3474,3 +3474,11 @@
- "Use scheduler-recv-interval values 2/60/30/1200/600/1920 for conc 1-4/8/16/32/64/128-256"
- "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Update DeepSeek-V4-Pro FP4 B300 SGLang non-MTP sweep to the 2026-05-19 8k/1k submission frontier: TP8 no-DP-attention c1-c64 and DEP8 DP-attention c512/c768/c1024/c1536/c2048"
- "Use lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44 to pick up the merged SGLang warmup path"
- "Map dp-attn=false to TP8 flashinfer_mxfp4 with chunked-prefill 8192; map dp-attn=true to DEP8 mixed-chunk MegaMoE throughput settings"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1575