diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3d6a17ff7..c4e9bf621 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2003,37 +2003,27 @@ dsr1-fp8-b300-sglang: # DeepSeek-V4-Pro on B300 with sglang (non-MTP). # Uses nightly image with megamoe backend for high-concurrency profiles. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b + image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 precision: fp4 framework: sglang multinode: false - # Recipes are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh - # by CONC: - # CONC 1|32: TP-only, flashinfer_mxfp4 - # CONC 512: DP-attn, flashinfer_mxfp4 - # CONC 2048-8192: DP-attn, megamoe - # ep is implicit in sglang: --moe-a2a-backend megamoe forces ep_size=tp_size, - # while low-latency leaves ep_size at the default of 1. + # The benchmark script maps dp-attn=false to the TP-only recipe and + # dp-attn=true to the mixed-chunk DEP8 throughput recipe. scenarios: fixed-seq-len: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } + - { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } + - { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh index b451dee0d..f87749d7c 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh @@ -12,29 +12,32 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi - if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi +# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip +# `hf download`. Only fetch when MODEL looks like a HF repo ID. +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + nvidia-smi -# ─── Common env vars (all profiles) ─────────────────────────────────────────── -export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +# Common SGLANG env vars (apply to every config). export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + +# This config uses a standard sglang dev image (lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44), which +# installs sglang under /sgl-workspace, so launch_b300-nv.sh bind-mounts our repo +# at /workspace (it only switches to /ix for the deepseek-v4 editable images). +# Paths in this script are $PWD-relative so they work under either mount dir. SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -42,105 +45,71 @@ EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else + EVAL_CONTEXT_ARGS="--context-length 16384" fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# ─── Per-concurrency launch profile ────────────────────────────────────────── -# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO, -# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars. -# -# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was +# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default. +if [[ "$ISL" == "1024" ]]; then + SWA_FULL_TOKENS_RATIO=0.5 +else + SWA_FULL_TOKENS_RATIO=0.1 +fi -if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then - # TP-only, no DP attention - MEM_FRACTION_STATIC=0.90 - SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) - PARALLEL_ARGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 8192 - --disable-flashinfer-autotune - ) - -elif [ "$CONC" = "512" ]; then - # DP attention, flashinfer_mxfp4 - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - MEM_FRACTION_STATIC=0.94 - SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1) - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) +# Pick the launch recipe based on the two-line submission frontier: +# TP8/no-DP-attn for low latency and DEP8/DP-attn for throughput. -elif [ "$CONC" = "2048" ]; then - # DP attention, megamoe +if [ "${DP_ATTENTION}" = "true" ]; then + export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8 export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_LOG_FORWARD_ITERS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 - MEM_FRACTION_STATIC=0.87 - SWA_FULL_TOKENS_RATIO=0.06 - MAX_RUNNING_REQUESTS=2560 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend megamoe - --cuda-graph-max-bs 288 - --chunked-prefill-size 65536 - --tokenizer-worker-num 4 - --enable-prefill-delayer - ) + export SGLANG_OPT_USE_ONLINE_COMPRESS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=2048 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1 + export SGLANG_EXPERIMENTAL_ENABLE_PIECEWISE_CUDA_GRAPH_MOE_A2A=1 + export NCCL_MNNVL_ENABLE=1 + export NCCL_CUMEM_ENABLE=1 + export MC_FORCE_MNNVL=1 + export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True -elif [ "$CONC" = "4096" ]; then - # DP attention, megamoe - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 MEM_FRACTION_STATIC=0.835 - SWA_FULL_TOKENS_RATIO=0.075 MAX_RUNNING_REQUESTS=4352 + SWA_FULL_TOKENS_RATIO=0.075 PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention --moe-a2a-backend megamoe --cuda-graph-max-bs 544 - --chunked-prefill-size 65536 + --enable-mixed-chunk + --chunked-prefill-size 16384 + --max-prefill-tokens 16384 --tokenizer-worker-num 8 - --enable-prefill-delayer --decode-log-interval 5 + --stream-interval 30 ) - -elif [ "$CONC" = "8192" ]; then - # DP attention, megamoe - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export NVSHMEM_DISABLE_IB=1 - export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 - export SGLANG_OPT_USE_ONLINE_COMPRESS=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 - MEM_FRACTION_STATIC=0.80 - SWA_FULL_TOKENS_RATIO=0.3 - MAX_RUNNING_REQUESTS=8192 +else + export SGLANG_JIT_DEEPGEMM_PRECOMPILE=1 + MEM_FRACTION_STATIC=0.90 + MAX_RUNNING_REQUESTS=512 PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend megamoe - --cuda-graph-max-bs 1088 - --chunked-prefill-size 65536 - --tokenizer-worker-num 16 - --enable-prefill-delayer + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 8192 + --disable-flashinfer-autotune + --cuda-graph-max-bs 512 + --tokenizer-worker-num 8 + --decode-log-interval 60 --stream-interval 30 + --scheduler-recv-interval 30 ) - -else - echo "ERROR: unsupported CONC=$CONC" >&2 - exit 1 fi # Print all SGLANG_* env vars to both the CI step log and server.log so the @@ -153,12 +122,12 @@ fi set -x PYTHONNOUSERSITE=1 sglang serve \ - --model-path $MODEL_PATH --served-model-name $MODEL \ + --model-path $MODEL \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \ + --max-running-requests "$MAX_RUNNING_REQUESTS" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & @@ -168,7 +137,6 @@ SERVER_PID=$! wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" pip install -q datasets pandas -pip install -q --upgrade transformers run_benchmark_serving \ --model "$MODEL" \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a5f3f3478..96093ae60 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3474,3 +3474,11 @@ - "Use scheduler-recv-interval values 2/60/30/1200/600/1920 for conc 1-4/8/16/32/64/128-256" - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Update DeepSeek-V4-Pro FP4 B300 SGLang non-MTP sweep to the 2026-05-19 8k/1k submission frontier: TP8 no-DP-attention c1-c64 and DEP8 DP-attention c512/c768/c1024/c1536/c2048" + - "Use lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44 to pick up the merged SGLang warmup path" + - "Map dp-attn=false to TP8 flashinfer_mxfp4 with chunked-prefill 8192; map dp-attn=true to DEP8 mixed-chunk MegaMoE throughput settings" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1575