SemiAnalysisAI · Ankur-singh · May 22, 2026 · May 22, 2026 · May 27, 2026 · Jun 4, 2026
@@ -2003,37 +2003,27 @@ dsr1-fp8-b300-sglang:
 # DeepSeek-V4-Pro on B300 with sglang (non-MTP).
 # Uses nightly image with megamoe backend for high-concurrency profiles.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b
+  image: lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
   precision: fp4
   framework: sglang
   multinode: false
-  # Recipes are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh
-  # by CONC:
-  #   CONC 1|32:         TP-only, flashinfer_mxfp4
-  #   CONC 512:          DP-attn, flashinfer_mxfp4
-  #   CONC 2048-8192:    DP-attn, megamoe
-  # ep is implicit in sglang: --moe-a2a-backend megamoe forces ep_size=tp_size,
-  # while low-latency leaves ep_size at the default of 1.
+  # The benchmark script maps dp-attn=false to the TP-only recipe and
+  # dp-attn=true to the mixed-chunk DEP8 throughput recipe.
   scenarios:
     fixed-seq-len:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-      - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-      - { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] }
 
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
   # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_b300_sglang.sh
@@ -12,135 +12,104 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
-# `hf download` creates the target dir if missing and is itself idempotent. 
-# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
-# Either way, MODEL_PATH is what the server is launched with.
-if [[ -n "${MODEL_PATH:-}" ]]; then
-    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
-        hf download "$MODEL" --local-dir "$MODEL_PATH"
-    fi
-else
-    hf download "$MODEL"
-    export MODEL_PATH="$MODEL"
-fi
-
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
+# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
+# `hf download`. Only fetch when MODEL looks like a HF repo ID.
+if [[ "$MODEL" != /* ]]; then
+    hf download "$MODEL"
+fi
+
 nvidia-smi
 
-# ─── Common env vars (all profiles) ───────────────────────────────────────────
-export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+# Common SGLANG env vars (apply to every config).
 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
+export SGLANG_OPT_USE_JIT_NORM=1
+export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
+export SGLANG_OPT_USE_TOPK_V2=1
+export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+
+# This config uses a standard sglang dev image (lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44), which
+# installs sglang under /sgl-workspace, so launch_b300-nv.sh bind-mounts our repo
+# at /workspace (it only switches to /ix for the deepseek-v4 editable images).
+# Paths in this script are $PWD-relative so they work under either mount dir.
 
 SERVER_LOG="$PWD/server.log"
+PORT=${PORT:-8888}
 
 echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+else
+    EVAL_CONTEXT_ARGS="--context-length 16384"
 fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# ─── Per-concurrency launch profile ──────────────────────────────────────────
-# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO,
-# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars.
-#
-# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was
+# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was
 # tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
+if [[ "$ISL" == "1024" ]]; then
+    SWA_FULL_TOKENS_RATIO=0.5
+else
+    SWA_FULL_TOKENS_RATIO=0.1
+fi
 
-if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
-    # TP-only, no DP attention
-    MEM_FRACTION_STATIC=0.90
-    SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
-    PARALLEL_ARGS=(
-        --moe-runner-backend flashinfer_mxfp4
-        --chunked-prefill-size 8192
-        --disable-flashinfer-autotune
-    )
-
-elif [ "$CONC" = "512" ]; then
-    # DP attention, flashinfer_mxfp4
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    MEM_FRACTION_STATIC=0.94
-    SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-runner-backend flashinfer_mxfp4
-        --disable-flashinfer-autotune
-        --chunked-prefill-size 16384
-        --enable-prefill-delayer
-    )
+# Pick the launch recipe based on the two-line submission frontier:
+# TP8/no-DP-attn for low latency and DEP8/DP-attn for throughput.
 
-elif [ "$CONC" = "2048" ]; then
-    # DP attention, megamoe
+if [ "${DP_ATTENTION}" = "true" ]; then
+    export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+    export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
+    export SGLANG_OPT_USE_FAST_MASK_EP=1
+    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
+    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export NVSHMEM_DISABLE_IB=1
     export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_LOG_FORWARD_ITERS=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
-    MEM_FRACTION_STATIC=0.87
-    SWA_FULL_TOKENS_RATIO=0.06
-    MAX_RUNNING_REQUESTS=2560
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend megamoe
-        --cuda-graph-max-bs 288
-        --chunked-prefill-size 65536
-        --tokenizer-worker-num 4
-        --enable-prefill-delayer
-    )
+    export SGLANG_OPT_USE_ONLINE_COMPRESS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=2048
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_MXF4_KIND=1
+    export SGLANG_EXPERIMENTAL_ENABLE_PIECEWISE_CUDA_GRAPH_MOE_A2A=1
+    export NCCL_MNNVL_ENABLE=1
+    export NCCL_CUMEM_ENABLE=1
+    export MC_FORCE_MNNVL=1
+    export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True
 
-elif [ "$CONC" = "4096" ]; then
-    # DP attention, megamoe
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export NVSHMEM_DISABLE_IB=1
-    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
     MEM_FRACTION_STATIC=0.835
-    SWA_FULL_TOKENS_RATIO=0.075
     MAX_RUNNING_REQUESTS=4352
+    SWA_FULL_TOKENS_RATIO=0.075
     PARALLEL_ARGS=(
         --dp-size "$TP"
         --enable-dp-attention
         --moe-a2a-backend megamoe
         --cuda-graph-max-bs 544
-        --chunked-prefill-size 65536
+        --enable-mixed-chunk
+        --chunked-prefill-size 16384
+        --max-prefill-tokens 16384
         --tokenizer-worker-num 8
-        --enable-prefill-delayer
         --decode-log-interval 5
+        --stream-interval 30
     )
-
-elif [ "$CONC" = "8192" ]; then
-    # DP attention, megamoe
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export NVSHMEM_DISABLE_IB=1
-    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-    export SGLANG_OPT_USE_ONLINE_COMPRESS=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
-    MEM_FRACTION_STATIC=0.80
-    SWA_FULL_TOKENS_RATIO=0.3
-    MAX_RUNNING_REQUESTS=8192
+else
+    export SGLANG_JIT_DEEPGEMM_PRECOMPILE=1
+    MEM_FRACTION_STATIC=0.90
+    MAX_RUNNING_REQUESTS=512
     PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend megamoe
-        --cuda-graph-max-bs 1088
-        --chunked-prefill-size 65536
-        --tokenizer-worker-num 16
-        --enable-prefill-delayer
+        --moe-runner-backend flashinfer_mxfp4
+        --chunked-prefill-size 8192
+        --disable-flashinfer-autotune
+        --cuda-graph-max-bs 512
+        --tokenizer-worker-num 8
+        --decode-log-interval 60
         --stream-interval 30
+        --scheduler-recv-interval 30
     )
-
-else
-    echo "ERROR: unsupported CONC=$CONC" >&2
-    exit 1
 fi
 
 # Print all SGLANG_* env vars to both the CI step log and server.log so the
@@ -153,12 +122,12 @@ fi
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
-    --model-path $MODEL_PATH --served-model-name $MODEL \
+    --model-path $MODEL \
     --host 0.0.0.0 \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
+    --max-running-requests "$MAX_RUNNING_REQUESTS" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
@@ -168,7 +137,6 @@ SERVER_PID=$!
 wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
 
 pip install -q datasets pandas
-pip install -q --upgrade transformers
 
 run_benchmark_serving \
     --model "$MODEL" \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3474,3 +3474,11 @@
     - "Use scheduler-recv-interval values 2/60/30/1200/600/1920 for conc 1-4/8/16/32/64/128-256"
     - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Update DeepSeek-V4-Pro FP4 B300 SGLang non-MTP sweep to the 2026-05-19 8k/1k submission frontier: TP8 no-DP-attention c1-c64 and DEP8 DP-attention c512/c768/c1024/c1536/c2048"
+    - "Use lmsysorg/sglang:nightly-dev-cu13-20260604-14ed9b44 to pick up the merged SGLang warmup path"
+    - "Map dp-attn=false to TP8 flashinfer_mxfp4 with chunked-prefill 8192; map dp-attn=true to DEP8 mixed-chunk MegaMoE throughput settings"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1575