Update B300 DSV4 SGLang sweep

YAMY1234 · YAMY1234 · commit 7534b4400d10 · 2026-05-21T23:17:33.000-07:00
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -1999,44 +1999,32 @@ dsr1-fp8-b300-sglang:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
 
-# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# lists B200 (not B300) as the Blackwell target. This config reuses the
-# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
-# until a B300-specific recipe ships. Prefix caching is disabled.
-# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
+# DeepSeek-V4-Pro on B300 with SGLang (non-MTP). This follows the 8k/1k
+# submission frontier from the 2026-05-19 Pareto HTML:
+#   TP-only low-latency line: TP8/EP1, no DP attention, c1-c64
+#   DP-attention throughput line: DEP8, DP attention, c512-c2048
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
+  image: lmsysorg/sglang:nightly-dev-cu13-20260522-7cf193fe
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
   precision: fp4
   framework: sglang
   multinode: false
-  # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-  # are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC:
-  #   low-latency    (CONC <= 32):       TP-only
-  #   balanced       (32 < CONC <= 128): + DP-attn
-  #   max-throughput (CONC > 128):       + DP-attn
-  # Split so result filenames (ep=, dpa=) accurately reflect the recipe.
-  # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
-  # while low-latency leaves ep_size at the default of 1.
+  # The benchmark script maps dp-attn=false to the TP-only recipe and
+  # dp-attn=true to the mixed-chunk DEP8 throughput recipe.
   scenarios:
     fixed-seq-len:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
+      - { tp: 8, ep: 1, dp-attn: false, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, ep: 8, dp-attn: true, conc-list: [512, 768, 1024, 1536, 2048] }
 
   # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
   # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -25,7 +25,6 @@ fi
 nvidia-smi
 
 # Common SGLANG env vars (apply to every config).
-export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
 export SGLANG_OPT_USE_JIT_NORM=1
 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
@@ -48,6 +47,8 @@ EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+else
+    EVAL_CONTEXT_ARGS="--context-length 16384"
 fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
@@ -60,105 +61,59 @@ else
     SWA_FULL_TOKENS_RATIO=0.1
 fi
 
-# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
-# script's pattern). DP-attention runs the empirically-tuned high-concurrency
-# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
-# single-instance uses flashinfer_mxfp4 with the cookbook defaults.
+# Pick the launch recipe based on the two-line submission frontier:
+# TP8/no-DP-attn for low latency and DEP8/DP-attn for throughput.
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
-# Default; the DP-attn branch below overrides to 0.94.
-MEM_FRACTION_STATIC=0.90
-
 if [ "${DP_ATTENTION}" = "true" ]; then
+    export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+    export SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION=8
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     export SGLANG_OPT_USE_FAST_MASK_EP=1
     export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
-    # recipes first (they also have ep=8) so they aren't shadowed by the
-    # medium-conc EP_SIZE=8 branch below.
-    if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then
-        export NVSHMEM_DISABLE_IB=1
-        export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
-        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-        if [ "$CONC" = "2048" ]; then
-            export SGLANG_LOG_FORWARD_ITERS=1
-            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
-            CUDA_GRAPH_MAX_BS=288
-            MAX_RUNNING_REQUESTS=2560
-            MEM_FRACTION_STATIC=0.87
-            SWA_FULL_TOKENS_RATIO=0.06
-            TOKENIZER_WORKER_NUM=4
-        elif [ "$CONC" = "4096" ]; then
-            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
-            CUDA_GRAPH_MAX_BS=544
-            MAX_RUNNING_REQUESTS=4352
-            MEM_FRACTION_STATIC=0.835
-            SWA_FULL_TOKENS_RATIO=0.075
-            TOKENIZER_WORKER_NUM=8
-        else
-            export SGLANG_OPT_USE_ONLINE_COMPRESS=1
-            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
-            CUDA_GRAPH_MAX_BS=1088
-            MAX_RUNNING_REQUESTS=8192
-            MEM_FRACTION_STATIC=0.80
-            SWA_FULL_TOKENS_RATIO=0.3
-            TOKENIZER_WORKER_NUM=16
-        fi
-        PARALLEL_ARGS=(
-            --dp-size "$TP"
-            --enable-dp-attention
-            --moe-a2a-backend deepep
-            --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
-            --deepep-config "$DEEPEP_CONFIG"
-            --chunked-prefill-size 65536
-            --tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
-            --enable-prefill-delayer
-        )
-        if [ "$CONC" = "4096" ]; then
-            PARALLEL_ARGS+=(--decode-log-interval 5)
-        fi
-        if [ "$CONC" = "8192" ]; then
-            PARALLEL_ARGS+=(--stream-interval 30)
-        fi
-    elif [ "${EP_SIZE}" = "8" ]; then
-        export NVSHMEM_DISABLE_IB=1
-        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
-        PARALLEL_ARGS=(
-            --dp-size "$TP"
-            --enable-dp-attention
-            --moe-a2a-backend deepep
-            --cuda-graph-max-bs 550
-            --deepep-config "$DEEPEP_CONFIG"
-            --chunked-prefill-size 16384
-            --enable-prefill-delayer
-        )
-        MAX_RUNNING_REQUESTS=768
-        MEM_FRACTION_STATIC=0.94
-    else
-        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
-        export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
-        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
-        PARALLEL_ARGS=(
-            --dp-size "$TP"
-            --enable-dp-attention
-            --moe-runner-backend flashinfer_mxfp4
-            --disable-flashinfer-autotune
-            --deepep-config "$DEEPEP_CONFIG"
-            --chunked-prefill-size 16384
-            --enable-prefill-delayer
-        )
-        MEM_FRACTION_STATIC=0.94
-    fi
+    export NVSHMEM_DISABLE_IB=1
+    export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+    export SGLANG_OPT_USE_ONLINE_COMPRESS=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=2048
+    export SGLANG_EXPERIMENTAL_ENABLE_PIECEWISE_CUDA_GRAPH_MOE_A2A=1
+    export NCCL_MNNVL_ENABLE=1
+    export NCCL_CUMEM_ENABLE=1
+    export MC_FORCE_MNNVL=1
+    export SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True
+
+    MEM_FRACTION_STATIC=0.835
+    MAX_RUNNING_REQUESTS=4352
+    SWA_FULL_TOKENS_RATIO=0.075
+    PARALLEL_ARGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --cuda-graph-max-bs 544
+        --enable-mixed-chunk
+        --chunked-prefill-size 16384
+        --max-prefill-tokens 16384
+        --tokenizer-worker-num 8
+        --decode-log-interval 5
+        --stream-interval 30
+    )
 else
+    export SGLANG_JIT_DEEPGEMM_PRECOMPILE=1
+    MEM_FRACTION_STATIC=0.90
+    MAX_RUNNING_REQUESTS=512
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
         --chunked-prefill-size 8192
         --disable-flashinfer-autotune
+        --cuda-graph-max-bs 512
+        --tokenizer-worker-num 8
+        --decode-log-interval 60
+        --stream-interval 30
+        --scheduler-recv-interval 30
     )
 fi
 
@@ -177,7 +132,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
+    --max-running-requests "$MAX_RUNNING_REQUESTS" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3089,3 +3089,11 @@
   description:
     - "Update SGLang image from v0.5.10.post1-cu130 / v0.5.11-cu130 (30d old) to v0.5.12-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1451
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Update DeepSeek-V4-Pro FP4 B300 SGLang non-MTP sweep to the 2026-05-19 8k/1k submission frontier: TP8 no-DP-attention c1-c64 and DEP8 DP-attention c512/c768/c1024/c1536/c2048"
+    - "Use lmsysorg/sglang:nightly-dev-cu13-20260522-7cf193fe to pick up the merged SGLang warmup path"
+    - "Map dp-attn=false to TP8 flashinfer_mxfp4 with chunked-prefill 8192; map dp-attn=true to DEP8 mixed-chunk DeepEP/DeepGEMM throughput settings"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1552