SemiAnalysisAI
diff --git a/‎.github/configs/amd-master.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/configs/amd-master.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/configs/nvidia-master.yaml‎
Lines changed: 46 additions & 31 deletions b/‎.github/configs/nvidia-master.yaml‎
Lines changed: 46 additions & 31 deletions
diff --git a/‎…isagg-gb300-4p1d-dep4-dep16-8-c1024.yaml‎ ‎…agg-gb300-10p1d-dep4-dep32-18-c2500.yaml‎benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml renamed to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml
Lines changed: 22 additions & 46 deletions b/‎…isagg-gb300-4p1d-dep4-dep16-8-c1024.yaml‎ ‎…agg-gb300-10p1d-dep4-dep32-18-c2500.yaml‎benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml renamed to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml
Lines changed: 22 additions & 46 deletions
@@ -1128,7 +1128,7 @@ gptoss-fp4-mi325x-vllm:
 
 gptoss-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
-  model: openai/gpt-oss-120b
+  model: amd/gpt-oss-120b-w-mxfp4-a-fp8
   model-prefix: gptoss
   runner: mi355x
   precision: fp4
 
@@ -1756,7 +1756,7 @@ dsv4-fp4-b200-sglang:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
 
 dsv4-fp4-b200-vllm:
-  image: vllm/vllm-openai:nightly-3f0a91bb96f8d72e0498b95c166e817deae14d62
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -8781,75 +8781,90 @@ dsv4-fp4-gb300-dynamo-sglang:
           tp: 16
           ep: 16
           dp-attn: true
-      # WideEP TP=16 decode: 4p1d-dep4-dep16. 8 nodes.
-      - conc-list: [1024]
+      # Low concurrency: 1p1d-tp4-tp4. 2 nodes.
+      - conc-list: [1]
         prefill:
-          num-worker: 4
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # --- Weiliang wide-EP sweep (srt-slurm PR#173), 18 nodes total ---
+      # EP=12: 15P+3D, conc=12000.
+      - conc-list: [12000]
+        prefill:
+          num-worker: 15
           tp: 4
           ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep16-8-c1024.yaml"
+          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-15p1d-dep4-dep12-18-c12000.yaml"
         decode:
           num-worker: 1
-          tp: 16
-          ep: 16
+          tp: 12
+          ep: 12
           dp-attn: true
-      # WideEP TP=16 decode: 8p1d-dep4-dep16. 12 nodes.
-      - conc-list: [4096]
+      # EP=16: 14P+4D, conc=8192.
+      - conc-list: [8192]
         prefill:
-          num-worker: 8
+          num-worker: 14
           tp: 4
           ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep16-12-c4096.yaml"
+          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-14p1d-dep4-dep16-18-c8192.yaml"
         decode:
           num-worker: 1
           tp: 16
           ep: 16
           dp-attn: true
-      # Low concurrency: 1p1d-tp4-tp4. 2 nodes.
-      - conc-list: [1]
+      # EP=24: 12P+6D, conc=3000.
+      - conc-list: [3000]
         prefill:
-          num-worker: 1
+          num-worker: 12
           tp: 4
-          ep: 1
-          dp-attn: false
+          ep: 4
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-1p1d-tp4-tp4-2-c1.yaml"
+          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep24-18-c3000.yaml"
         decode:
           num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-      # Mid concurrency: 10p1d-dep4-dep16. 14 nodes.
-      - conc-list: [8192]
+          tp: 24
+          ep: 24
+          dp-attn: true
+      # EP=32: 10P+8D, conc=2500.
+      - conc-list: [2500]
         prefill:
           num-worker: 10
           tp: 4
           ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep16-14-c8192.yaml"
+          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-10p1d-dep4-dep32-18-c2500.yaml"
         decode:
           num-worker: 1
-          tp: 16
-          ep: 16
+          tp: 32
+          ep: 32
           dp-attn: true
-      # Max concurrency: 12p1d-dep4-dep12. 15 nodes.
-      - conc-list: [21504]
+      # EP=40: 8P+10D, conc=2048.
+      - conc-list: [2048]
         prefill:
-          num-worker: 12
+          num-worker: 8
           tp: 4
           ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-12p1d-dep4-dep12-15-c21504.yaml"
+          - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-8p1d-dep4-dep40-18-c2048.yaml"
         decode:
           num-worker: 1
-          tp: 12
-          ep: 12
+          tp: 40
+          ep: 40
           dp-attn: true
 
 glm5-fp8-b200-dynamo-sglang:
 
@@ -1,35 +1,8 @@
-name: "disagg-gb300-4p1d-dep4-dep16-8-c1024"
-
-# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
-#
-# Schema/values come from PR #1213 (513cbef) — that PR introduced the
-# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
-# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
-# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
-# support either: `zip_override_*_hightpt` rejects with `Unknown field`
-# and `benchmark` only validates at top level. So this file inlines the
-# wideep [0] override and lifts `benchmark` back out — same operational
-# values, schema the pinned srtctl will accept.
-#
-# Other adjustments back to the InferenceX cluster shape: container &
-# model.path restored to the aliases mapped in launch_gb300.sh's
-# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
-# `deepseek-v4-pro`); `dynamo.install: true` added so the container
-# (which has no dynamo baked in) installs from the pinned hash.
-#
-# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
-#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
-#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
-#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
-#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
-#     active runtime diff vs container sglang; other patched files are
-#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
-#     already sets.
-#
-# DG-related env intentionally diverged (DG cache path is host-specific):
-#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
-#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
-#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
+name: "disagg-gb300-10p1d-dep4-dep32-18-c2500"
+
+# Weiliang wide-EP sweep point: EP=32, 10P+8D = 18 nodes, conc=2500.
+# Matches srt-slurm PR#173 zip_override EP=32 topology.
+# Env vars and sglang_config from InferenceX main (not Weiliang's 0510 image).
 
 model:
   path: "deepseek-v4-pro"
@@ -50,18 +23,19 @@ sbatch_directives:
 resources:
   gpu_type: "gb300"
   gpus_per_node: 4
-  prefill_nodes: 4
-  prefill_workers: 4
+  prefill_nodes: 10
+  prefill_workers: 10
   gpus_per_prefill: 4
-  decode_nodes: 4
+  decode_nodes: 8
   decode_workers: 1
-  gpus_per_decode: 16
+  gpus_per_decode: 32
 
 frontend:
   type: dynamo
-  enable_multiple_frontends: false
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
   env:
-    DYN_ROUTER_LOAD_BLOCK_SIZE: "1" 
+    DYN_ROUTER_LOAD_BLOCK_SIZE: "1"
   args:
     router-mode: "kv"
     router-kv-overlap-score-weight: 0
@@ -120,7 +94,6 @@ backend:
     SGLANG_LOG_FORWARD_ITERS: "1"
     SGLANG_LOG_MS: "1"
     SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
-    # is single-node only and corrupts results in 2-node decode setups.
 
   sglang_config:
     prefill:
@@ -141,9 +114,11 @@ backend:
 
       disaggregation-mode: "prefill"
       disaggregation-transfer-backend: mooncake
+      enable-dp-lm-head: true
 
       mem-fraction-static: 0.90
       max-running-requests: 512
+      cuda-graph-max-bs: 512
       chunked-prefill-size: 32768
 
     decode:
@@ -153,30 +128,31 @@ backend:
       skip-tokenizer-init: true
       stream-interval: 60
 
-      load-balance-method: "total_requests"
       moe-a2a-backend: "megamoe"
 
+      moe-dense-tp-size: 1
+
       disaggregation-mode: "decode"
       disaggregation-transfer-backend: mooncake
       disaggregation-decode-polling-interval: 8
 
       mem-fraction-static: 0.94
-      swa-full-tokens-ratio: 0.056
+      swa-full-tokens-ratio: 0.20
       context-length: 9216
-      tensor-parallel-size: 16
-      data-parallel-size: 16
-      expert-parallel-size: 16
+      tensor-parallel-size: 32
+      data-parallel-size: 32
+      expert-parallel-size: 32
       enable-dp-attention: true
       enable-dp-lm-head: true
-      max-running-requests: 21504
+      max-running-requests: 18432
       cuda-graph-max-bs: 1280
 
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "2500"
   req_rate: "inf"
   use_chat_template: false
   custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"