SemiAnalysisAI
diff --git a/‎.github/configs/nvidia-master.yaml‎
Lines changed: 196 additions & 0 deletions b/‎.github/configs/nvidia-master.yaml‎
Lines changed: 196 additions & 0 deletions
diff --git a/‎benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml‎
Lines changed: 175 additions & 0 deletions b/‎benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml‎
Lines changed: 175 additions & 0 deletions
@@ -9234,3 +9234,199 @@ qwen3.5-fp8-h100-sglang-mtp:
       osl: 1024
       search-space:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+
+glm5-fp4-gb300-dynamo-sglang:
+  image: lmsysorg/sglang:v0.5.11-cu130
+  model: nvidia/GLM-5-NVFP4
+  model-prefix: glm5
+  runner: gb300-nv
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    # ---------- 8k1k high-throughput (wide-EP TP=32 decode) ----------
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 5p1d wide-EP. 13 nodes (5P @ TP=4 + 1D @ TP=32 on 8 nodes).
+      - conc-list: [2048]
+        prefill:
+          num-worker: 5
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+      # 7p1d wide-EP. 15 nodes.
+      - conc-list: [3072]
+        prefill:
+          num-worker: 7
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+      # 10p1d wide-EP. 18 nodes.
+      - conc-list: [4096]
+        prefill:
+          num-worker: 10
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+    # ---------- 8k1k low-latency (per-node TP=4 decode workers) ----------
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1p3d. 4 nodes (1P + 3 D workers @ 1 node each).
+      - conc-list: [1024]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
+        decode:
+          num-worker: 3
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # 1p5d. 6 nodes.
+      - conc-list: [1024]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
+        decode:
+          num-worker: 5
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # 1p9d. 10 nodes.
+      - conc-list: [1024]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
+        decode:
+          num-worker: 9
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # 1p15d. 16 nodes.
+      - conc-list: [1024]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml"
+        decode:
+          num-worker: 15
+          tp: 4
+          ep: 1
+          dp-attn: false
+    # ---------- 1k1k high-throughput (wide-EP TP=32 decode) ----------
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 3p1d wide-EP. 11 nodes. conc 16500.
+      - conc-list: [16500]
+        prefill:
+          num-worker: 3
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+      # 2p1d wide-EP. 10 nodes. conc 8300.
+      - conc-list: [8300]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+      # 1p1d wide-EP. 9 nodes. conc sweep 2500x1024x512x256.
+      - conc-list: [2500, 1024, 512, 256]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml"
+        decode:
+          num-worker: 1
+          tp: 32
+          ep: 32
+          dp-attn: true
+    # ---------- 1k1k low-latency (per-node TP=4 decode workers) ----------
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # 1p17d low-latency, bs=32 sweep. 18 nodes.
+      - conc-list: [512, 256, 128, 64]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml"
+        decode:
+          num-worker: 17
+          tp: 4
+          ep: 1
+          dp-attn: false
+      # 1p17d low-latency, bs=1 (single-stream). 18 nodes.
+      - conc-list: [32]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml"
+        decode:
+          num-worker: 17
+          tp: 4
+          ep: 1
+          dp-attn: false
@@ -0,0 +1,175 @@
+name: "gb300-fp4-glm5_1k1k_lowlat_0"
+
+# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152).
+# Upstream uses a single combined file with `zip_override_*` arrays
+# expanded by srtctl across zip indices. We split into one flat yaml
+# per concrete topology to match the InferenceX dsv4 sglang convention
+# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the
+# prefill sglang_config are inlined here verbatim from the upstream
+# `base:` block; the decode block is the upstream base plus the
+# topology-specific override from this zip index.
+
+model:
+  path: "glm-5-fp4"
+  container: "lmsysorg/sglang:v0.5.11-cu130"
+  precision: "fp4"
+
+# Released dynamo wheel — upstream recipe uses dynamo.version: "1.1.0".
+# launch_gb300-cw.sh stages /configs/dynamo-wheels for `hash:` source
+# builds (dsv4 path); the version path uses a released wheel and does
+# not depend on that cache.
+dynamo:
+  version: "1.1.0"
+
+slurm:
+  time_limit: "03:00:00"
+
+# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU
+# default that turns dynamo install + sglang weight load into a serial
+# crawl; mem=0 grants whole-node memory.
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 17
+  decode_workers: 17
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "GLM-5-FP4"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: "nixl"
+
+      # Size limits
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+      mem-fraction-static: 0.7
+      context-length: 9600
+      chunked-prefill-size: 32768
+      max-prefill-tokens: 8192
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      load-balance-method: "total_tokens"
+
+      # Backend
+      nsa-decode-backend: "trtllm"
+      nsa-prefill-backend: "trtllm"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+      # Other flags
+      # disable-shared-experts-fusion: true
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+    decode:
+      # Model configuration
+      served-model-name: "GLM-5-FP4"
+      trust-remote-code: true
+
+      quantization: "modelopt_fp4"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: "nixl"
+
+      # Memory and token limits
+      mem-fraction-static: 0.8
+      context-length: 9600
+
+      # Backend
+      nsa-decode-backend: "trtllm"
+      nsa-prefill-backend: "trtllm"
+      moe-runner-backend: "flashinfer_cutedsl"
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+      # Detokenizer
+      skip-tokenizer-init: true
+      stream-interval: 30
+
+      # Other flags
+      # disable-shared-experts-fusion: true
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      # Parallelism (override from upstream zip_override_*_lowlat)
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      data-parallel-size:   1
+      enable-flashinfer-allreduce-fusion: true
+
+      moe-runner-backend: "flashinfer_trtllm"
+      max-running-requests: 32
+      cuda-graph-max-bs:    32
+
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x256x128x64"
+  req_rate: "inf"