glm5-fp4-gb300-dynamo-sglang: extend 8k1k low-lat sweep with 1p17d topology (#1583)

Ankur-singh · functionstackx · web-flow · commit e0cd8f7ddb5b · 2026-05-29T19:10:24.000-04:00
* glm5-fp4-gb300-dynamo-sglang: extend 8k1k low-lat sweep with 1p17d topology Mirrors NVIDIA/srt-slurm#175: adds a 5th 8k1k_stp_lowlat_4 recipe with decode_nodes/workers=17, and lowers per-zip-index decode max-running-requests / cuda-graph-max-bs from a flat 4096 to 128/64/32/16/1 across lowlat_0..4. Benchmark concurrencies follow suit: 128/64/32/16/12. nvidia-master.yaml conc-list updated to match for each of the five 1p{3,5,9,15,17}d entries. * perf-changelog: set PR link to #1583 --------- Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -9296,7 +9296,7 @@ glm5-fp4-gb300-dynamo-sglang:
       osl: 1024
       search-space:
       # 1p3d. 4 nodes (1P + 3 D workers @ 1 node each).
-      - conc-list: [1024]
+      - conc-list: [128]
         prefill:
           num-worker: 1
           tp: 4
@@ -9310,7 +9310,7 @@ glm5-fp4-gb300-dynamo-sglang:
           ep: 1
           dp-attn: false
       # 1p5d. 6 nodes.
-      - conc-list: [1024]
+      - conc-list: [64]
         prefill:
           num-worker: 1
           tp: 4
@@ -9324,7 +9324,7 @@ glm5-fp4-gb300-dynamo-sglang:
           ep: 1
           dp-attn: false
       # 1p9d. 10 nodes.
-      - conc-list: [1024]
+      - conc-list: [32]
         prefill:
           num-worker: 1
           tp: 4
@@ -9338,7 +9338,7 @@ glm5-fp4-gb300-dynamo-sglang:
           ep: 1
           dp-attn: false
       # 1p15d. 16 nodes.
-      - conc-list: [1024]
+      - conc-list: [16]
         prefill:
           num-worker: 1
           tp: 4
@@ -9351,6 +9351,20 @@ glm5-fp4-gb300-dynamo-sglang:
           tp: 4
           ep: 1
           dp-attn: false
+      # 1p17d. 18 nodes.
+      - conc-list: [12]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml"
+        decode:
+          num-worker: 17
+          tp: 4
+          ep: 1
+          dp-attn: false
     # ---------- 1k1k high-throughput (wide-EP TP=32 decode) ----------
     - isl: 1024
       osl: 1024
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml
@@ -158,8 +158,8 @@ backend:
       enable-flashinfer-allreduce-fusion: true
 
       moe-runner-backend: "flashinfer_trtllm"
-      max-running-requests: 4096
-      cuda-graph-max-bs:    4096
+      max-running-requests: 128
+      cuda-graph-max-bs:    128
 
 
 
@@ -171,5 +171,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "128"
   req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml
@@ -158,8 +158,8 @@ backend:
       enable-flashinfer-allreduce-fusion: true
 
       moe-runner-backend: "flashinfer_trtllm"
-      max-running-requests: 4096
-      cuda-graph-max-bs:    4096
+      max-running-requests: 64
+      cuda-graph-max-bs:    64
 
 
 
@@ -171,5 +171,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "64"
   req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml
@@ -158,8 +158,8 @@ backend:
       enable-flashinfer-allreduce-fusion: true
 
       moe-runner-backend: "flashinfer_trtllm"
-      max-running-requests: 4096
-      cuda-graph-max-bs:    4096
+      max-running-requests: 32
+      cuda-graph-max-bs:    32
 
 
 
@@ -171,5 +171,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "32"
   req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml
@@ -158,8 +158,8 @@ backend:
       enable-flashinfer-allreduce-fusion: true
 
       moe-runner-backend: "flashinfer_trtllm"
-      max-running-requests: 4096
-      cuda-graph-max-bs:    4096
+      max-running-requests: 16
+      cuda-graph-max-bs:    16
 
 
 
@@ -171,5 +171,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "16"
   req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_4.yaml
@@ -0,0 +1,175 @@
+name: "gb300-fp4-glm5_8k1k_lowlat_4"
+
+# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152).
+# Upstream uses a single combined file with `zip_override_*` arrays
+# expanded by srtctl across zip indices. We split into one flat yaml
+# per concrete topology to match the InferenceX dsv4 sglang convention
+# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the
+# prefill sglang_config are inlined here verbatim from the upstream
+# `base:` block; the decode block is the upstream base plus the
+# topology-specific override from this zip index.
+
+model:
+  path: "glm-5-fp4"
+  container: "lmsysorg/sglang:v0.5.11-cu130"
+  precision: "fp4"
+
+# Released dynamo wheel — upstream recipe uses dynamo.version: "1.1.0".
+# launch_gb300-cw.sh stages /configs/dynamo-wheels for `hash:` source
+# builds (dsv4 path); the version path uses a released wheel and does
+# not depend on that cache.
+dynamo:
+  version: "1.1.0"
+
+slurm:
+  time_limit: "03:00:00"
+
+# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU
+# default that turns dynamo install + sglang weight load into a serial
+# crawl; mem=0 grants whole-node memory.
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 17
+  decode_workers: 17
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+    SGLANG_MOE_NVFP4_DISPATCH: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "GLM-5-FP4"
+      trust-remote-code: true
+      quantization: "modelopt_fp4"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Disaggregation mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: "nixl"
+
+      # Size limits
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+      mem-fraction-static: 0.7
+      context-length: 9600
+      chunked-prefill-size: 32768
+      max-prefill-tokens: 8192
+
+      # Parallelism
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 1
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      load-balance-method: "total_tokens"
+
+      # Backend
+      nsa-decode-backend: "trtllm"
+      nsa-prefill-backend: "trtllm"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+      # Other flags
+      # disable-shared-experts-fusion: true
+      enable-flashinfer-allreduce-fusion: true
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+
+    decode:
+      # Model configuration
+      served-model-name: "GLM-5-FP4"
+      trust-remote-code: true
+
+      quantization: "modelopt_fp4"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Disaggregation mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: "nixl"
+
+      # Memory and token limits
+      mem-fraction-static: 0.8
+      context-length: 9600
+
+      # Backend
+      nsa-decode-backend: "trtllm"
+      nsa-prefill-backend: "trtllm"
+      moe-runner-backend: "flashinfer_cutedsl"
+      fp4-gemm-backend: "flashinfer_cutlass"
+
+      # Detokenizer
+      skip-tokenizer-init: true
+      stream-interval: 30
+
+      # Other flags
+      # disable-shared-experts-fusion: true
+      disable-radix-cache: true
+      weight-loader-prefetch-checkpoints: true
+      model-loader-extra-config: '{"enable_multithread_load": true}'
+      # Parallelism (override from upstream zip_override_*_lowlat)
+      tensor-parallel-size: 4
+      expert-parallel-size: 1
+      data-parallel-size:   1
+      enable-flashinfer-allreduce-fusion: true
+
+      moe-runner-backend: "flashinfer_trtllm"
+      max-running-requests: 1
+      cuda-graph-max-bs:    1
+
+
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "12"
+  req_rate: "inf"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3214,3 +3214,9 @@
   description:
     - "Update vLLM image tag to v0.22.0"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1384
+
+- config-keys:
+    - glm5-fp4-gb300-dynamo-sglang
+  description:
+    - "Update GB300 FP4 GLM-5 8k1k low-latency sweep to mirror NVIDIA/srt-slurm#175: add a 5th 1p17d topology (decode_nodes/workers=17), and lower decode max-running-requests / cuda-graph-max-bs / benchmark concurrency per-zip-index from a flat 4096/1024 to 128/64/32/16/1 (mrr & cuda-graph) and 128/64/32/16/12 (concurrency)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1583