Skip to content

Commit 3c1eaed

Browse files
Add GLM-5 FP4 GB300 dynamo-sglang disagg config (#1514)
* Add GLM-5 FP4 GB300 dynamo-sglang disagg config Ports upstream srt-slurm PR #152 (recipes/gb300-fp4/glm5.yaml) into InferenceX as 12 flat per-topology recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp4/ covering the 1k1k/8k1k maxtpt and lowlat sweeps. Wires glm5-fp4-gb300-dynamo-sglang through runners/launch_gb300-nv.sh: adds the glm5/fp4 model branch (SRT_SLURM_MODEL_PREFIX="glm-5-fp4", MODEL_PATH=/scratch/models/GLM-5-NVFP4), overlays the recipes onto the sa-submission-q2-2026 srt-slurm checkout, and exposes the recipe container: value via the srtslurm.yaml containers map. * perf-changelog: set PR link to #1514 * update runner --------- Co-authored-by: functionstackx <47992694+functionstackx@users.noreply.github.com>
1 parent 6e01f1e commit 3c1eaed

15 files changed

Lines changed: 2392 additions & 1 deletion

.github/configs/nvidia-master.yaml

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9234,3 +9234,199 @@ qwen3.5-fp8-h100-sglang-mtp:
92349234
osl: 1024
92359235
search-space:
92369236
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
9237+
9238+
glm5-fp4-gb300-dynamo-sglang:
9239+
image: lmsysorg/sglang:v0.5.11-cu130
9240+
model: nvidia/GLM-5-NVFP4
9241+
model-prefix: glm5
9242+
runner: gb300-nv
9243+
precision: fp4
9244+
framework: dynamo-sglang
9245+
multinode: true
9246+
disagg: true
9247+
scenarios:
9248+
fixed-seq-len:
9249+
# ---------- 8k1k high-throughput (wide-EP TP=32 decode) ----------
9250+
- isl: 8192
9251+
osl: 1024
9252+
search-space:
9253+
# 5p1d wide-EP. 13 nodes (5P @ TP=4 + 1D @ TP=32 on 8 nodes).
9254+
- conc-list: [2048]
9255+
prefill:
9256+
num-worker: 5
9257+
tp: 4
9258+
ep: 1
9259+
dp-attn: true
9260+
additional-settings:
9261+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_0.yaml"
9262+
decode:
9263+
num-worker: 1
9264+
tp: 32
9265+
ep: 32
9266+
dp-attn: true
9267+
# 7p1d wide-EP. 15 nodes.
9268+
- conc-list: [3072]
9269+
prefill:
9270+
num-worker: 7
9271+
tp: 4
9272+
ep: 1
9273+
dp-attn: true
9274+
additional-settings:
9275+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_1.yaml"
9276+
decode:
9277+
num-worker: 1
9278+
tp: 32
9279+
ep: 32
9280+
dp-attn: true
9281+
# 10p1d wide-EP. 18 nodes.
9282+
- conc-list: [4096]
9283+
prefill:
9284+
num-worker: 10
9285+
tp: 4
9286+
ep: 1
9287+
dp-attn: true
9288+
additional-settings:
9289+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_maxtpt_2.yaml"
9290+
decode:
9291+
num-worker: 1
9292+
tp: 32
9293+
ep: 32
9294+
dp-attn: true
9295+
# ---------- 8k1k low-latency (per-node TP=4 decode workers) ----------
9296+
- isl: 8192
9297+
osl: 1024
9298+
search-space:
9299+
# 1p3d. 4 nodes (1P + 3 D workers @ 1 node each).
9300+
- conc-list: [1024]
9301+
prefill:
9302+
num-worker: 1
9303+
tp: 4
9304+
ep: 1
9305+
dp-attn: true
9306+
additional-settings:
9307+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
9308+
decode:
9309+
num-worker: 3
9310+
tp: 4
9311+
ep: 1
9312+
dp-attn: false
9313+
# 1p5d. 6 nodes.
9314+
- conc-list: [1024]
9315+
prefill:
9316+
num-worker: 1
9317+
tp: 4
9318+
ep: 1
9319+
dp-attn: true
9320+
additional-settings:
9321+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
9322+
decode:
9323+
num-worker: 5
9324+
tp: 4
9325+
ep: 1
9326+
dp-attn: false
9327+
# 1p9d. 10 nodes.
9328+
- conc-list: [1024]
9329+
prefill:
9330+
num-worker: 1
9331+
tp: 4
9332+
ep: 1
9333+
dp-attn: true
9334+
additional-settings:
9335+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
9336+
decode:
9337+
num-worker: 9
9338+
tp: 4
9339+
ep: 1
9340+
dp-attn: false
9341+
# 1p15d. 16 nodes.
9342+
- conc-list: [1024]
9343+
prefill:
9344+
num-worker: 1
9345+
tp: 4
9346+
ep: 1
9347+
dp-attn: true
9348+
additional-settings:
9349+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/8k1k/disagg/stp/8k1k_stp_lowlat_3.yaml"
9350+
decode:
9351+
num-worker: 15
9352+
tp: 4
9353+
ep: 1
9354+
dp-attn: false
9355+
# ---------- 1k1k high-throughput (wide-EP TP=32 decode) ----------
9356+
- isl: 1024
9357+
osl: 1024
9358+
search-space:
9359+
# 3p1d wide-EP. 11 nodes. conc 16500.
9360+
- conc-list: [16500]
9361+
prefill:
9362+
num-worker: 3
9363+
tp: 4
9364+
ep: 1
9365+
dp-attn: true
9366+
additional-settings:
9367+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_0.yaml"
9368+
decode:
9369+
num-worker: 1
9370+
tp: 32
9371+
ep: 32
9372+
dp-attn: true
9373+
# 2p1d wide-EP. 10 nodes. conc 8300.
9374+
- conc-list: [8300]
9375+
prefill:
9376+
num-worker: 2
9377+
tp: 4
9378+
ep: 1
9379+
dp-attn: true
9380+
additional-settings:
9381+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_1.yaml"
9382+
decode:
9383+
num-worker: 1
9384+
tp: 32
9385+
ep: 32
9386+
dp-attn: true
9387+
# 1p1d wide-EP. 9 nodes. conc sweep 2500x1024x512x256.
9388+
- conc-list: [2500, 1024, 512, 256]
9389+
prefill:
9390+
num-worker: 1
9391+
tp: 4
9392+
ep: 1
9393+
dp-attn: true
9394+
additional-settings:
9395+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_maxtpt_2.yaml"
9396+
decode:
9397+
num-worker: 1
9398+
tp: 32
9399+
ep: 32
9400+
dp-attn: true
9401+
# ---------- 1k1k low-latency (per-node TP=4 decode workers) ----------
9402+
- isl: 1024
9403+
osl: 1024
9404+
search-space:
9405+
# 1p17d low-latency, bs=32 sweep. 18 nodes.
9406+
- conc-list: [512, 256, 128, 64]
9407+
prefill:
9408+
num-worker: 1
9409+
tp: 4
9410+
ep: 1
9411+
dp-attn: true
9412+
additional-settings:
9413+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml"
9414+
decode:
9415+
num-worker: 17
9416+
tp: 4
9417+
ep: 1
9418+
dp-attn: false
9419+
# 1p17d low-latency, bs=1 (single-stream). 18 nodes.
9420+
- conc-list: [32]
9421+
prefill:
9422+
num-worker: 1
9423+
tp: 4
9424+
ep: 1
9425+
dp-attn: true
9426+
additional-settings:
9427+
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp4/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml"
9428+
decode:
9429+
num-worker: 17
9430+
tp: 4
9431+
ep: 1
9432+
dp-attn: false
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
name: "gb300-fp4-glm5_1k1k_lowlat_0"
2+
3+
# Ported from upstream srt-slurm recipes/gb300-fp4/glm5.yaml (PR #152).
4+
# Upstream uses a single combined file with `zip_override_*` arrays
5+
# expanded by srtctl across zip indices. We split into one flat yaml
6+
# per concrete topology to match the InferenceX dsv4 sglang convention
7+
# (see ../deepseek-v4/8k1k/*.yaml). All shared base envs and the
8+
# prefill sglang_config are inlined here verbatim from the upstream
9+
# `base:` block; the decode block is the upstream base plus the
10+
# topology-specific override from this zip index.
11+
12+
model:
13+
path: "glm-5-fp4"
14+
container: "lmsysorg/sglang:v0.5.11-cu130"
15+
precision: "fp4"
16+
17+
# Released dynamo wheel — upstream recipe uses dynamo.version: "1.1.0".
18+
# launch_gb300-cw.sh stages /configs/dynamo-wheels for `hash:` source
19+
# builds (dsv4 path); the version path uses a released wheel and does
20+
# not depend on that cache.
21+
dynamo:
22+
version: "1.1.0"
23+
24+
slurm:
25+
time_limit: "03:00:00"
26+
27+
# Mirror dsv4 sglang recipes: cpus-per-task=144 avoids the 1-CPU
28+
# default that turns dynamo install + sglang weight load into a serial
29+
# crawl; mem=0 grants whole-node memory.
30+
sbatch_directives:
31+
cpus-per-task: "144"
32+
mem: "0"
33+
34+
resources:
35+
gpu_type: "gb300"
36+
gpus_per_node: 4
37+
prefill_nodes: 1
38+
prefill_workers: 1
39+
gpus_per_prefill: 4
40+
decode_nodes: 17
41+
decode_workers: 17
42+
gpus_per_decode: 4
43+
44+
frontend:
45+
type: dynamo
46+
47+
backend:
48+
type: sglang
49+
50+
prefill_environment:
51+
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
52+
PYTHONUNBUFFERED: "1"
53+
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
54+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
55+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
56+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
57+
MC_TE_METRIC: "true"
58+
MC_FORCE_MNNVL: "1"
59+
NCCL_MNNVL_ENABLE: "1"
60+
NCCL_CUMEM_ENABLE: "1"
61+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
62+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
63+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
64+
65+
decode_environment:
66+
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
67+
PYTHONUNBUFFERED: "1"
68+
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
69+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
70+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
71+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
72+
MC_TE_METRIC: "true"
73+
MC_FORCE_MNNVL: "1"
74+
NCCL_MNNVL_ENABLE: "1"
75+
NCCL_CUMEM_ENABLE: "1"
76+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
77+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
78+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
79+
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
80+
SGLANG_MOE_NVFP4_DISPATCH: "1"
81+
82+
sglang_config:
83+
prefill:
84+
# Model configuration
85+
served-model-name: "GLM-5-FP4"
86+
trust-remote-code: true
87+
quantization: "modelopt_fp4"
88+
kv-cache-dtype: "fp8_e4m3"
89+
90+
# Disaggregation mode
91+
disaggregation-mode: "prefill"
92+
disaggregation-transfer-backend: "nixl"
93+
94+
# Size limits
95+
max-running-requests: 256
96+
cuda-graph-max-bs: 256
97+
mem-fraction-static: 0.7
98+
context-length: 9600
99+
chunked-prefill-size: 32768
100+
max-prefill-tokens: 8192
101+
102+
# Parallelism
103+
tensor-parallel-size: 4
104+
data-parallel-size: 4
105+
expert-parallel-size: 1
106+
enable-dp-attention: true
107+
enable-dp-lm-head: true
108+
load-balance-method: "total_tokens"
109+
110+
# Backend
111+
nsa-decode-backend: "trtllm"
112+
nsa-prefill-backend: "trtllm"
113+
moe-runner-backend: "flashinfer_trtllm"
114+
fp4-gemm-backend: "flashinfer_cutlass"
115+
116+
# Other flags
117+
# disable-shared-experts-fusion: true
118+
enable-flashinfer-allreduce-fusion: true
119+
disable-radix-cache: true
120+
weight-loader-prefetch-checkpoints: true
121+
model-loader-extra-config: '{"enable_multithread_load": true}'
122+
123+
decode:
124+
# Model configuration
125+
served-model-name: "GLM-5-FP4"
126+
trust-remote-code: true
127+
128+
quantization: "modelopt_fp4"
129+
kv-cache-dtype: "fp8_e4m3"
130+
131+
# Disaggregation mode
132+
disaggregation-mode: "decode"
133+
disaggregation-transfer-backend: "nixl"
134+
135+
# Memory and token limits
136+
mem-fraction-static: 0.8
137+
context-length: 9600
138+
139+
# Backend
140+
nsa-decode-backend: "trtllm"
141+
nsa-prefill-backend: "trtllm"
142+
moe-runner-backend: "flashinfer_cutedsl"
143+
fp4-gemm-backend: "flashinfer_cutlass"
144+
145+
# Detokenizer
146+
skip-tokenizer-init: true
147+
stream-interval: 30
148+
149+
# Other flags
150+
# disable-shared-experts-fusion: true
151+
disable-radix-cache: true
152+
weight-loader-prefetch-checkpoints: true
153+
model-loader-extra-config: '{"enable_multithread_load": true}'
154+
# Parallelism (override from upstream zip_override_*_lowlat)
155+
tensor-parallel-size: 4
156+
expert-parallel-size: 1
157+
data-parallel-size: 1
158+
enable-flashinfer-allreduce-fusion: true
159+
160+
moe-runner-backend: "flashinfer_trtllm"
161+
max-running-requests: 32
162+
cuda-graph-max-bs: 32
163+
164+
165+
166+
health_check:
167+
max_attempts: 360
168+
interval_seconds: 10
169+
170+
benchmark:
171+
type: "sa-bench"
172+
isl: 1024
173+
osl: 1024
174+
concurrencies: "512x256x128x64"
175+
req_rate: "inf"

0 commit comments

Comments
 (0)