Skip to content

Commit 3481ea1

Browse files
committed
Add glm5-fp4-mi355x-sglang-disagg MI355X PD-disagg recipe
Mirror glm5-fp8-mi355x-sglang-disagg with amd/GLM-5-MXFP4 on the v0.5.12.post1 image; add models.yaml GLM-5-MXFP4 entry, launcher script, and GLM-5 runtime hooks for disaggregated serving.
1 parent c4e397d commit 3481ea1

6 files changed

Lines changed: 182 additions & 3 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,59 @@ glm5-fp8-mi355x-sglang-disagg:
533533
- "DECODE_NODES=1"
534534
- "DECODE_MTP_SIZE=0"
535535

536+
glm5-fp4-mi355x-sglang-disagg:
537+
image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523
538+
model: amd/GLM-5-MXFP4
539+
model-prefix: glm5
540+
runner: mi355x-disagg
541+
precision: fp4
542+
framework: sglang-disagg
543+
multinode: true
544+
disagg: true
545+
scenarios:
546+
fixed-seq-len:
547+
- isl: 1024
548+
osl: 1024
549+
search-space:
550+
- spec-decoding: "none"
551+
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
552+
prefill:
553+
num-worker: 1
554+
tp: 8
555+
ep: 1
556+
dp-attn: false
557+
additional-settings:
558+
- "PREFILL_NODES=1"
559+
decode:
560+
num-worker: 1
561+
tp: 8
562+
ep: 1
563+
dp-attn: false
564+
additional-settings:
565+
- "DECODE_NODES=1"
566+
- "DECODE_MTP_SIZE=0"
567+
568+
- isl: 8192
569+
osl: 1024
570+
search-space:
571+
- spec-decoding: "none"
572+
conc-list: [ 8, 16, 32, 64, 128, 256, 512 ]
573+
prefill:
574+
num-worker: 1
575+
tp: 8
576+
ep: 1
577+
dp-attn: false
578+
additional-settings:
579+
- "PREFILL_NODES=1"
580+
decode:
581+
num-worker: 1
582+
tp: 8
583+
ep: 1
584+
dp-attn: false
585+
additional-settings:
586+
- "DECODE_NODES=1"
587+
- "DECODE_MTP_SIZE=0"
588+
536589
glm5-fp8-mi355x-atom:
537590
image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
538591
model: zai-org/GLM-5-FP8

benchmarks/multi_node/amd_utils/env.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
5555
export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
5656

5757
# GLM-5: uses NSA (not MLA), needs fused-decode-MLA disabled + fast loading
58-
if [[ "$MODEL_NAME" == "GLM-5-FP8" ]]; then
58+
if [[ "$MODEL_NAME" == "GLM-5-FP8" || "$MODEL_NAME" == "GLM-5-MXFP4" ]]; then
5959
export SGLANG_ROCM_FUSED_DECODE_MLA=0
6060
export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
6161
export SAFETENSORS_FAST_GPU=1

benchmarks/multi_node/amd_utils/models.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,37 @@ Qwen3.5-397B-A17B-FP8:
192192
chunked_prefill_size: 262144
193193
cuda_graph_bs_range: "1-128"
194194

195+
GLM-5-MXFP4:
196+
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\\\"enable_multithread_load\\\": true, \\\"num_threads\\\": 8}' --nsa-prefill-backend tilelang --nsa-decode-backend tilelang"
197+
mtp_flags: ""
198+
dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
199+
prefill:
200+
mem_fraction_static: 0.8
201+
disable_radix_cache: true
202+
dp:
203+
max_running_requests: 24
204+
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
205+
cuda_graph_bs: "1 2 3"
206+
no_dp:
207+
max_running_requests: 128
208+
chunked_prefill_size: 262144
209+
cuda_graph_bs_range: "1-128"
210+
decode:
211+
mem_fraction_static: 0.85
212+
prefill_round_robin_balance: true
213+
dp:
214+
max_running_requests: 4096
215+
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
216+
cuda_graph_bs_range: "1-160"
217+
ep_only:
218+
max_running_requests: 256
219+
chunked_prefill_size: 262144
220+
cuda_graph_bs_range: "1-256"
221+
no_dp:
222+
max_running_requests: 128
223+
chunked_prefill_size: 262144
224+
cuda_graph_bs_range: "1-128"
225+
195226
GLM-5-FP8:
196227
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --load-balance-method round_robin --disaggregation-transfer-backend mori --tool-call-parser glm47 --reasoning-parser glm45 --model-loader-extra-config '{\\\"enable_multithread_load\\\": true, \\\"num_threads\\\": 8}'"
197228
mtp_flags: ""

benchmarks/multi_node/amd_utils/setup_deps.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,16 @@ print("[SETUP] Patched: gluon pa_mqa_logits 3D instr_shape for base variant")
104104
# Only install if GLM-5 is the active model (avoid overhead otherwise).
105105
# ---------------------------------------------------------------------------
106106
install_transformers_glm5() {
107-
if [[ "$MODEL_NAME" != "GLM-5-FP8" ]]; then
107+
if [[ "$MODEL_NAME" != "GLM-5-FP8" && "$MODEL_NAME" != "GLM-5-MXFP4" ]]; then
108108
return 0
109109
fi
110110

111-
if python3 -c "from transformers import AutoConfig; AutoConfig.from_pretrained('zai-org/GLM-5-FP8', trust_remote_code=True)" 2>/dev/null; then
111+
_glm5_config_probe="zai-org/GLM-5-FP8"
112+
if [[ "$MODEL_NAME" == "GLM-5-MXFP4" ]]; then
113+
_glm5_config_probe="amd/GLM-5-MXFP4"
114+
fi
115+
116+
if python3 -c "from transformers import AutoConfig; AutoConfig.from_pretrained('${_glm5_config_probe}', trust_remote_code=True)" 2>/dev/null; then
112117
echo "[SETUP] transformers already supports GLM-5 model type"
113118
return 0
114119
fi
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/usr/bin/env bash
2+
3+
source "$(dirname "$0")/../benchmark_lib.sh"
4+
5+
check_env_vars \
6+
CONC_LIST \
7+
ISL \
8+
OSL \
9+
IMAGE \
10+
SPEC_DECODING \
11+
MODEL_PATH \
12+
PREFILL_NUM_WORKERS \
13+
PREFILL_TP \
14+
PREFILL_EP \
15+
PREFILL_DP_ATTN \
16+
DECODE_NUM_WORKERS \
17+
DECODE_TP \
18+
DECODE_EP \
19+
DECODE_DP_ATTN \
20+
PREFILL_NODES \
21+
DECODE_NODES \
22+
RANDOM_RANGE_RATIO
23+
24+
if [[ -n "$SLURM_JOB_ID" ]]; then
25+
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
26+
fi
27+
28+
set -x
29+
30+
# Use upstreamed multi_node scripts (no external clone needed)
31+
cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
32+
33+
# Set up SGL launch script-specific environment variables
34+
export TIME_LIMIT="08:00:00"
35+
export MODEL_PATH=$MODEL_PATH
36+
export MODEL_NAME=$MODEL_NAME
37+
export CONTAINER_IMAGE=$IMAGE
38+
39+
if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
40+
export PREFILL_ENABLE_EP=false
41+
else
42+
export PREFILL_ENABLE_EP=true
43+
fi
44+
45+
if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
46+
export PREFILL_ENABLE_DP=true
47+
else
48+
export PREFILL_ENABLE_DP=false
49+
fi
50+
51+
if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
52+
export DECODE_ENABLE_EP=false
53+
else
54+
export DECODE_ENABLE_EP=true
55+
fi
56+
57+
if [[ "$DECODE_DP_ATTN" == "true" ]]; then
58+
export DECODE_ENABLE_DP=true
59+
else
60+
export DECODE_ENABLE_DP=false
61+
fi
62+
63+
# Launch jobs based on ISL/OSL
64+
# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented
65+
# by a list of numbers delimited by 'x'. This is because of how the underlying launch script
66+
# expects the concurrencies.
67+
JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
68+
$PREFILL_NUM_WORKERS \
69+
$DECODE_NODES \
70+
$DECODE_NUM_WORKERS \
71+
$ISL $OSL "${CONC_LIST// /x}" inf \
72+
${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
73+
${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
74+
${PREFILL_TP} ${DECODE_TP} \
75+
${RANDOM_RANGE_RATIO})
76+
77+
if [[ $? -ne 0 ]]; then
78+
echo "Failed to submit job" >&2
79+
exit 1
80+
fi
81+
82+
echo "$JOB_ID"

perf-changelog.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2307,3 +2307,11 @@
23072307
- "Tune DSv4 FP4 MI355X SGLang runtime envs: enable aiter MHC pre/post, and enable triton swa prepare kernel."
23082308
- "Add --context-length. Add --enable-prefill-delayer for dp config"
23092309
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1300
2310+
2311+
- config-keys:
2312+
- glm5-fp4-mi355x-sglang-disagg
2313+
description:
2314+
- "Add GLM-5 MXFP4 MI355X SGLang PD-disaggregation (mirrors glm5-fp8-mi355x-sglang-disagg)"
2315+
- "Image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260523; model: amd/GLM-5-MXFP4; 1P1D TP8/EP1 dp-attn false; conc [8..512]"
2316+
- "models.yaml GLM-5-MXFP4 entry (NSA tilelang + fp8 KV cache); launcher glm5_fp4_mi355x_sglang-disagg.sh; MoRI conn.py overlay via job.slurm"
2317+
pr-link: XXX

0 commit comments

Comments
 (0)