Skip to content

Commit 5dcddc1

Browse files
committed
Add recipe for GLM 5.1 & 5.2 on GB200 and H100
Signed-off-by: Hollow Man <hollowman@opensuse.org>
1 parent e01d67b commit 5dcddc1

6 files changed

Lines changed: 270 additions & 0 deletions

File tree

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ruff: noqa: F401
15+
"""Performance benchmark recipes for GLM models."""
16+
17+
from megatron.bridge.perf_recipes.glm.gb200.glm5 import (
18+
glm51_sft_192gpu_gb200_bf16_config,
19+
glm52_sft_192gpu_gb200_bf16_config,
20+
)
21+
from megatron.bridge.perf_recipes.glm.h100.glm5 import (
22+
glm51_sft_416gpu_h100_bf16_config,
23+
glm52_sft_416gpu_h100_bf16_config,
24+
)
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ruff: noqa: F401
15+
"""Common helpers for GLM performance recipes."""
16+
17+
from megatron.bridge import AutoBridge
18+
from megatron.bridge.perf_recipes._common import _benchmark_common, _perf_precision
19+
from megatron.bridge.recipes.common import _sft_common
20+
from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
21+
from megatron.bridge.training.config import ConfigContainer
22+
23+
24+
GLM51_MODEL_ID = "zai-org/GLM-5.1"
25+
GLM52_MODEL_ID = "zai-org/GLM-5.2"
26+
GLM5_LONG_CONTEXT = 131072
27+
28+
29+
def _glm5_cudnn_sft_base(
30+
model_id: str,
31+
*,
32+
tensor_model_parallel_size: int,
33+
pipeline_model_parallel_size: int,
34+
context_parallel_size: int,
35+
expert_model_parallel_size: int,
36+
global_batch_size: int,
37+
sequence_parallel: bool,
38+
num_layers_in_first_pipeline_stage: int | None = None,
39+
num_layers_in_last_pipeline_stage: int | None = None,
40+
) -> ConfigContainer:
41+
"""Return a GLM5 cuDNN SFT benchmark config."""
42+
cfg = _sft_common()
43+
44+
cfg.model = AutoBridge.from_hf_pretrained(model_id).to_megatron_provider(load_weights=False)
45+
cfg.mixed_precision = _perf_precision("bf16")
46+
47+
cfg.tokenizer.tokenizer_type = "NullTokenizer"
48+
cfg.tokenizer.tokenizer_model = None
49+
cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
50+
51+
cfg.dataset.seq_length = GLM5_LONG_CONTEXT
52+
cfg.dataset.num_workers = 1
53+
cfg.dataset.dataset_kwargs = {"pad_to_max_length": True}
54+
cfg.dataset.packed_sequence_specs.packed_sequence_size = GLM5_LONG_CONTEXT
55+
cfg.dataset.packed_sequence_specs.pad_seq_to_mult = context_parallel_size * 2
56+
cfg.dataset.packed_sequence_specs.tokenizer_model_name = "glm5"
57+
58+
cfg.model.seq_length = GLM5_LONG_CONTEXT
59+
cfg.model.tensor_model_parallel_size = tensor_model_parallel_size
60+
cfg.model.pipeline_model_parallel_size = pipeline_model_parallel_size
61+
cfg.model.virtual_pipeline_model_parallel_size = None
62+
cfg.model.context_parallel_size = context_parallel_size
63+
cfg.model.expert_model_parallel_size = expert_model_parallel_size
64+
cfg.model.expert_tensor_parallel_size = 1
65+
cfg.model.sequence_parallel = sequence_parallel
66+
cfg.model.pipeline_model_parallel_layout = None
67+
cfg.model.account_for_embedding_in_pipeline_split = False
68+
cfg.model.account_for_loss_in_pipeline_split = False
69+
cfg.model.num_layers_in_first_pipeline_stage = num_layers_in_first_pipeline_stage
70+
cfg.model.num_layers_in_last_pipeline_stage = num_layers_in_last_pipeline_stage
71+
72+
cfg.train.global_batch_size = global_batch_size
73+
cfg.train.micro_batch_size = 1
74+
75+
cfg.ddp.use_distributed_optimizer = True
76+
cfg.ddp.grad_reduce_in_fp32 = False
77+
cfg.optimizer.use_distributed_optimizer = True
78+
79+
cfg.model.transformer_impl = "transformer_engine"
80+
cfg.model.attention_backend = "auto"
81+
cfg.model.cp_comm_type = "allgather"
82+
cfg.model.gradient_accumulation_fusion = True
83+
cfg.model.moe_permute_fusion = True
84+
cfg.model.moe_grouped_gemm = True
85+
cfg.model.moe_flex_dispatcher_backend = "deepep"
86+
cfg.model.moe_router_dtype = "fp32"
87+
cfg.model.moe_shared_expert_overlap = False
88+
cfg.model.deallocate_pipeline_outputs = True
89+
cfg.model.persist_layer_norm = True
90+
cfg.model.bias_dropout_fusion = True
91+
cfg.model.bias_activation_fusion = True
92+
cfg.model.calculate_per_token_loss = True
93+
cfg.model.apply_dsa_kernel_fusion = True
94+
cfg.model.dsa_kernel_backend = "cudnn"
95+
cfg.model.dsa_indexer_n_heads = 32
96+
cfg.model.dsa_indexer_head_dim = 128
97+
cfg.model.dsa_indexer_topk = 2048
98+
cfg.model.dsa_indexer_topk_freq = 4
99+
cfg.model.dsa_indexer_skip_topk_offset = 3
100+
cfg.model.dsa_indexer_rope_interleaved = True
101+
cfg.model.dsa_indexer_rotate_activation = False
102+
cfg.model.dsa_indexer_k_norm_epsilon = 1e-6
103+
cfg.model.dsa_indexer_loss_coeff = 0.001
104+
cfg.model.dsa_indexer_use_sparse_loss = True
105+
cfg.model.mtp_num_layers = 1
106+
107+
cfg.model.recompute_granularity = "full"
108+
cfg.model.recompute_method = "uniform"
109+
cfg.model.recompute_num_layers = 1
110+
111+
cfg.model.cuda_graph_impl = "none"
112+
cfg.model.cuda_graph_scope = []
113+
cfg.rng.te_rng_tracker = cfg.model.use_te_rng_tracker = False
114+
115+
_benchmark_common(cfg, cross_entropy_impl="native")
116+
cfg.model.apply_rope_fusion = False
117+
cfg.ddp.grad_reduce_in_fp32 = True
118+
if not isinstance(cfg.mixed_precision, str):
119+
cfg.mixed_precision.grad_reduce_in_fp32 = True
120+
return cfg
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""GB200 performance recipes for GLM-5.1 and GLM-5.2 SFT."""
15+
16+
from megatron.bridge.perf_recipes.glm.common import (
17+
GLM51_MODEL_ID,
18+
GLM52_MODEL_ID,
19+
ConfigContainer,
20+
_glm5_cudnn_sft_base,
21+
)
22+
23+
24+
_GLM5_GB200_CP = 32
25+
26+
27+
def _glm5_gb200_cudnn_sft_config(model_id: str) -> ConfigContainer:
28+
"""Return the 48-node GB200 GLM5 cuDNN SFT benchmark shape."""
29+
return _glm5_cudnn_sft_base(
30+
model_id,
31+
tensor_model_parallel_size=1,
32+
pipeline_model_parallel_size=6,
33+
context_parallel_size=_GLM5_GB200_CP,
34+
expert_model_parallel_size=32,
35+
global_batch_size=56,
36+
sequence_parallel=False,
37+
num_layers_in_first_pipeline_stage=14,
38+
num_layers_in_last_pipeline_stage=16,
39+
)
40+
41+
42+
def glm51_sft_192gpu_gb200_bf16_config() -> ConfigContainer:
43+
"""GLM-5.1 SFT: 192× GB200, BF16, 128K packed THD, CP=32, cuDNN DSA."""
44+
return _glm5_gb200_cudnn_sft_config(GLM51_MODEL_ID)
45+
46+
47+
def glm52_sft_192gpu_gb200_bf16_config() -> ConfigContainer:
48+
"""GLM-5.2 SFT: 192× GB200, BF16, 128K packed THD, CP=32, cuDNN DSA."""
49+
return _glm5_gb200_cudnn_sft_config(GLM52_MODEL_ID)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""H100 performance recipes for GLM-5.1 and GLM-5.2 SFT."""
15+
16+
from megatron.bridge.perf_recipes.glm.common import (
17+
GLM51_MODEL_ID,
18+
GLM52_MODEL_ID,
19+
ConfigContainer,
20+
_glm5_cudnn_sft_base,
21+
)
22+
23+
24+
_GLM5_H100_TP = 4
25+
_GLM5_H100_PP = 26
26+
_GLM5_H100_CP = 4
27+
_GLM5_H100_EP = 8
28+
_GLM5_H100_GBS = 520
29+
30+
31+
def _glm5_h100_cudnn_sft_config(model_id: str) -> ConfigContainer:
32+
"""Return the 52-node H100 GLM5 cuDNN SFT benchmark shape."""
33+
return _glm5_cudnn_sft_base(
34+
model_id,
35+
tensor_model_parallel_size=_GLM5_H100_TP,
36+
pipeline_model_parallel_size=_GLM5_H100_PP,
37+
context_parallel_size=_GLM5_H100_CP,
38+
expert_model_parallel_size=_GLM5_H100_EP,
39+
global_batch_size=_GLM5_H100_GBS,
40+
sequence_parallel=True,
41+
)
42+
43+
44+
def glm51_sft_416gpu_h100_bf16_config() -> ConfigContainer:
45+
"""GLM-5.1 SFT: 416x H100, BF16, 128K packed THD, CP=4, cuDNN DSA."""
46+
return _glm5_h100_cudnn_sft_config(GLM51_MODEL_ID)
47+
48+
49+
def glm52_sft_416gpu_h100_bf16_config() -> ConfigContainer:
50+
"""GLM-5.2 SFT: 416x H100, BF16, 128K packed THD, CP=4, cuDNN DSA."""
51+
return _glm5_h100_cudnn_sft_config(GLM52_MODEL_ID)

0 commit comments

Comments
 (0)