Skip to content

Commit b6f2fa6

Browse files
committed
Support GLM5.2 and recipe for GLM 5.1 & 5.2 on GB200 and H100
Signed-off-by: Hollow Man <hollowman@opensuse.org>
1 parent 56b48b6 commit b6f2fa6

7 files changed

Lines changed: 272 additions & 0 deletions

File tree

src/megatron/bridge/models/glm_moe_dsa/glm5_bridge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> MLAModelProvider
111111
provider.dsa_indexer_head_dim = hf_config.index_head_dim
112112
provider.dsa_indexer_n_heads = hf_config.index_n_heads
113113
provider.dsa_indexer_topk = hf_config.index_topk
114+
provider.dsa_indexer_rope_interleaved = hf_config.indexer_rope_interleave
115+
provider.dsa_indexer_topk_freq = getattr(hf_config, "index_topk_freq", 1)
116+
provider.dsa_indexer_skip_topk_offset = getattr(hf_config, "index_skip_topk_offset", 0)
117+
provider.dsa_indexer_rotate_activation = False
118+
provider.dsa_indexer_k_norm_epsilon = 1e-6
114119
provider.dsa_indexer_loss_coeff = 0.001
115120
provider.dsa_indexer_use_sparse_loss = True
116121

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ruff: noqa: F401
15+
"""Performance benchmark recipes for GLM models."""
16+
17+
from megatron.bridge.perf_recipes.glm_moe_dsa.gb200.glm5 import (
18+
glm51_sft_192gpu_gb200_bf16_config,
19+
glm52_sft_192gpu_gb200_bf16_config,
20+
)
21+
from megatron.bridge.perf_recipes.glm_moe_dsa.h100.glm5 import (
22+
glm51_sft_416gpu_h100_bf16_config,
23+
glm52_sft_416gpu_h100_bf16_config,
24+
)
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ruff: noqa: F401
15+
"""Common helpers for GLM performance recipes."""
16+
17+
from megatron.bridge import AutoBridge
18+
from megatron.bridge.perf_recipes._common import _benchmark_common, _perf_precision
19+
from megatron.bridge.recipes.common import _sft_common
20+
from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
21+
from megatron.bridge.training.config import ConfigContainer
22+
23+
24+
GLM5_LONG_CONTEXT = 131072
25+
26+
27+
def _glm5_cudnn_sft_base(
28+
model_id: str,
29+
*,
30+
tensor_model_parallel_size: int,
31+
pipeline_model_parallel_size: int,
32+
context_parallel_size: int,
33+
expert_model_parallel_size: int,
34+
global_batch_size: int,
35+
sequence_parallel: bool,
36+
num_layers_in_first_pipeline_stage: int | None = None,
37+
num_layers_in_last_pipeline_stage: int | None = None,
38+
) -> ConfigContainer:
39+
"""Return a GLM5 cuDNN SFT benchmark config."""
40+
cfg = _sft_common()
41+
42+
cfg.model = AutoBridge.from_hf_pretrained(model_id).to_megatron_provider(load_weights=False)
43+
cfg.mixed_precision = _perf_precision("bf16")
44+
45+
cfg.tokenizer.tokenizer_type = "NullTokenizer"
46+
cfg.tokenizer.tokenizer_model = None
47+
cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
48+
49+
cfg.dataset.seq_length = GLM5_LONG_CONTEXT
50+
cfg.dataset.num_workers = 1
51+
cfg.dataset.dataset_kwargs = {"pad_to_max_length": True}
52+
cfg.dataset.packed_sequence_specs.packed_sequence_size = GLM5_LONG_CONTEXT
53+
cfg.dataset.packed_sequence_specs.pad_seq_to_mult = context_parallel_size * 2
54+
cfg.dataset.packed_sequence_specs.tokenizer_model_name = "glm5"
55+
56+
cfg.model.seq_length = GLM5_LONG_CONTEXT
57+
cfg.model.tensor_model_parallel_size = tensor_model_parallel_size
58+
cfg.model.pipeline_model_parallel_size = pipeline_model_parallel_size
59+
cfg.model.virtual_pipeline_model_parallel_size = None
60+
cfg.model.context_parallel_size = context_parallel_size
61+
cfg.model.expert_model_parallel_size = expert_model_parallel_size
62+
cfg.model.expert_tensor_parallel_size = 1
63+
cfg.model.sequence_parallel = sequence_parallel
64+
cfg.model.pipeline_model_parallel_layout = None
65+
cfg.model.account_for_embedding_in_pipeline_split = False
66+
cfg.model.account_for_loss_in_pipeline_split = False
67+
cfg.model.num_layers_in_first_pipeline_stage = num_layers_in_first_pipeline_stage
68+
cfg.model.num_layers_in_last_pipeline_stage = num_layers_in_last_pipeline_stage
69+
70+
cfg.train.global_batch_size = global_batch_size
71+
cfg.train.micro_batch_size = 1
72+
73+
cfg.ddp.use_distributed_optimizer = True
74+
cfg.ddp.grad_reduce_in_fp32 = False
75+
cfg.optimizer.use_distributed_optimizer = True
76+
77+
cfg.model.transformer_impl = "transformer_engine"
78+
cfg.model.attention_backend = "auto"
79+
cfg.model.cp_comm_type = "allgather"
80+
cfg.model.gradient_accumulation_fusion = True
81+
cfg.model.moe_permute_fusion = True
82+
cfg.model.moe_grouped_gemm = True
83+
cfg.model.moe_flex_dispatcher_backend = "deepep"
84+
cfg.model.moe_router_dtype = "fp32"
85+
cfg.model.moe_shared_expert_overlap = False
86+
cfg.model.deallocate_pipeline_outputs = True
87+
cfg.model.persist_layer_norm = True
88+
cfg.model.bias_dropout_fusion = True
89+
cfg.model.bias_activation_fusion = True
90+
cfg.model.calculate_per_token_loss = True
91+
cfg.model.apply_dsa_kernel_fusion = True
92+
cfg.model.dsa_kernel_backend = "cudnn"
93+
cfg.model.dsa_indexer_n_heads = 32
94+
cfg.model.dsa_indexer_head_dim = 128
95+
cfg.model.dsa_indexer_topk = 2048
96+
cfg.model.dsa_indexer_topk_freq = 4
97+
cfg.model.dsa_indexer_skip_topk_offset = 3
98+
cfg.model.dsa_indexer_rope_interleaved = True
99+
cfg.model.dsa_indexer_rotate_activation = False
100+
cfg.model.dsa_indexer_k_norm_epsilon = 1e-6
101+
cfg.model.dsa_indexer_loss_coeff = 0.001
102+
cfg.model.dsa_indexer_use_sparse_loss = True
103+
cfg.model.mtp_num_layers = 1
104+
105+
cfg.model.recompute_granularity = "full"
106+
cfg.model.recompute_method = "uniform"
107+
cfg.model.recompute_num_layers = 1
108+
109+
cfg.model.cuda_graph_impl = "none"
110+
cfg.model.cuda_graph_scope = []
111+
cfg.rng.te_rng_tracker = cfg.model.use_te_rng_tracker = False
112+
113+
_benchmark_common(cfg, cross_entropy_impl="native")
114+
cfg.model.apply_rope_fusion = False
115+
cfg.ddp.grad_reduce_in_fp32 = True
116+
if not isinstance(cfg.mixed_precision, str):
117+
cfg.mixed_precision.grad_reduce_in_fp32 = True
118+
return cfg
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""GB200 performance recipes for GLM-5.1 and GLM-5.2 SFT."""
15+
16+
from megatron.bridge.perf_recipes.glm_moe_dsa.common import (
17+
ConfigContainer,
18+
_glm5_cudnn_sft_base,
19+
)
20+
21+
22+
_GLM5_GB200_CP = 32
23+
24+
25+
def _glm5_gb200_cudnn_sft_config(model_id: str) -> ConfigContainer:
26+
"""Return the 48-node GB200 GLM5 cuDNN SFT benchmark shape."""
27+
return _glm5_cudnn_sft_base(
28+
model_id,
29+
tensor_model_parallel_size=1,
30+
pipeline_model_parallel_size=6,
31+
context_parallel_size=_GLM5_GB200_CP,
32+
expert_model_parallel_size=32,
33+
global_batch_size=56,
34+
sequence_parallel=False,
35+
num_layers_in_first_pipeline_stage=14,
36+
num_layers_in_last_pipeline_stage=16,
37+
)
38+
39+
40+
def glm51_sft_192gpu_gb200_bf16_config() -> ConfigContainer:
41+
"""GLM-5.1 SFT: 192× GB200, BF16, 128K packed THD, CP=32, cuDNN DSA."""
42+
return _glm5_gb200_cudnn_sft_config("zai-org/GLM-5.1")
43+
44+
45+
def glm52_sft_192gpu_gb200_bf16_config() -> ConfigContainer:
46+
"""GLM-5.2 SFT: 192× GB200, BF16, 128K packed THD, CP=32, cuDNN DSA."""
47+
return _glm5_gb200_cudnn_sft_config("zai-org/GLM-5.2")
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""H100 performance recipes for GLM-5.1 and GLM-5.2 SFT."""
15+
16+
from megatron.bridge.perf_recipes.glm_moe_dsa.common import (
17+
ConfigContainer,
18+
_glm5_cudnn_sft_base,
19+
)
20+
21+
22+
_GLM5_H100_TP = 4
23+
_GLM5_H100_PP = 26
24+
_GLM5_H100_CP = 4
25+
_GLM5_H100_EP = 8
26+
_GLM5_H100_GBS = 520
27+
28+
29+
def glm51_sft_416gpu_h100_bf16_config() -> ConfigContainer:
30+
"""GLM-5.1 SFT: 416x H100, BF16, 128K packed THD, CP=4, cuDNN DSA."""
31+
return _glm5_cudnn_sft_base(
32+
"zai-org/GLM-5.1",
33+
tensor_model_parallel_size=_GLM5_H100_TP,
34+
pipeline_model_parallel_size=_GLM5_H100_PP,
35+
context_parallel_size=_GLM5_H100_CP,
36+
expert_model_parallel_size=_GLM5_H100_EP,
37+
global_batch_size=_GLM5_H100_GBS,
38+
sequence_parallel=True,
39+
)
40+
41+
42+
def glm52_sft_416gpu_h100_bf16_config() -> ConfigContainer:
43+
"""GLM-5.2 SFT: 416x H100, BF16, 128K packed THD, CP=4, cuDNN DSA."""
44+
return _glm5_cudnn_sft_base(
45+
"zai-org/GLM-5.2",
46+
tensor_model_parallel_size=_GLM5_H100_TP,
47+
pipeline_model_parallel_size=_GLM5_H100_PP,
48+
context_parallel_size=_GLM5_H100_CP,
49+
expert_model_parallel_size=_GLM5_H100_EP,
50+
global_batch_size=_GLM5_H100_GBS,
51+
sequence_parallel=True,
52+
)

0 commit comments

Comments
 (0)