Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/megatron/bridge/models/glm_moe_dsa/glm5_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> MLAModelProvider
provider.dsa_indexer_head_dim = hf_config.index_head_dim
provider.dsa_indexer_n_heads = hf_config.index_n_heads
provider.dsa_indexer_topk = hf_config.index_topk
provider.dsa_indexer_rope_interleaved = hf_config.indexer_rope_interleave
provider.dsa_indexer_topk_freq = getattr(hf_config, "index_topk_freq", 1)
provider.dsa_indexer_skip_topk_offset = getattr(hf_config, "index_skip_topk_offset", 0)
provider.dsa_indexer_rotate_activation = False
provider.dsa_indexer_k_norm_epsilon = 1e-6
provider.dsa_indexer_loss_coeff = 0.001
provider.dsa_indexer_use_sparse_loss = True

Expand Down
24 changes: 24 additions & 0 deletions src/megatron/bridge/perf_recipes/glm_moe_dsa/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa: F401
"""Performance benchmark recipes for GLM models."""

from megatron.bridge.perf_recipes.glm_moe_dsa.gb200.glm5 import (
glm51_sft_192gpu_gb200_bf16_config,
glm52_sft_192gpu_gb200_bf16_config,
)
from megatron.bridge.perf_recipes.glm_moe_dsa.h100.glm5 import (
glm51_sft_416gpu_h100_bf16_config,
glm52_sft_416gpu_h100_bf16_config,
)
118 changes: 118 additions & 0 deletions src/megatron/bridge/perf_recipes/glm_moe_dsa/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa: F401
"""Common helpers for GLM performance recipes."""

from megatron.bridge import AutoBridge
from megatron.bridge.perf_recipes._common import _benchmark_common, _perf_precision
from megatron.bridge.recipes.common import _sft_common
from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
from megatron.bridge.training.config import ConfigContainer


GLM5_LONG_CONTEXT = 131072


def _glm5_cudnn_sft_base(
model_id: str,
*,
tensor_model_parallel_size: int,
pipeline_model_parallel_size: int,
context_parallel_size: int,
expert_model_parallel_size: int,
global_batch_size: int,
sequence_parallel: bool,
num_layers_in_first_pipeline_stage: int | None = None,
num_layers_in_last_pipeline_stage: int | None = None,
) -> ConfigContainer:
"""Return a GLM5 cuDNN SFT benchmark config."""
cfg = _sft_common()

cfg.model = AutoBridge.from_hf_pretrained(model_id).to_megatron_provider(load_weights=False)
cfg.mixed_precision = _perf_precision("bf16")

cfg.tokenizer.tokenizer_type = "NullTokenizer"
cfg.tokenizer.tokenizer_model = None
cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE

cfg.dataset.seq_length = GLM5_LONG_CONTEXT
cfg.dataset.num_workers = 1
cfg.dataset.dataset_kwargs = {"pad_to_max_length": True}
cfg.dataset.packed_sequence_specs.packed_sequence_size = GLM5_LONG_CONTEXT
cfg.dataset.packed_sequence_specs.pad_seq_to_mult = context_parallel_size * 2
cfg.dataset.packed_sequence_specs.tokenizer_model_name = "glm5"

cfg.model.seq_length = GLM5_LONG_CONTEXT
cfg.model.tensor_model_parallel_size = tensor_model_parallel_size
cfg.model.pipeline_model_parallel_size = pipeline_model_parallel_size
cfg.model.virtual_pipeline_model_parallel_size = None
cfg.model.context_parallel_size = context_parallel_size
cfg.model.expert_model_parallel_size = expert_model_parallel_size
cfg.model.expert_tensor_parallel_size = 1
cfg.model.sequence_parallel = sequence_parallel
cfg.model.pipeline_model_parallel_layout = None
cfg.model.account_for_embedding_in_pipeline_split = False
cfg.model.account_for_loss_in_pipeline_split = False
cfg.model.num_layers_in_first_pipeline_stage = num_layers_in_first_pipeline_stage
cfg.model.num_layers_in_last_pipeline_stage = num_layers_in_last_pipeline_stage

cfg.train.global_batch_size = global_batch_size
cfg.train.micro_batch_size = 1

cfg.ddp.use_distributed_optimizer = True
cfg.ddp.grad_reduce_in_fp32 = False
cfg.optimizer.use_distributed_optimizer = True

cfg.model.transformer_impl = "transformer_engine"
cfg.model.attention_backend = "auto"
cfg.model.cp_comm_type = "allgather"
cfg.model.gradient_accumulation_fusion = True
cfg.model.moe_permute_fusion = True
cfg.model.moe_grouped_gemm = True
cfg.model.moe_flex_dispatcher_backend = "deepep"
cfg.model.moe_router_dtype = "fp32"
cfg.model.moe_shared_expert_overlap = False
cfg.model.deallocate_pipeline_outputs = True
cfg.model.persist_layer_norm = True
cfg.model.bias_dropout_fusion = True
cfg.model.bias_activation_fusion = True
cfg.model.calculate_per_token_loss = True
cfg.model.apply_dsa_kernel_fusion = True
cfg.model.dsa_kernel_backend = "cudnn"
cfg.model.dsa_indexer_n_heads = 32
cfg.model.dsa_indexer_head_dim = 128
cfg.model.dsa_indexer_topk = 2048
cfg.model.dsa_indexer_topk_freq = 4
cfg.model.dsa_indexer_skip_topk_offset = 3
cfg.model.dsa_indexer_rope_interleaved = True
cfg.model.dsa_indexer_rotate_activation = False
cfg.model.dsa_indexer_k_norm_epsilon = 1e-6
cfg.model.dsa_indexer_loss_coeff = 0.001
cfg.model.dsa_indexer_use_sparse_loss = True
cfg.model.mtp_num_layers = 1

cfg.model.recompute_granularity = "full"
cfg.model.recompute_method = "uniform"
cfg.model.recompute_num_layers = 1

cfg.model.cuda_graph_impl = "none"
cfg.model.cuda_graph_scope = []
cfg.rng.te_rng_tracker = cfg.model.use_te_rng_tracker = False

_benchmark_common(cfg, cross_entropy_impl="native")
cfg.model.apply_rope_fusion = False
cfg.ddp.grad_reduce_in_fp32 = True
if not isinstance(cfg.mixed_precision, str):
cfg.mixed_precision.grad_reduce_in_fp32 = True
return cfg
13 changes: 13 additions & 0 deletions src/megatron/bridge/perf_recipes/glm_moe_dsa/gb200/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
47 changes: 47 additions & 0 deletions src/megatron/bridge/perf_recipes/glm_moe_dsa/gb200/glm5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GB200 performance recipes for GLM-5.1 and GLM-5.2 SFT."""

from megatron.bridge.perf_recipes.glm_moe_dsa.common import (
ConfigContainer,
_glm5_cudnn_sft_base,
)


_GLM5_GB200_CP = 32


def _glm5_gb200_cudnn_sft_config(model_id: str) -> ConfigContainer:
"""Return the 48-node GB200 GLM5 cuDNN SFT benchmark shape."""
return _glm5_cudnn_sft_base(
model_id,
tensor_model_parallel_size=1,
pipeline_model_parallel_size=6,
context_parallel_size=_GLM5_GB200_CP,
expert_model_parallel_size=32,
global_batch_size=56,
sequence_parallel=False,
num_layers_in_first_pipeline_stage=14,
num_layers_in_last_pipeline_stage=16,
)


def glm51_sft_192gpu_gb200_bf16_config() -> ConfigContainer:
"""GLM-5.1 SFT: 192× GB200, BF16, 128K packed THD, CP=32, cuDNN DSA."""
return _glm5_gb200_cudnn_sft_config("zai-org/GLM-5.1")


def glm52_sft_192gpu_gb200_bf16_config() -> ConfigContainer:
"""GLM-5.2 SFT: 192× GB200, BF16, 128K packed THD, CP=32, cuDNN DSA."""
return _glm5_gb200_cudnn_sft_config("zai-org/GLM-5.2")
13 changes: 13 additions & 0 deletions src/megatron/bridge/perf_recipes/glm_moe_dsa/h100/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
52 changes: 52 additions & 0 deletions src/megatron/bridge/perf_recipes/glm_moe_dsa/h100/glm5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""H100 performance recipes for GLM-5.1 and GLM-5.2 SFT."""

from megatron.bridge.perf_recipes.glm_moe_dsa.common import (
ConfigContainer,
_glm5_cudnn_sft_base,
)


_GLM5_H100_TP = 4
_GLM5_H100_PP = 26
_GLM5_H100_CP = 4
_GLM5_H100_EP = 8
_GLM5_H100_GBS = 520


def glm51_sft_416gpu_h100_bf16_config() -> ConfigContainer:
"""GLM-5.1 SFT: 416x H100, BF16, 128K packed THD, CP=4, cuDNN DSA."""
return _glm5_cudnn_sft_base(
"zai-org/GLM-5.1",
tensor_model_parallel_size=_GLM5_H100_TP,
pipeline_model_parallel_size=_GLM5_H100_PP,
context_parallel_size=_GLM5_H100_CP,
expert_model_parallel_size=_GLM5_H100_EP,
global_batch_size=_GLM5_H100_GBS,
sequence_parallel=True,
)


def glm52_sft_416gpu_h100_bf16_config() -> ConfigContainer:
"""GLM-5.2 SFT: 416x H100, BF16, 128K packed THD, CP=4, cuDNN DSA."""
return _glm5_cudnn_sft_base(
"zai-org/GLM-5.2",
tensor_model_parallel_size=_GLM5_H100_TP,
pipeline_model_parallel_size=_GLM5_H100_PP,
context_parallel_size=_GLM5_H100_CP,
expert_model_parallel_size=_GLM5_H100_EP,
global_batch_size=_GLM5_H100_GBS,
sequence_parallel=True,
)
Loading