NVIDIA-NeMo · HollowMan6 · Jun 26, 2026
diff --git a/src/megatron/bridge/models/glm_moe_dsa/glm5_bridge.py b/src/megatron/bridge/models/glm_moe_dsa/glm5_bridge.py
@@ -111,6 +111,11 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> MLAModelProvider
         provider.dsa_indexer_head_dim = hf_config.index_head_dim
         provider.dsa_indexer_n_heads = hf_config.index_n_heads
         provider.dsa_indexer_topk = hf_config.index_topk
+        provider.dsa_indexer_rope_interleaved = hf_config.indexer_rope_interleave
+        provider.dsa_indexer_topk_freq = getattr(hf_config, "index_topk_freq", 1)
+        provider.dsa_indexer_skip_topk_offset = getattr(hf_config, "index_skip_topk_offset", 0)
+        provider.dsa_indexer_rotate_activation = False
+        provider.dsa_indexer_k_norm_epsilon = 1e-6
         provider.dsa_indexer_loss_coeff = 0.001
         provider.dsa_indexer_use_sparse_loss = True
 

diff --git a/src/megatron/bridge/perf_recipes/glm_moe_dsa/__init__.py b/src/megatron/bridge/perf_recipes/glm_moe_dsa/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ruff: noqa: F401
+"""Performance benchmark recipes for GLM models."""
+
+from megatron.bridge.perf_recipes.glm_moe_dsa.gb200.glm5 import (
+    glm51_sft_192gpu_gb200_bf16_config,
+    glm52_sft_192gpu_gb200_bf16_config,
+)
+from megatron.bridge.perf_recipes.glm_moe_dsa.h100.glm5 import (
+    glm51_sft_416gpu_h100_bf16_config,
+    glm52_sft_416gpu_h100_bf16_config,
+)
diff --git a/src/megatron/bridge/perf_recipes/glm_moe_dsa/common.py b/src/megatron/bridge/perf_recipes/glm_moe_dsa/common.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ruff: noqa: F401
+"""Common helpers for GLM performance recipes."""
+
+from megatron.bridge import AutoBridge
+from megatron.bridge.perf_recipes._common import _benchmark_common, _perf_precision
+from megatron.bridge.recipes.common import _sft_common
+from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
+from megatron.bridge.training.config import ConfigContainer
+
+
+GLM5_LONG_CONTEXT = 131072
+
+
+def _glm5_cudnn_sft_base(
+    model_id: str,
+    *,
+    tensor_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+    context_parallel_size: int,
+    expert_model_parallel_size: int,
+    global_batch_size: int,
+    sequence_parallel: bool,
+    num_layers_in_first_pipeline_stage: int | None = None,
+    num_layers_in_last_pipeline_stage: int | None = None,
+) -> ConfigContainer:
+    """Return a GLM5 cuDNN SFT benchmark config."""
+    cfg = _sft_common()
+
+    cfg.model = AutoBridge.from_hf_pretrained(model_id).to_megatron_provider(load_weights=False)
+    cfg.mixed_precision = _perf_precision("bf16")
+
+    cfg.tokenizer.tokenizer_type = "NullTokenizer"
+    cfg.tokenizer.tokenizer_model = None
+    cfg.tokenizer.vocab_size = DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
+
+    cfg.dataset.seq_length = GLM5_LONG_CONTEXT
+    cfg.dataset.num_workers = 1
+    cfg.dataset.dataset_kwargs = {"pad_to_max_length": True}
+    cfg.dataset.packed_sequence_specs.packed_sequence_size = GLM5_LONG_CONTEXT
+    cfg.dataset.packed_sequence_specs.pad_seq_to_mult = context_parallel_size * 2
+    cfg.dataset.packed_sequence_specs.tokenizer_model_name = "glm5"
+
+    cfg.model.seq_length = GLM5_LONG_CONTEXT
+    cfg.model.tensor_model_parallel_size = tensor_model_parallel_size
+    cfg.model.pipeline_model_parallel_size = pipeline_model_parallel_size
+    cfg.model.virtual_pipeline_model_parallel_size = None
+    cfg.model.context_parallel_size = context_parallel_size
+    cfg.model.expert_model_parallel_size = expert_model_parallel_size
+    cfg.model.expert_tensor_parallel_size = 1
+    cfg.model.sequence_parallel = sequence_parallel
+    cfg.model.pipeline_model_parallel_layout = None
+    cfg.model.account_for_embedding_in_pipeline_split = False
+    cfg.model.account_for_loss_in_pipeline_split = False
+    cfg.model.num_layers_in_first_pipeline_stage = num_layers_in_first_pipeline_stage
+    cfg.model.num_layers_in_last_pipeline_stage = num_layers_in_last_pipeline_stage
+
+    cfg.train.global_batch_size = global_batch_size
+    cfg.train.micro_batch_size = 1
+
+    cfg.ddp.use_distributed_optimizer = True
+    cfg.ddp.grad_reduce_in_fp32 = False
+    cfg.optimizer.use_distributed_optimizer = True
+
+    cfg.model.transformer_impl = "transformer_engine"
+    cfg.model.attention_backend = "auto"
+    cfg.model.cp_comm_type = "allgather"
+    cfg.model.gradient_accumulation_fusion = True
+    cfg.model.moe_permute_fusion = True
+    cfg.model.moe_grouped_gemm = True
+    cfg.model.moe_flex_dispatcher_backend = "deepep"
+    cfg.model.moe_router_dtype = "fp32"
+    cfg.model.moe_shared_expert_overlap = False
+    cfg.model.deallocate_pipeline_outputs = True
+    cfg.model.persist_layer_norm = True
+    cfg.model.bias_dropout_fusion = True
+    cfg.model.bias_activation_fusion = True
+    cfg.model.calculate_per_token_loss = True
+    cfg.model.apply_dsa_kernel_fusion = True
+    cfg.model.dsa_kernel_backend = "cudnn"
+    cfg.model.dsa_indexer_n_heads = 32
+    cfg.model.dsa_indexer_head_dim = 128
+    cfg.model.dsa_indexer_topk = 2048
+    cfg.model.dsa_indexer_topk_freq = 4
+    cfg.model.dsa_indexer_skip_topk_offset = 3
+    cfg.model.dsa_indexer_rope_interleaved = True
+    cfg.model.dsa_indexer_rotate_activation = False
+    cfg.model.dsa_indexer_k_norm_epsilon = 1e-6
+    cfg.model.dsa_indexer_loss_coeff = 0.001
+    cfg.model.dsa_indexer_use_sparse_loss = True
+    cfg.model.mtp_num_layers = 1
+
+    cfg.model.recompute_granularity = "full"
+    cfg.model.recompute_method = "uniform"
+    cfg.model.recompute_num_layers = 1
+
+    cfg.model.cuda_graph_impl = "none"
+    cfg.model.cuda_graph_scope = []
+    cfg.rng.te_rng_tracker = cfg.model.use_te_rng_tracker = False
+
+    _benchmark_common(cfg, cross_entropy_impl="native")
+    cfg.model.apply_rope_fusion = False
+    cfg.ddp.grad_reduce_in_fp32 = True
+    if not isinstance(cfg.mixed_precision, str):
+        cfg.mixed_precision.grad_reduce_in_fp32 = True
+    return cfg
diff --git a/src/megatron/bridge/perf_recipes/glm_moe_dsa/gb200/__init__.py b/src/megatron/bridge/perf_recipes/glm_moe_dsa/gb200/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/megatron/bridge/perf_recipes/glm_moe_dsa/gb200/glm5.py b/src/megatron/bridge/perf_recipes/glm_moe_dsa/gb200/glm5.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GB200 performance recipes for GLM-5.1 and GLM-5.2 SFT."""
+
+from megatron.bridge.perf_recipes.glm_moe_dsa.common import (
+    ConfigContainer,
+    _glm5_cudnn_sft_base,
+)
+
+
+_GLM5_GB200_CP = 32
+
+
+def _glm5_gb200_cudnn_sft_config(model_id: str) -> ConfigContainer:
+    """Return the 48-node GB200 GLM5 cuDNN SFT benchmark shape."""
+    return _glm5_cudnn_sft_base(
+        model_id,
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=6,
+        context_parallel_size=_GLM5_GB200_CP,
+        expert_model_parallel_size=32,
+        global_batch_size=56,
+        sequence_parallel=False,
+        num_layers_in_first_pipeline_stage=14,
+        num_layers_in_last_pipeline_stage=16,
+    )
+
+
+def glm51_sft_192gpu_gb200_bf16_config() -> ConfigContainer:
+    """GLM-5.1 SFT: 192× GB200, BF16, 128K packed THD, CP=32, cuDNN DSA."""
+    return _glm5_gb200_cudnn_sft_config("zai-org/GLM-5.1")
+
+
+def glm52_sft_192gpu_gb200_bf16_config() -> ConfigContainer:
+    """GLM-5.2 SFT: 192× GB200, BF16, 128K packed THD, CP=32, cuDNN DSA."""
+    return _glm5_gb200_cudnn_sft_config("zai-org/GLM-5.2")
diff --git a/src/megatron/bridge/perf_recipes/glm_moe_dsa/h100/__init__.py b/src/megatron/bridge/perf_recipes/glm_moe_dsa/h100/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/megatron/bridge/perf_recipes/glm_moe_dsa/h100/glm5.py b/src/megatron/bridge/perf_recipes/glm_moe_dsa/h100/glm5.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""H100 performance recipes for GLM-5.1 and GLM-5.2 SFT."""
+
+from megatron.bridge.perf_recipes.glm_moe_dsa.common import (
+    ConfigContainer,
+    _glm5_cudnn_sft_base,
+)
+
+
+_GLM5_H100_TP = 4
+_GLM5_H100_PP = 26
+_GLM5_H100_CP = 4
+_GLM5_H100_EP = 8
+_GLM5_H100_GBS = 520
+
+
+def glm51_sft_416gpu_h100_bf16_config() -> ConfigContainer:
+    """GLM-5.1 SFT: 416x H100, BF16, 128K packed THD, CP=4, cuDNN DSA."""
+    return _glm5_cudnn_sft_base(
+        "zai-org/GLM-5.1",
+        tensor_model_parallel_size=_GLM5_H100_TP,
+        pipeline_model_parallel_size=_GLM5_H100_PP,
+        context_parallel_size=_GLM5_H100_CP,
+        expert_model_parallel_size=_GLM5_H100_EP,
+        global_batch_size=_GLM5_H100_GBS,
+        sequence_parallel=True,
+    )
+
+
+def glm52_sft_416gpu_h100_bf16_config() -> ConfigContainer:
+    """GLM-5.2 SFT: 416x H100, BF16, 128K packed THD, CP=4, cuDNN DSA."""
+    return _glm5_cudnn_sft_base(
+        "zai-org/GLM-5.2",
+        tensor_model_parallel_size=_GLM5_H100_TP,
+        pipeline_model_parallel_size=_GLM5_H100_PP,
+        context_parallel_size=_GLM5_H100_CP,
+        expert_model_parallel_size=_GLM5_H100_EP,
+        global_batch_size=_GLM5_H100_GBS,
+        sequence_parallel=True,
+    )