Fix QuantSequentialMLP sharded_state_dict (#742)

ChenhanYu · web-flow · commit 307fe7183b30 · 2026-01-08T16:06:44.000-08:00
## What does this PR do? **Type of change:** ?  Bug **Overview:** ? These fixes are needed for Megatron-LM `main` branch due to some changes in `sharded_state_dict`. Qwen3-30B-A3B PTQ and resume fails while EP=4 cannot load a checkpoint generated with PP=4. `singleton_local_shards` must be added to the metadata; otherwise, all experts `amax` are packed to gather and currently the TP `replica_id` for `linear_fc1` is incorrect. **Other Finding:** This limits TP=ETP=1 when EP>1. Otherwise, there will be `sharded_state_dict` access error. There is a potential blind spot of using the default TP group in `ColumnParallelLinear` and `RowParallelLinear` since it can be part of the MoE where the tensor parallelism is controlled by ETP instead. Will need a different PR to fix the parallel_state. **Results:** If calibrate with EP=1, mmlu = 0.80. This can be resumed with EP=4, TP=1, ETP=1 (TP>1 does not work as mentioned above). However if calibrated with EP=4, then mmlu = 0.71 which shows there are some issues with max sync in EP. ## Usage  ```python # Add a code snippet demonstrating how to use this ``` ## Testing  ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  --------- Signed-off-by: Chenhan Yu <chenhany@nvidia.com>
diff --git a/modelopt/torch/opt/plugins/megatron.py b/modelopt/torch/opt/plugins/megatron.py
@@ -155,6 +155,15 @@ def _setup(self):
         pass
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
+        """Overriding the default to support scalar sharding.
+
+        Note:
+            singleton_local_shards needs to be added to the metadata as well as
+            apply_swiglu_sharded_factory to handle the swiglu case.
+        """
+        if metadata is None:
+            metadata = {}
+        metadata["singleton_local_shards"] = True
         sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
         if not self.config.gated_linear_unit:
             return sharded_state_dict
@@ -163,6 +172,8 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
                 re.compile(pattern).match(k) for pattern in self._modelopt_state_keys
             ):
                 sharded_state_dict[k] = megatron_mlp.apply_swiglu_sharded_factory(
-                    v, sharded_offsets
+                    v,
+                    sharded_offsets,
+                    metadata["singleton_local_shards"],
                 )
         return sharded_state_dict
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -33,6 +33,7 @@
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 from megatron.core.utils import get_tensor_model_parallel_group_if_none
 
+from modelopt.torch.opt.dynamic import DynamicModule
 from modelopt.torch.opt.plugins.megatron import (
     _MegatronMLP,
     ensure_metadata_has_dp_cp_group,
@@ -551,8 +552,16 @@ def forward(self, input, *args, **kwargs):
 
 
 @QuantModuleRegistry.register({megatron_moe.SequentialMLP: "megatron_moe_SequentialMLP"})
-class _MegatronSequentialMLP(_MegatronMLP):
+class _MegatronSequentialMLP(DynamicModule):
     def _setup(self):
+        if (
+            self.config.expert_model_parallel_size > 1
+            and self.config.tensor_model_parallel_size > 1
+        ):
+            raise ValueError(
+                "TP+EP is not supported by QuantSequentialMLP. Set either TP or EP to 1!"
+            )
+
         if not hasattr(self, "parallel_state") or self.parallel_state is None:
             self.parallel_state = ParallelState(
                 mcore_parallel.get_expert_data_parallel_group(),
@@ -592,6 +601,21 @@ def sync_moe_local_experts_amax(self):
                 if isinstance(module, TensorQuantizer) and module.amax is not None:
                     module.amax = amax_dict[name].detach().clone().to(module.amax.device)
 
+    def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
+        """Override the default to enable singleton_local_shards.
+
+        Note:
+            singleton_local_shards must be added to the metadata; otherwise, all experts
+            amax are packed to gather and currently the TP replica_id for linear_fc1
+            is incorrect. This limits TP=ETP=1 when EP>1. Otherwise, there will be
+            sharded_state_dict access error.
+        """
+        if metadata is None:
+            metadata = {}
+        metadata["singleton_local_shards"] = True
+        sharded_state_dict = super().sharded_state_dict(prefix, sharded_offsets, metadata)
+        return sharded_state_dict
+
 
 if HAS_TE: