NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/export/layer_utils.py‎
Lines changed: 12 additions & 1 deletion b/‎modelopt/torch/export/layer_utils.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎modelopt/torch/export/moe_utils.py‎
Lines changed: 128 additions & 0 deletions b/‎modelopt/torch/export/moe_utils.py‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 10 additions & 0 deletions b/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 10 additions & 0 deletions
@@ -27,7 +27,7 @@ Changelog
 
 - [Security] Changed the default of ``weights_only`` to ``True`` in ``torch.load`` for secure checkpoint loading. If you need to load a checkpoint that requires unpickling arbitrary objects, first register the class in ``torch.serialization.add_safe_globals([cls])`` before loading. Added :meth:`safe_save <modelopt.torch.utils.serialization.safe_save>` and :meth:`safe_load <modelopt.torch.utils.serialization.safe_load>` API to save and load checkpoints securely.
 - Bump minimum required PyTorch version to 2.8.
-- [Experimental] Add support for transformers>=5.0. Unified Hugging Face checkpoint export for quantized checkpoints may not work for MoE models with transformers>=5.0 yet.
+- [Experimental] Add support for transformers>=5.0, including generic PTQ and unified HF checkpoint export for fused MoE expert modules (Mixtral, Qwen2-MoE, Qwen3-MoE, Qwen3.5-MoE, DeepSeek-V3, Jamba, OLMoE, etc.).
 - Improve ``megatron_preprocess_data``: add ``--reasoning_content`` support for Nemotron v3 datasets, eliminate intermediate JSONL for HuggingFace datasets, return output file prefixes from the Python API, add gzip input support (``.jsonl.gz``), add ``--strip_newlines`` flag for plain-text pretraining data, add ``--hf_streaming`` for very large datasets (only consumed rows downloaded), and auto-shuffle when ``--hf_max_samples_per_split`` is set to avoid biased sampling.
 
 0.43 (2026-04-09)
 
@@ -965,6 +965,12 @@ def module_match_name_list(module, name_list):
         """
         return any(name.lower() in type(module).__name__.lower() for name in name_list)
 
+    # Structural detection: after _export_fused_experts, fused expert modules
+    # have per-expert submodules with gate_proj/up_proj/down_proj.
+    # Also handles models that originally used this naming (Qwen, DeepSeek, etc.).
+    if hasattr(module, "experts") and hasattr(module.experts, "gate_up_proj_weight_quantizers"):
+        return ["gate_up_proj", "down_proj"]
+
     if module_match_name_list(
         module,
         [
@@ -976,12 +982,17 @@ def module_match_name_list(module, name_list):
         ],
     ):
         return ["gate_proj", "down_proj", "up_proj"]
+    elif module_match_name_list(module, ["MixtralSparseMoeBlock"]):
+        # Old-style Mixtral (iterable experts) uses w1/w2/w3.
+        # Fused Mixtral (transformers 5.0+) is already handled by the
+        # structural gate_up_proj_weight_quantizers check above.
+        return ["w1", "w2", "w3"]
     elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]):
+        # Older transformers naming for Mixtral
         return ["linear_fc1", "linear_fc2"]
     elif module_match_name_list(module, ["DBRXMoeSparseMoeBlock"]):
         return ["w1_linear", "w2_linear", "v1_linear"]
     elif module_match_name_list(module, ["GptOssMoE"]):
-        # GPT-OSS MoE modules use gate_up_proj and down_proj
         return ["gate_up_proj", "down_proj"]
     else:
         # assuming w1, w2, w3 by default
 
@@ -15,11 +15,139 @@
 
 """Utilities for Mixture-of-Experts (MoE) model export."""
 
+import copy
+import warnings
 from pathlib import Path
 
+import torch
 import torch.nn as nn
 
 
+def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
+    """Split fused MoE expert weights and export per-expert quantization scales.
+
+    Works with any module wrapped by ``_QuantFusedExperts`` — i.e. any HF
+    transformers 5.0+ fused expert container that stores ``gate_up_proj`` and
+    ``down_proj`` as 3-D ``nn.Parameter`` tensors with per-expert quantizer
+    ``nn.ModuleList`` s.
+
+    Steps:
+
+    1. Handle amax fallback for uncalibrated expert input quantizers.
+    2. Split fused 3-D weights into per-expert 2-D projections
+       (``gate_proj``, ``up_proj``, ``down_proj``).
+    3. Call ``_export_quantized_weight`` on each projection.
+    4. Register results under the standard naming convention::
+
+           {E}.gate_proj.weight, {E}.gate_proj.weight_scale, ...
+           {E}.up_proj.weight, {E}.up_proj.weight_scale, ...
+           {E}.down_proj.weight, {E}.down_proj.weight_scale, ...
+    """
+    from modelopt.torch.export.unified_export_hf import _export_quantized_weight
+    from modelopt.torch.quantization.plugins.huggingface import _get_fused_expert_intermediate_dim
+
+    n = module.num_experts
+    expert_dim = _get_fused_expert_intermediate_dim(module)
+
+    # 1. Shared input quantizers — one per projection type, shared across all experts.
+    gate_up_input_q = module.gate_up_proj_input_quantizer
+    down_input_q = module.down_proj_input_quantizer
+
+    gate_up = module.gate_up_proj.data
+    down = module.down_proj.data
+
+    # 2-3. Split + export each per-expert projection.
+    fused_dim0 = gate_up.shape[1]  # 2 * expert_dim
+
+    for idx in range(n):
+        expert = nn.Module()
+
+        projections = [
+            ("gate_proj", gate_up[idx, :expert_dim, :], 0, fused_dim0, True),
+            ("up_proj", gate_up[idx, expert_dim:, :], expert_dim, fused_dim0, True),
+            ("down_proj", down[idx], 0, down.shape[1], False),
+        ]
+
+        for proj_name, weight_slice, fused_start, fused_total, is_gate_up in projections:
+            w_quantizer_src = (
+                module.gate_up_proj_weight_quantizers[idx]
+                if is_gate_up
+                else module.down_proj_weight_quantizers[idx]
+            )
+            i_quantizer = gate_up_input_q if is_gate_up else down_input_q
+
+            # gate/up share a weight quantizer — clone so each gets independent amax.
+            w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src
+
+            # For per-channel amax (dim >= 1), proportionally slice dim-0
+            # to match the split weight.
+            if (
+                hasattr(w_quantizer, "_amax")
+                and w_quantizer._amax is not None
+                and w_quantizer._amax.dim() >= 1
+            ):
+                amax = w_quantizer._amax
+                amax_dim0 = amax.shape[0]
+                if fused_total % amax_dim0 == 0:
+                    slice_start = fused_start * amax_dim0 // fused_total
+                    slice_end = (fused_start + weight_slice.shape[0]) * amax_dim0 // fused_total
+                    w_quantizer.amax = amax[slice_start:slice_end].contiguous()
+                else:
+                    warnings.warn(
+                        f"Expert {idx} {proj_name}: fused amax dim0 ({amax_dim0}) does not "
+                        f"evenly divide fused_total ({fused_total}). Skipping amax slicing, "
+                        f"which may produce incorrect quantization scales.",
+                        stacklevel=2,
+                    )
+
+            # If the weight quantizer was never calibrated, compute amax from weights.
+            if (
+                hasattr(w_quantizer, "is_enabled")
+                and w_quantizer.is_enabled
+                and (
+                    not hasattr(w_quantizer, "_amax")
+                    or w_quantizer._amax is None
+                    or torch.all(w_quantizer._amax == 0)
+                )
+            ):
+                w_quantizer.amax = weight_slice.abs().amax().to(torch.float32)
+                warnings.warn(
+                    f"Expert {idx} {proj_name} weight quantizer was not calibrated "
+                    f"(amax missing or zero). Using weight-derived amax as fallback. "
+                    f"Consider using more calibration data to activate all experts.",
+                    stacklevel=2,
+                )
+
+            wrapper = nn.Module()
+            wrapper.weight = nn.Parameter(weight_slice.contiguous(), requires_grad=False)
+            wrapper.weight_quantizer = w_quantizer
+            wrapper.input_quantizer = i_quantizer
+
+            _export_quantized_weight(wrapper, dtype)
+
+            proj = nn.Module()
+            proj.weight = wrapper.weight
+            for attr in ("weight_scale", "weight_scale_2", "input_scale"):
+                if hasattr(wrapper, attr):
+                    proj.register_buffer(attr, getattr(wrapper, attr))
+
+            expert.add_module(proj_name, proj)
+
+        module.add_module(str(idx), expert)
+
+    # 4. Remove fused params and quantizer lists — replaced by per-expert submodules
+    for attr in (
+        "gate_up_proj",
+        "down_proj",
+        "gate_up_proj_weight_quantizers",
+        "gate_up_proj_input_quantizer",
+        "down_proj_weight_quantizers",
+        "down_proj_input_quantizer",
+    ):
+        if hasattr(module, attr):
+            delattr(module, attr)
+
+
 def save_expert_token_count_table(model: nn.Module, output_dir: str | Path | None = None):
     """Collect expert_token_count from all quantized MoE layers and save as an HTML table.
 
 
@@ -677,6 +677,13 @@ def _process_quantized_modules(
                 with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                     for weight_name in ["gate_up_proj", "down_proj"]:
                         _export_quantized_weight(sub_module, dtype, weight_name)
+            elif hasattr(sub_module, "gate_up_proj_weight_quantizers"):
+                # Generic fused MoE experts (_QuantFusedExperts) with per-expert
+                # quantizer ModuleLists. Split into per-expert modules and export.
+                from modelopt.torch.export.moe_utils import _export_fused_experts
+
+                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                    _export_fused_experts(sub_module, dtype)
 
 
 def _export_transformers_checkpoint(
@@ -721,6 +728,9 @@ def _export_transformers_checkpoint(
                                 modules=list(linear_modulelist),
                                 quantizer_attrs=["input_quantizer"],
                             )
+                elif hasattr(sub_module.experts, "gate_up_proj_weight_quantizers"):
+                    # _QuantFusedExperts: amax fallback is handled in _export_fused_experts
+                    break
                 elif "QuantGptOssExperts" in type(sub_module.experts).__name__:
                     # Handle GPT-OSS experts specifically
                     # GPT-OSS experts use gate_up_proj and down_proj