Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,16 @@ Changelog
- Add support for ``active_params`` (for MoE models) and ``memory_mb`` constraints in Minitron pruning on top of existing ``params`` constraint. You can also provide multiple constraints. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning>`_ for more details. The underlying utility functions ``mcore_param_count``, ``mcore_memory_footprint_mb``, and ``print_mcore_model_stats`` in ``modelopt.torch.nas.plugins.megatron_model_stats`` are also available for standalone use to compute parameter counts and memory footprints (weights + KV-cache + Mamba state) for any Megatron-Core model.
- Add ``--cast_mxfp4_to_nvfp4`` flag to ``examples/llm_ptq/hf_ptq.py`` for closed-form, bit-exact MXFP4 → NVFP4 weight conversion. Supports the GPT-OSS family (``openai/gpt-oss-20b``, ``openai/gpt-oss-120b``). See `examples/llm_ptq/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_ptq#mxfp4--nvfp4-cast-for-gpt-oss>`__ for usage.
- DeepSeek PTQ (``examples/deepseek/ptq.py``) now defaults to native top-k calibration with post-hoc per-layer peer-max sync of expert ``input_quantizer.amax``; the all-experts path is preserved behind ``--calib_all_experts``.
- Support Megatron-Core checkpoint restore and export for MSE ``NVFP4StaticQuantizer``.
- Add mixed-precision FP8 + NVFP4 export for Megatron-Core: per-layer ``quant_algo`` recorded under ``quantized_layers`` in ``hf_quant_config.json``, PP-aware ``kv_cache_dtype`` gather, fused-QKV exclude split into per-HF-name ``q/k/v_proj`` entries.
- Add Nemotron-3-Super-120B-A12B PTQ recipes ``modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4.yaml`` (MSE-mixed) and ``super-nvfp4-max-calib.yaml`` (max-calib mixed): NVFP4 W4A4 routed experts + FP8 per-tensor shared experts / Mamba in/out_proj + FP8 KV cache.
- Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. vLLM deployment support is in progress.
- Add ``DATASET_COMBOS`` to ``modelopt.torch.utils.dataset_utils`` — single ``--dataset`` tokens that fan out to multiple registered datasets; per-entry ``num_samples`` is split evenly across the members. Initial combos: ``cnn_nemotron_v2_mix`` (``cnn_dailymail`` + ``nemotron-post-training-dataset-v2``, used by ``hf_ptq.py`` when no ``--dataset`` is provided) and ``nemotron-post-training-v3`` (the seven ``nvidia/Nemotron-*`` SFT datasets added in #1498, mirroring the `nemotron-post-training-v3 collection <https://huggingface.co/collections/nvidia/nemotron-post-training-v3>`_). Combo names are listed by ``get_supported_datasets()`` and surfaced in ``--dataset`` help. ``get_dataset_dataloader`` rejects inputs that mix a combo with one of its member datasets (e.g. ``cnn_dailymail,cnn_nemotron_v2_mix``) to avoid double-sampling, and ``get_dataset_samples`` rejects combo names so callers route through the dataloader. ``hf_ptq.py`` default ``--calib_size`` is bumped from ``512`` to ``1024`` so the total calibration sample count under the new default combo matches the previous two-dataset fallback.
- The ``nemotron-sft-agentic-v2`` registered dataset (added in #1498) now uses only the ``search`` split. The previously configured ``interactive_agent`` and ``tool_calling`` splits contain content-level defects (heterogeneous schema and a malformed JSON row, respectively) that cause pyarrow's streaming JSON reader to fail deterministically.

**Bug Fixes**

- In Megatron-Core only do EP amax sync for routed expert weights if ``sync_expert_weight_amax=True``. Previously EP amax sync would sync routed expert weights across EP ranks even when ``sync_expert_weight_amax`` was False
- Fix Megatron-Core HF importer to load fused ``TELayerNormColumnParallelLinear.layer_norm_weight`` from HF for GPT-family models (Qwen3 etc.) under ``--export-default-te-spec``. Importer now prefers per-context keys ``fused_input_layernorm`` / ``fused_pre_mlp_layernorm`` (fallback ``fused_norm`` for Nemotron-H backward compatibility); ``mcore_qwen.py`` provides the new rules. Without this fix, post-prune MMLU sat at chance.

0.44 (2026-05-14)
Expand Down
6 changes: 1 addition & 5 deletions examples/specdec_bench/specdec_bench/datasets/speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,11 +730,7 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data
# Strip HF metadata from the schema to avoid Feature parsing errors
schema = table.schema
if schema.metadata and b"huggingface" in schema.metadata:
new_meta = {
k: v
for k, v in schema.metadata.items()
if k != b"huggingface"
}
new_meta = {k: v for k, v in schema.metadata.items() if k != b"huggingface"}
table = table.replace_schema_metadata(new_meta or None)
dataset = HFDataset(table)
if self.num_samples is not None and self.num_samples < len(dataset):
Expand Down
45 changes: 37 additions & 8 deletions modelopt/torch/export/plugins/hf_checkpoint_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,21 @@

import torch
from huggingface_hub import snapshot_download
from huggingface_hub.errors import LocalEntryNotFoundError
from safetensors.torch import safe_open
from tqdm import tqdm

_HF_HUB_OFFLINE_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}


def _is_hf_hub_offline() -> bool:
return os.environ.get("HF_HUB_OFFLINE", "").strip().upper() in _HF_HUB_OFFLINE_TRUE_VALUES


def _copy_python_files(source_dir: Path, save_dir: Path) -> None:
for py_file in source_dir.glob("*.py"):
shutil.copy2(py_file, save_dir / py_file.name)


def copy_hf_ckpt_remote_code(
pretrained_model_path: str | os.PathLike, save_directory: str | os.PathLike
Expand All @@ -36,7 +48,10 @@ def copy_hf_ckpt_remote_code(
frameworks.

If ``pretrained_model_path`` is a local directory, Python files are copied directly.
If it's a HF Hub model ID (e.g. ``nvidia/NVIDIA-Nemotron-Nano-12B-v2``), files are downloaded from the Hub.
If it's a HF Hub model ID (e.g. ``nvidia/NVIDIA-Nemotron-Nano-12B-v2``), the Hub
snapshot is resolved first and Python files are copied from that snapshot. When
``HF_HUB_OFFLINE`` is set, the snapshot must already be available in the local
Hugging Face cache.

Args:
pretrained_model_path: Local path to the pretrained model or HuggingFace Hub model ID.
Expand All @@ -47,14 +62,28 @@ def copy_hf_ckpt_remote_code(
save_dir.mkdir(parents=True, exist_ok=True)

if hf_checkpoint_path.is_dir():
for py_file in hf_checkpoint_path.glob("*.py"):
shutil.copy2(py_file, save_dir / py_file.name)
_copy_python_files(hf_checkpoint_path, save_dir)
else:
snapshot_download(
repo_id=str(pretrained_model_path),
local_dir=str(save_dir),
allow_patterns=["*.py"],
)
local_files_only = _is_hf_hub_offline()
try:
source_dir = Path(
snapshot_download(
repo_id=str(pretrained_model_path),
allow_patterns=["*.py"],
local_files_only=local_files_only,
)
)
except LocalEntryNotFoundError as exc:
if local_files_only:
raise RuntimeError(
f"Could not copy Python sidecar files for {pretrained_model_path!r} because "
"HF_HUB_OFFLINE is enabled and the files are not available in the local "
"Hugging Face cache. Populate the cache with the model's *.py files or pass "
"a local pretrained model directory."
) from exc
raise

_copy_python_files(source_dir, save_dir)


def load_multimodal_components(
Expand Down
5 changes: 4 additions & 1 deletion modelopt/torch/export/plugins/mcore_nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,10 @@
"input_layernorm": NameRemapping("backbone.layers.{}.norm."),
"linear_qkv": QKVSlicing("backbone.layers.{}.mixer."),
"linear_proj": NameRemapping("backbone.layers.{}.mixer.o_proj."),
"core_attention": SelfAttentionScaling("backbone.layers.{}.mixer."),
"core_attention": SelfAttentionScaling(
"backbone.layers.{}.mixer.",
func_kwargs={"k_scale_name": "k_proj.k_scale", "v_scale_name": "v_proj.v_scale"},
),
# MLP
"pre_mlp_layernorm": NameRemapping("backbone.layers.{}.norm."),
"linear_fc1": NameRemapping("backbone.layers.{}.mixer.up_proj."),
Expand Down
20 changes: 18 additions & 2 deletions modelopt/torch/export/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,25 @@ def _ensure_weight_quantizer_calibrated(
module_name: Optional module name for better warning messages
"""
if isinstance(weight_quantizer, NVFP4StaticQuantizer):
need_per_block = not hasattr(weight_quantizer, "_amax") or weight_quantizer._amax is None

def _amax_is_invalid(t: torch.Tensor | None) -> bool:
# MCore distcp may register but not fill amax — treat missing/non-finite/negative as recompute.
if t is None:
return True
t = t.detach()
if not torch.is_floating_point(t):
return False
return bool(torch.any(~torch.isfinite(t)).item() or torch.any(t < 0).item())

need_per_block = (
not hasattr(weight_quantizer, "_amax")
or weight_quantizer._amax is None
or _amax_is_invalid(weight_quantizer._amax)
)
need_global = (
not hasattr(weight_quantizer, "_global_amax") or weight_quantizer.global_amax is None
not hasattr(weight_quantizer, "_global_amax")
or weight_quantizer.global_amax is None
or _amax_is_invalid(weight_quantizer.global_amax)
)
if not (need_per_block or need_global):
return
Expand Down
102 changes: 89 additions & 13 deletions modelopt/torch/export/unified_export_megatron.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
QUANTIZATION_FP8_PB_WO,
QUANTIZATION_NONE,
QUANTIZATION_NVFP4,
QUANTIZATION_W4A16_NVFP4,
)
from .plugins.hf_checkpoint_utils import copy_hf_ckpt_remote_code, load_multimodal_components
from .plugins.mcore_common import all_mcore_hf_export_mapping
Expand All @@ -61,6 +62,7 @@
get_weight_block_size,
get_weight_scaling_factor,
get_weight_scaling_factor_2,
process_layer_quant_config,
to_quantized_weight,
)

Expand Down Expand Up @@ -169,6 +171,7 @@ def __init__(
self.all_rules = self._populate_rule_book()
self.rules = self.all_rules[self.arch]
self.exclude_modules = []
self.layer_config_dict = {}

if not hasattr(model, "_modelopt_state"):
return
Expand Down Expand Up @@ -287,6 +290,8 @@ def save_pretrained(
quantization = "FP8"
elif quantization_format == QUANTIZATION_NVFP4:
quantization = "NVFP4"
elif quantization_format == QUANTIZATION_W4A16_NVFP4:
quantization = "W4A16_NVFP4"

# We use the last PP rank and the 1st EP rank to write the config because
# medusa_heads and eagle_module only exist in the last stage.
Expand Down Expand Up @@ -324,22 +329,32 @@ def save_pretrained(
print(f"Successfully loaded {len(mtp_state_dict)} MTP tensors")

combined_exclude_modules = self._gather_exclude_modules()
combined_layer_config_dict = self._gather_layer_config_dict()
# kv_cache_dtype is only set on attention-owning ranks; writer rank may not be one.
gathered_kv_cache_dtype = self._gather_kv_cache_dtype()

if is_last_stage_main_rank and quantization is not None:
if combined_layer_config_dict:
quantization_config = process_layer_quant_config(combined_layer_config_dict)
quantization_config["exclude_modules"] = combined_exclude_modules
else:
quantization_config = {
"quant_algo": quantization,
"exclude_modules": combined_exclude_modules,
}
if quantization in ("NVFP4", "W4A16_NVFP4"): # update block size
quantization_config["group_size"] = 16

if gathered_kv_cache_dtype is not None:
quantization_config["kv_cache_quant_algo"] = gathered_kv_cache_dtype

self._hf_quant_config = {
"producer": {
"name": "modelopt",
"version": __version__,
},
"quantization": {
"quant_algo": quantization,
"exclude_modules": combined_exclude_modules,
},
"quantization": quantization_config,
}
if quantization == "NVFP4": # update block size
self._hf_quant_config["quantization"]["group_size"] = 16
if hasattr(self, "kv_cache_dtype"):
self._hf_quant_config["quantization"]["kv_cache_quant_algo"] = self.kv_cache_dtype
with open(save_directory + "/hf_quant_config.json", "w") as f:
json.dump(self._hf_quant_config, f, indent=4)

Expand All @@ -359,10 +374,11 @@ def save_pretrained(
# Newer versions of VLLM expect config.json with hf_quant_config
config_json_file = save_directory + "/config.json"
if self._hf_quant_config and os.path.exists(config_json_file):
converted_quant_config = convert_hf_quant_config_format(self._hf_quant_config)
with open(config_json_file) as f:
config_dict = json.load(f)
config_dict["quantization_config"] = converted_quant_config
config_dict["quantization_config"] = convert_hf_quant_config_format(
self._hf_quant_config
)
with open(config_json_file, "w") as f:
json.dump(config_dict, f, indent=4)

Expand Down Expand Up @@ -814,9 +830,7 @@ def _get_quantized_state(
name_to_value = {}
qformat: str = self._get_quantization_format(module)
if qformat is None and "norm" not in prefix:
# Add exclude layers for hf_quant_config. Note that if the prefix is not an empty
# string then it usually ends with "." which needs to be removed.
self.exclude_modules.append(prefix.removesuffix("."))
self._record_excluded_module(prefix)
block_size = get_weight_block_size(module)

name_to_value = self._get_weight_bias(module, dtype, name_to_value)
Expand Down Expand Up @@ -861,6 +875,27 @@ def _get_weight_scales(self, quantized_state: dict[str, Any], qformat: str):

return weight_scale, weight_scale_2

def _record_layer_quant_config(self, prefix: str, qformat: str | None, block_size: int):
"""Record per-HF-layer quantization metadata for mixed precision exports."""
if qformat in (None, QUANTIZATION_NONE):
return

layer_name = prefix.removesuffix(".")
if "{" in layer_name or not layer_name:
return

self.layer_config_dict[layer_name + ".quantization"] = qformat
self.layer_config_dict[layer_name + ".awq_block_size"] = block_size

def _record_excluded_module(self, prefix: str):
"""Record an unquantized HF module prefix for hf_quant_config."""
layer_name = prefix.removesuffix(".")
if "{" in layer_name or not layer_name:
return

if layer_name not in self.exclude_modules:
self.exclude_modules.append(layer_name)

def _name_remapping(
self,
module: torch.nn.Module | torch.Tensor,
Expand All @@ -877,6 +912,7 @@ def _name_remapping(
return

name_to_value, qformat, block_size = self._get_quantized_state(module, dtype, prefix=prefix)
self._record_layer_quant_config(prefix, qformat, block_size)

weight = name_to_value.pop("weight")
weight_scale, weight_scale_2 = self._get_weight_scales(name_to_value, qformat)
Expand Down Expand Up @@ -917,6 +953,8 @@ def _gated_mlp_slicing(

gate_proj_prefix = prefix + gate_proj_name + "."
up_proj_prefix = prefix + up_proj_name + "."
self._record_layer_quant_config(gate_proj_prefix, qformat, block_size)
self._record_layer_quant_config(up_proj_prefix, qformat, block_size)

ffn_hidden_size = module.config.ffn_hidden_size
gate_proj_weight = weight[:ffn_hidden_size, :]
Expand Down Expand Up @@ -997,6 +1035,7 @@ def _grouped_mlp_slicing(self, module, prefix, parallel_config=None):

for expert_id in range(num_experts):
expert_prefix = prefix.format(expert_id) + "."
self._record_layer_quant_config(expert_prefix, qformat, block_size)
weight_key = f"weight{expert_id}"

if weight_key not in state_dict:
Expand Down Expand Up @@ -1041,6 +1080,16 @@ def _qkv_slicing(
q_proj_prefix = prefix + q_proj_name + "."
k_proj_prefix = prefix + k_proj_name + "."
v_proj_prefix = prefix + v_proj_name + "."
self._record_layer_quant_config(q_proj_prefix, qformat, block_size)
self._record_layer_quant_config(k_proj_prefix, qformat, block_size)
self._record_layer_quant_config(v_proj_prefix, qformat, block_size)
if qformat in (None, QUANTIZATION_NONE):
# Split fused linear_qkv exclude into per-HF-name q/k/v_proj entries.
fused_prefix = prefix.removesuffix(".")
self.exclude_modules = [m for m in self.exclude_modules if m != fused_prefix]
self._record_excluded_module(q_proj_prefix)
self._record_excluded_module(k_proj_prefix)
self._record_excluded_module(v_proj_prefix)

config = module.config
hidden_size = config.hidden_size
Expand Down Expand Up @@ -1190,6 +1239,7 @@ def _pack_name_remapping(self, module, prefix, layer_type=None):
weight_scale_list.append(weight_scale)
weight_scale_2_list.append(weight_scale_2)
input_scale_list.append(input_scale)
self._record_layer_quant_config(prefix, qformat, block_size)

merged_weight = torch.stack(weight_list, dim=0)

Expand Down Expand Up @@ -1258,6 +1308,7 @@ def _pack_name_remapping_gpt_oss(self, module, prefix, layer_type=None):
weight_scale_2_list.append(weight_scale_2)
input_scale_list.append(input_scale)
bias_list.append(bias)
self._record_layer_quant_config(prefix, qformat, block_size)

merged_weight = torch.stack(weight_list, dim=0)

Expand Down Expand Up @@ -1360,6 +1411,31 @@ def _gather_exclude_modules(self):
combined_exclude_modules.update(modules)
return sorted(combined_exclude_modules)

def _gather_layer_config_dict(self):
"""Get per-layer quantization metadata from all ranks for hf_quant_config."""
if not torch.distributed.is_initialized():
return dict(sorted(self.layer_config_dict.items()))

all_layer_config_dicts = [None] * torch.distributed.get_world_size()
torch.distributed.all_gather_object(all_layer_config_dicts, self.layer_config_dict)
combined_layer_config_dict = {}
for layer_config_dict in all_layer_config_dicts:
if layer_config_dict:
combined_layer_config_dict.update(layer_config_dict)
return dict(sorted(combined_layer_config_dict.items()))

def _gather_kv_cache_dtype(self):
"""Return first non-None kv_cache_dtype across ranks (only attention ranks set it)."""
local = getattr(self, "kv_cache_dtype", None)
if not torch.distributed.is_initialized():
return local
all_dtypes = [None] * torch.distributed.get_world_size()
torch.distributed.all_gather_object(all_dtypes, local)
for dt in all_dtypes:
if dt is not None:
return dt
return None


def export_mcore_gpt_to_hf(
model: torch.nn.Module,
Expand Down
Loading
Loading