Support export to hf format

ajrasane · ajrasane · commit aa775656ab4c · 2026-02-02T19:27:50.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -394,28 +394,11 @@ def load_model(args: argparse.Namespace):
             attn_implementation=args.attn_implementation,
         )
 
+        # Uncomment this to load the model from a .pt file
+        # model = mto.restore(model, "./qwen3_omni_30b_nvfp4/model.pt")
+        # print("Qwen3Omni model restored from checkpoint")
+
         quant_cfg = QUANT_CFG_CHOICES[args.qformat]
-        # Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
-        if "qkv_disabled" in args.qformat:
-            # Disable q_proj, k_proj, v_proj quantizers
-            for proj in ["q_proj", "k_proj", "v_proj"]:
-                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
-                    "enable": False
-                }
-        if "qkvo_disabled" in args.qformat:
-            # Disable q_proj, k_proj, v_proj, o_proj quantizers
-            for proj in ["o_proj"]:
-                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
-                    "enable": False
-                }
-        if "first_and_last_n_disabled" in args.qformat:
-            # Disable both first N and last N layers
-            total_layers = 48
-            n_layers_to_disable = 4
-            for i in range(n_layers_to_disable):
-                quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
-            for i in range(total_layers - n_layers_to_disable, total_layers):
-                quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
     else:
         assert args.qformat in QUANT_CFG_CHOICES, (
             f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
@@ -637,6 +620,37 @@ def mono_quantize(
             if language_model_lineage is not None:
                 print("Updating full_model with quantized language_model...")
                 language_model_lineage[-2].language_model = language_model
+
+        # Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
+        if "qkv_disabled" in args.qformat:
+            # Disable q_proj, k_proj, v_proj quantizers
+            for proj in ["q_proj", "k_proj", "v_proj"]:
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
+                    "enable": False
+                }
+        if "qkvo_disabled" in args.qformat:
+            # Disable q_proj, k_proj, v_proj, o_proj quantizers
+            for proj in ["o_proj"]:
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
+                    "enable": False
+                }
+        if "first_and_last_n_disabled" in args.qformat:
+            # Disable both first N and last N layers
+            total_layers = 48
+            n_layers_to_disable = 4
+            for i in range(n_layers_to_disable):
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
+            for i in range(total_layers - n_layers_to_disable, total_layers):
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
+
+        if not model_is_already_quantized or calibration_only:
+            # Only run single sample for preview
+            calib_batch = next(iter(calib_dataloader))
+            input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][
+                0:1
+            ]
+
+            # Generate preview before quantization
             if is_nemotron_vl_model and tokenizer is not None:
                 generated_ids_before_ptq = run_nemotron_vl_preview(
                     full_model,
@@ -771,11 +785,11 @@ def export_quantized(
     default_padding_side,
     default_pad_token,
 ):
-    if model_type == "qwen3omni":
-        print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.")
-        os.makedirs(os.path.dirname(args.export_path), exist_ok=True)
-        mto.save(model, args.export_path)
-        return
+    # Uncomment this to save the model as a .pt file
+    # if model_type == "qwen3omni":
+    #     print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.")
+    #     os.makedirs(os.path.dirname(args.export_path), exist_ok=True)
+    #     mto.save(full_model, f"{args.export_path}/model.pt")
 
     with torch.inference_mode():
         if model_type is None:
@@ -857,6 +871,7 @@ def export_quantized(
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
+                save_modelopt_state=model_type == "qwen3omni",
             )
 
         # Copy custom model files (Python files and JSON configs) if trust_remote_code is used
diff --git a/examples/llm_ptq/run_quantized_qwen3omni.py b/examples/llm_ptq/run_quantized_qwen3omni.py
@@ -16,7 +16,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Script to load and run a quantized Qwen3Omni model from mto checkpoint."""
+"""Script to load and run a quantized Qwen3Omni model from export_hf_checkpoint."""
 
 import argparse
 import time
@@ -27,38 +27,41 @@
 
 import modelopt.torch.opt as mto
 
+# Enable HuggingFace checkpointing for modelopt quantized models
+mto.enable_huggingface_checkpointing()
+
 
 def main(args):
-    print(f"Loading base model from {args.model_path}...")
+    print(f"Loading quantized model from {args.checkpoint_path}...")
     model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
-        args.model_path,
+        args.checkpoint_path,
         torch_dtype="auto",
-        device_map="cuda",
+        device_map="auto",
         attn_implementation="flash_attention_2",
         trust_remote_code=True,
     )
 
-    print(f"Restoring quantized state from {args.checkpoint_path}...")
-    model = mto.restore(model, args.checkpoint_path)
-
     model.disable_talker()
 
     print("Loading processor...")
     processor = Qwen3OmniMoeProcessor.from_pretrained(
-        args.model_path,
+        "Qwen/Qwen3-Omni-30B-A3B-Thinking",
         trust_remote_code=True,
     )
 
     # Build conversation with user prompt
     prompt = args.prompt or "What is the capital of France?"
-    conversation = [{"role": "user", "content": [{"type": "text", "text": f"{prompt}"}]}]
+    conversation = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
     conversations = [conversation]
 
     # Set whether to use audio in video
     use_audio_in_video = True
 
     # Preparation for inference
-    texts = processor.apply_chat_template(conversations, add_generation_prompt=True, tokenize=False)
+    texts = processor.apply_chat_template(
+        conversations, add_generation_prompt=True, tokenize=False, enable_thinking=False
+    )
+    print(f"Texts: {texts}")
     audios, images, videos = process_mm_info(conversations, use_audio_in_video=use_audio_in_video)
 
     inputs = processor(
@@ -99,17 +102,11 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run quantized Qwen3Omni model")
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default="Qwen/Qwen3-Omni-30B-A3B-Instruct",
-        help="Path to the base Qwen3Omni model (HF format)",
-    )
     parser.add_argument(
         "--checkpoint_path",
         type=str,
-        default="/home/scratch.arasane_hw/models/qwen3omni_nvfp4_qkv_disabled_text_bs512_calib512.pt",
-        help="Path to the mto.save() quantized checkpoint",
+        required=True,
+        help="Path to the export_hf_checkpoint() quantized checkpoint directory",
     )
     parser.add_argument(
         "--prompt",
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -300,29 +300,43 @@ def llm_dummy_forward():
                 [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype
             ).to(model.device)
 
-        if getattr(model.config, "is_encoder_decoder", False):
-            # For encoder-decoder models, we need to pass both the encoder and decoder input ids
-            model(fake_input, decoder_input_ids=decoder_fake_input)
-        elif is_vl_model and "nemotron" in model_type:
-            # For Nemotron VL models, try to run optimization on just the language model part
-            language_model_lineage = get_language_model_from_vl(model)
-
-            if language_model_lineage is not None:
-                # Run optimization on just the language model with the same input format as regular LLMs
-                # Use the same fake_input tensor that regular LLMs use
-                language_model = language_model_lineage[-1]
-                print(
-                    f"Running optimization on language model with fake_input shape: {fake_input.shape}"
-                )
-                language_model(fake_input)
+        with set_quantizer_by_cfg_context(model, {"*": {"enable": False}}):
+            if getattr(model.config, "is_encoder_decoder", False):
+                # For encoder-decoder models, we need to pass both the encoder and decoder input ids
+                model(fake_input, decoder_input_ids=decoder_fake_input)
+            elif is_vl_model and "nemotron" in model_type:
+                # For Nemotron VL models, try to run optimization on just the language model part
+                language_model_lineage = get_language_model_from_vl(model)
+
+                if language_model_lineage is not None:
+                    # Run optimization on just the language model with the same input format as regular LLMs
+                    # Use the same fake_input tensor that regular LLMs use
+                    language_model = language_model_lineage[-1]
+                    print(
+                        f"Running optimization on language model with fake_input shape: {fake_input.shape}"
+                    )
+                    language_model(fake_input)
+                else:
+                    raise ValueError(
+                        f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
+                        "This is required for requantization/resmoothing optimization. "
+                        "Please ensure the model architecture is supported or file an issue."
+                    )
+            elif "qwen3omni" in model_type:
+                # For Qwen3Omni, run on the thinker (language model) component
+                # The model has structure: model.thinker.model.layers.*
+                if hasattr(model, "thinker"):
+                    print(
+                        f"Running optimization on Qwen3Omni thinker with fake_input shape: {fake_input.shape}"
+                    )
+                    model.thinker(fake_input)
+                else:
+                    raise ValueError(
+                        f"Cannot extract thinker from Qwen3Omni model (type: {model_type}). "
+                        "This is required for requantization/resmoothing optimization."
+                    )
             else:
-                raise ValueError(
-                    f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
-                    "This is required for requantization/resmoothing optimization. "
-                    "Please ensure the model architecture is supported or file an issue."
-                )
-        else:
-            model(fake_input)
+                model(fake_input)
 
     input_to_linear, output_to_layernorm = _collect_shared_input_modules(
         model, llm_dummy_forward, collect_layernorms=True
@@ -380,6 +394,19 @@ def _export_quantized_weight(
     weight_quantizer: TensorQuantizer | SequentialQuantizer = getattr(
         sub_module, quantizer_attrs.weight_quantizer
     )
+
+    # Skip export if weight quantizer is disabled or has no amax (not calibrated)
+    if not _is_enabled_quantizer(weight_quantizer):
+        return
+
+    # Check if weight quantizer has calibrated amax
+    def _has_amax(quantizer):
+        if isinstance(quantizer, SequentialQuantizer):
+            return any(hasattr(q, "_amax") and q._amax is not None for q in quantizer)
+        return hasattr(quantizer, "_amax") and quantizer._amax is not None
+
+    if not _has_amax(weight_quantizer):
+        return
     input_quantizer: TensorQuantizer | SequentialQuantizer | None = getattr(
         sub_module, quantizer_attrs.input_quantizer, None
     )
@@ -543,6 +570,7 @@ def _process_quantized_modules(
     model: nn.Module,
     dtype: torch.dtype,
     is_modelopt_qlora: bool = False,
+    pack_weights: bool = True,
 ) -> None:
     """Process all quantized modules in model, export weights in-place.
 
@@ -555,6 +583,7 @@ def _process_quantized_modules(
         dtype: The data type for weight conversion.
         is_modelopt_qlora: Whether the model is a modelopt-trained QLoRA model.
             If True, modules with base_layer attribute are skipped.
+        pack_weights: Whether to pack quantized weights.
     """
     fsdp_module_to_reshard = None
 
@@ -577,8 +606,9 @@ def _process_quantized_modules(
             sub_module.unpack_weight()
         if get_quantization_format(sub_module) != QUANTIZATION_NONE:
             if is_quantlinear(sub_module):
-                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
-                    _export_quantized_weight(sub_module, dtype)
+                if pack_weights:
+                    with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                        _export_quantized_weight(sub_module, dtype)
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__
                 or "GptOssExperts" in type(sub_module).__name__
@@ -595,13 +625,18 @@ def _process_quantized_modules(
                     quantizer_attrs=["gate_up_proj_input_quantizer", "down_proj_input_quantizer"],
                 )
                 # Export the quantized weights
-                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
-                    for weight_name in ["gate_up_proj", "down_proj"]:
-                        _export_quantized_weight(sub_module, dtype, weight_name)
+                if pack_weights:
+                    with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                        for weight_name in ["gate_up_proj", "down_proj"]:
+                            _export_quantized_weight(sub_module, dtype, weight_name)
 
 
-def _export_transformers_checkpoint(
-    model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False, **kwargs
+def _export_hf_checkpoint(
+    model: nn.Module,
+    dtype: torch.dtype | None = None,
+    is_modelopt_qlora: bool = False,
+    pack_weights: bool = True,
+    **kwargs,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     """Exports the torch model to the packed checkpoint with original HF naming.
 
@@ -611,6 +646,7 @@ def _export_transformers_checkpoint(
         model: the full torch model to export. The actual quantized model may be a submodule.
         dtype: the weights data type to export the unquantized layers or the default model data type if None.
         accelerator: the accelerator instance in case of distributed export setup.
+        pack_weights: whether to pack quantized weights (False keeps original shapes for HF reload).
 
     Returns:
         post_state_dict: Dict containing quantized weights
@@ -695,7 +731,7 @@ def _export_transformers_checkpoint(
     quant_config = get_quant_config(model, is_modelopt_qlora=is_modelopt_qlora)
 
     # Process all quantized modules and export weights
-    _process_quantized_modules(model, dtype, is_modelopt_qlora)
+    _process_quantized_modules(model, dtype, is_modelopt_qlora, pack_weights)
 
     if accelerator is not None:
         # Gather state_dict from all ranks
@@ -964,7 +1000,12 @@ def export_hf_checkpoint(
         return
 
     try:
-        post_state_dict, hf_quant_config = _export_transformers_checkpoint(model, dtype)
+        # Packed weights are only for TRT-LLM consumption
+        # Set this to true if you want to save the weights in the original precision
+        pack_weights = True
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(
+            model, dtype, pack_weights=pack_weights
+        )
 
         if hf_quant_config is not None:
             # Save hf_quant_config.json for backward compatibility
@@ -977,6 +1018,16 @@ def export_hf_checkpoint(
         if getattr(model, "hf_quantizer", None) is not None:
             model.hf_quantizer = None
 
+        # Fix generation_config conflicts before saving
+        # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors
+        if hasattr(model, "generation_config") and model.generation_config is not None:
+            gen_config = model.generation_config
+            if not getattr(gen_config, "do_sample", True):
+                # Remove sampling-related params when do_sample is False
+                for attr in ["temperature", "top_p", "top_k"]:
+                    if hasattr(gen_config, attr):
+                        setattr(gen_config, attr, None)
+
         # Save model
         model.save_pretrained(
             export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py
@@ -145,13 +145,9 @@ def preprocess_function(self, text: str) -> dict:
             Dictionary with tokenized inputs.
         """
         # Build conversation in Qwen format (text-only)
-        conversation = [
-            {"role": "user", "content": [{"type": "text", "text": "/no_think " + text}]}
-        ]
-
-        # Apply chat template (tokenize=False to get formatted string)
+        conversation = [{"role": "user", "content": [{"type": "text", "text": text}]}]
         formatted_text = self.tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, tokenize=False
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
         )
 
         # Tokenize with the processor (no multimodal inputs)
@@ -212,10 +208,8 @@ def preprocess_function(self, examples):
         content.append({"type": "text", "text": question})
 
         conversation = [{"role": "user", "content": content}]
-
-        # Apply chat template (tokenize=False to get string)
         text = self.tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, tokenize=False
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
         )
 
         # Extract multimodal info using qwen_omni_utils
diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py