Support export to hf format

ajrasane · ajrasane · commit 272579701d97 · 2026-01-28T08:20:48.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -300,28 +300,11 @@ def main(args):
             attn_implementation=args.attn_implementation,
         )
 
+        # Uncomment this to load the model from a .pt file
+        # model = mto.restore(model, "./qwen3_omni_30b_nvfp4/model.pt")
+        # print("Qwen3Omni model restored from checkpoint")
+
         quant_cfg = QUANT_CFG_CHOICES[args.qformat]
-        # Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
-        if "qkv_disabled" in args.qformat:
-            # Disable q_proj, k_proj, v_proj quantizers
-            for proj in ["q_proj", "k_proj", "v_proj"]:
-                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
-                    "enable": False
-                }
-        if "qkvo_disabled" in args.qformat:
-            # Disable q_proj, k_proj, v_proj, o_proj quantizers
-            for proj in ["o_proj"]:
-                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
-                    "enable": False
-                }
-        if "first_and_last_n_disabled" in args.qformat:
-            # Disable both first N and last N layers
-            total_layers = 48
-            n_layers_to_disable = 4
-            for i in range(n_layers_to_disable):
-                quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
-            for i in range(total_layers - n_layers_to_disable, total_layers):
-                quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
     else:
         assert args.qformat in QUANT_CFG_CHOICES, (
             f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
@@ -606,6 +589,28 @@ def main(args):
             quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
             quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
 
+        # Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
+        if "qkv_disabled" in args.qformat:
+            # Disable q_proj, k_proj, v_proj quantizers
+            for proj in ["q_proj", "k_proj", "v_proj"]:
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
+                    "enable": False
+                }
+        if "qkvo_disabled" in args.qformat:
+            # Disable q_proj, k_proj, v_proj, o_proj quantizers
+            for proj in ["o_proj"]:
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
+                    "enable": False
+                }
+        if "first_and_last_n_disabled" in args.qformat:
+            # Disable both first N and last N layers
+            total_layers = 48
+            n_layers_to_disable = 4
+            for i in range(n_layers_to_disable):
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
+            for i in range(total_layers - n_layers_to_disable, total_layers):
+                quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
+
         if not model_is_already_quantized or calibration_only:
             # Only run single sample for preview
             calib_batch = next(iter(calib_dataloader))
@@ -745,11 +750,11 @@ def output_decode(generated_ids, input_shape):
         assert model_type != "dbrx", f"Does not support export {model_type} without quantizaton"
         print(f"qformat: {args.qformat}. No quantization applied, export {device} model")
 
-    if model_type == "qwen3omni":
-        print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.")
-        os.makedirs(os.path.dirname(args.export_path), exist_ok=True)
-        mto.save(model, args.export_path)
-        return
+    # Uncomment this to save the model as a .pt file
+    # if model_type == "qwen3omni":
+    #     print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.")
+    #     os.makedirs(os.path.dirname(args.export_path), exist_ok=True)
+    #     mto.save(model, f"{args.export_path}/model.pt")
 
     with torch.inference_mode():
         if model_type is None:
@@ -828,6 +833,7 @@ def output_decode(generated_ids, input_shape):
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
+                save_modelopt_state=model_type == "qwen3omni",
             )
 
         # Copy custom model files (Python files and JSON configs) if trust_remote_code is used
diff --git a/examples/llm_ptq/run_quantized_qwen3omni.py b/examples/llm_ptq/run_quantized_qwen3omni.py
@@ -16,7 +16,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Script to load and run a quantized Qwen3Omni model from mto checkpoint."""
+"""Script to load and run a quantized Qwen3Omni model from export_hf_checkpoint."""
 
 import argparse
 import time
@@ -27,38 +27,41 @@
 
 import modelopt.torch.opt as mto
 
+# Enable HuggingFace checkpointing for modelopt quantized models
+mto.enable_huggingface_checkpointing()
+
 
 def main(args):
-    print(f"Loading base model from {args.model_path}...")
+    print(f"Loading quantized model from {args.checkpoint_path}...")
     model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
-        args.model_path,
+        args.checkpoint_path,
         torch_dtype="auto",
-        device_map="cuda",
+        device_map="auto",
         attn_implementation="flash_attention_2",
         trust_remote_code=True,
     )
 
-    print(f"Restoring quantized state from {args.checkpoint_path}...")
-    model = mto.restore(model, args.checkpoint_path)
-
     model.disable_talker()
 
     print("Loading processor...")
     processor = Qwen3OmniMoeProcessor.from_pretrained(
-        args.model_path,
+        "Qwen/Qwen3-Omni-30B-A3B-Thinking",
         trust_remote_code=True,
     )
 
     # Build conversation with user prompt
     prompt = args.prompt or "What is the capital of France?"
-    conversation = [{"role": "user", "content": [{"type": "text", "text": f"{prompt}"}]}]
+    conversation = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
     conversations = [conversation]
 
     # Set whether to use audio in video
     use_audio_in_video = True
 
     # Preparation for inference
-    texts = processor.apply_chat_template(conversations, add_generation_prompt=True, tokenize=False)
+    texts = processor.apply_chat_template(
+        conversations, add_generation_prompt=True, tokenize=False, enable_thinking=False
+    )
+    print(f"Texts: {texts}")
     audios, images, videos = process_mm_info(conversations, use_audio_in_video=use_audio_in_video)
 
     inputs = processor(
@@ -99,17 +102,11 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run quantized Qwen3Omni model")
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default="Qwen/Qwen3-Omni-30B-A3B-Instruct",
-        help="Path to the base Qwen3Omni model (HF format)",
-    )
     parser.add_argument(
         "--checkpoint_path",
         type=str,
-        default="/home/scratch.arasane_hw/models/qwen3omni_nvfp4_qkv_disabled_text_bs512_calib512.pt",
-        help="Path to the mto.save() quantized checkpoint",
+        required=True,
+        help="Path to the export_hf_checkpoint() quantized checkpoint directory",
     )
     parser.add_argument(
         "--prompt",
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -177,6 +177,19 @@ def _output_hook(module, input, output):
                         "This is required for requantization/resmoothing optimization. "
                         "Please ensure the model architecture is supported or file an issue."
                     )
+            elif "qwen3omni" in model_type:
+                # For Qwen3Omni, run on the thinker (language model) component
+                # The model has structure: model.thinker.model.layers.*
+                if hasattr(model, "thinker"):
+                    print(
+                        f"Running optimization on Qwen3Omni thinker with fake_input shape: {fake_input.shape}"
+                    )
+                    model.thinker(fake_input)
+                else:
+                    raise ValueError(
+                        f"Cannot extract thinker from Qwen3Omni model (type: {model_type}). "
+                        "This is required for requantization/resmoothing optimization."
+                    )
             else:
                 model(fake_input)
 
@@ -248,6 +261,19 @@ def _export_quantized_weight(
     weight_quantizer: TensorQuantizer | SequentialQuantizer = getattr(
         sub_module, quantizer_attrs.weight_quantizer
     )
+
+    # Skip export if weight quantizer is disabled or has no amax (not calibrated)
+    if not _is_enabled_quantizer(weight_quantizer):
+        return
+
+    # Check if weight quantizer has calibrated amax
+    def _has_amax(quantizer):
+        if isinstance(quantizer, SequentialQuantizer):
+            return any(hasattr(q, "_amax") and q._amax is not None for q in quantizer)
+        return hasattr(quantizer, "_amax") and quantizer._amax is not None
+
+    if not _has_amax(weight_quantizer):
+        return
     input_quantizer: TensorQuantizer | SequentialQuantizer | None = getattr(
         sub_module, quantizer_attrs.input_quantizer, None
     )
@@ -392,7 +418,11 @@ def _export_quantized_weight(
 
 
 def _export_hf_checkpoint(
-    model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False, **kwargs
+    model: nn.Module,
+    dtype: torch.dtype | None = None,
+    is_modelopt_qlora: bool = False,
+    pack_weights: bool = True,
+    **kwargs,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     """Exports the torch model to the packed checkpoint with original HF naming.
 
@@ -402,6 +432,7 @@ def _export_hf_checkpoint(
         model: the full torch model to export. The actual quantized model may be a submodule.
         dtype: the weights data type to export the unquantized layers or the default model data type if None.
         accelerator: the accelerator instance in case of distributed export setup.
+        pack_weights: whether to pack quantized weights (False keeps original shapes for HF reload).
 
     Returns:
         post_state_dict: Dict containing quantized weights
@@ -518,8 +549,9 @@ def _export_hf_checkpoint(
 
         if get_quantization_format(sub_module) != QUANTIZATION_NONE:
             if is_quantlinear(sub_module):
-                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
-                    _export_quantized_weight(sub_module, dtype)
+                if pack_weights:
+                    with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                        _export_quantized_weight(sub_module, dtype)
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__
                 or "GptOssExperts" in type(sub_module).__name__
@@ -536,9 +568,10 @@ def _export_hf_checkpoint(
                     quantizer_attrs=["gate_up_proj_input_quantizer", "down_proj_input_quantizer"],
                 )
                 # Export the quantized weights
-                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
-                    for weight_name in ["gate_up_proj", "down_proj"]:
-                        _export_quantized_weight(sub_module, dtype, weight_name)
+                if pack_weights:
+                    with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                        for weight_name in ["gate_up_proj", "down_proj"]:
+                            _export_quantized_weight(sub_module, dtype, weight_name)
 
     if accelerator is not None:
         # Gather state_dict from all ranks
@@ -579,7 +612,12 @@ def export_hf_checkpoint(
         return
 
     try:
-        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
+        # Packed weights are only for TRT-LLM consumption
+        # Set this to true if you want to save the weights in the original precision
+        pack_weights = True
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(
+            model, dtype, pack_weights=pack_weights
+        )
 
         if hf_quant_config is not None:
             # Save hf_quant_config.json for\ backward compatibility
@@ -588,6 +626,16 @@ def export_hf_checkpoint(
 
             hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
 
+        # Fix generation_config conflicts before saving
+        # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors
+        if hasattr(model, "generation_config") and model.generation_config is not None:
+            gen_config = model.generation_config
+            if not getattr(gen_config, "do_sample", True):
+                # Remove sampling-related params when do_sample is False
+                for attr in ["temperature", "top_p", "top_k"]:
+                    if hasattr(gen_config, attr):
+                        setattr(gen_config, attr, None)
+
         # Save model
         model.save_pretrained(
             export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py
@@ -145,13 +145,9 @@ def preprocess_function(self, text: str) -> dict:
             Dictionary with tokenized inputs.
         """
         # Build conversation in Qwen format (text-only)
-        conversation = [
-            {"role": "user", "content": [{"type": "text", "text": "/no_think " + text}]}
-        ]
-
-        # Apply chat template (tokenize=False to get formatted string)
+        conversation = [{"role": "user", "content": [{"type": "text", "text": text}]}]
         formatted_text = self.tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, tokenize=False
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
         )
 
         # Tokenize with the processor (no multimodal inputs)
@@ -212,10 +208,8 @@ def preprocess_function(self, examples):
         content.append({"type": "text", "text": question})
 
         conversation = [{"role": "user", "content": content}]
-
-        # Apply chat template (tokenize=False to get string)
         text = self.tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, tokenize=False
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
         )
 
         # Extract multimodal info using qwen_omni_utils
diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py
@@ -114,32 +114,41 @@ def get_video_dataset_dataloader(
 
     processed_dataset = None
 
-    # Try to load from cache
+    # Try to load from cache (use torch.save/load to avoid Arrow 32-bit offset overflow)
     if cache_dir is not None:
-        from datasets import load_from_disk
-
-        cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed")
+        cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed.pt")
         if os.path.exists(cache_path):
             try:
-                processed_dataset = load_from_disk(cache_path)
+                from datasets import Dataset
+
+                processed_samples = torch.load(cache_path, weights_only=False)
+                processed_dataset = Dataset.from_list(processed_samples)
                 print(f"Loaded processed dataset from cache: {cache_path}")
             except Exception as e:
                 print(f"Failed to load cache from {cache_path}: {e}. Reprocessing...")
                 processed_dataset = None
 
     # Process dataset if not loaded from cache
     if processed_dataset is None:
+        from datasets import Dataset
+
         dataset = _get_video_dataset(dataset_name, num_samples=num_samples)
-        # Apply the preprocessing function to the dataset
-        processed_dataset = dataset.map(
-            processor.preprocess_function, batched=False, remove_columns=dataset.column_names
-        )
 
-        # Save to cache if cache_dir is provided
+        # Process samples manually to avoid Arrow 32-bit offset overflow
+        # (dataset.map() uses Arrow internally which can't handle large nested lists)
+        processed_samples = []
+        for i, sample in enumerate(dataset):
+            processed = processor.preprocess_function(sample)
+            processed_samples.append(processed)
+            if (i + 1) % 10 == 0:
+                print(f"Processed {i + 1}/{len(dataset)} samples...")
+
+        processed_dataset = Dataset.from_list(processed_samples)
+
+        # Save to cache using torch.save to avoid Arrow 32-bit offset overflow
         if cache_dir is not None:
             os.makedirs(cache_dir, exist_ok=True)
-            # Use num_shards=1 to avoid off-by-one sharding bug with complex nested structures
-            processed_dataset.save_to_disk(cache_path, num_shards=1)
+            torch.save(processed_samples, cache_path)
             print(f"Saved processed dataset to cache: {cache_path}")
 
     # Create DataLoader with the custom collate function
@@ -204,9 +213,11 @@ def preprocess_function(self, examples):
             metadata = examples["json"]
             # Try to get a meaningful question from metadata
             category = metadata.get("content_fine_category", "")
-            question = f"/no_think Describe what is happening in this video in detail. Category hint: {category}"
+            question = (
+                f"Describe what is happening in this video in detail. Category hint: {category}"
+            )
         else:
-            question = examples.get("question", "/no_think Describe this video in detail.")
+            question = examples.get("question", "Describe this video in detail.")
 
         # Build conversation in Qwen format
         content = []
@@ -226,10 +237,8 @@ def preprocess_function(self, examples):
         content.append({"type": "text", "text": question})
 
         conversation = [{"role": "user", "content": content}]
-
-        # Apply chat template (tokenize=False to get string)
         text = self.tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, tokenize=False
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
         )
 
         # Extract multimodal info using qwen_omni_utils