NVIDIA
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 13 additions & 65 deletions b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 13 additions & 65 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 35 additions & 28 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 35 additions & 28 deletions
diff --git a/‎modelopt/torch/export/model_utils.py‎
Lines changed: 22 additions & 4 deletions b/‎modelopt/torch/export/model_utils.py‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 16 additions & 1 deletion b/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 16 additions & 1 deletion
@@ -45,7 +45,7 @@
 except ImportError:
     snapshot_download = None
 
-from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE
+from modelopt.torch.export.model_utils import match_model_type_by_name
 from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
 from modelopt.torch.utils.image_processor import (
     BaseImageProcessor,
@@ -95,19 +95,13 @@ def get_model_type_from_config(model_path: str) -> str | None:
         config = json.load(f)
 
     # Check architectures field first
-    architectures = config.get("architectures", [])
-    for arch in architectures:
-        for key, model_type in MODEL_NAME_TO_TYPE.items():
-            if key.lower() in arch.lower():
-                return model_type
+    for arch in config.get("architectures", []):
+        result = match_model_type_by_name(arch)
+        if result is not None:
+            return result
 
     # Fallback to model_type field
-    model_type_field = config.get("model_type", "")
-    for key, model_type in MODEL_NAME_TO_TYPE.items():
-        if key.lower() in model_type_field.lower():
-            return model_type
-
-    return None
+    return match_model_type_by_name(config.get("model_type", ""))
 
 
 def get_sampling_params_from_config(model_path: str) -> dict:
@@ -164,10 +158,13 @@ def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None:
 
     print(f"Copying missing tokenizer files from {source_model_id}...")
     # Download only tokenizer files from HF
-    cache_dir = snapshot_download(
-        source_model_id,
-        allow_patterns=TOKENIZER_FILES,
-    )
+    if os.path.isdir(source_model_id):
+        cache_dir = source_model_id
+    else:
+        cache_dir = snapshot_download(
+            source_model_id,
+            allow_patterns=TOKENIZER_FILES,
+        )
 
     for fname in TOKENIZER_FILES:
         src = os.path.join(cache_dir, fname)
@@ -992,55 +989,6 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
         print("No custom model files found to copy")
 
 
-def patch_config_for_unified_export(model_type: str, export_path: str) -> None:
-    """Patch config files to add missing exclusion patterns for unified HF export.
-
-    This function adds missing exclusion patterns for modules that should not be quantized
-    (e.g., audio tower, visual encoder, lm_head) to both hf_quant_config.json and config.json.
-
-    Args:
-        export_path: Path to the exported model directory.
-    """
-    if model_type == "qwen3omni":
-        missing_patterns = [
-            "thinker.audio_tower*",
-            "thinker.visual*",
-            "thinker.lm_head",
-        ]
-
-        # (filename, path_to_exclude_list)
-        configs = [
-            ("hf_quant_config.json", ["quantization", "exclude_modules"]),
-            ("config.json", ["quantization_config", "ignore"]),
-        ]
-
-        for filename, keys in configs:
-            filepath = os.path.join(export_path, filename)
-            if not os.path.exists(filepath):
-                continue
-            try:
-                with open(filepath) as f:
-                    config = json.load(f)
-
-                # Navigate to nested key
-                target = config
-                for key in keys[:-1]:
-                    target = target.get(key, {})
-
-                exclude_list = target.get(keys[-1])
-                if exclude_list is None:
-                    continue
-
-                added = [p for p in missing_patterns if p not in exclude_list]
-                if added:
-                    exclude_list.extend(added)
-                    with open(filepath, "w") as f:
-                        json.dump(config, f, indent=2)
-                    print(f"Patched {filename} with exclusions: {added}")
-            except Exception as e:
-                print(f"Warning: Failed to patch {filename}: {e}")
-
-
 def get_qwen3omni_dataloader(
     dataset_name: str | list[str] | None,
     processor: Qwen3OmniImageProcessor | None,
 
@@ -18,6 +18,7 @@
 import random
 import time
 import warnings
+from collections import namedtuple
 from typing import Any
 
 import numpy as np
@@ -35,7 +36,6 @@
     is_enc_dec,
     is_nemotron_vl,
     load_mtp_weights,
-    patch_config_for_unified_export,
     run_nemotron_vl_preview,
 )
 from torch.utils.data import DataLoader
@@ -735,9 +735,6 @@ def export_quantized(
                     extra_state_dict=mtp_state_dict,
                 )
 
-            # Exclude non-quantized modules in config.json and hf_quant_config.json
-            patch_config_for_unified_export(model_type, export_path)
-
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
             tokenizer.padding_side = default_padding_side
@@ -757,6 +754,23 @@ def export_quantized(
         )
 
 
+PreQuantizeResult = namedtuple(
+    "PreQuantizeResult", ["preview_input_ids", "generated_ids_before_ptq", "calib_batch"]
+)
+
+
+def _qwen3omni_generate(model, calib_batch):
+    """Run Qwen3Omni generate and unpack the result.
+
+    Qwen3Omni returns a (text_ids, audio) tuple; text_ids may have a .sequences attribute.
+    """
+    result = model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
+    if isinstance(result, tuple):
+        text_ids, _ = result
+        return text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
+    return result
+
+
 def pre_quantize(
     args: argparse.Namespace,
     full_model: torch.nn.Module,
@@ -799,20 +813,15 @@ def pre_quantize(
             allow_fallback=False,
         )
     elif model_type == "qwen3omni":
-        # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
-        # Pass full batch with all multimodal inputs
-        result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
-        if isinstance(result, tuple):
-            text_ids, _ = result
-            generated_ids_before_ptq = (
-                text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
-            )
-        else:
-            generated_ids_before_ptq = result
+        # Use only a single sample for preview generation to avoid OOM
+        single_sample = {
+            k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items()
+        }
+        generated_ids_before_ptq = _qwen3omni_generate(full_model, single_sample)
     else:
         generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
 
-    return preview_input_ids, generated_ids_before_ptq, calib_batch
+    return PreQuantizeResult(preview_input_ids, generated_ids_before_ptq, calib_batch)
 
 
 def post_quantize(
@@ -861,25 +870,23 @@ def post_quantize(
     """
 
     if args.verbose:
-        mtq.print_quant_summary(full_model, save_path=args.quant_summary_path)
-        save_expert_token_count_table(full_model, args.export_path)
+        try:
+            mtq.print_quant_summary(full_model, save_path=args.quant_summary_path)
+            save_expert_token_count_table(full_model, args.export_path)
+        except Exception as e:
+            print(f"Warning: Failed to print quant summary: {e}")
 
     # Run some samples
     torch.cuda.empty_cache()
     generated_ids_after_ptq = None
     if generated_ids_before_ptq is None:
         pass
-    elif model_type == "qwen3omni":
-        # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
-        # Pass full batch with all multimodal inputs
-        result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
-        if isinstance(result, tuple):
-            text_ids, _ = result
-            generated_ids_after_ptq = (
-                text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
-            )
-        else:
-            generated_ids_after_ptq = result
+    elif model_type == "qwen3omni" and calib_batch is not None:
+        # Use only a single sample for preview generation to avoid OOM
+        single_sample = {
+            k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items()
+        }
+        generated_ids_after_ptq = _qwen3omni_generate(full_model, single_sample)
     elif model_type != "llama4" and not is_nemotron_vl_model:
         # Our fake quantizer may not be fully compatible with torch.compile.
         generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
 
@@ -67,17 +67,35 @@
         {MODEL_NAME_TO_TYPE=}
 """
 
-__all__ = ["get_language_model_from_vl", "get_model_type", "is_multimodal_model"]
+__all__ = [
+    "get_language_model_from_vl",
+    "get_model_type",
+    "is_multimodal_model",
+    "match_model_type_by_name",
+]
 
 
-def get_model_type(model):
-    """Try get the model type from the model name. If not found, return None."""
+def match_model_type_by_name(name: str) -> str | None:
+    """Match a model type from MODEL_NAME_TO_TYPE by case-insensitive substring match.
+
+    Args:
+        name: String to match against (e.g. class name, architecture string, model_type field).
+
+    Returns:
+        Matched model type string, or None.
+    """
+    name_lower = name.lower()
     for k, v in MODEL_NAME_TO_TYPE.items():
-        if k.lower() in type(model).__name__.lower():
+        if k.lower() in name_lower:
             return v
     return None
 
 
+def get_model_type(model):
+    """Try get the model type from the model name. If not found, return None."""
+    return match_model_type_by_name(type(model).__name__)
+
+
 def is_multimodal_model(model):
     """Check if a model is a Vision-Language Model (VLM) or multimodal model.
 
 
@@ -87,7 +87,7 @@
     QUANTIZATION_W4A8_AWQ,
     QUANTIZATION_W4A8_NVFP4_FP8,
 )
-from .model_utils import get_language_model_from_vl, is_multimodal_model
+from .model_utils import get_language_model_from_vl, get_model_type, is_multimodal_model
 from .plugins import SpeculativeDecodingExporter, has_spec_opt
 from .quant_utils import (
     fuse_prequant_layernorm,
@@ -781,6 +781,16 @@ def _export_transformers_checkpoint(
                 exclude_modules.append(pattern)
                 print(f"Adding MTP layer to quantization_config ignore: {pattern}")
 
+    # Add model-specific non-quantized module exclusions
+    _model_type_exclusions = {
+        "qwen3omni": ["thinker.audio_tower*", "thinker.visual*", "thinker.lm_head"],
+    }
+    model_type = get_model_type(model)
+    for pattern in _model_type_exclusions.get(model_type, []):
+        exclude_modules = quant_config["quantization"].setdefault("exclude_modules", [])
+        if pattern not in exclude_modules:
+            exclude_modules.append(pattern)
+
     # Safety net: sync any gate/up weight quantizer amaxes that
     # requantize_resmooth_fused_llm_layers did not reach (e.g. experts not
     # activated during the dummy forward, or non-standard expert naming).
@@ -1185,6 +1195,8 @@ def export_hf_checkpoint(
 
         # Fix generation_config conflicts before saving
         # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors
+        # Restore the original value after save to avoid mutating the caller's model.
+        _gen_config_restore = None
         if hasattr(model, "generation_config") and model.generation_config is not None:
             gen_config = model.generation_config
             if not getattr(gen_config, "do_sample", True):
@@ -1193,6 +1205,7 @@ def export_hf_checkpoint(
                     getattr(gen_config, attr, None) is not None
                     for attr in ["temperature", "top_p", "top_k"]
                 ):
+                    _gen_config_restore = gen_config.do_sample
                     gen_config.do_sample = True
 
         # Save model
@@ -1211,6 +1224,8 @@ def export_hf_checkpoint(
             )
         finally:
             _unpatch_revert_weight_conversion(_patches)
+            if _gen_config_restore is not None:
+                model.generation_config.do_sample = _gen_config_restore
 
         original_config = f"{export_dir}/config.json"
         config_data = {}