fix issues caused by rebase and simplify

Edwardf0t1 · Edwardf0t1 · commit 10f1140f585e · 2026-02-09T22:48:19.000-08:00
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -66,7 +66,6 @@
 )
 from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
-from modelopt.torch.utils.nemotron_vlm_dataset_utils import get_nemotron_vlm_dataset_dataloader
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
 from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
 
@@ -142,7 +141,6 @@ def make_calib_dataloader(
     tokenizer: PreTrainedTokenizerBase | None,
     device: torch.device,
     model_type: str | None,
-    full_model: torch.nn.Module | None = None,
 ) -> tuple[DataLoader, str | None]:
     calib_dataloader = None
     first_text_speech_dataset = None
@@ -525,12 +523,6 @@ def mono_quantize(
             "Consider reducing calib_size to reduce calibration time.\n####\n"
         )
 
-    # Check if this is Nemotron-Parse
-    config = full_model.config
-    architectures = getattr(config, "architectures", [])
-    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-    original_forward = None  # Track original forward method if we wrap it
-
     # For Nemotron VL models, disable quantization of vision components
     if is_nemotron_vl_model:
         print("Disabling quantization for vision components in Nemotron VL model")
@@ -569,15 +561,8 @@ def mono_quantize(
         else:
             language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop)
 
-        # Restore original forward method if we wrapped it for Nemotron-Parse
-        if is_nemotron_parse and original_forward is not None:
-            print("Restoring original forward method after calibration")
-            language_model.forward = original_forward
-            original_forward = None
-
-        # For VL models (except Nemotron-Parse), update full_model to use the quantized language model
-        # For Nemotron-Parse, language_model IS full_model, so no update needed
-        if is_nemotron_vl_model and language_model is not full_model:
+        # For VL models, update full_model to use the quantized language model
+        if is_nemotron_vl_model:
             language_model_lineage = get_language_model_from_vl(full_model)
             if language_model_lineage is not None:
                 print("Updating full_model with quantized language_model...")
@@ -717,20 +702,10 @@ def pre_quantize(
     post-quantize generation.
 
     """
-    # Check if this is Nemotron-Parse (encoder-decoder model)
-    config = full_model.config
-    architectures = getattr(config, "architectures", [])
-    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
     # Only run single sample for preview
-    # For Nemotron-Parse, use decoder_input_ids instead of input_ids
-    sample_batch = next(iter(calib_dataloader))
-    if is_nemotron_parse and "decoder_input_ids" in sample_batch:
-        preview_input_ids = sample_batch["decoder_input_ids"][0:1]
-    elif model_type == "whisper":
-        preview_input_ids = sample_batch["input_features"][0:1]
-    else:
-        preview_input_ids = sample_batch["input_ids"][0:1]
+    preview_input_ids = next(iter(calib_dataloader))[
+        "input_features" if model_type == "whisper" else "input_ids"
+    ][0:1]
 
     # Generate preview before quantization
     if model_type == "deepseek":
@@ -901,7 +876,7 @@ def quantize_main(
     print(f"Use calib batch_size {args.batch_size}")
 
     calib_dataloader, first_text_speech_dataset = make_calib_dataloader(
-        args, language_model, processor, tokenizer, device, model_type, full_model
+        args, language_model, processor, tokenizer, device, model_type
     )
 
     # Detect if this is a Nemotron VL model using architecture-based detection
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -148,13 +148,13 @@ def _collect_shared_input_modules(
     def _input_hook(module, input, output):
         """Update dictionary with list of all modules that share the same input."""
         if len(input) > 0 and isinstance(input[0], torch.Tensor):
-        # TODO: Handle DBRX MoE case
-        input_to_linear[input[0]].append(module)
+            # TODO: Handle DBRX MoE case
+            input_to_linear[input[0]].append(module)
 
     def _output_hook(module, input, output):
         """Update dictionary with mapping of layernorms and their outputs."""
         if output_to_layernorm is not None and isinstance(output, torch.Tensor):
-        output_to_layernorm[output] = module
+            output_to_layernorm[output] = module
 
     handles = []
 
@@ -323,29 +323,29 @@ def llm_dummy_forward():
         if is_vl_model and ("nemotron" in model_type or is_nemotron_parse):
             # For Nemotron VL models (including Nemotron-Parse), run optimization on just the
             # language model/decoder. This avoids needing pixel_values for the vision encoder.
-                language_model_lineage = get_language_model_from_vl(model)
+            language_model_lineage = get_language_model_from_vl(model)
 
-                if language_model_lineage is not None:
-                    language_model = language_model_lineage[-1]
-                    print(
-                        f"Running optimization on language model with fake_input shape: {fake_input.shape}"
-                    )
-                    # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
-                    if is_nemotron_parse:
-                        language_model(fake_input, use_cache=False)
-                    else:
-                        language_model(fake_input)
+            if language_model_lineage is not None:
+                language_model = language_model_lineage[-1]
+                print(
+                    f"Running optimization on language model with fake_input shape: {fake_input.shape}"
+                )
+                # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
+                if is_nemotron_parse:
+                    language_model(fake_input, use_cache=False)
                 else:
-                    raise ValueError(
-                        f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
-                        "This is required for requantization/resmoothing optimization. "
-                        "Please ensure the model architecture is supported or file an issue."
-                    )
+                    language_model(fake_input)
+            else:
+                raise ValueError(
+                    f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
+                    "This is required for requantization/resmoothing optimization. "
+                    "Please ensure the model architecture is supported or file an issue."
+                )
         elif getattr(model.config, "is_encoder_decoder", False):
             # For other encoder-decoder models (non-VL), pass both encoder and decoder input ids
             model(fake_input, decoder_input_ids=decoder_fake_input)
-            else:
-                model(fake_input)
+        else:
+            model(fake_input)
 
     input_to_linear, output_to_layernorm = _collect_shared_input_modules(
         model, llm_dummy_forward, collect_layernorms=True
@@ -440,19 +440,14 @@ def _export_quantized_weight(
                 weight_scaling_factor,
             )
 
-        sub_module.register_buffer(
-            quantizer_attrs.weight_scale,
-            weight_scaling_factor,
-        )
-
         if hasattr(input_quantizer, "_amax") or (
             input_quantizer is not None
             and hasattr(input_quantizer, "amax")
             and input_quantizer.amax is not None
         ):
             assert input_quantizer is not None
             if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
-            input_quantizer._amax = input_quantizer._amax.to(torch.float32)
+                input_quantizer._amax = input_quantizer._amax.to(torch.float32)
 
             sub_module.register_buffer(
                 quantizer_attrs.input_scale,
@@ -468,7 +463,7 @@ def _export_quantized_weight(
         ):
             assert output_quantizer is not None
             if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:
-            output_quantizer._amax = output_quantizer._amax.to(torch.float32)
+                output_quantizer._amax = output_quantizer._amax.to(torch.float32)
     else:
         # Register weight_scale and input_scale
         if quantization_format == QUANTIZATION_FP8_PB_REAL:
@@ -485,7 +480,7 @@ def _export_quantized_weight(
             )
             sub_module.register_buffer(quantizer_attrs.weight_scale, e8m0_scale)
             if hasattr(weight_quantizer, "_scale") and weight_quantizer._scale is not None:
-            del weight_quantizer._scale
+                del weight_quantizer._scale
         else:
             sub_module.register_buffer(
                 quantizer_attrs.weight_scale, get_weight_scaling_factor(sub_module, weight_name)