add image-text data calibration support

Edwardf0t1 · Edwardf0t1 · commit 52eee8444ccc · 2026-01-20T17:42:11.000-08:00
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -221,9 +221,33 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok
     if "vila" in ckpt_path.lower():
         ckpt_path += "/llm"
 
-    tokenizer = AutoTokenizer.from_pretrained(
-        ckpt_path, trust_remote_code=trust_remote_code, **kwargs
-    )
+    # Suppress verbose tokenizer output (e.g., printing all special tokens)
+    import contextlib
+    import io
+    import logging
+    import os
+
+    # Save current settings
+    old_verbosity = os.environ.get("TOKENIZERS_PARALLELISM", None)
+    transformers_log_level = logging.getLogger("transformers").level
+
+    # Suppress output
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    logging.getLogger("transformers").setLevel(logging.ERROR)
+
+    # Also capture stdout to suppress verbose tokenizer printing
+    with contextlib.redirect_stdout(io.StringIO()):
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                ckpt_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+        finally:
+            # Restore original settings
+            if old_verbosity is not None:
+                os.environ["TOKENIZERS_PARALLELISM"] = old_verbosity
+            else:
+                os.environ.pop("TOKENIZERS_PARALLELISM", None)
+            logging.getLogger("transformers").setLevel(transformers_log_level)
 
     # can't set attribute 'pad_token' for "<unk>"
     # We skip this step for Nemo models
@@ -279,10 +303,23 @@ def get_processor(
         # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
         # This will only work if the model has a processor config
         try:
-            processor = AutoProcessor.from_pretrained(
-                ckpt_path,
-                **model_kwargs,
-            )
+            import contextlib
+            import io
+            import logging
+
+            # Suppress verbose output from processor/tokenizer loading
+            transformers_log_level = logging.getLogger("transformers").level
+            logging.getLogger("transformers").setLevel(logging.ERROR)
+
+            with contextlib.redirect_stdout(io.StringIO()):
+                processor = AutoProcessor.from_pretrained(
+                    ckpt_path,
+                    **model_kwargs,
+                )
+
+            # Restore logging
+            logging.getLogger("transformers").setLevel(transformers_log_level)
+
             print(f"Loaded AutoProcessor for model type: {model_type}")
             return processor
         except Exception as e:
@@ -330,12 +367,26 @@ def get_model(
     # Load config once and handle VL model detection
     try:
         hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
+
+        # Check specifically for Nemotron-Parse
+        architectures = getattr(hf_config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         if is_nemotron_vl(hf_config):
-            print(
-                "Detected Nemotron VL model from config. "
-                "Disabling automatic device mapping for compatibility."
-            )
-            device_map = None
+            if is_nemotron_parse:
+                # Nemotron-Parse works fine with device_map="auto"
+                # Keep device_map="auto" to ensure proper device placement
+                print(
+                    "Detected Nemotron-Parse model from config. "
+                    "Using automatic device mapping."
+                )
+            else:
+                # For other Nemotron VL models, disable device_map for compatibility
+                print(
+                    "Detected Nemotron VL model from config. "
+                    "Disabling automatic device mapping for compatibility."
+                )
+                device_map = None
     except Exception as e:
         print(f"Error: Could not load config from {ckpt_path}: {e}")
         raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
@@ -433,6 +484,21 @@ def get_model(
         print(f"Moving model to {device} device...")
         model = model.to(device)
 
+    # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device
+    # The RADIO encoder has buffers that might not be properly moved even with device_map="auto"
+    # This is because custom RADIO modules might not fully support accelerate's device_map
+    if device != "cpu" and hasattr(model, "encoder"):
+        # Check if encoder has any buffers on CPU
+        cpu_buffers = []
+        for name, buffer in model.encoder.named_buffers():
+            if buffer.device.type == "cpu":
+                cpu_buffers.append(name)
+
+        if cpu_buffers:
+            print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...")
+            model.encoder = model.encoder.to(device)
+            print(f"Encoder moved to {device}")
+
     if device == "cuda" and not is_model_on_gpu(model):
         print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -64,6 +64,7 @@
 )
 from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
+from modelopt.torch.utils.nemotron_vlm_dataset_utils import get_nemotron_vlm_dataset_dataloader
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
 from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
 
@@ -173,9 +174,50 @@ def make_calib_dataloader(
     tokenizer: PreTrainedTokenizerBase | None,
     device: torch.device,
     model_type: str | None,
+    full_model: torch.nn.Module | None = None,
 ) -> tuple[DataLoader, str | None]:
     calib_dataloader = None
     first_text_speech_dataset = None
+
+    # Check if this is Nemotron-Parse - use image-text data for better calibration
+    if full_model is not None:
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+        if is_nemotron_parse and processor is not None:
+            print(
+                "✓ Detected Nemotron-Parse model. Using image-text dataset for calibration "
+                "to provide realistic visual embeddings to the decoder."
+            )
+
+            # Override dataset to use image-text dataset if not specified
+            supported_datasets = ["nemotron_vlm_v2", "chartqa", "scienceqa"]
+            if not args.dataset or args.dataset[0] not in supported_datasets:
+                print(
+                    f"[INFO] Dataset '{args.dataset}' is not a supported image-text dataset. "
+                    f"Automatically using 'nemotron_vlm_v2' for Nemotron-Parse calibration."
+                )
+                dataset_to_use = "nemotron_vlm_v2"
+            else:
+                dataset_to_use = args.dataset[0]
+
+            # Nemotron-Parse needs single dataset for now
+            if len(args.calib_size) > 1:
+                print(f"[INFO] Using first calib_size value: {args.calib_size[0]}")
+                calib_size_to_use = args.calib_size[0]
+            else:
+                calib_size_to_use = args.calib_size[0] if args.calib_size else 512
+
+            calib_dataloader = get_nemotron_vlm_dataset_dataloader(
+                dataset_name=dataset_to_use,
+                processor=processor,
+                batch_size=args.batch_size,
+                num_samples=calib_size_to_use,
+                device=device,  # Move data to model's device
+            )
+            return calib_dataloader, first_text_speech_dataset
+
     if model_type == "mllama":
         assert processor is not None and isinstance(processor, MllamaImageProcessor), (
             "The MllamaImageProcessor must be set."
@@ -377,18 +419,35 @@ def load_model(args: argparse.Namespace):
             trust_remote_code=args.trust_remote_code,
         )
     else:
+        # Check if this is a Nemotron VL model that needs a processor
+        # Do this BEFORE setting default datasets so we can use image-text data for Nemotron-Parse
+        is_nemotron_vl_model = is_nemotron_vl(full_model)
+
+        # Check specifically for Nemotron-Parse to set appropriate dataset defaults
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         if args.dataset is None:
-            args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
-            warnings.warn(
-                "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
-            )
+            if is_nemotron_parse:
+                # For Nemotron-Parse, default to Nemotron VLM Dataset v2
+                args.dataset = ["nemotron_vlm_v2"]
+                print(
+                    "No dataset specified. Defaulting to 'nemotron_vlm_v2' for Nemotron-Parse "
+                    "(NVIDIA's image-text dataset for better calibration)."
+                )
+            else:
+                # For other models, use text-only datasets
+                args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
+                warnings.warn(
+                    "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
+                )
+
         # Adjust calib_size to match dataset length by extending or truncating as needed
         args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[
             : len(args.dataset)
         ]
 
-        # Check if this is a Nemotron VL model that needs a processor
-        is_nemotron_vl_model = is_nemotron_vl(full_model)
         if is_nemotron_vl_model:
             # Load processor for Nemotron VL models (like Nemotron-Parse)
             processor = get_processor(
@@ -404,26 +463,41 @@ def load_model(args: argparse.Namespace):
         # Left padding usually provides better calibration result.
         tokenizer.padding_side = "left"
 
-        # We only quantize the language model for VLMs other than the type supported above.
-        language_model_lineage = get_language_model_from_vl(full_model)
-        if language_model_lineage is not None:
-            language_model = language_model_lineage.pop(-1)
-            ancestors = language_model_lineage
-            # Apply disabled quant to all modules that are not part of language_model so we can exclude them during
-            # HF export.
-            disabled_quant_cfg = {
-                "quant_cfg": {"default": {"enable": False}},
-                "algorithm": "max",
-            }
-
-            memo = set(ancestors) | {language_model}
-            for ancestor in ancestors:
-                for _, module in ancestor.named_children():
-                    if module not in memo:
-                        mtq.quantize(module, disabled_quant_cfg, forward_loop=None)
-                        memo.add(module)
-
-            model_type = get_model_type(language_model)
+        # Check if this is Nemotron-Parse
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+        # For Nemotron-Parse, DON'T extract the decoder
+        # We want to calibrate the full model so the decoder sees realistic visual embeddings
+        # The vision encoder won't be quantized (disabled via quant_cfg in mono_quantize)
+        if is_nemotron_parse:
+            print(
+                "Nemotron-Parse detected: Keeping full encoder-decoder model for calibration "
+                "with image-text data. Vision encoder will be disabled from quantization."
+            )
+            # language_model = full_model (already set above)
+        else:
+            # For other VLMs, extract the language model for quantization
+            language_model_lineage = get_language_model_from_vl(full_model)
+            if language_model_lineage is not None:
+                language_model = language_model_lineage.pop(-1)
+                ancestors = language_model_lineage
+                # Apply disabled quant to all modules that are not part of language_model so we can exclude them during
+                # HF export.
+                disabled_quant_cfg = {
+                    "quant_cfg": {"default": {"enable": False}},
+                    "algorithm": "max",
+                }
+
+                memo = set(ancestors) | {language_model}
+                for ancestor in ancestors:
+                    for _, module in ancestor.named_children():
+                        if module not in memo:
+                            mtq.quantize(module, disabled_quant_cfg, forward_loop=None)
+                            memo.add(module)
+
+                model_type = get_model_type(language_model)
 
     if model_type == "phi4mm":
         warnings.warn("Please set the default input_mode to InputMode.LANGUAGE before quantizing.")
@@ -494,14 +568,23 @@ def mono_quantize(
             "Consider reducing calib_size to reduce calibration time.\n####\n"
         )
 
+    # Check if this is Nemotron-Parse
+    config = full_model.config
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+    original_forward = None  # Track original forward method if we wrap it
+
     # For Nemotron VL models, disable quantization of vision components
     if is_nemotron_vl_model:
         print("Disabling quantization for vision components in Nemotron VL model")
         quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
-        # Also disable radio model components specifically
+        # Also disable radio model components specifically (for Nemotron-Parse)
         quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*encoder*"] = {"enable": False}  # Disable encoder
+        quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False}  # Nemotron-Parse specific
+        print("Quantization will only be applied to the decoder (text generation) component")
 
     if not model_is_already_quantized or calibration_only:
         if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
@@ -513,9 +596,25 @@ def mono_quantize(
 
         if not use_calibration:
             warnings.warn("Dynamic quantization. Calibration skipped.")
-        calibrate_loop = (
-            create_forward_loop(dataloader=calib_dataloader) if use_calibration else None
-        )
+
+        # Create calibration loop
+        if use_calibration:
+            if is_nemotron_parse:
+                # For Nemotron-Parse, wrap the model to force use_cache=False
+                print("Wrapping Nemotron-Parse model for calibration (use_cache=False)")
+                original_forward = language_model.forward
+
+                def wrapped_forward(*args, **kwargs):
+                    kwargs["use_cache"] = False
+                    return original_forward(*args, **kwargs)
+
+                # Temporarily replace forward method
+                language_model.forward = wrapped_forward
+                calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+            else:
+                calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+        else:
+            calibrate_loop = None
 
         if calibration_only:
             language_model = mtq.calibrate(
@@ -524,8 +623,15 @@ def mono_quantize(
         else:
             language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop)
 
-        # For VL models, update full_model to use the quantized language model
-        if is_nemotron_vl_model:
+        # Restore original forward method if we wrapped it for Nemotron-Parse
+        if is_nemotron_parse and original_forward is not None:
+            print("Restoring original forward method after calibration")
+            language_model.forward = original_forward
+            original_forward = None
+
+        # For VL models (except Nemotron-Parse), update full_model to use the quantized language model
+        # For Nemotron-Parse, language_model IS full_model, so no update needed
+        if is_nemotron_vl_model and language_model is not full_model:
             language_model_lineage = get_language_model_from_vl(full_model)
             if language_model_lineage is not None:
                 print("Updating full_model with quantized language_model...")
@@ -828,38 +934,12 @@ def quantize_main(
     print(f"Use calib batch_size {args.batch_size}")
 
     calib_dataloader, first_text_speech_dataset = make_calib_dataloader(
-        args, language_model, processor, tokenizer, device, model_type
+        args, language_model, processor, tokenizer, device, model_type, full_model
     )
 
     # Detect if this is a Nemotron VL model using architecture-based detection
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
-    # For Nemotron-Parse, wrap the text-only dataloader to add dummy images
-    # Nemotron-Parse is an encoder-decoder model that requires pixel_values
-    if is_nemotron_vl_model and processor is not None:
-        config = full_model.config
-        architectures = getattr(config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
-        if is_nemotron_parse:
-            # Check if we're quantizing just the decoder or the full model
-            decoder_only = language_model is not full_model
-
-            if decoder_only:
-                print(
-                    "Calibration will use text-only inputs for Nemotron-Parse decoder. "
-                    "Vision encoder is excluded from quantization."
-                )
-            else:
-                print(
-                    "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. "
-                    "Nemotron-Parse requires pixel_values for full model calibration."
-                )
-
-            calib_dataloader = create_nemotron_parse_calib_wrapper(
-                calib_dataloader, processor, device, decoder_only=decoder_only
-            )
-
     preview_input_ids, generated_ids_before_ptq = pre_quantize(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py