make image-text calib default for VLMs, further simplify

Edwardf0t1 · Edwardf0t1 · commit 3dd8758653cd · 2026-02-12T16:17:23.000-08:00
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -68,39 +68,26 @@ def run_nemotron_vl_preview(
     """
     from vlm_utils import run_text_only_generation, run_vl_preview_generation
 
-    # Check if this is Nemotron-Parse (encoder-decoder model that requires images)
-    config = full_model.config
-    architectures = getattr(config, "architectures", [])
-    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+    print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
+    question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+    generation_config = {
+        "max_new_tokens": 100,
+        "do_sample": False,
+        "eos_token_id": tokenizer.eos_token_id,
+    }
+
+    # Try text-only generation (may fail for encoder-decoder models like Nemotron-Parse)
+    text_response = run_text_only_generation(
+        full_model, tokenizer, question, generation_config, pyt_ckpt_path
+    )
 
     generated_ids = None
-
-    if not is_nemotron_parse:
-        # Only try text-only generation for models that support it (not Nemotron-Parse)
-        print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
-        question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
-        generation_config = {
-            "max_new_tokens": 100,
-            "do_sample": False,
-            "eos_token_id": tokenizer.eos_token_id,
-        }
-
-        # Try text-only generation
-        text_response = run_text_only_generation(
-            full_model, tokenizer, question, generation_config, pyt_ckpt_path
-        )
-
-        if text_response is not None:
-            print(f"✅ Text-only generation successful: {text_response[:100]}...")
-            generated_ids = text_response
-        elif allow_fallback:
-            print("Text-only generation failed, falling back to standard generate...")
-            generated_ids = full_model.generate(input_ids, max_new_tokens=100)
-    else:
-        print(
-            f"Skipping text-only generation for Nemotron-Parse ({stage_name}) - "
-            "this encoder-decoder model requires images for all operations."
-        )
+    if text_response is not None:
+        print(f"✅ Text-only generation successful: {text_response[:100]}...")
+        generated_ids = text_response
+    elif allow_fallback:
+        print("Text-only generation failed, falling back to standard generate...")
+        generated_ids = full_model.generate(input_ids, max_new_tokens=100)
 
     # Run additional VL test with images
     print(f"Running additional VL test with images ({stage_name})...")
@@ -111,10 +98,6 @@ def run_nemotron_vl_preview(
 
 def _is_multimodal_config(config):
     """Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
-    # Check for Nemotron-Parse encoder-decoder architecture
-    architectures = getattr(config, "architectures", [])
-    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
     return (
         hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
         or getattr(config, "model_type", "") == "phi4mm"  # Phi-4 multimodal
@@ -123,7 +106,10 @@ def _is_multimodal_config(config):
         or (
             hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
         )  # Image embedding layers
-        or is_nemotron_parse  # Nemotron-Parse conditional generation model
+        or getattr(config, "is_encoder_decoder", False)  # Encoder-decoder VL models
+        or any(  # Architecture-based detection for custom VL models (e.g., Nemotron-Parse)
+            "conditionalgeneration" in arch.lower() for arch in getattr(config, "architectures", [])
+        )
     )
 
 
@@ -176,9 +162,20 @@ def calibrate_loop(_model):
         )
         allowed_keys = set(forward_params.keys())
 
+        # Check if model is encoder-decoder (needs decoder_input_ids instead of input_ids)
+        is_enc_dec = getattr(full_model.config, "is_encoder_decoder", False)
+
         full_model.eval()
         with torch.no_grad():
             for batch in calib_dataloader:
+                # For encoder-decoder models, rename input_ids → decoder_input_ids
+                # and disable KV caching to avoid tuple index errors in decoder layers
+                if is_enc_dec and "input_ids" in batch and "pixel_values" in batch:
+                    batch["decoder_input_ids"] = batch.pop("input_ids")
+                    if "attention_mask" in batch:
+                        batch["decoder_attention_mask"] = batch.pop("attention_mask")
+                    batch["use_cache"] = False
+
                 # Filter batch to only include parameters the model accepts
                 if accepts_kwargs:
                     call_kwargs = batch
@@ -190,10 +187,8 @@ def calibrate_loop(_model):
                 # Use safe_nemotron_vl_forward for Nemotron Nano VL (embedding-injection style)
                 # For other VLMs (like Nemotron-Parse), use standard forward
                 if hasattr(full_model, "img_context_token_id"):
-                    # Nemotron Nano VL style
                     safe_nemotron_vl_forward(full_model, call_kwargs)
                 else:
-                    # Standard encoder-decoder or other VLM architectures
                     full_model(**call_kwargs)
 
     return calibrate_loop
@@ -276,20 +271,9 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok
     if "vila" in ckpt_path.lower():
         ckpt_path += "/llm"
 
-    # Some custom tokenizers (e.g., Nemotron-Parse) print verbose output when loading.
-    # Only suppress stdout for trust_remote_code models where custom tokenizer code may be noisy.
-    if trust_remote_code:
-        import contextlib
-        import io
-
-        with contextlib.redirect_stdout(io.StringIO()):
-            tokenizer = AutoTokenizer.from_pretrained(
-                ckpt_path, trust_remote_code=trust_remote_code, **kwargs
-            )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            ckpt_path, trust_remote_code=trust_remote_code, **kwargs
-        )
+    tokenizer = AutoTokenizer.from_pretrained(
+        ckpt_path, trust_remote_code=trust_remote_code, **kwargs
+    )
 
     # can't set attribute 'pad_token' for "<unk>"
     # We skip this step for Nemo models
@@ -342,18 +326,9 @@ def get_processor(
 
         return MllamaImageProcessor(processor, device)
     else:
-        # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse).
-        # Suppress stdout for trust_remote_code models where custom processor code may be noisy.
-        import contextlib
-        import io
-
+        # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
         try:
-            if model_kwargs.get("trust_remote_code", False):
-                with contextlib.redirect_stdout(io.StringIO()):
-                    processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
-            else:
-                processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
-
+            processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
             print(f"Loaded AutoProcessor for model type: {model_type}")
             return processor
         except Exception as e:
@@ -493,22 +468,12 @@ def get_model(
     try:
         hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
 
-        # Check specifically for Nemotron-Parse
-        architectures = getattr(hf_config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
         if is_nemotron_vl(hf_config):
-            if is_nemotron_parse:
-                # Nemotron-Parse works fine with device_map="auto"
-                # Keep device_map="auto" to ensure proper device placement
-                print("Detected Nemotron-Parse model from config. Using automatic device mapping.")
-            else:
-                # For other Nemotron VL models, disable device_map for compatibility
-                print(
-                    "Detected Nemotron VL model from config. "
-                    "Disabling automatic device mapping for compatibility."
-                )
-                device_map = None
+            print(
+                "Detected Nemotron VL model from config. "
+                "Disabling automatic device mapping for compatibility."
+            )
+            device_map = None
     except Exception as e:
         print(f"Error: Could not load config from {ckpt_path}: {e}")
         raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
@@ -564,13 +529,17 @@ def get_model(
                 if not hasattr(transformers, architecture):
                     warnings.warn(
                         f"Architecture {architecture} not found in transformers: {transformers.__version__}. "
-                        "Falling back to AutoModel."
+                        "Falling back to AutoModelForCausalLM (or AutoModel for non-causal architectures)."
                     )
                 assert trust_remote_code, (
                     "Please set trust_remote_code to True if you want to use this architecture"
                 )
 
-                auto_model_module = AutoModel
+                # Use AutoModelForCausalLM for causal LMs, AutoModel for encoder-decoder models
+                if getattr(hf_config, "is_encoder_decoder", False):
+                    auto_model_module = AutoModel
+                else:
+                    auto_model_module = AutoModelForCausalLM
                 from_config = auto_model_module.from_config
             else:
                 auto_model_module = getattr(transformers, architecture)
@@ -617,21 +586,6 @@ def get_model(
         print(f"Moving model to {device} device...")
         model = model.to(device)
 
-    # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device
-    # The RADIO encoder has buffers that might not be properly moved even with device_map="auto"
-    # This is because custom RADIO modules might not fully support accelerate's device_map
-    if device != "cpu" and hasattr(model, "encoder"):
-        # Check if encoder has any buffers on CPU
-        cpu_buffers = []
-        for name, buffer in model.encoder.named_buffers():
-            if buffer.device.type == "cpu":
-                cpu_buffers.append(name)
-
-        if cpu_buffers:
-            print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...")
-            model.encoder = model.encoder.to(device)
-            print(f"Encoder moved to {device}")
-
     if device == "cuda" and not is_model_on_gpu(model):
         print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -361,6 +361,12 @@ def load_model(args: argparse.Namespace):
     default_pad_token = None
 
     is_nemotron_vl_model = is_nemotron_vl(full_model)
+
+    # Default to image-text calibration for VLM models
+    if is_nemotron_vl_model and not args.calib_with_images:
+        print("Nemotron VL model detected. Enabling image-text calibration by default.")
+        args.calib_with_images = True
+
     if model_type == "mllama":
         processor = get_processor(
             args.pyt_ckpt_path,
@@ -689,7 +695,7 @@ def pre_quantize(
             preview_input_ids,
             args.pyt_ckpt_path,
             "before quantization",
-            allow_fallback=True,
+            allow_fallback=False,
         )
     else:
         # Standard generation for non-Nemotron VL models
diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py
@@ -18,7 +18,7 @@
 import os
 
 from PIL import Image
-from transformers import AutoImageProcessor, AutoProcessor, GenerationConfig
+from transformers import AutoImageProcessor, AutoProcessor
 
 
 def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
@@ -73,34 +73,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             print("   Skipping VL preview generation.")
             return None
 
-        # Check if this is Nemotron-Parse early to set up proper generation config
-        config = model.config
-        architectures = getattr(config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
         # Generate response
         question = "Describe this image briefly."  # Updated for single image
-
-        # Use model's GenerationConfig for Nemotron-Parse, dict for others
-        if is_nemotron_parse:
-            try:
-                generation_config = GenerationConfig.from_pretrained(
-                    model_path, trust_remote_code=True
-                )
-                print("Using Nemotron-Parse GenerationConfig from model")
-            except Exception as e:
-                print(f"Warning: Could not load GenerationConfig: {e}, using defaults")
-                generation_config = {
-                    "max_new_tokens": 50,
-                    "do_sample": False,
-                    "eos_token_id": tokenizer.eos_token_id,
-                }
-        else:
-            generation_config = {
-                "max_new_tokens": 50,
-                "do_sample": False,
-                "eos_token_id": tokenizer.eos_token_id,
-            }
+        generation_config = {
+            "max_new_tokens": 50,
+            "do_sample": False,
+            "eos_token_id": tokenizer.eos_token_id,
+        }
 
         print(f"Generating VL response ({stage_name})...")
 
@@ -126,14 +105,8 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
         else:
             processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
-            # is_nemotron_parse was already computed above
-            if is_nemotron_parse:
-                # Nemotron-Parse uses a specific task prompt format
-                # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example
-                prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
-                print(f"Using Nemotron-Parse task prompt: {prompt}")
-            else:
-                # Other VL models use chat templates
+            # Use chat template if available, otherwise fall back to default task prompt
+            if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
                 messages = [
                     {"role": "system", "content": "/no_think"},
                     {
@@ -150,11 +123,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
                         ],
                     },
                 ]
-
-                # Apply chat template
                 prompt = tokenizer.apply_chat_template(
                     messages, tokenize=False, add_generation_prompt=True
                 )
+            else:
+                # For models without chat templates (e.g., encoder-decoder VL models),
+                # use the tokenizer's bos/eos tokens as a minimal prompt
+                prompt = (tokenizer.bos_token or "") + question
 
             # Process inputs using the processor with single image
             inputs = processor(
@@ -175,22 +150,12 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
                 )
 
             # Generate response using model.generate
-            if isinstance(generation_config, GenerationConfig):
-                # For Nemotron-Parse with GenerationConfig object
-                generated_ids = model.generate(
-                    pixel_values=inputs.pixel_values,
-                    input_ids=inputs.input_ids,
-                    attention_mask=inputs.attention_mask,
-                    generation_config=generation_config,
-                )
-            else:
-                # For other models with dict generation config
-                generated_ids = model.generate(
-                    pixel_values=inputs.pixel_values,
-                    input_ids=inputs.input_ids,
-                    attention_mask=inputs.attention_mask,
-                    **generation_config,
-                )
+            generated_ids = model.generate(
+                pixel_values=inputs.pixel_values,
+                input_ids=inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                **generation_config,
+            )
 
             # Decode the response (trim input tokens like in the working example)
             if generated_ids is None:
@@ -199,20 +164,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             generated_ids_trimmed = [
                 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
             ]
-
-            # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode
-            if is_nemotron_parse and hasattr(tokenizer, "batch_decode"):
-                output_text = tokenizer.batch_decode(
-                    generated_ids_trimmed,
-                    skip_special_tokens=True,
-                    clean_up_tokenization_spaces=False,
-                )
-            else:
-                output_text = processor.batch_decode(
-                    generated_ids_trimmed,
-                    skip_special_tokens=True,
-                    clean_up_tokenization_spaces=False,
-                )
+            # Use processor.batch_decode if available, otherwise fall back to tokenizer
+            decoder = processor if hasattr(processor, "batch_decode") else tokenizer
+            output_text = decoder.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )
 
             if output_text is None or len(output_text) == 0:
                 raise ValueError("Decoding returned empty output")
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py