Skip to content

Commit 3dd8758

Browse files
committed
make image-text calib default for VLMs, further simplify
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
1 parent e0e28cb commit 3dd8758

4 files changed

Lines changed: 103 additions & 226 deletions

File tree

examples/llm_ptq/example_utils.py

Lines changed: 49 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -68,39 +68,26 @@ def run_nemotron_vl_preview(
6868
"""
6969
from vlm_utils import run_text_only_generation, run_vl_preview_generation
7070

71-
# Check if this is Nemotron-Parse (encoder-decoder model that requires images)
72-
config = full_model.config
73-
architectures = getattr(config, "architectures", [])
74-
is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
71+
print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
72+
question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
73+
generation_config = {
74+
"max_new_tokens": 100,
75+
"do_sample": False,
76+
"eos_token_id": tokenizer.eos_token_id,
77+
}
78+
79+
# Try text-only generation (may fail for encoder-decoder models like Nemotron-Parse)
80+
text_response = run_text_only_generation(
81+
full_model, tokenizer, question, generation_config, pyt_ckpt_path
82+
)
7583

7684
generated_ids = None
77-
78-
if not is_nemotron_parse:
79-
# Only try text-only generation for models that support it (not Nemotron-Parse)
80-
print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
81-
question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
82-
generation_config = {
83-
"max_new_tokens": 100,
84-
"do_sample": False,
85-
"eos_token_id": tokenizer.eos_token_id,
86-
}
87-
88-
# Try text-only generation
89-
text_response = run_text_only_generation(
90-
full_model, tokenizer, question, generation_config, pyt_ckpt_path
91-
)
92-
93-
if text_response is not None:
94-
print(f"✅ Text-only generation successful: {text_response[:100]}...")
95-
generated_ids = text_response
96-
elif allow_fallback:
97-
print("Text-only generation failed, falling back to standard generate...")
98-
generated_ids = full_model.generate(input_ids, max_new_tokens=100)
99-
else:
100-
print(
101-
f"Skipping text-only generation for Nemotron-Parse ({stage_name}) - "
102-
"this encoder-decoder model requires images for all operations."
103-
)
85+
if text_response is not None:
86+
print(f"✅ Text-only generation successful: {text_response[:100]}...")
87+
generated_ids = text_response
88+
elif allow_fallback:
89+
print("Text-only generation failed, falling back to standard generate...")
90+
generated_ids = full_model.generate(input_ids, max_new_tokens=100)
10491

10592
# Run additional VL test with images
10693
print(f"Running additional VL test with images ({stage_name})...")
@@ -111,10 +98,6 @@ def run_nemotron_vl_preview(
11198

11299
def _is_multimodal_config(config):
113100
"""Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
114-
# Check for Nemotron-Parse encoder-decoder architecture
115-
architectures = getattr(config, "architectures", [])
116-
is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
117-
118101
return (
119102
hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL)
120103
or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal
@@ -123,7 +106,10 @@ def _is_multimodal_config(config):
123106
or (
124107
hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
125108
) # Image embedding layers
126-
or is_nemotron_parse # Nemotron-Parse conditional generation model
109+
or getattr(config, "is_encoder_decoder", False) # Encoder-decoder VL models
110+
or any( # Architecture-based detection for custom VL models (e.g., Nemotron-Parse)
111+
"conditionalgeneration" in arch.lower() for arch in getattr(config, "architectures", [])
112+
)
127113
)
128114

129115

@@ -176,9 +162,20 @@ def calibrate_loop(_model):
176162
)
177163
allowed_keys = set(forward_params.keys())
178164

165+
# Check if model is encoder-decoder (needs decoder_input_ids instead of input_ids)
166+
is_enc_dec = getattr(full_model.config, "is_encoder_decoder", False)
167+
179168
full_model.eval()
180169
with torch.no_grad():
181170
for batch in calib_dataloader:
171+
# For encoder-decoder models, rename input_ids → decoder_input_ids
172+
# and disable KV caching to avoid tuple index errors in decoder layers
173+
if is_enc_dec and "input_ids" in batch and "pixel_values" in batch:
174+
batch["decoder_input_ids"] = batch.pop("input_ids")
175+
if "attention_mask" in batch:
176+
batch["decoder_attention_mask"] = batch.pop("attention_mask")
177+
batch["use_cache"] = False
178+
182179
# Filter batch to only include parameters the model accepts
183180
if accepts_kwargs:
184181
call_kwargs = batch
@@ -190,10 +187,8 @@ def calibrate_loop(_model):
190187
# Use safe_nemotron_vl_forward for Nemotron Nano VL (embedding-injection style)
191188
# For other VLMs (like Nemotron-Parse), use standard forward
192189
if hasattr(full_model, "img_context_token_id"):
193-
# Nemotron Nano VL style
194190
safe_nemotron_vl_forward(full_model, call_kwargs)
195191
else:
196-
# Standard encoder-decoder or other VLM architectures
197192
full_model(**call_kwargs)
198193

199194
return calibrate_loop
@@ -276,20 +271,9 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok
276271
if "vila" in ckpt_path.lower():
277272
ckpt_path += "/llm"
278273

279-
# Some custom tokenizers (e.g., Nemotron-Parse) print verbose output when loading.
280-
# Only suppress stdout for trust_remote_code models where custom tokenizer code may be noisy.
281-
if trust_remote_code:
282-
import contextlib
283-
import io
284-
285-
with contextlib.redirect_stdout(io.StringIO()):
286-
tokenizer = AutoTokenizer.from_pretrained(
287-
ckpt_path, trust_remote_code=trust_remote_code, **kwargs
288-
)
289-
else:
290-
tokenizer = AutoTokenizer.from_pretrained(
291-
ckpt_path, trust_remote_code=trust_remote_code, **kwargs
292-
)
274+
tokenizer = AutoTokenizer.from_pretrained(
275+
ckpt_path, trust_remote_code=trust_remote_code, **kwargs
276+
)
293277

294278
# can't set attribute 'pad_token' for "<unk>"
295279
# We skip this step for Nemo models
@@ -342,18 +326,9 @@ def get_processor(
342326

343327
return MllamaImageProcessor(processor, device)
344328
else:
345-
# Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse).
346-
# Suppress stdout for trust_remote_code models where custom processor code may be noisy.
347-
import contextlib
348-
import io
349-
329+
# Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
350330
try:
351-
if model_kwargs.get("trust_remote_code", False):
352-
with contextlib.redirect_stdout(io.StringIO()):
353-
processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
354-
else:
355-
processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
356-
331+
processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
357332
print(f"Loaded AutoProcessor for model type: {model_type}")
358333
return processor
359334
except Exception as e:
@@ -493,22 +468,12 @@ def get_model(
493468
try:
494469
hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
495470

496-
# Check specifically for Nemotron-Parse
497-
architectures = getattr(hf_config, "architectures", [])
498-
is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
499-
500471
if is_nemotron_vl(hf_config):
501-
if is_nemotron_parse:
502-
# Nemotron-Parse works fine with device_map="auto"
503-
# Keep device_map="auto" to ensure proper device placement
504-
print("Detected Nemotron-Parse model from config. Using automatic device mapping.")
505-
else:
506-
# For other Nemotron VL models, disable device_map for compatibility
507-
print(
508-
"Detected Nemotron VL model from config. "
509-
"Disabling automatic device mapping for compatibility."
510-
)
511-
device_map = None
472+
print(
473+
"Detected Nemotron VL model from config. "
474+
"Disabling automatic device mapping for compatibility."
475+
)
476+
device_map = None
512477
except Exception as e:
513478
print(f"Error: Could not load config from {ckpt_path}: {e}")
514479
raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
@@ -564,13 +529,17 @@ def get_model(
564529
if not hasattr(transformers, architecture):
565530
warnings.warn(
566531
f"Architecture {architecture} not found in transformers: {transformers.__version__}. "
567-
"Falling back to AutoModel."
532+
"Falling back to AutoModelForCausalLM (or AutoModel for non-causal architectures)."
568533
)
569534
assert trust_remote_code, (
570535
"Please set trust_remote_code to True if you want to use this architecture"
571536
)
572537

573-
auto_model_module = AutoModel
538+
# Use AutoModelForCausalLM for causal LMs, AutoModel for encoder-decoder models
539+
if getattr(hf_config, "is_encoder_decoder", False):
540+
auto_model_module = AutoModel
541+
else:
542+
auto_model_module = AutoModelForCausalLM
574543
from_config = auto_model_module.from_config
575544
else:
576545
auto_model_module = getattr(transformers, architecture)
@@ -617,21 +586,6 @@ def get_model(
617586
print(f"Moving model to {device} device...")
618587
model = model.to(device)
619588

620-
# For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device
621-
# The RADIO encoder has buffers that might not be properly moved even with device_map="auto"
622-
# This is because custom RADIO modules might not fully support accelerate's device_map
623-
if device != "cpu" and hasattr(model, "encoder"):
624-
# Check if encoder has any buffers on CPU
625-
cpu_buffers = []
626-
for name, buffer in model.encoder.named_buffers():
627-
if buffer.device.type == "cpu":
628-
cpu_buffers.append(name)
629-
630-
if cpu_buffers:
631-
print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...")
632-
model.encoder = model.encoder.to(device)
633-
print(f"Encoder moved to {device}")
634-
635589
if device == "cuda" and not is_model_on_gpu(model):
636590
print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
637591

examples/llm_ptq/hf_ptq.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,12 @@ def load_model(args: argparse.Namespace):
361361
default_pad_token = None
362362

363363
is_nemotron_vl_model = is_nemotron_vl(full_model)
364+
365+
# Default to image-text calibration for VLM models
366+
if is_nemotron_vl_model and not args.calib_with_images:
367+
print("Nemotron VL model detected. Enabling image-text calibration by default.")
368+
args.calib_with_images = True
369+
364370
if model_type == "mllama":
365371
processor = get_processor(
366372
args.pyt_ckpt_path,
@@ -689,7 +695,7 @@ def pre_quantize(
689695
preview_input_ids,
690696
args.pyt_ckpt_path,
691697
"before quantization",
692-
allow_fallback=True,
698+
allow_fallback=False,
693699
)
694700
else:
695701
# Standard generation for non-Nemotron VL models

examples/llm_ptq/vlm_utils.py

Lines changed: 25 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import os
1919

2020
from PIL import Image
21-
from transformers import AutoImageProcessor, AutoProcessor, GenerationConfig
21+
from transformers import AutoImageProcessor, AutoProcessor
2222

2323

2424
def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
@@ -73,34 +73,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
7373
print(" Skipping VL preview generation.")
7474
return None
7575

76-
# Check if this is Nemotron-Parse early to set up proper generation config
77-
config = model.config
78-
architectures = getattr(config, "architectures", [])
79-
is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
80-
8176
# Generate response
8277
question = "Describe this image briefly." # Updated for single image
83-
84-
# Use model's GenerationConfig for Nemotron-Parse, dict for others
85-
if is_nemotron_parse:
86-
try:
87-
generation_config = GenerationConfig.from_pretrained(
88-
model_path, trust_remote_code=True
89-
)
90-
print("Using Nemotron-Parse GenerationConfig from model")
91-
except Exception as e:
92-
print(f"Warning: Could not load GenerationConfig: {e}, using defaults")
93-
generation_config = {
94-
"max_new_tokens": 50,
95-
"do_sample": False,
96-
"eos_token_id": tokenizer.eos_token_id,
97-
}
98-
else:
99-
generation_config = {
100-
"max_new_tokens": 50,
101-
"do_sample": False,
102-
"eos_token_id": tokenizer.eos_token_id,
103-
}
78+
generation_config = {
79+
"max_new_tokens": 50,
80+
"do_sample": False,
81+
"eos_token_id": tokenizer.eos_token_id,
82+
}
10483

10584
print(f"Generating VL response ({stage_name})...")
10685

@@ -126,14 +105,8 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
126105
else:
127106
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
128107

129-
# is_nemotron_parse was already computed above
130-
if is_nemotron_parse:
131-
# Nemotron-Parse uses a specific task prompt format
132-
# See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example
133-
prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
134-
print(f"Using Nemotron-Parse task prompt: {prompt}")
135-
else:
136-
# Other VL models use chat templates
108+
# Use chat template if available, otherwise fall back to default task prompt
109+
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
137110
messages = [
138111
{"role": "system", "content": "/no_think"},
139112
{
@@ -150,11 +123,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
150123
],
151124
},
152125
]
153-
154-
# Apply chat template
155126
prompt = tokenizer.apply_chat_template(
156127
messages, tokenize=False, add_generation_prompt=True
157128
)
129+
else:
130+
# For models without chat templates (e.g., encoder-decoder VL models),
131+
# use the tokenizer's bos/eos tokens as a minimal prompt
132+
prompt = (tokenizer.bos_token or "") + question
158133

159134
# Process inputs using the processor with single image
160135
inputs = processor(
@@ -175,22 +150,12 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
175150
)
176151

177152
# Generate response using model.generate
178-
if isinstance(generation_config, GenerationConfig):
179-
# For Nemotron-Parse with GenerationConfig object
180-
generated_ids = model.generate(
181-
pixel_values=inputs.pixel_values,
182-
input_ids=inputs.input_ids,
183-
attention_mask=inputs.attention_mask,
184-
generation_config=generation_config,
185-
)
186-
else:
187-
# For other models with dict generation config
188-
generated_ids = model.generate(
189-
pixel_values=inputs.pixel_values,
190-
input_ids=inputs.input_ids,
191-
attention_mask=inputs.attention_mask,
192-
**generation_config,
193-
)
153+
generated_ids = model.generate(
154+
pixel_values=inputs.pixel_values,
155+
input_ids=inputs.input_ids,
156+
attention_mask=inputs.attention_mask,
157+
**generation_config,
158+
)
194159

195160
# Decode the response (trim input tokens like in the working example)
196161
if generated_ids is None:
@@ -199,20 +164,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
199164
generated_ids_trimmed = [
200165
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
201166
]
202-
203-
# For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode
204-
if is_nemotron_parse and hasattr(tokenizer, "batch_decode"):
205-
output_text = tokenizer.batch_decode(
206-
generated_ids_trimmed,
207-
skip_special_tokens=True,
208-
clean_up_tokenization_spaces=False,
209-
)
210-
else:
211-
output_text = processor.batch_decode(
212-
generated_ids_trimmed,
213-
skip_special_tokens=True,
214-
clean_up_tokenization_spaces=False,
215-
)
167+
# Use processor.batch_decode if available, otherwise fall back to tokenizer
168+
decoder = processor if hasattr(processor, "batch_decode") else tokenizer
169+
output_text = decoder.batch_decode(
170+
generated_ids_trimmed,
171+
skip_special_tokens=True,
172+
clean_up_tokenization_spaces=False,
173+
)
216174

217175
if output_text is None or len(output_text) == 0:
218176
raise ValueError("Decoding returned empty output")

0 commit comments

Comments
 (0)