@@ -68,39 +68,26 @@ def run_nemotron_vl_preview(
6868 """
6969 from vlm_utils import run_text_only_generation , run_vl_preview_generation
7070
71- # Check if this is Nemotron-Parse (encoder-decoder model that requires images)
72- config = full_model .config
73- architectures = getattr (config , "architectures" , [])
74- is_nemotron_parse = any ("nemotronparse" in arch .lower () for arch in architectures )
71+ print (f"Running text-only preview generation for Nemotron VL model ({ stage_name } )..." )
72+ question = tokenizer .decode (input_ids [0 ], skip_special_tokens = True )
73+ generation_config = {
74+ "max_new_tokens" : 100 ,
75+ "do_sample" : False ,
76+ "eos_token_id" : tokenizer .eos_token_id ,
77+ }
78+
79+ # Try text-only generation (may fail for encoder-decoder models like Nemotron-Parse)
80+ text_response = run_text_only_generation (
81+ full_model , tokenizer , question , generation_config , pyt_ckpt_path
82+ )
7583
7684 generated_ids = None
77-
78- if not is_nemotron_parse :
79- # Only try text-only generation for models that support it (not Nemotron-Parse)
80- print (f"Running text-only preview generation for Nemotron VL model ({ stage_name } )..." )
81- question = tokenizer .decode (input_ids [0 ], skip_special_tokens = True )
82- generation_config = {
83- "max_new_tokens" : 100 ,
84- "do_sample" : False ,
85- "eos_token_id" : tokenizer .eos_token_id ,
86- }
87-
88- # Try text-only generation
89- text_response = run_text_only_generation (
90- full_model , tokenizer , question , generation_config , pyt_ckpt_path
91- )
92-
93- if text_response is not None :
94- print (f"✅ Text-only generation successful: { text_response [:100 ]} ..." )
95- generated_ids = text_response
96- elif allow_fallback :
97- print ("Text-only generation failed, falling back to standard generate..." )
98- generated_ids = full_model .generate (input_ids , max_new_tokens = 100 )
99- else :
100- print (
101- f"Skipping text-only generation for Nemotron-Parse ({ stage_name } ) - "
102- "this encoder-decoder model requires images for all operations."
103- )
85+ if text_response is not None :
86+ print (f"✅ Text-only generation successful: { text_response [:100 ]} ..." )
87+ generated_ids = text_response
88+ elif allow_fallback :
89+ print ("Text-only generation failed, falling back to standard generate..." )
90+ generated_ids = full_model .generate (input_ids , max_new_tokens = 100 )
10491
10592 # Run additional VL test with images
10693 print (f"Running additional VL test with images ({ stage_name } )..." )
@@ -111,10 +98,6 @@ def run_nemotron_vl_preview(
11198
11299def _is_multimodal_config (config ):
113100 """Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
114- # Check for Nemotron-Parse encoder-decoder architecture
115- architectures = getattr (config , "architectures" , [])
116- is_nemotron_parse = any ("nemotronparse" in arch .lower () for arch in architectures )
117-
118101 return (
119102 hasattr (config , "vision_config" ) # Standard vision config (e.g., Qwen2.5-VL)
120103 or getattr (config , "model_type" , "" ) == "phi4mm" # Phi-4 multimodal
@@ -123,7 +106,10 @@ def _is_multimodal_config(config):
123106 or (
124107 hasattr (config , "embd_layer" ) and hasattr (config .embd_layer , "image_embd_layer" )
125108 ) # Image embedding layers
126- or is_nemotron_parse # Nemotron-Parse conditional generation model
109+ or getattr (config , "is_encoder_decoder" , False ) # Encoder-decoder VL models
110+ or any ( # Architecture-based detection for custom VL models (e.g., Nemotron-Parse)
111+ "conditionalgeneration" in arch .lower () for arch in getattr (config , "architectures" , [])
112+ )
127113 )
128114
129115
@@ -176,9 +162,20 @@ def calibrate_loop(_model):
176162 )
177163 allowed_keys = set (forward_params .keys ())
178164
165+ # Check if model is encoder-decoder (needs decoder_input_ids instead of input_ids)
166+ is_enc_dec = getattr (full_model .config , "is_encoder_decoder" , False )
167+
179168 full_model .eval ()
180169 with torch .no_grad ():
181170 for batch in calib_dataloader :
171+ # For encoder-decoder models, rename input_ids → decoder_input_ids
172+ # and disable KV caching to avoid tuple index errors in decoder layers
173+ if is_enc_dec and "input_ids" in batch and "pixel_values" in batch :
174+ batch ["decoder_input_ids" ] = batch .pop ("input_ids" )
175+ if "attention_mask" in batch :
176+ batch ["decoder_attention_mask" ] = batch .pop ("attention_mask" )
177+ batch ["use_cache" ] = False
178+
182179 # Filter batch to only include parameters the model accepts
183180 if accepts_kwargs :
184181 call_kwargs = batch
@@ -190,10 +187,8 @@ def calibrate_loop(_model):
190187 # Use safe_nemotron_vl_forward for Nemotron Nano VL (embedding-injection style)
191188 # For other VLMs (like Nemotron-Parse), use standard forward
192189 if hasattr (full_model , "img_context_token_id" ):
193- # Nemotron Nano VL style
194190 safe_nemotron_vl_forward (full_model , call_kwargs )
195191 else :
196- # Standard encoder-decoder or other VLM architectures
197192 full_model (** call_kwargs )
198193
199194 return calibrate_loop
@@ -276,20 +271,9 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok
276271 if "vila" in ckpt_path .lower ():
277272 ckpt_path += "/llm"
278273
279- # Some custom tokenizers (e.g., Nemotron-Parse) print verbose output when loading.
280- # Only suppress stdout for trust_remote_code models where custom tokenizer code may be noisy.
281- if trust_remote_code :
282- import contextlib
283- import io
284-
285- with contextlib .redirect_stdout (io .StringIO ()):
286- tokenizer = AutoTokenizer .from_pretrained (
287- ckpt_path , trust_remote_code = trust_remote_code , ** kwargs
288- )
289- else :
290- tokenizer = AutoTokenizer .from_pretrained (
291- ckpt_path , trust_remote_code = trust_remote_code , ** kwargs
292- )
274+ tokenizer = AutoTokenizer .from_pretrained (
275+ ckpt_path , trust_remote_code = trust_remote_code , ** kwargs
276+ )
293277
294278 # can't set attribute 'pad_token' for "<unk>"
295279 # We skip this step for Nemo models
@@ -342,18 +326,9 @@ def get_processor(
342326
343327 return MllamaImageProcessor (processor , device )
344328 else :
345- # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse).
346- # Suppress stdout for trust_remote_code models where custom processor code may be noisy.
347- import contextlib
348- import io
349-
329+ # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
350330 try :
351- if model_kwargs .get ("trust_remote_code" , False ):
352- with contextlib .redirect_stdout (io .StringIO ()):
353- processor = AutoProcessor .from_pretrained (ckpt_path , ** model_kwargs )
354- else :
355- processor = AutoProcessor .from_pretrained (ckpt_path , ** model_kwargs )
356-
331+ processor = AutoProcessor .from_pretrained (ckpt_path , ** model_kwargs )
357332 print (f"Loaded AutoProcessor for model type: { model_type } " )
358333 return processor
359334 except Exception as e :
@@ -493,22 +468,12 @@ def get_model(
493468 try :
494469 hf_config = AutoConfig .from_pretrained (ckpt_path , ** config_kwargs )
495470
496- # Check specifically for Nemotron-Parse
497- architectures = getattr (hf_config , "architectures" , [])
498- is_nemotron_parse = any ("nemotronparse" in arch .lower () for arch in architectures )
499-
500471 if is_nemotron_vl (hf_config ):
501- if is_nemotron_parse :
502- # Nemotron-Parse works fine with device_map="auto"
503- # Keep device_map="auto" to ensure proper device placement
504- print ("Detected Nemotron-Parse model from config. Using automatic device mapping." )
505- else :
506- # For other Nemotron VL models, disable device_map for compatibility
507- print (
508- "Detected Nemotron VL model from config. "
509- "Disabling automatic device mapping for compatibility."
510- )
511- device_map = None
472+ print (
473+ "Detected Nemotron VL model from config. "
474+ "Disabling automatic device mapping for compatibility."
475+ )
476+ device_map = None
512477 except Exception as e :
513478 print (f"Error: Could not load config from { ckpt_path } : { e } " )
514479 raise RuntimeError (f"Failed to load model configuration from { ckpt_path } " ) from e
@@ -564,13 +529,17 @@ def get_model(
564529 if not hasattr (transformers , architecture ):
565530 warnings .warn (
566531 f"Architecture { architecture } not found in transformers: { transformers .__version__ } . "
567- "Falling back to AutoModel."
532+ "Falling back to AutoModelForCausalLM (or AutoModel for non-causal architectures) ."
568533 )
569534 assert trust_remote_code , (
570535 "Please set trust_remote_code to True if you want to use this architecture"
571536 )
572537
573- auto_model_module = AutoModel
538+ # Use AutoModelForCausalLM for causal LMs, AutoModel for encoder-decoder models
539+ if getattr (hf_config , "is_encoder_decoder" , False ):
540+ auto_model_module = AutoModel
541+ else :
542+ auto_model_module = AutoModelForCausalLM
574543 from_config = auto_model_module .from_config
575544 else :
576545 auto_model_module = getattr (transformers , architecture )
@@ -617,21 +586,6 @@ def get_model(
617586 print (f"Moving model to { device } device..." )
618587 model = model .to (device )
619588
620- # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device
621- # The RADIO encoder has buffers that might not be properly moved even with device_map="auto"
622- # This is because custom RADIO modules might not fully support accelerate's device_map
623- if device != "cpu" and hasattr (model , "encoder" ):
624- # Check if encoder has any buffers on CPU
625- cpu_buffers = []
626- for name , buffer in model .encoder .named_buffers ():
627- if buffer .device .type == "cpu" :
628- cpu_buffers .append (name )
629-
630- if cpu_buffers :
631- print (f"Found { len (cpu_buffers )} encoder buffers on CPU. Moving encoder to { device } ..." )
632- model .encoder = model .encoder .to (device )
633- print (f"Encoder moved to { device } " )
634-
635589 if device == "cuda" and not is_model_on_gpu (model ):
636590 print ("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM" )
637591
0 commit comments