1414# limitations under the License.
1515
1616import argparse
17- import contextlib
18- import os
1917import random
2018import time
2119import warnings
104102 "mxfp8" : mtq .MXFP8_DEFAULT_CFG ,
105103 "qwen3_nvfp4_qkv_disabled" : mtq .NVFP4_DEFAULT_CFG ,
106104 "qwen3_nvfp4_qkvo_disabled" : mtq .NVFP4_DEFAULT_CFG ,
107- "qwen3_first_and_last_n_disabled " : mtq .NVFP4_DEFAULT_CFG ,
105+ "qwen3_nvfp4_first_and_last_n_disabled " : mtq .NVFP4_DEFAULT_CFG ,
108106}
109107
110108KV_QUANT_CFG_CHOICES = {
@@ -199,9 +197,6 @@ def make_calib_dataloader(
199197 num_samples = args .calib_size [0 ],
200198 )
201199 elif model_type == "qwen3omni" :
202- assert len (args .calib_size ) == 1 , (
203- "qwen3omni only supports one dataset for calibration, can extend this in the future"
204- )
205200 assert processor is not None , "The processor must be set for qwen3omni model."
206201 dataset_name = args .dataset [0 ] if args .dataset else "cnn_dailymail"
207202 # Check if using video dataset (e.g., finevideo)
@@ -394,10 +389,6 @@ def load_model(args: argparse.Namespace):
394389 attn_implementation = args .attn_implementation ,
395390 )
396391
397- # Uncomment this to load the model from a .pt file
398- # model = mto.restore(model, "./qwen3_omni_30b_nvfp4/model.pt")
399- # print("Qwen3Omni model restored from checkpoint")
400-
401392 quant_cfg = QUANT_CFG_CHOICES [args .qformat ]
402393 else :
403394 assert args .qformat in QUANT_CFG_CHOICES , (
@@ -425,18 +416,13 @@ def load_model(args: argparse.Namespace):
425416 calibration_only = True
426417
427418 model_type = get_model_type (full_model )
428- if model_type == "qwen3omni" and os . environ . get ( "DISABLE_TALKER" , "0" ) == "1" :
419+ if model_type == "qwen3omni" :
429420 print ("Disabling talker for Qwen3Omni model" )
430421 full_model .disable_talker ()
431422
432423 device = full_model .device
433424 if hasattr (full_model , "model" ):
434425 device = full_model .model .device
435- # For multi-GPU models with device_map="auto", model.device may return 'meta' or 'cpu'
436- # since parameters are distributed. Force cuda:0 for input tensors.
437- if device is None or str (device ) in ("meta" , "cpu" ):
438- device = "cuda"
439- print (f"Overriding device to { device } " )
440426
441427 processor = None
442428 tokenizer = None
@@ -620,158 +606,6 @@ def mono_quantize(
620606 if language_model_lineage is not None :
621607 print ("Updating full_model with quantized language_model..." )
622608 language_model_lineage [- 2 ].language_model = language_model
623-
624- # Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
625- if "qkv_disabled" in args .qformat :
626- # Disable q_proj, k_proj, v_proj quantizers
627- for proj in ["q_proj" , "k_proj" , "v_proj" ]:
628- quant_cfg ["quant_cfg" ][f"*thinker.model.layers.*.self_attn.{ proj } *" ] = {
629- "enable" : False
630- }
631- if "qkvo_disabled" in args .qformat :
632- # Disable q_proj, k_proj, v_proj, o_proj quantizers
633- for proj in ["o_proj" ]:
634- quant_cfg ["quant_cfg" ][f"*thinker.model.layers.*.self_attn.{ proj } *" ] = {
635- "enable" : False
636- }
637- if "first_and_last_n_disabled" in args .qformat :
638- # Disable both first N and last N layers
639- total_layers = 48
640- n_layers_to_disable = 4
641- for i in range (n_layers_to_disable ):
642- quant_cfg ["quant_cfg" ][f"*thinker.model.layers.{ i } .*" ] = {"enable" : False }
643- for i in range (total_layers - n_layers_to_disable , total_layers ):
644- quant_cfg ["quant_cfg" ][f"*thinker.model.layers.{ i } .*" ] = {"enable" : False }
645-
646- if not model_is_already_quantized or calibration_only :
647- # Only run single sample for preview
648- calib_batch = next (iter (calib_dataloader ))
649- input_ids = calib_batch ["input_features" if model_type == "whisper" else "input_ids" ][
650- 0 :1
651- ]
652-
653- # Generate preview before quantization
654- if is_nemotron_vl_model and tokenizer is not None :
655- generated_ids_before_ptq = run_nemotron_vl_preview (
656- full_model ,
657- tokenizer ,
658- input_ids ,
659- args .pyt_ckpt_path ,
660- "before quantization" ,
661- allow_fallback = True ,
662- )
663- elif model_type == "qwen3omni" :
664- # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
665- # Pass full batch with all multimodal inputs
666- result = full_model .generate (** calib_batch , max_new_tokens = 100 )
667- if isinstance (result , tuple ):
668- text_ids , _ = result
669- generated_ids_before_ptq = (
670- text_ids .sequences if hasattr (text_ids , "sequences" ) else text_ids
671- )
672- else :
673- generated_ids_before_ptq = result
674- else :
675- # Standard generation for non-Nemotron VL models
676- generated_ids_before_ptq = full_model .generate (input_ids , max_new_tokens = 100 )
677- if model_type == "gptoss" and args .qformat == "nvfp4_mlp_only" :
678- print ("Applying nvfp4 quantization (MoE only) for gpt-oss" )
679-
680- # quantize the model
681- model = quantize_model (model , quant_cfg , args , calib_dataloader , calibration_only )
682-
683- # For VL models, update full_model to use the quantized language model
684- if is_nemotron_vl_model :
685- language_model_lineage = get_language_model_from_vl (full_model )
686- if language_model_lineage is not None :
687- print ("Updating full_model with quantized language_model..." )
688- language_model_lineage [- 2 ].language_model = model
689-
690- if args .verbose :
691- with open ("./quant_summary.txt" , "w" ) as f , contextlib .redirect_stdout (f ):
692- mtq .print_quant_summary (full_model )
693-
694- # Run some samples
695- torch .cuda .empty_cache ()
696- generated_ids_after_ptq = None
697- if model_type == "qwen3omni" :
698- # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
699- # Pass full batch with all multimodal inputs
700- result = full_model .generate (** calib_batch , max_new_tokens = 100 )
701- if isinstance (result , tuple ):
702- text_ids , _ = result
703- generated_ids_after_ptq = (
704- text_ids .sequences if hasattr (text_ids , "sequences" ) else text_ids
705- )
706- else :
707- generated_ids_after_ptq = result
708- elif model_type != "llama4" and not is_nemotron_vl_model :
709- # Our fake quantizer may not be fully compatible with torch.compile.
710- generated_ids_after_ptq = full_model .generate (input_ids , max_new_tokens = 100 )
711- elif is_nemotron_vl_model and tokenizer is not None :
712- generated_ids_after_ptq = run_nemotron_vl_preview (
713- full_model ,
714- tokenizer ,
715- input_ids ,
716- args .pyt_ckpt_path ,
717- "after quantization" ,
718- allow_fallback = False ,
719- )
720- else :
721- warnings .warn (
722- "Llama4 Maverick generation after quantization has a bug. Skipping generation sample."
723- )
724-
725- def input_decode (input_ids ):
726- # BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor
727- if processor is not None and isinstance (processor , BaseImageProcessor ):
728- return processor .tokenizer .batch_decode (input_ids )
729- elif processor is not None and isinstance (processor , WhisperProcessor ):
730- return first_text
731- elif tokenizer is not None :
732- return tokenizer .batch_decode (input_ids )
733- else :
734- raise ValueError ("The processor or tokenizer must be set" )
735-
736- def output_decode (generated_ids , input_shape ):
737- if is_enc_dec (model_type ):
738- if processor is not None and isinstance (processor , WhisperProcessor ):
739- return processor .tokenizer .batch_decode (
740- generated_ids , skip_special_tokens = True
741- )[0 ]
742- elif tokenizer is not None :
743- return tokenizer .batch_decode (generated_ids , skip_special_tokens = True )
744- elif processor is not None and isinstance (processor , MllamaImageProcessor ):
745- return processor .tokenizer .batch_decode (generated_ids [:, input_shape :])
746- elif processor is not None and isinstance (processor , Qwen3OmniImageProcessor ):
747- return processor .tokenizer .batch_decode (
748- generated_ids [:, input_shape :],
749- skip_special_tokens = True ,
750- clean_up_tokenization_spaces = False ,
751- )
752- elif tokenizer is not None :
753- return tokenizer .batch_decode (generated_ids [:, input_shape :])
754- else :
755- raise ValueError ("The processor or tokenizer must be set" )
756-
757- if generated_ids_after_ptq is not None :
758- print ("--------" )
759- if is_nemotron_vl_model :
760- # For Nemotron VL models, generated_ids are text strings from model.chat()
761- print ("Nemotron VL model text-only generation results:" )
762- print (f"Text response before quantization: { generated_ids_before_ptq } " )
763- print ("--------" )
764- print (f"Text response after quantization: { generated_ids_after_ptq } " )
765- print ("--------" )
766- print ("Note: Additional VL tests with images were run separately above" )
767- else :
768- # For regular LLMs, generated_ids are token tensors that need decoding
769- print (f"example test input: { input_decode (input_ids )} " )
770- print ("--------" )
771- print (
772- f"example outputs before ptq: { output_decode (generated_ids_before_ptq , input_ids .shape [1 ])} "
773- )
774- print ("--------" )
775609 else :
776610 warnings .warn ("Skipping quantization: model is already quantized." )
777611
@@ -785,12 +619,6 @@ def export_quantized(
785619 default_padding_side ,
786620 default_pad_token ,
787621):
788- # Uncomment this to save the model as a .pt file
789- # if model_type == "qwen3omni":
790- # print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.")
791- # os.makedirs(os.path.dirname(args.export_path), exist_ok=True)
792- # mto.save(full_model, f"{args.export_path}/model.pt")
793-
794622 with torch .inference_mode ():
795623 if model_type is None :
796624 print (f"Unknown model type { type (language_model ).__name__ } . Continue exporting..." )
0 commit comments