From 7dca9d431d806d6d11f848c0e71f546864b52c19 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Thu, 11 Dec 2025 06:02:06 +0000 Subject: [PATCH 01/19] Add support for Qwen3-Omni-30B-A3B-Thinking --- examples/llm_ptq/example_utils.py | 19 +++- examples/llm_ptq/hf_ptq.py | 57 ++++++++++- modelopt/torch/export/model_utils.py | 1 + modelopt/torch/utils/dataset_utils.py | 47 +++++++-- modelopt/torch/utils/image_processor.py | 116 ++++++++++++++++++++++ modelopt/torch/utils/vlm_dataset_utils.py | 4 +- 6 files changed, 228 insertions(+), 16 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index aad29fc97c..26f8a75099 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -42,7 +42,11 @@ snapshot_download = None import modelopt.torch.quantization as mtq -from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor +from modelopt.torch.utils.image_processor import ( + BaseImageProcessor, + MllamaImageProcessor, + Qwen3OmniImageProcessor, +) SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"] @@ -310,6 +314,19 @@ def get_processor( ) return MllamaImageProcessor(processor, device) + elif model_type == "qwen3omni": + processor = AutoProcessor.from_pretrained( + ckpt_path, + padding_side="left", + **model_kwargs, + ) + if processor.tokenizer.pad_token is None: + processor.tokenizer.pad_token = processor.tokenizer.eos_token + assert processor.tokenizer.pad_token is not None, ( + f"Pad token for {ckpt_path} cannot be set!" + ) + + return Qwen3OmniImageProcessor(processor, device) return None diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index e32d0dae84..84db44bc27 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -63,7 +63,11 @@ get_max_batch_size, get_supported_datasets, ) -from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor +from modelopt.torch.utils.image_processor import ( + BaseImageProcessor, + MllamaImageProcessor, + Qwen3OmniImageProcessor, +) from modelopt.torch.utils.memory_monitor import launch_memory_monitor from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader @@ -179,6 +183,19 @@ def make_calib_dataloader( batch_size=args.batch_size, num_samples=args.calib_size[0], ) + elif model_type == "qwen3omni": + assert processor is not None and isinstance(processor, Qwen3OmniImageProcessor), ( + "The Qwen3OmniImageProcessor must be set." + ) + assert len(args.calib_size) == 1, ( + "qwen3omni only supports one dataset for calibration, can extend this in the future" + ) + calib_dataloader = get_vlm_dataset_dataloader( + dataset_name=args.dataset[0] if args.dataset else "scienceqa", + processor=processor, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + ) elif model_type == "whisper": assert processor is not None and isinstance(processor, WhisperProcessor), ( "The AutoProcessor must be set." @@ -349,10 +366,16 @@ def load_model(args: argparse.Namespace): calibration_only = True model_type = get_model_type(full_model) + if model_type == "qwen3omni": + full_model.disable_talker() device = full_model.device if hasattr(full_model, "model"): device = full_model.model.device + # For multi-GPU models with device_map="auto", model.device may return 'meta' or 'cpu' + # since parameters are distributed. Force cuda:0 for input tensors. + if device is None or str(device) in ("meta", "cpu"): + device = "cuda" processor = None tokenizer = None language_model = full_model @@ -360,7 +383,8 @@ def load_model(args: argparse.Namespace): default_pad_token = None is_nemotron_vl_model = is_nemotron_vl(full_model) - if model_type == "mllama": + + if model_type in ["mllama", "qwen3omni"]: processor = get_processor( args.pyt_ckpt_path, model_type, @@ -679,6 +703,16 @@ def pre_quantize( "before quantization", allow_fallback=True, ) + elif model_type == "qwen3omni": + # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences + result = full_model.generate(preview_input_ids, max_new_tokens=100) + if isinstance(result, tuple): + text_ids, _ = result + generated_ids_before_ptq = ( + text_ids.sequences if hasattr(text_ids, "sequences") else text_ids + ) + else: + generated_ids_before_ptq = result else: # Standard generation for non-Nemotron VL models generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) @@ -715,6 +749,16 @@ def post_quantize( generated_ids_after_ptq = None if generated_ids_before_ptq is None: pass + elif model_type == "qwen3omni": + # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences + result = full_model.generate(preview_input_ids, max_new_tokens=100) + if isinstance(result, tuple): + text_ids, _ = result + generated_ids_after_ptq = ( + text_ids.sequences if hasattr(text_ids, "sequences") else text_ids + ) + else: + generated_ids_after_ptq = result elif model_type != "llama4" and not is_nemotron_vl_model: # Our fake quantizer may not be fully compatible with torch.compile. generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) @@ -733,7 +777,8 @@ def post_quantize( ) def input_decode(input_ids): - if processor is not None and isinstance(processor, MllamaImageProcessor): + # BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor + if processor is not None and isinstance(processor, BaseImageProcessor): return processor.tokenizer.batch_decode(input_ids) elif processor is not None and isinstance(processor, WhisperProcessor): return first_text_speech_dataset @@ -750,6 +795,12 @@ def output_decode(generated_ids, input_shape): return tokenizer.batch_decode(generated_ids, skip_special_tokens=True) elif processor is not None and isinstance(processor, MllamaImageProcessor): return processor.tokenizer.batch_decode(generated_ids[:, input_shape:]) + elif processor is not None and isinstance(processor, Qwen3OmniImageProcessor): + return processor.tokenizer.batch_decode( + generated_ids[:, input_shape:], + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) elif tokenizer is not None: return tokenizer.batch_decode(generated_ids[:, input_shape:]) else: diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 5a24429ad7..4e08f3dccb 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -31,6 +31,7 @@ "ChatGLM": "chatglm", "Qwen3Moe": "qwen3moe", "Qwen3Next": "qwen3next", + "Qwen3OmniMoeForConditionalGeneration": "qwen3omni", "QWen": "qwen", "RecurrentGemma": "recurrentgemma", "Gemma3": "gemma3", diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 042e74ba5b..60ead12078 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -283,7 +283,8 @@ def _get_free_gpu_mem(): free_mem_before, max_allocated_before = _get_free_gpu_mem() is_enc_dec = model_type_is_enc_dec(model) - infer_method = model.generate if is_enc_dec else model.forward + requires_generate = _model_requires_generate(model) + infer_method = model.generate if (is_enc_dec or requires_generate) else model.forward if sample_input_single_batch is None: sample_input_single_batch = ( @@ -349,11 +350,15 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): Returns: The maximum batch size that worked successfully """ - assert all(torch.is_tensor(data) or data is None for data in batch_data.values()), ( - "batch_data values must be tensors" + # Separate tensor values from scalar parameters (like max_new_tokens) + tensor_data = {k: v for k, v in batch_data.items() if torch.is_tensor(v) or v is None} + scalar_data = {k: v for k, v in batch_data.items() if not torch.is_tensor(v) and v is not None} + + assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), ( + "tensor_data values must be tensors" ) # Get the batch size of current data - batch_size = batch_data[next(iter(batch_data.keys()))].shape[0] + batch_size = tensor_data[next(iter(tensor_data.keys()))].shape[0] # If we know a smaller batch size works, preemptively split if max_working_batch_size is not None and batch_size > max_working_batch_size: @@ -361,11 +366,13 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): for i in range(0, batch_size, max_working_batch_size): end_idx = min(i + max_working_batch_size, batch_size) split_data = {} - for key in batch_data: - if batch_data[key] is None: + for key in tensor_data: + if tensor_data[key] is None: split_data[key] = None else: - split_data[key] = batch_data[key][i:end_idx, ...] + split_data[key] = tensor_data[key][i:end_idx, ...] + # Add back scalar data (non-tensor params like max_new_tokens) + split_data.update(scalar_data) max_working_batch_size = _process_batch( split_data, infer_method, max_working_batch_size @@ -392,8 +399,11 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): # Split the batch in half mid = (batch_size + 1) // 2 warn(f"CUDA out of memory with batch size {batch_size}, trying with batch size {mid}") - split_data_1 = {key: batch_data[key][:mid, ...] for key in batch_data} - split_data_2 = {key: batch_data[key][mid:, ...] for key in batch_data} + split_data_1 = {key: tensor_data[key][:mid, ...] for key in tensor_data} + split_data_2 = {key: tensor_data[key][mid:, ...] for key in tensor_data} + # Add back scalar data (non-tensor params like max_new_tokens) + split_data_1.update(scalar_data) + split_data_2.update(scalar_data) # Recursively process each half and track max working batch size max_working_batch_size = _process_batch(split_data_1, infer_method) @@ -412,10 +422,15 @@ def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None: """ with torch.no_grad(): is_enc_dec = model_type_is_enc_dec(model) - infer_method = model.generate if is_enc_dec else model.forward + requires_generate = _model_requires_generate(model) + use_generate = is_enc_dec or requires_generate + infer_method = model.generate if use_generate else model.forward max_working_batch_size = None # Initialize max working batch size as None for _, data in enumerate(tqdm(dataloader)): + # For generate(), add max_new_tokens to prevent indefinite generation during calibration + if use_generate: + data["max_new_tokens"] = 1 # Process batch and update max working batch size max_working_batch_size = _process_batch(data, infer_method, max_working_batch_size) @@ -493,3 +508,15 @@ def create_forward_loop( def model_type_is_enc_dec(model): enc_dec_model_list = ["t5", "bart", "whisper"] return any(model_name in model.__class__.__name__.lower() for model_name in enc_dec_model_list) + + +def _model_requires_generate(model): + """Check if model requires generate() instead of forward() for calibration. + + Some conditional generation models (like Qwen3-Omni) don't have a standard + forward(input_ids, ...) signature and need to use generate() for calibration. + """ + # Models that require generate() for calibration instead of forward() + generate_model_list = ["qwen3omni"] + model_name = model.__class__.__name__.lower() + return any(name in model_name for name in generate_model_list) diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py index 6374642e3d..258209188f 100644 --- a/modelopt/torch/utils/image_processor.py +++ b/modelopt/torch/utils/image_processor.py @@ -25,6 +25,9 @@ class BaseImageProcessor: def __init__(self, tokenizer, device="cuda"): """Constructor.""" self.tokenizer = tokenizer + # Handle invalid device values that can come from multi-GPU models with device_map="auto" + if device is None or str(device) in ("auto", "meta", "cpu"): + device = "cuda" self.device = device def __call__(self, **kwargs): @@ -110,3 +113,116 @@ def collate_function(self, batch): ).to(self.device) return batch[0] + + +class Qwen3OmniImageProcessor(BaseImageProcessor): + """Image processor for Qwen3-Omni multimodal model.""" + + def __init__(self, tokenizer, device="auto", use_audio_in_video=False): + """Constructor.""" + super().__init__(tokenizer, device) + self.use_audio_in_video = use_audio_in_video + # Try to import qwen_omni_utils for multimodal processing + try: + from qwen_omni_utils import process_mm_info + + self.process_mm_info = process_mm_info + except ImportError: + raise ImportError( + "qwen_omni_utils is required for Qwen3OmniImageProcessor. " + "Please install it from https://github.com/QwenLM/Qwen3-Omni" + ) + + def preprocess_function(self, examples): + """Preprocess function for Qwen3-Omni.""" + question = examples.get("question", "Describe this image.") + + # Build conversation in Qwen format + content = [] + if examples.get("image") is not None: + content.append({"type": "image", "image": examples["image"]}) + if examples.get("audio") is not None: + content.append({"type": "audio", "audio": examples["audio"]}) + if examples.get("video") is not None: + content.append({"type": "video", "video": examples["video"]}) + content.append({"type": "text", "text": question}) + + conversation = [{"role": "user", "content": content}] + + # Apply chat template (tokenize=False to get string) + text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False + ) + + # Extract multimodal info using qwen_omni_utils + audios, images, videos = self.process_mm_info( + conversation, use_audio_in_video=self.use_audio_in_video + ) + + # Process inputs with the processor + values = self.tokenizer( + text=text, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=self.use_audio_in_video, + ) + + # Define all possible keys to ensure consistent schema for Arrow serialization + all_keys = [ + "input_ids", + "attention_mask", + "pixel_values", + "image_grid_thw", + "audio_features", + "audio_feature_lens", + "video_grid_thw", + ] + + # Convert tensors to lists for Arrow serialization compatibility + # Tensor conversion back happens in collate_function + result = dict.fromkeys(all_keys) # Initialize all keys to None + for key, val in values.items(): + if val is not None and hasattr(val, "tolist"): + result[key] = val.tolist() + elif val is not None: + result[key] = val + + return result + + def collate_function(self, batch): + """Collate function to process inputs during data loading.""" + result = {} + + # Take first item from batch (batch_size handling) + first = batch[0] + + # Convert lists to tensors and move to device + if "input_ids" in first and first["input_ids"] is not None: + result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) + if "attention_mask" in first and first["attention_mask"] is not None: + result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) + + # Handle pixel values for images + if first.get("pixel_values") is not None: + result["pixel_values"] = torch.tensor(first["pixel_values"]).to(self.device) + + # Handle image grid thw (tile height width info) + if first.get("image_grid_thw") is not None: + result["image_grid_thw"] = torch.LongTensor(first["image_grid_thw"]).to(self.device) + + # Handle audio features if present + if first.get("audio_feature_lens") is not None: + result["audio_feature_lens"] = torch.LongTensor(first["audio_feature_lens"]).to( + self.device + ) + if first.get("audio_features") is not None: + result["audio_features"] = torch.tensor(first["audio_features"]).to(self.device) + + # Handle video features if present + if first.get("video_grid_thw") is not None: + result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device) + + return result diff --git a/modelopt/torch/utils/vlm_dataset_utils.py b/modelopt/torch/utils/vlm_dataset_utils.py index 3f07c57715..196d452ebc 100644 --- a/modelopt/torch/utils/vlm_dataset_utils.py +++ b/modelopt/torch/utils/vlm_dataset_utils.py @@ -30,7 +30,7 @@ import torch from torch.utils.data import DataLoader -from .image_processor import MllamaImageProcessor +from .image_processor import BaseImageProcessor, MllamaImageProcessor from .nemotron_vlm_dataset_utils import NemotronTarPlusJsonlIterable, list_repo_files_cached # Use dict to store the config for each dataset. @@ -331,7 +331,7 @@ def get_supported_vlm_datasets() -> list[str]: def get_vlm_dataset_dataloader( dataset_name: str = "scienceqa", - processor: Any = None, + processor: BaseImageProcessor | Any = None, batch_size: int = 1, num_samples: int = 512, device: str | torch.device | None = None, From f6ac2d3f69180e6df4ca60ba61cca8aa48d53d75 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Sat, 13 Dec 2025 01:37:15 +0000 Subject: [PATCH 02/19] Add the finevideo dataset for calibration Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 57 +++- modelopt/torch/utils/__init__.py | 1 + modelopt/torch/utils/video_dataset_utils.py | 292 ++++++++++++++++++++ 3 files changed, 336 insertions(+), 14 deletions(-) create mode 100644 modelopt/torch/utils/video_dataset_utils.py diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 84db44bc27..ca124e012b 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -70,6 +70,11 @@ ) from modelopt.torch.utils.memory_monitor import launch_memory_monitor from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader +from modelopt.torch.utils.video_dataset_utils import ( + Qwen3OmniVideoProcessor, + get_supported_video_datasets, + get_video_dataset_dataloader, +) from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader RAND_SEED = 1234 @@ -184,18 +189,37 @@ def make_calib_dataloader( num_samples=args.calib_size[0], ) elif model_type == "qwen3omni": - assert processor is not None and isinstance(processor, Qwen3OmniImageProcessor), ( - "The Qwen3OmniImageProcessor must be set." - ) assert len(args.calib_size) == 1, ( "qwen3omni only supports one dataset for calibration, can extend this in the future" ) - calib_dataloader = get_vlm_dataset_dataloader( - dataset_name=args.dataset[0] if args.dataset else "scienceqa", - processor=processor, - batch_size=args.batch_size, - num_samples=args.calib_size[0], - ) + assert processor is not None, "The processor must be set for qwen3omni model." + dataset_name = args.dataset[0] if args.dataset else "scienceqa" + # Check if using video dataset (e.g., finevideo) + if dataset_name in get_supported_video_datasets(): + video_processor = Qwen3OmniVideoProcessor( + processor.tokenizer if hasattr(processor, "tokenizer") else processor, + device=device, + dtype=language_model.dtype, + use_audio_in_video=True, + ) + calib_dataloader = get_video_dataset_dataloader( + dataset_name=dataset_name, + processor=video_processor, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + ) + else: + assert isinstance(processor, Qwen3OmniImageProcessor), ( + "The Qwen3OmniImageProcessor must be set." + ) + # Set the dtype for proper tensor conversion in collate_function + processor.dtype = language_model.dtype + calib_dataloader = get_vlm_dataset_dataloader( + dataset_name=dataset_name, + processor=processor, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + ) elif model_type == "whisper": assert processor is not None and isinstance(processor, WhisperProcessor), ( "The AutoProcessor must be set." @@ -686,7 +710,8 @@ def pre_quantize( """ # Only run single sample for preview - preview_input_ids = next(iter(calib_dataloader))[ + calib_batch = next(iter(calib_dataloader)) + preview_input_ids = calib_batch[ "input_features" if model_type == "whisper" else "input_ids" ][0:1] @@ -705,7 +730,8 @@ def pre_quantize( ) elif model_type == "qwen3omni": # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences - result = full_model.generate(preview_input_ids, max_new_tokens=100) + # Pass full batch with all multimodal inputs + result = full_model.generate(**calib_batch, max_new_tokens=100) if isinstance(result, tuple): text_ids, _ = result generated_ids_before_ptq = ( @@ -719,7 +745,7 @@ def pre_quantize( if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only": print("Applying nvfp4 quantization (MoE only) for gpt-oss") - return preview_input_ids, generated_ids_before_ptq + return preview_input_ids, generated_ids_before_ptq, calib_batch def post_quantize( @@ -732,6 +758,7 @@ def post_quantize( generated_ids_before_ptq, is_nemotron_vl_model, first_text_speech_dataset, + calib_batch: dict | None = None, ): """ Processing after the quantization. @@ -751,7 +778,8 @@ def post_quantize( pass elif model_type == "qwen3omni": # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences - result = full_model.generate(preview_input_ids, max_new_tokens=100) + # Pass full batch with all multimodal inputs + result = full_model.generate(**calib_batch, max_new_tokens=100) if isinstance(result, tuple): text_ids, _ = result generated_ids_after_ptq = ( @@ -882,7 +910,7 @@ def quantize_main( # Detect if this is a Nemotron VL model using architecture-based detection is_nemotron_vl_model = is_nemotron_vl(full_model) - preview_input_ids, generated_ids_before_ptq = pre_quantize( + preview_input_ids, generated_ids_before_ptq, calib_batch = pre_quantize( args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model ) @@ -954,6 +982,7 @@ def quantize_main( generated_ids_before_ptq, is_nemotron_vl_model, first_text_speech_dataset, + calib_batch, ) export_quantized( args, diff --git a/modelopt/torch/utils/__init__.py b/modelopt/torch/utils/__init__.py index 3ae385ac66..b909609c45 100644 --- a/modelopt/torch/utils/__init__.py +++ b/modelopt/torch/utils/__init__.py @@ -26,4 +26,5 @@ from .perf import * from .regex import * from .tensor import * +from .video_dataset_utils import * from .vlm_dataset_utils import * diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py new file mode 100644 index 0000000000..6ae5c2d2ab --- /dev/null +++ b/modelopt/torch/utils/video_dataset_utils.py @@ -0,0 +1,292 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for getting samples and forward loop function for video datasets.""" + +import os +import tempfile +from typing import Any + +import torch +from torch.utils.data import DataLoader + +from .image_processor import BaseImageProcessor + +# Use dict to store the config for each dataset. +SUPPORTED_VIDEO_DATASET_CONFIG: dict[str, dict[str, Any]] = { + "finevideo": { + "config": {"path": "HuggingFaceFV/finevideo", "split": "train", "streaming": True} + }, +} + +__all__ = [ + "Qwen3OmniVideoProcessor", + "get_supported_video_datasets", + "get_video_dataset_dataloader", +] + + +def _get_video_dataset(dataset_name: str, num_samples: int): + """Load a portion of train dataset with the dataset name and a given size. + + Args: + dataset_name: Name of the dataset to load. + num_samples: Number of samples to load from the dataset. + + Returns: + A hugging face Dataset. + """ + if dataset_name in SUPPORTED_VIDEO_DATASET_CONFIG: + from datasets import Dataset, load_dataset + + config = SUPPORTED_VIDEO_DATASET_CONFIG[dataset_name]["config"] + is_streaming = config.get("streaming", False) + + dataset = load_dataset(**config) + + if is_streaming: + # For streaming datasets, use take() and convert to list then Dataset + samples = list(dataset.take(num_samples)) + return Dataset.from_list(samples) + else: + return dataset.select(range(num_samples)) + else: + raise NotImplementedError( + f"dataset {dataset_name} is not supported. Please use one of the following:" + f" {get_supported_video_datasets()}." + ) + + +def get_supported_video_datasets() -> list[str]: + """Retrieves a list of video datasets supported. + + Returns: + A list of strings, where each string is the name of a supported dataset. + + Example usage: + + .. code-block:: python + + from modelopt.torch.utils import get_supported_video_datasets + + print("Supported video datasets:", get_supported_video_datasets()) + """ + return list(SUPPORTED_VIDEO_DATASET_CONFIG.keys()) + + +def get_video_dataset_dataloader( + dataset_name: str = "finevideo", + processor: "Qwen3OmniVideoProcessor" = None, + batch_size: int = 1, + num_samples: int = 512, +) -> DataLoader: + """Get a dataloader with the dataset name and processor of the target model. + + Args: + dataset_name: Name of the dataset to load. + processor: Processor used for encoding video and text data. + batch_size: Batch size of the returned dataloader. + num_samples: Number of samples from the dataset. + + Returns: + An instance of dataloader. + """ + assert processor is not None, "Please provide a valid processor." + + dataset = _get_video_dataset(dataset_name, num_samples=num_samples) + # Apply the preprocessing function to the dataset + processed_dataset = dataset.map( + processor.preprocess_function, batched=False, remove_columns=dataset.column_names + ) + + # Create DataLoader with the custom collate function + return DataLoader( + processed_dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=processor.collate_function, + ) + + +class Qwen3OmniVideoProcessor(BaseImageProcessor): + """Video processor for Qwen3-Omni multimodal model with finevideo dataset support.""" + + def __init__(self, tokenizer, device="cuda", dtype=None, use_audio_in_video=True): + """Constructor. + + Args: + tokenizer: The Qwen3OmniMoeProcessor for tokenizing and processing inputs. + device: Device to move tensors to. + dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default. + use_audio_in_video: Whether to extract and use audio from video files. + """ + super().__init__(tokenizer, device) + self.dtype = dtype + self.use_audio_in_video = use_audio_in_video + self._temp_dir = tempfile.mkdtemp(prefix="qwen3omni_video_") + self._video_counter = 0 + # Try to import qwen_omni_utils for multimodal processing + try: + from qwen_omni_utils import process_mm_info + + self.process_mm_info = process_mm_info + except ImportError: + raise ImportError( + "qwen_omni_utils is required for Qwen3OmniVideoProcessor. " + "Please install it from https://github.com/QwenLM/Qwen3-Omni" + ) + + def _save_video_bytes_to_file(self, video_bytes: bytes) -> str: + """Save video bytes to a temporary file and return the path. + + Args: + video_bytes: Raw video bytes (e.g., from finevideo's 'mp4' field). + + Returns: + Path to the temporary video file. + """ + video_path = os.path.join(self._temp_dir, f"video_{self._video_counter}.mp4") + self._video_counter += 1 + with open(video_path, "wb") as f: + f.write(video_bytes) + return video_path + + def preprocess_function(self, examples): + """Preprocess function for Qwen3-Omni with video support. + + Handles both standard video paths and raw video bytes (finevideo format). + """ + # Get question/prompt - finevideo has metadata in 'json' field + if "json" in examples and examples["json"] is not None: + metadata = examples["json"] + # Try to get a meaningful question from metadata + category = metadata.get("content_fine_category", "") + question = f"/no_think Describe what is happening in this video in detail. Category hint: {category}" + else: + question = examples.get("question", "/no_think Describe this video in detail.") + + # Build conversation in Qwen format + content = [] + + # Handle video - check for raw bytes (finevideo format) or path + video_path = None + if examples.get("mp4") is not None: + # finevideo format: raw video bytes in 'mp4' field + video_path = self._save_video_bytes_to_file(examples["mp4"]) + elif examples.get("video") is not None: + # Standard format: video path or URL + video_path = examples["video"] + + if video_path is not None: + content.append({"type": "video", "video": video_path}) + + content.append({"type": "text", "text": question}) + + conversation = [{"role": "user", "content": content}] + + # Apply chat template (tokenize=False to get string) + text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False + ) + + # Extract multimodal info using qwen_omni_utils + audios, images, videos = self.process_mm_info( + conversation, use_audio_in_video=self.use_audio_in_video + ) + + # Process inputs with the processor + values = self.tokenizer( + text=text, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=self.use_audio_in_video, + ) + # Define all possible keys to ensure consistent schema for Arrow serialization + all_keys = [ + "input_ids", + "attention_mask", + "pixel_values_videos", + "video_grid_thw", + "video_second_per_grid", + "feature_attention_mask", + "input_features", + ] + + # Convert tensors to lists for Arrow serialization compatibility + # Tensor conversion back happens in collate_function + result = dict.fromkeys(all_keys) # Initialize all keys to None + for key, val in values.items(): + if val is not None and hasattr(val, "tolist"): + result[key] = val.tolist() + elif val is not None: + result[key] = val + + return result + + def collate_function(self, batch): + """Collate function to process inputs during data loading.""" + result = {} + + # Take first item from batch (batch_size handling) + first = batch[0] + + # Convert lists to tensors and move to device + if first.get("input_ids") is not None: + result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) + if first.get("attention_mask") is not None: + result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) + + # Handle pixel values for video frames + if first.get("pixel_values_videos") is not None: + pv = torch.tensor(first["pixel_values_videos"]) + if self.dtype is not None: + pv = pv.to(self.dtype) + result["pixel_values_videos"] = pv.to(self.device) + + # Handle video grid thw (tile height width info) + if first.get("video_grid_thw") is not None: + result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device) + + # Handle video second per grid (temporal info for rope) + if first.get("video_second_per_grid") is not None: + result["video_second_per_grid"] = torch.tensor(first["video_second_per_grid"]).to( + self.device + ) + + # Handle audio features if present + if first.get("feature_attention_mask") is not None: + result["feature_attention_mask"] = torch.LongTensor(first["feature_attention_mask"]).to( + self.device + ) + if first.get("input_features") is not None: + inp_feat = torch.tensor(first["input_features"]) + if self.dtype is not None: + inp_feat = inp_feat.to(self.dtype) + result["input_features"] = inp_feat.to(self.device) + + # Pass use_audio_in_video flag to model.generate() for Qwen3Omni + result["use_audio_in_video"] = self.use_audio_in_video + + return result + + def cleanup(self): + """Clean up temporary video files.""" + import shutil + + if os.path.exists(self._temp_dir): + shutil.rmtree(self._temp_dir) From 616cc1b442110345915fa76ba5f19972dd83605d Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Tue, 16 Dec 2025 20:26:07 +0000 Subject: [PATCH 03/19] Add option to disable talker Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index ca124e012b..0065a0c4f4 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -14,6 +14,7 @@ # limitations under the License. import argparse +import os import random import time import warnings @@ -390,7 +391,7 @@ def load_model(args: argparse.Namespace): calibration_only = True model_type = get_model_type(full_model) - if model_type == "qwen3omni": + if model_type == "qwen3omni" and os.environ.get("DISABLE_TALKER", "0") == "1": full_model.disable_talker() device = full_model.device @@ -711,9 +712,9 @@ def pre_quantize( """ # Only run single sample for preview calib_batch = next(iter(calib_dataloader)) - preview_input_ids = calib_batch[ - "input_features" if model_type == "whisper" else "input_ids" - ][0:1] + preview_input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][ + 0:1 + ] # Generate preview before quantization if model_type == "deepseek": From cf3bbb8205a4ee57b422c3c6400ef5ee085a23bf Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Tue, 16 Dec 2025 21:57:05 +0000 Subject: [PATCH 04/19] Add quantization configs for the model Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 60 +++++++++++++++++++++++++-- modelopt/torch/utils/dataset_utils.py | 2 +- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 0065a0c4f4..f6cfba6ff1 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -76,7 +76,10 @@ get_supported_video_datasets, get_video_dataset_dataloader, ) -from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader +from modelopt.torch.utils.vlm_dataset_utils import ( + get_supported_vlm_datasets, + get_vlm_dataset_dataloader, +) RAND_SEED = 1234 @@ -96,6 +99,11 @@ "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG, "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG, "mxfp8": mtq.MXFP8_DEFAULT_CFG, + "qwen3_nvfp4_qkv_disabled": mtq.NVFP4_DEFAULT_CFG, + "qwen3_nvfp4_qkvo_disabled": mtq.NVFP4_DEFAULT_CFG, + "qwen3_nvfp4_first_n_disabled": mtq.NVFP4_DEFAULT_CFG, + "qwen3_nvfp4_last_n_disabled": mtq.NVFP4_DEFAULT_CFG, + "qwen3_first_and_last_n_disabled": mtq.NVFP4_DEFAULT_CFG, } KV_QUANT_CFG_CHOICES = { @@ -194,7 +202,7 @@ def make_calib_dataloader( "qwen3omni only supports one dataset for calibration, can extend this in the future" ) assert processor is not None, "The processor must be set for qwen3omni model." - dataset_name = args.dataset[0] if args.dataset else "scienceqa" + dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail" # Check if using video dataset (e.g., finevideo) if dataset_name in get_supported_video_datasets(): video_processor = Qwen3OmniVideoProcessor( @@ -209,7 +217,7 @@ def make_calib_dataloader( batch_size=args.batch_size, num_samples=args.calib_size[0], ) - else: + elif dataset_name in get_supported_vlm_datasets(): assert isinstance(processor, Qwen3OmniImageProcessor), ( "The Qwen3OmniImageProcessor must be set." ) @@ -221,6 +229,17 @@ def make_calib_dataloader( batch_size=args.batch_size, num_samples=args.calib_size[0], ) + else: + # Text-only datasets (e.g., cnn_dailymail) + qwen3omni_tokenizer = processor.tokenizer.tokenizer + calib_dataloader = get_dataset_dataloader( + dataset_name=dataset_name, + tokenizer=qwen3omni_tokenizer, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + device=device, + ) + print(f"Selected dataset for calibration: {dataset_name}") elif model_type == "whisper": assert processor is not None and isinstance(processor, WhisperProcessor), ( "The AutoProcessor must be set." @@ -371,6 +390,40 @@ def load_model(args: argparse.Namespace): f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" ) quant_cfg = QUANT_CFG_CHOICES[args.qformat] + + # Qwen3 specific quantizer disabling patterns (thinker.model.layers only) + if args.qformat == "qwen3_nvfp4_qkv_disabled": + # Disable q_proj, k_proj, v_proj quantizers + for proj in ["q_proj", "k_proj", "v_proj"]: + quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { + "enable": False + } + elif args.qformat == "qwen3_nvfp4_qkvo_disabled": + # Disable q_proj, k_proj, v_proj, o_proj quantizers + for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]: + quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { + "enable": False + } + elif args.qformat == "qwen3_nvfp4_first_n_disabled": + # Disable first N layers (e.g., layers 0-7) + n_layers_to_disable = 8 + for i in range(n_layers_to_disable): + quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + elif args.qformat == "qwen3_nvfp4_last_n_disabled": + # Disable last N layers (e.g., layers 40-47 for 48 total layers) + total_layers = 48 + n_layers_to_disable = 8 + for i in range(total_layers - n_layers_to_disable, total_layers): + quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + elif args.qformat == "qwen3_first_and_last_n_disabled": + # Disable both first N and last N layers + total_layers = 48 + n_layers_to_disable = 4 + for i in range(n_layers_to_disable): + quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + for i in range(total_layers - n_layers_to_disable, total_layers): + quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + if args.kv_cache_qformat != "none": quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant( quant_cfg, @@ -392,6 +445,7 @@ def load_model(args: argparse.Namespace): model_type = get_model_type(full_model) if model_type == "qwen3omni" and os.environ.get("DISABLE_TALKER", "0") == "1": + print("Disabling talker for Qwen3Omni model") full_model.disable_talker() device = full_model.device diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 60ead12078..feedea12cd 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -74,7 +74,7 @@ }, "cnn_dailymail": { "config": {"path": "abisee/cnn_dailymail", "name": "3.0.0", "split": ["train"]}, - "preprocess": lambda sample: sample["article"], + "preprocess": lambda sample: "/no_think " + sample["article"], }, "pile": { "config": {"path": "monology/pile-uncopyrighted", "name": "v1.0", "split": ["train"]}, From c700f911e323b0dad1d6eb878dfe49b747da660b Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Tue, 16 Dec 2025 23:44:58 +0000 Subject: [PATCH 05/19] Register Qwen3 thinker and talker sparse moe blocks in quant module Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- .../torch/quantization/plugins/huggingface.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index a29d7c7549..ab0cba9430 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -796,6 +796,23 @@ def unpack_weight(self): except ImportError: pass +try: + from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( + Qwen3OmniMoeTalkerTextSparseMoeBlock, + Qwen3OmniMoeThinkerTextSparseMoeBlock, + ) + + if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry: + QuantModuleRegistry.register( + {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"} + )(_QuantSparseMoe) + if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry: + QuantModuleRegistry.register( + {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"} + )(_QuantSparseMoe) +except ImportError: + pass + class _QuantGptOssExperts(_QuantFunctionalMixin): """Quantized wrapper for `transformers.GptOssExperts`. From 445ff6a97ce1042f31c2664fd554a4d6d81c9f5d Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Tue, 16 Dec 2025 23:52:54 +0000 Subject: [PATCH 06/19] remove first_n and last_n configs Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index f6cfba6ff1..c4d83e12df 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -101,8 +101,6 @@ "mxfp8": mtq.MXFP8_DEFAULT_CFG, "qwen3_nvfp4_qkv_disabled": mtq.NVFP4_DEFAULT_CFG, "qwen3_nvfp4_qkvo_disabled": mtq.NVFP4_DEFAULT_CFG, - "qwen3_nvfp4_first_n_disabled": mtq.NVFP4_DEFAULT_CFG, - "qwen3_nvfp4_last_n_disabled": mtq.NVFP4_DEFAULT_CFG, "qwen3_first_and_last_n_disabled": mtq.NVFP4_DEFAULT_CFG, } @@ -404,17 +402,6 @@ def load_model(args: argparse.Namespace): quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { "enable": False } - elif args.qformat == "qwen3_nvfp4_first_n_disabled": - # Disable first N layers (e.g., layers 0-7) - n_layers_to_disable = 8 - for i in range(n_layers_to_disable): - quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} - elif args.qformat == "qwen3_nvfp4_last_n_disabled": - # Disable last N layers (e.g., layers 40-47 for 48 total layers) - total_layers = 48 - n_layers_to_disable = 8 - for i in range(total_layers - n_layers_to_disable, total_layers): - quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} elif args.qformat == "qwen3_first_and_last_n_disabled": # Disable both first N and last N layers total_layers = 48 From de01666dd2916089ddba47e1f15e1b6a69b7865e Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 17 Dec 2025 00:02:35 +0000 Subject: [PATCH 07/19] Update quantization modes to stack on top of one another Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index c4d83e12df..8458dbb86e 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -390,19 +390,19 @@ def load_model(args: argparse.Namespace): quant_cfg = QUANT_CFG_CHOICES[args.qformat] # Qwen3 specific quantizer disabling patterns (thinker.model.layers only) - if args.qformat == "qwen3_nvfp4_qkv_disabled": + if "qkv_disabled" in args.qformat: # Disable q_proj, k_proj, v_proj quantizers for proj in ["q_proj", "k_proj", "v_proj"]: quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { "enable": False } - elif args.qformat == "qwen3_nvfp4_qkvo_disabled": + if "qkvo_disabled" in args.qformat: # Disable q_proj, k_proj, v_proj, o_proj quantizers - for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]: + for proj in ["o_proj"]: quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { "enable": False } - elif args.qformat == "qwen3_first_and_last_n_disabled": + if "first_and_last_n_disabled" in args.qformat: # Disable both first N and last N layers total_layers = 48 n_layers_to_disable = 4 From 7ef534a505bea0a3d19cc5e5058029a7013e051f Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 17 Dec 2025 03:37:37 +0000 Subject: [PATCH 08/19] Add a text processor for text datasets Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 15 ++++- modelopt/torch/utils/dataset_utils.py | 83 +++++++++++++++++++++++++ modelopt/torch/utils/image_processor.py | 64 +++++++++++++++++++ 3 files changed, 159 insertions(+), 3 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 8458dbb86e..1e8d9e2ca0 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -62,12 +62,14 @@ create_forward_loop, get_dataset_dataloader, get_max_batch_size, + get_qwen3omni_text_dataloader, get_supported_datasets, ) from modelopt.torch.utils.image_processor import ( BaseImageProcessor, MllamaImageProcessor, Qwen3OmniImageProcessor, + Qwen3OmniTextProcessor, ) from modelopt.torch.utils.memory_monitor import launch_memory_monitor from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader @@ -229,12 +231,19 @@ def make_calib_dataloader( ) else: # Text-only datasets (e.g., cnn_dailymail) - qwen3omni_tokenizer = processor.tokenizer.tokenizer - calib_dataloader = get_dataset_dataloader( + # Use Qwen3OmniTextProcessor to apply proper conversation template + # See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking + text_processor = Qwen3OmniTextProcessor( + processor=processor.tokenizer, # Pass the underlying HF processor + device=device, + dtype=language_model.dtype, + ) + calib_dataloader = get_qwen3omni_text_dataloader( dataset_name=dataset_name, - tokenizer=qwen3omni_tokenizer, + processor=text_processor, batch_size=args.batch_size, num_samples=args.calib_size[0], + max_sample_length=args.calib_seq, device=device, ) print(f"Selected dataset for calibration: {dataset_name}") diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index feedea12cd..5e7bdc0780 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -98,6 +98,7 @@ "create_forward_loop", "get_dataset_dataloader", "get_max_batch_size", + "get_qwen3omni_text_dataloader", "get_supported_datasets", ] @@ -243,6 +244,88 @@ def get_dataset_dataloader( return calib_dataloader +def get_qwen3omni_text_dataloader( + dataset_name: str | list[str] = "cnn_dailymail", + processor=None, + batch_size: int = 1, + num_samples: int | list[int] = 512, + max_sample_length: int = 512, + device: str | None = None, +) -> DataLoader: + """Get a text-only dataloader for Qwen3-Omni with proper conversation template applied. + + This function applies the Qwen3-Omni chat template to text samples before tokenization, + which is required for proper calibration of Qwen3-Omni models with text-only datasets. + + See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking + + Args: + dataset_name: Name of the dataset(s) to load. + processor: Qwen3OmniTextProcessor instance wrapping the Qwen3OmniMoeProcessor. + batch_size: Batch size of the returned dataloader. + num_samples: Number of samples from the dataset. + max_sample_length: Maximum length of a sample (for truncation). + device: Target device for the returned dataloader. + + Returns: + A DataLoader with properly formatted inputs for Qwen3-Omni. + """ + assert processor is not None, "Please provide a Qwen3OmniTextProcessor." + + if isinstance(num_samples, int): + num_samples = [num_samples] + + if isinstance(dataset_name, str): + dataset_name = [dataset_name] + + assert len(dataset_name) == len(num_samples), ( + "dataset_name and num_samples must be the same length" + ) + + # Get raw text samples + all_samples = [] + for ds_name, num_sample in zip(dataset_name, num_samples): + samples = _get_dataset_samples(ds_name, num_sample) + all_samples.extend(samples) + + # Preprocess each sample with the conversation template + processed_samples = [] + for text in all_samples: + # Apply conversation template and tokenize + values = processor.preprocess_function(text) + + # Convert to lists for dataset compatibility + sample_dict = {} + for key, val in values.items(): + if val is not None and hasattr(val, "tolist"): + sample_dict[key] = val.tolist() + elif val is not None: + sample_dict[key] = val + processed_samples.append(sample_dict) + + # Create dataset + class _Qwen3OmniTextDataset(torch.utils.data.Dataset): + def __init__(self, samples): + self.samples = samples + + def __getitem__(self, idx): + return self.samples[idx] + + def __len__(self): + return len(self.samples) + + dataset = _Qwen3OmniTextDataset(processed_samples) + + calib_dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=processor.collate_function, + ) + + return calib_dataloader + + def get_supported_datasets() -> list[str]: """Retrieves a list of datasets supported. diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py index 258209188f..84e2b29b3b 100644 --- a/modelopt/torch/utils/image_processor.py +++ b/modelopt/torch/utils/image_processor.py @@ -115,6 +115,70 @@ def collate_function(self, batch): return batch[0] +class Qwen3OmniTextProcessor(BaseImageProcessor): + """Text-only processor for Qwen3-Omni that applies proper conversation template. + + This processor wraps raw text in the Qwen3-Omni conversation format and applies + the chat template before tokenization. Use this for text-only calibration datasets. + + See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking + """ + + def __init__(self, processor, device="auto", dtype=None): + """Constructor. + + Args: + processor: The Qwen3OmniMoeProcessor (from AutoProcessor.from_pretrained). + device: Device to move tensors to. + dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default. + """ + super().__init__(processor, device) + self.dtype = dtype + + def preprocess_function(self, text: str) -> dict: + """Preprocess a single text sample by applying conversation template. + + Args: + text: Raw text string from dataset. + + Returns: + Dictionary with tokenized inputs. + """ + # Build conversation in Qwen format (text-only) + conversation = [ + {"role": "user", "content": [{"type": "text", "text": "/no_think " + text}]} + ] + + # Apply chat template (tokenize=False to get formatted string) + formatted_text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False + ) + + # Tokenize with the processor (no multimodal inputs) + values = self.tokenizer( + text=formatted_text, + audio=None, + images=None, + videos=None, + return_tensors="pt", + padding=True, + ) + + return values + + def collate_function(self, batch): + """Collate function to process text inputs during data loading.""" + result = {} + first = batch[0] + + if "input_ids" in first and first["input_ids"] is not None: + result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) + if "attention_mask" in first and first["attention_mask"] is not None: + result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) + + return result + + class Qwen3OmniImageProcessor(BaseImageProcessor): """Image processor for Qwen3-Omni multimodal model.""" From 7aa5aedd2779928614b2cdb280113c36a338ce5b Mon Sep 17 00:00:00 2001 From: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> Date: Tue, 16 Dec 2025 22:52:56 -0800 Subject: [PATCH 09/19] Disable Qwen3OmniMoe class registration Comment out import and registration of Qwen3OmniMoe classes. Signed-off-by: Chenjie Luo <108829653+cjluo-nv@users.noreply.github.com> --- .../torch/quantization/plugins/huggingface.py | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index ab0cba9430..f7b65fd3df 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -796,22 +796,23 @@ def unpack_weight(self): except ImportError: pass -try: - from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( - Qwen3OmniMoeTalkerTextSparseMoeBlock, - Qwen3OmniMoeThinkerTextSparseMoeBlock, - ) - - if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry: - QuantModuleRegistry.register( - {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"} - )(_QuantSparseMoe) - if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry: - QuantModuleRegistry.register( - {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"} - )(_QuantSparseMoe) -except ImportError: - pass +# Uncomment to forward tokens to all MoE experts for full calibration. +# try: +# from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( +# Qwen3OmniMoeTalkerTextSparseMoeBlock, +# Qwen3OmniMoeThinkerTextSparseMoeBlock, +# ) +# +# if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry: +# QuantModuleRegistry.register( +# {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"} +# )(_QuantSparseMoe) +# if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry: +# QuantModuleRegistry.register( +# {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"} +# )(_QuantSparseMoe) +# except ImportError: +# pass class _QuantGptOssExperts(_QuantFunctionalMixin): From fdad81a85640ad1a8f1fc8658e8c7309cca1c7c3 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 17 Dec 2025 08:22:08 +0000 Subject: [PATCH 10/19] Update logic to disable quantizers Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 15 ++++ examples/llm_ptq/hf_ptq.py | 138 ++++++++++++++++++++++++++++-- 2 files changed, 148 insertions(+), 5 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 26f8a75099..f7343b4f17 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -244,6 +244,21 @@ def build_quant_cfg( quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False} quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False} + # Qwen3 specific quantizer disabling patterns (thinker.model.layers only) + if "qkv_disabled" in qformat: + quant_cfg = copy.deepcopy(quant_cfg) # Don't modify global config + for proj in ["q_proj", "k_proj", "v_proj"]: + quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { + "enable": False + } + if "qkvo_disabled" in qformat: + if "qkv_disabled" not in qformat: # Avoid double deepcopy + quant_cfg = copy.deepcopy(quant_cfg) + for proj in ["o_proj"]: + quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { + "enable": False + } + return quant_cfg diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 1e8d9e2ca0..163c006ae5 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -14,6 +14,7 @@ # limitations under the License. import argparse +import contextlib import os import random import time @@ -392,12 +393,8 @@ def load_model(args: argparse.Namespace): use_seq_device_map=args.use_seq_device_map, attn_implementation=args.attn_implementation, ) - else: - assert args.qformat in QUANT_CFG_CHOICES, ( - f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" - ) - quant_cfg = QUANT_CFG_CHOICES[args.qformat] + quant_cfg = QUANT_CFG_CHOICES[args.qformat] # Qwen3 specific quantizer disabling patterns (thinker.model.layers only) if "qkv_disabled" in args.qformat: # Disable q_proj, k_proj, v_proj quantizers @@ -419,6 +416,11 @@ def load_model(args: argparse.Namespace): quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} for i in range(total_layers - n_layers_to_disable, total_layers): quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + else: + assert args.qformat in QUANT_CFG_CHOICES, ( + f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" + ) + quant_cfg = QUANT_CFG_CHOICES[args.qformat] if args.kv_cache_qformat != "none": quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant( @@ -451,6 +453,8 @@ def load_model(args: argparse.Namespace): # since parameters are distributed. Force cuda:0 for input tensors. if device is None or str(device) in ("meta", "cpu"): device = "cuda" + print(f"Overriding device to {device}") + processor = None tokenizer = None language_model = full_model @@ -633,7 +637,127 @@ def mono_quantize( if language_model_lineage is not None: print("Updating full_model with quantized language_model...") language_model_lineage[-2].language_model = language_model + if is_nemotron_vl_model and tokenizer is not None: + generated_ids_before_ptq = run_nemotron_vl_preview( + full_model, + tokenizer, + input_ids, + args.pyt_ckpt_path, + "before quantization", + allow_fallback=True, + ) + elif model_type == "qwen3omni": + # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences + # Pass full batch with all multimodal inputs + result = full_model.generate(**calib_batch, max_new_tokens=100) + if isinstance(result, tuple): + text_ids, _ = result + generated_ids_before_ptq = ( + text_ids.sequences if hasattr(text_ids, "sequences") else text_ids + ) + else: + generated_ids_before_ptq = result + else: + # Standard generation for non-Nemotron VL models + generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100) + if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only": + print("Applying nvfp4 quantization (MoE only) for gpt-oss") + + # quantize the model + model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only) + + # For VL models, update full_model to use the quantized language model + if is_nemotron_vl_model: + language_model_lineage = get_language_model_from_vl(full_model) + if language_model_lineage is not None: + print("Updating full_model with quantized language_model...") + language_model_lineage[-2].language_model = model + + if args.verbose: + with open("./quant_summary.txt", "w") as f, contextlib.redirect_stdout(f): + mtq.print_quant_summary(full_model) + + # Run some samples + torch.cuda.empty_cache() + generated_ids_after_ptq = None + if model_type == "qwen3omni": + # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences + # Pass full batch with all multimodal inputs + result = full_model.generate(**calib_batch, max_new_tokens=100) + if isinstance(result, tuple): + text_ids, _ = result + generated_ids_after_ptq = ( + text_ids.sequences if hasattr(text_ids, "sequences") else text_ids + ) + else: + generated_ids_after_ptq = result + elif model_type != "llama4" and not is_nemotron_vl_model: + # Our fake quantizer may not be fully compatible with torch.compile. + generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100) + elif is_nemotron_vl_model and tokenizer is not None: + generated_ids_after_ptq = run_nemotron_vl_preview( + full_model, + tokenizer, + input_ids, + args.pyt_ckpt_path, + "after quantization", + allow_fallback=False, + ) + else: + warnings.warn( + "Llama4 Maverick generation after quantization has a bug. Skipping generation sample." + ) + def input_decode(input_ids): + # BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor + if processor is not None and isinstance(processor, BaseImageProcessor): + return processor.tokenizer.batch_decode(input_ids) + elif processor is not None and isinstance(processor, WhisperProcessor): + return first_text + elif tokenizer is not None: + return tokenizer.batch_decode(input_ids) + else: + raise ValueError("The processor or tokenizer must be set") + + def output_decode(generated_ids, input_shape): + if is_enc_dec(model_type): + if processor is not None and isinstance(processor, WhisperProcessor): + return processor.tokenizer.batch_decode( + generated_ids, skip_special_tokens=True + )[0] + elif tokenizer is not None: + return tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + elif processor is not None and isinstance(processor, MllamaImageProcessor): + return processor.tokenizer.batch_decode(generated_ids[:, input_shape:]) + elif processor is not None and isinstance(processor, Qwen3OmniImageProcessor): + return processor.tokenizer.batch_decode( + generated_ids[:, input_shape:], + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + elif tokenizer is not None: + return tokenizer.batch_decode(generated_ids[:, input_shape:]) + else: + raise ValueError("The processor or tokenizer must be set") + + if generated_ids_after_ptq is not None: + print("--------") + if is_nemotron_vl_model: + # For Nemotron VL models, generated_ids are text strings from model.chat() + print("Nemotron VL model text-only generation results:") + print(f"Text response before quantization: {generated_ids_before_ptq}") + print("--------") + print(f"Text response after quantization: {generated_ids_after_ptq}") + print("--------") + print("Note: Additional VL tests with images were run separately above") + else: + # For regular LLMs, generated_ids are token tensors that need decoding + print(f"example test input: {input_decode(input_ids)}") + print("--------") + print( + f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}" + ) + print("--------") else: warnings.warn("Skipping quantization: model is already quantized.") @@ -647,6 +771,10 @@ def export_quantized( default_padding_side, default_pad_token, ): + if model_type == "qwen3omni": + print("Export of Qwen3Omni model is not supported yet") + return + with torch.inference_mode(): if model_type is None: print(f"Unknown model type {type(language_model).__name__}. Continue exporting...") From 8a4cfac0ea9dde0cf1fdf2008f399e382a3e1e49 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Thu, 22 Jan 2026 00:17:39 +0000 Subject: [PATCH 11/19] Add option to save the quantized checkpoint --- examples/llm_ptq/hf_ptq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 163c006ae5..b73bde96de 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -772,7 +772,9 @@ def export_quantized( default_pad_token, ): if model_type == "qwen3omni": - print("Export of Qwen3Omni model is not supported yet") + print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.") + os.makedirs(os.path.dirname(args.export_path), exist_ok=True) + mto.save(model, args.export_path) return with torch.inference_mode(): From e8d9b0e7cc8b5d992e27c72aabfb530872f6b0f4 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Thu, 22 Jan 2026 00:42:15 +0000 Subject: [PATCH 12/19] Add a script to load and run the qwen3omni quantized checkpoint --- examples/llm_ptq/run_quantized_qwen3omni.py | 128 ++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 examples/llm_ptq/run_quantized_qwen3omni.py diff --git a/examples/llm_ptq/run_quantized_qwen3omni.py b/examples/llm_ptq/run_quantized_qwen3omni.py new file mode 100644 index 0000000000..5d7860bc20 --- /dev/null +++ b/examples/llm_ptq/run_quantized_qwen3omni.py @@ -0,0 +1,128 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Script to load and run a quantized Qwen3Omni model from mto checkpoint.""" + +import argparse +import time + +import torch +from qwen_omni_utils import process_mm_info +from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor + +import modelopt.torch.opt as mto + + +def main(args): + print(f"Loading base model from {args.model_path}...") + model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( + args.model_path, + torch_dtype="auto", + device_map="cuda", + attn_implementation="flash_attention_2", + trust_remote_code=True, + ) + + print(f"Restoring quantized state from {args.checkpoint_path}...") + model = mto.restore(model, args.checkpoint_path) + + model.disable_talker() + + print("Loading processor...") + processor = Qwen3OmniMoeProcessor.from_pretrained( + args.model_path, + trust_remote_code=True, + ) + + # Build conversation with user prompt + prompt = args.prompt or "What is the capital of France?" + conversation = [{"role": "user", "content": [{"type": "text", "text": f"{prompt}"}]}] + conversations = [conversation] + + # Set whether to use audio in video + use_audio_in_video = True + + # Preparation for inference + texts = processor.apply_chat_template(conversations, add_generation_prompt=True, tokenize=False) + audios, images, videos = process_mm_info(conversations, use_audio_in_video=use_audio_in_video) + + inputs = processor( + text=texts, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=use_audio_in_video, + ) + inputs = inputs.to(model.device).to(model.dtype) + + print(f"\nPrompt: {prompt}") + print("Generating...") + + start_time = time.time() + with torch.no_grad(): + text_ids, _ = model.generate( + **inputs, + thinker_return_dict_in_generate=True, + use_audio_in_video=use_audio_in_video, + max_new_tokens=args.max_new_tokens, + return_audio=False, + ) + end_time = time.time() + print(f"Time taken for generation: {end_time - start_time:.2f} seconds") + + # Decode the generated tokens + generated_text = processor.batch_decode( + text_ids.sequences[:, inputs["input_ids"].shape[1] :], + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) + + print(f"\nGenerated: {generated_text[0]}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run quantized Qwen3Omni model") + parser.add_argument( + "--model_path", + type=str, + default="Qwen/Qwen3-Omni-30B-A3B-Instruct", + help="Path to the base Qwen3Omni model (HF format)", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + default="/home/scratch.arasane_hw/models/qwen3omni_nvfp4_qkv_disabled_text_bs512_calib512.pt", + help="Path to the mto.save() quantized checkpoint", + ) + parser.add_argument( + "--prompt", + type=str, + default=None, + help="Text prompt for generation", + ) + parser.add_argument( + "--max_new_tokens", + type=int, + default=512, + help="Maximum new tokens to generate", + ) + + args = parser.parse_args() + main(args) From e3337a042957924afe1a00cd45ca6f24f80fd422 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Mon, 26 Jan 2026 20:13:00 +0000 Subject: [PATCH 13/19] Create a script to cache the processed dataset Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/generate_video_dataset.py | 112 ++++++++++++++++++++ modelopt/torch/utils/video_dataset_utils.py | 41 ++++++- 2 files changed, 148 insertions(+), 5 deletions(-) create mode 100644 examples/llm_ptq/generate_video_dataset.py diff --git a/examples/llm_ptq/generate_video_dataset.py b/examples/llm_ptq/generate_video_dataset.py new file mode 100644 index 0000000000..2f8d6bbf80 --- /dev/null +++ b/examples/llm_ptq/generate_video_dataset.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script to pre-generate processed video dataset for Qwen3-Omni quantization.""" + +import argparse +import os + +import torch +from transformers import AutoProcessor + +from modelopt.torch.utils.video_dataset_utils import ( + Qwen3OmniVideoProcessor, + get_video_dataset_dataloader, +) + + +def main(): + parser = argparse.ArgumentParser(description="Generate processed video dataset cache") + parser.add_argument( + "--model-name", + type=str, + default="Qwen/Qwen3-Omni-30B-A3B-Thinking", + help="Model name or path for loading the processor", + ) + parser.add_argument( + "--dataset-name", + type=str, + default="finevideo", + help="Name of the video dataset to process", + ) + parser.add_argument( + "--num-samples", + type=int, + default=512, + help="Number of samples to process", + ) + parser.add_argument( + "--cache-dir", + type=str, + required=True, + help="Directory to save the processed dataset cache", + ) + parser.add_argument( + "--dtype", + type=str, + default="bfloat16", + choices=["float16", "bfloat16", "float32"], + help="Data type for processing", + ) + parser.add_argument( + "--no-audio", + action="store_true", + help="Disable audio extraction from videos", + ) + args = parser.parse_args() + + use_audio = not args.no_audio + + # Set dtype + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + } + dtype = dtype_map[args.dtype] + + print(f"Loading processor from {args.model_name}...") + hf_processor = AutoProcessor.from_pretrained(args.model_name, trust_remote_code=True) + + print(f"Creating Qwen3OmniVideoProcessor (use_audio={use_audio}, dtype={args.dtype})...") + processor = Qwen3OmniVideoProcessor( + tokenizer=hf_processor, + device="cuda" if torch.cuda.is_available() else "cpu", + dtype=dtype, + use_audio_in_video=use_audio, + ) + + print(f"Processing {args.num_samples} samples from {args.dataset_name}...") + print(f"Cache directory: {args.cache_dir}") + + # This will process and save to cache + _ = get_video_dataset_dataloader( + dataset_name=args.dataset_name, + processor=processor, + batch_size=1, + num_samples=args.num_samples, + cache_dir=args.cache_dir, + ) + + # Cleanup temp files + processor.cleanup() + + cache_path = os.path.join(args.cache_dir, f"{args.dataset_name}_n{args.num_samples}_processed") + print(f"\nDone! Processed dataset saved to: {cache_path}") + + +if __name__ == "__main__": + main() diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py index 6ae5c2d2ab..e3cc8f1909 100644 --- a/modelopt/torch/utils/video_dataset_utils.py +++ b/modelopt/torch/utils/video_dataset_utils.py @@ -91,6 +91,7 @@ def get_video_dataset_dataloader( processor: "Qwen3OmniVideoProcessor" = None, batch_size: int = 1, num_samples: int = 512, + cache_dir: str | None = None, ) -> DataLoader: """Get a dataloader with the dataset name and processor of the target model. @@ -99,17 +100,47 @@ def get_video_dataset_dataloader( processor: Processor used for encoding video and text data. batch_size: Batch size of the returned dataloader. num_samples: Number of samples from the dataset. + cache_dir: Directory to cache the processed dataset. Defaults to a temp directory. + If the cache exists, it will be loaded instead of reprocessing. Returns: An instance of dataloader. """ assert processor is not None, "Please provide a valid processor." - dataset = _get_video_dataset(dataset_name, num_samples=num_samples) - # Apply the preprocessing function to the dataset - processed_dataset = dataset.map( - processor.preprocess_function, batched=False, remove_columns=dataset.column_names - ) + # Default cache_dir to temp directory + if cache_dir is None: + cache_dir = os.path.join(tempfile.gettempdir(), "modelopt_video_dataset_cache") + + processed_dataset = None + + # Try to load from cache + if cache_dir is not None: + from datasets import load_from_disk + + cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed") + if os.path.exists(cache_path): + try: + processed_dataset = load_from_disk(cache_path) + print(f"Loaded processed dataset from cache: {cache_path}") + except Exception as e: + print(f"Failed to load cache from {cache_path}: {e}. Reprocessing...") + processed_dataset = None + + # Process dataset if not loaded from cache + if processed_dataset is None: + dataset = _get_video_dataset(dataset_name, num_samples=num_samples) + # Apply the preprocessing function to the dataset + processed_dataset = dataset.map( + processor.preprocess_function, batched=False, remove_columns=dataset.column_names + ) + + # Save to cache if cache_dir is provided + if cache_dir is not None: + os.makedirs(cache_dir, exist_ok=True) + # Use num_shards=1 to avoid off-by-one sharding bug with complex nested structures + processed_dataset.save_to_disk(cache_path, num_shards=1) + print(f"Saved processed dataset to cache: {cache_path}") # Create DataLoader with the custom collate function return DataLoader( From aa775656ab4c27b410c389ca4e706ac91cc4886d Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 28 Jan 2026 08:20:48 +0000 Subject: [PATCH 14/19] Support export to hf format Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 67 +++++++----- examples/llm_ptq/run_quantized_qwen3omni.py | 33 +++--- modelopt/torch/export/unified_export_hf.py | 113 ++++++++++++++------ modelopt/torch/utils/image_processor.py | 12 +-- modelopt/torch/utils/video_dataset_utils.py | 43 +++++--- 5 files changed, 167 insertions(+), 101 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index b73bde96de..c9df249e39 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -394,28 +394,11 @@ def load_model(args: argparse.Namespace): attn_implementation=args.attn_implementation, ) + # Uncomment this to load the model from a .pt file + # model = mto.restore(model, "./qwen3_omni_30b_nvfp4/model.pt") + # print("Qwen3Omni model restored from checkpoint") + quant_cfg = QUANT_CFG_CHOICES[args.qformat] - # Qwen3 specific quantizer disabling patterns (thinker.model.layers only) - if "qkv_disabled" in args.qformat: - # Disable q_proj, k_proj, v_proj quantizers - for proj in ["q_proj", "k_proj", "v_proj"]: - quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { - "enable": False - } - if "qkvo_disabled" in args.qformat: - # Disable q_proj, k_proj, v_proj, o_proj quantizers - for proj in ["o_proj"]: - quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { - "enable": False - } - if "first_and_last_n_disabled" in args.qformat: - # Disable both first N and last N layers - total_layers = 48 - n_layers_to_disable = 4 - for i in range(n_layers_to_disable): - quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} - for i in range(total_layers - n_layers_to_disable, total_layers): - quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} else: assert args.qformat in QUANT_CFG_CHOICES, ( f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" @@ -637,6 +620,37 @@ def mono_quantize( if language_model_lineage is not None: print("Updating full_model with quantized language_model...") language_model_lineage[-2].language_model = language_model + + # Qwen3 specific quantizer disabling patterns (thinker.model.layers only) + if "qkv_disabled" in args.qformat: + # Disable q_proj, k_proj, v_proj quantizers + for proj in ["q_proj", "k_proj", "v_proj"]: + quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { + "enable": False + } + if "qkvo_disabled" in args.qformat: + # Disable q_proj, k_proj, v_proj, o_proj quantizers + for proj in ["o_proj"]: + quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { + "enable": False + } + if "first_and_last_n_disabled" in args.qformat: + # Disable both first N and last N layers + total_layers = 48 + n_layers_to_disable = 4 + for i in range(n_layers_to_disable): + quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + for i in range(total_layers - n_layers_to_disable, total_layers): + quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + + if not model_is_already_quantized or calibration_only: + # Only run single sample for preview + calib_batch = next(iter(calib_dataloader)) + input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][ + 0:1 + ] + + # Generate preview before quantization if is_nemotron_vl_model and tokenizer is not None: generated_ids_before_ptq = run_nemotron_vl_preview( full_model, @@ -771,11 +785,11 @@ def export_quantized( default_padding_side, default_pad_token, ): - if model_type == "qwen3omni": - print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.") - os.makedirs(os.path.dirname(args.export_path), exist_ok=True) - mto.save(model, args.export_path) - return + # Uncomment this to save the model as a .pt file + # if model_type == "qwen3omni": + # print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.") + # os.makedirs(os.path.dirname(args.export_path), exist_ok=True) + # mto.save(full_model, f"{args.export_path}/model.pt") with torch.inference_mode(): if model_type is None: @@ -857,6 +871,7 @@ def export_quantized( export_hf_checkpoint( full_model, export_dir=export_path, + save_modelopt_state=model_type == "qwen3omni", ) # Copy custom model files (Python files and JSON configs) if trust_remote_code is used diff --git a/examples/llm_ptq/run_quantized_qwen3omni.py b/examples/llm_ptq/run_quantized_qwen3omni.py index 5d7860bc20..1d9c7629c4 100644 --- a/examples/llm_ptq/run_quantized_qwen3omni.py +++ b/examples/llm_ptq/run_quantized_qwen3omni.py @@ -16,7 +16,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Script to load and run a quantized Qwen3Omni model from mto checkpoint.""" +"""Script to load and run a quantized Qwen3Omni model from export_hf_checkpoint.""" import argparse import time @@ -27,38 +27,41 @@ import modelopt.torch.opt as mto +# Enable HuggingFace checkpointing for modelopt quantized models +mto.enable_huggingface_checkpointing() + def main(args): - print(f"Loading base model from {args.model_path}...") + print(f"Loading quantized model from {args.checkpoint_path}...") model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( - args.model_path, + args.checkpoint_path, torch_dtype="auto", - device_map="cuda", + device_map="auto", attn_implementation="flash_attention_2", trust_remote_code=True, ) - print(f"Restoring quantized state from {args.checkpoint_path}...") - model = mto.restore(model, args.checkpoint_path) - model.disable_talker() print("Loading processor...") processor = Qwen3OmniMoeProcessor.from_pretrained( - args.model_path, + "Qwen/Qwen3-Omni-30B-A3B-Thinking", trust_remote_code=True, ) # Build conversation with user prompt prompt = args.prompt or "What is the capital of France?" - conversation = [{"role": "user", "content": [{"type": "text", "text": f"{prompt}"}]}] + conversation = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] conversations = [conversation] # Set whether to use audio in video use_audio_in_video = True # Preparation for inference - texts = processor.apply_chat_template(conversations, add_generation_prompt=True, tokenize=False) + texts = processor.apply_chat_template( + conversations, add_generation_prompt=True, tokenize=False, enable_thinking=False + ) + print(f"Texts: {texts}") audios, images, videos = process_mm_info(conversations, use_audio_in_video=use_audio_in_video) inputs = processor( @@ -99,17 +102,11 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run quantized Qwen3Omni model") - parser.add_argument( - "--model_path", - type=str, - default="Qwen/Qwen3-Omni-30B-A3B-Instruct", - help="Path to the base Qwen3Omni model (HF format)", - ) parser.add_argument( "--checkpoint_path", type=str, - default="/home/scratch.arasane_hw/models/qwen3omni_nvfp4_qkv_disabled_text_bs512_calib512.pt", - help="Path to the mto.save() quantized checkpoint", + required=True, + help="Path to the export_hf_checkpoint() quantized checkpoint directory", ) parser.add_argument( "--prompt", diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 011af533dd..f531967b69 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -300,29 +300,43 @@ def llm_dummy_forward(): [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype ).to(model.device) - if getattr(model.config, "is_encoder_decoder", False): - # For encoder-decoder models, we need to pass both the encoder and decoder input ids - model(fake_input, decoder_input_ids=decoder_fake_input) - elif is_vl_model and "nemotron" in model_type: - # For Nemotron VL models, try to run optimization on just the language model part - language_model_lineage = get_language_model_from_vl(model) - - if language_model_lineage is not None: - # Run optimization on just the language model with the same input format as regular LLMs - # Use the same fake_input tensor that regular LLMs use - language_model = language_model_lineage[-1] - print( - f"Running optimization on language model with fake_input shape: {fake_input.shape}" - ) - language_model(fake_input) + with set_quantizer_by_cfg_context(model, {"*": {"enable": False}}): + if getattr(model.config, "is_encoder_decoder", False): + # For encoder-decoder models, we need to pass both the encoder and decoder input ids + model(fake_input, decoder_input_ids=decoder_fake_input) + elif is_vl_model and "nemotron" in model_type: + # For Nemotron VL models, try to run optimization on just the language model part + language_model_lineage = get_language_model_from_vl(model) + + if language_model_lineage is not None: + # Run optimization on just the language model with the same input format as regular LLMs + # Use the same fake_input tensor that regular LLMs use + language_model = language_model_lineage[-1] + print( + f"Running optimization on language model with fake_input shape: {fake_input.shape}" + ) + language_model(fake_input) + else: + raise ValueError( + f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " + "This is required for requantization/resmoothing optimization. " + "Please ensure the model architecture is supported or file an issue." + ) + elif "qwen3omni" in model_type: + # For Qwen3Omni, run on the thinker (language model) component + # The model has structure: model.thinker.model.layers.* + if hasattr(model, "thinker"): + print( + f"Running optimization on Qwen3Omni thinker with fake_input shape: {fake_input.shape}" + ) + model.thinker(fake_input) + else: + raise ValueError( + f"Cannot extract thinker from Qwen3Omni model (type: {model_type}). " + "This is required for requantization/resmoothing optimization." + ) else: - raise ValueError( - f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " - "This is required for requantization/resmoothing optimization. " - "Please ensure the model architecture is supported or file an issue." - ) - else: - model(fake_input) + model(fake_input) input_to_linear, output_to_layernorm = _collect_shared_input_modules( model, llm_dummy_forward, collect_layernorms=True @@ -380,6 +394,19 @@ def _export_quantized_weight( weight_quantizer: TensorQuantizer | SequentialQuantizer = getattr( sub_module, quantizer_attrs.weight_quantizer ) + + # Skip export if weight quantizer is disabled or has no amax (not calibrated) + if not _is_enabled_quantizer(weight_quantizer): + return + + # Check if weight quantizer has calibrated amax + def _has_amax(quantizer): + if isinstance(quantizer, SequentialQuantizer): + return any(hasattr(q, "_amax") and q._amax is not None for q in quantizer) + return hasattr(quantizer, "_amax") and quantizer._amax is not None + + if not _has_amax(weight_quantizer): + return input_quantizer: TensorQuantizer | SequentialQuantizer | None = getattr( sub_module, quantizer_attrs.input_quantizer, None ) @@ -543,6 +570,7 @@ def _process_quantized_modules( model: nn.Module, dtype: torch.dtype, is_modelopt_qlora: bool = False, + pack_weights: bool = True, ) -> None: """Process all quantized modules in model, export weights in-place. @@ -555,6 +583,7 @@ def _process_quantized_modules( dtype: The data type for weight conversion. is_modelopt_qlora: Whether the model is a modelopt-trained QLoRA model. If True, modules with base_layer attribute are skipped. + pack_weights: Whether to pack quantized weights. """ fsdp_module_to_reshard = None @@ -577,8 +606,9 @@ def _process_quantized_modules( sub_module.unpack_weight() if get_quantization_format(sub_module) != QUANTIZATION_NONE: if is_quantlinear(sub_module): - with fsdp2_aware_weight_update(model, sub_module, reshard=False): - _export_quantized_weight(sub_module, dtype) + if pack_weights: + with fsdp2_aware_weight_update(model, sub_module, reshard=False): + _export_quantized_weight(sub_module, dtype) elif ( "Llama4TextExperts" in type(sub_module).__name__ or "GptOssExperts" in type(sub_module).__name__ @@ -595,13 +625,18 @@ def _process_quantized_modules( quantizer_attrs=["gate_up_proj_input_quantizer", "down_proj_input_quantizer"], ) # Export the quantized weights - with fsdp2_aware_weight_update(model, sub_module, reshard=False): - for weight_name in ["gate_up_proj", "down_proj"]: - _export_quantized_weight(sub_module, dtype, weight_name) + if pack_weights: + with fsdp2_aware_weight_update(model, sub_module, reshard=False): + for weight_name in ["gate_up_proj", "down_proj"]: + _export_quantized_weight(sub_module, dtype, weight_name) -def _export_transformers_checkpoint( - model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False, **kwargs +def _export_hf_checkpoint( + model: nn.Module, + dtype: torch.dtype | None = None, + is_modelopt_qlora: bool = False, + pack_weights: bool = True, + **kwargs, ) -> tuple[dict[str, Any], dict[str, Any]]: """Exports the torch model to the packed checkpoint with original HF naming. @@ -611,6 +646,7 @@ def _export_transformers_checkpoint( model: the full torch model to export. The actual quantized model may be a submodule. dtype: the weights data type to export the unquantized layers or the default model data type if None. accelerator: the accelerator instance in case of distributed export setup. + pack_weights: whether to pack quantized weights (False keeps original shapes for HF reload). Returns: post_state_dict: Dict containing quantized weights @@ -695,7 +731,7 @@ def _export_transformers_checkpoint( quant_config = get_quant_config(model, is_modelopt_qlora=is_modelopt_qlora) # Process all quantized modules and export weights - _process_quantized_modules(model, dtype, is_modelopt_qlora) + _process_quantized_modules(model, dtype, is_modelopt_qlora, pack_weights) if accelerator is not None: # Gather state_dict from all ranks @@ -964,7 +1000,12 @@ def export_hf_checkpoint( return try: - post_state_dict, hf_quant_config = _export_transformers_checkpoint(model, dtype) + # Packed weights are only for TRT-LLM consumption + # Set this to true if you want to save the weights in the original precision + pack_weights = True + post_state_dict, hf_quant_config = _export_hf_checkpoint( + model, dtype, pack_weights=pack_weights + ) if hf_quant_config is not None: # Save hf_quant_config.json for backward compatibility @@ -977,6 +1018,16 @@ def export_hf_checkpoint( if getattr(model, "hf_quantizer", None) is not None: model.hf_quantizer = None + # Fix generation_config conflicts before saving + # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors + if hasattr(model, "generation_config") and model.generation_config is not None: + gen_config = model.generation_config + if not getattr(gen_config, "do_sample", True): + # Remove sampling-related params when do_sample is False + for attr in ["temperature", "top_p", "top_k"]: + if hasattr(gen_config, attr): + setattr(gen_config, attr, None) + # Save model model.save_pretrained( export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py index 84e2b29b3b..264f9d3649 100644 --- a/modelopt/torch/utils/image_processor.py +++ b/modelopt/torch/utils/image_processor.py @@ -145,13 +145,9 @@ def preprocess_function(self, text: str) -> dict: Dictionary with tokenized inputs. """ # Build conversation in Qwen format (text-only) - conversation = [ - {"role": "user", "content": [{"type": "text", "text": "/no_think " + text}]} - ] - - # Apply chat template (tokenize=False to get formatted string) + conversation = [{"role": "user", "content": [{"type": "text", "text": text}]}] formatted_text = self.tokenizer.apply_chat_template( - conversation, add_generation_prompt=True, tokenize=False + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False ) # Tokenize with the processor (no multimodal inputs) @@ -212,10 +208,8 @@ def preprocess_function(self, examples): content.append({"type": "text", "text": question}) conversation = [{"role": "user", "content": content}] - - # Apply chat template (tokenize=False to get string) text = self.tokenizer.apply_chat_template( - conversation, add_generation_prompt=True, tokenize=False + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False ) # Extract multimodal info using qwen_omni_utils diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py index e3cc8f1909..e022d7e24f 100644 --- a/modelopt/torch/utils/video_dataset_utils.py +++ b/modelopt/torch/utils/video_dataset_utils.py @@ -114,14 +114,15 @@ def get_video_dataset_dataloader( processed_dataset = None - # Try to load from cache + # Try to load from cache (use torch.save/load to avoid Arrow 32-bit offset overflow) if cache_dir is not None: - from datasets import load_from_disk - - cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed") + cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed.pt") if os.path.exists(cache_path): try: - processed_dataset = load_from_disk(cache_path) + from datasets import Dataset + + processed_samples = torch.load(cache_path, weights_only=False) + processed_dataset = Dataset.from_list(processed_samples) print(f"Loaded processed dataset from cache: {cache_path}") except Exception as e: print(f"Failed to load cache from {cache_path}: {e}. Reprocessing...") @@ -129,17 +130,25 @@ def get_video_dataset_dataloader( # Process dataset if not loaded from cache if processed_dataset is None: + from datasets import Dataset + dataset = _get_video_dataset(dataset_name, num_samples=num_samples) - # Apply the preprocessing function to the dataset - processed_dataset = dataset.map( - processor.preprocess_function, batched=False, remove_columns=dataset.column_names - ) - # Save to cache if cache_dir is provided + # Process samples manually to avoid Arrow 32-bit offset overflow + # (dataset.map() uses Arrow internally which can't handle large nested lists) + processed_samples = [] + for i, sample in enumerate(dataset): + processed = processor.preprocess_function(sample) + processed_samples.append(processed) + if (i + 1) % 10 == 0: + print(f"Processed {i + 1}/{len(dataset)} samples...") + + processed_dataset = Dataset.from_list(processed_samples) + + # Save to cache using torch.save to avoid Arrow 32-bit offset overflow if cache_dir is not None: os.makedirs(cache_dir, exist_ok=True) - # Use num_shards=1 to avoid off-by-one sharding bug with complex nested structures - processed_dataset.save_to_disk(cache_path, num_shards=1) + torch.save(processed_samples, cache_path) print(f"Saved processed dataset to cache: {cache_path}") # Create DataLoader with the custom collate function @@ -204,9 +213,11 @@ def preprocess_function(self, examples): metadata = examples["json"] # Try to get a meaningful question from metadata category = metadata.get("content_fine_category", "") - question = f"/no_think Describe what is happening in this video in detail. Category hint: {category}" + question = ( + f"Describe what is happening in this video in detail. Category hint: {category}" + ) else: - question = examples.get("question", "/no_think Describe this video in detail.") + question = examples.get("question", "Describe this video in detail.") # Build conversation in Qwen format content = [] @@ -226,10 +237,8 @@ def preprocess_function(self, examples): content.append({"type": "text", "text": question}) conversation = [{"role": "user", "content": content}] - - # Apply chat template (tokenize=False to get string) text = self.tokenizer.apply_chat_template( - conversation, add_generation_prompt=True, tokenize=False + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False ) # Extract multimodal info using qwen_omni_utils From 4f92fbfed72e58c30302d3d656050aa1343fbecf Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Mon, 2 Feb 2026 19:54:54 +0000 Subject: [PATCH 15/19] restore configs Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 20 ++- examples/llm_ptq/hf_ptq.py | 176 +-------------------- modelopt/torch/export/unified_export_hf.py | 4 +- modelopt/torch/utils/dataset_utils.py | 23 ++- modelopt/torch/utils/image_processor.py | 3 - 5 files changed, 27 insertions(+), 199 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index f7343b4f17..03e0adbd67 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -244,21 +244,27 @@ def build_quant_cfg( quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False} quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False} - # Qwen3 specific quantizer disabling patterns (thinker.model.layers only) - if "qkv_disabled" in qformat: - quant_cfg = copy.deepcopy(quant_cfg) # Don't modify global config + if model_type == "qwen3omni": + if qformat == "qwen3_nvfp4_qkv_disabled": for proj in ["q_proj", "k_proj", "v_proj"]: quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { "enable": False } - if "qkvo_disabled" in qformat: - if "qkv_disabled" not in qformat: # Avoid double deepcopy - quant_cfg = copy.deepcopy(quant_cfg) - for proj in ["o_proj"]: + elif qformat == "qwen3_nvfp4_qkvo_disabled": + for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]: quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { "enable": False } + elif qformat == "qwen3_nvfp4_first_and_last_n_disabled": + # Disable both first N and last N layers + total_layers = 48 + n_layers_to_disable = 4 + for i in range(n_layers_to_disable): + quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + for i in range(total_layers - n_layers_to_disable, total_layers): + quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} + return quant_cfg diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index c9df249e39..9136fcce55 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -14,8 +14,6 @@ # limitations under the License. import argparse -import contextlib -import os import random import time import warnings @@ -104,7 +102,7 @@ "mxfp8": mtq.MXFP8_DEFAULT_CFG, "qwen3_nvfp4_qkv_disabled": mtq.NVFP4_DEFAULT_CFG, "qwen3_nvfp4_qkvo_disabled": mtq.NVFP4_DEFAULT_CFG, - "qwen3_first_and_last_n_disabled": mtq.NVFP4_DEFAULT_CFG, + "qwen3_nvfp4_first_and_last_n_disabled": mtq.NVFP4_DEFAULT_CFG, } KV_QUANT_CFG_CHOICES = { @@ -199,9 +197,6 @@ def make_calib_dataloader( num_samples=args.calib_size[0], ) elif model_type == "qwen3omni": - assert len(args.calib_size) == 1, ( - "qwen3omni only supports one dataset for calibration, can extend this in the future" - ) assert processor is not None, "The processor must be set for qwen3omni model." dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail" # Check if using video dataset (e.g., finevideo) @@ -394,10 +389,6 @@ def load_model(args: argparse.Namespace): attn_implementation=args.attn_implementation, ) - # Uncomment this to load the model from a .pt file - # model = mto.restore(model, "./qwen3_omni_30b_nvfp4/model.pt") - # print("Qwen3Omni model restored from checkpoint") - quant_cfg = QUANT_CFG_CHOICES[args.qformat] else: assert args.qformat in QUANT_CFG_CHOICES, ( @@ -425,18 +416,13 @@ def load_model(args: argparse.Namespace): calibration_only = True model_type = get_model_type(full_model) - if model_type == "qwen3omni" and os.environ.get("DISABLE_TALKER", "0") == "1": + if model_type == "qwen3omni": print("Disabling talker for Qwen3Omni model") full_model.disable_talker() device = full_model.device if hasattr(full_model, "model"): device = full_model.model.device - # For multi-GPU models with device_map="auto", model.device may return 'meta' or 'cpu' - # since parameters are distributed. Force cuda:0 for input tensors. - if device is None or str(device) in ("meta", "cpu"): - device = "cuda" - print(f"Overriding device to {device}") processor = None tokenizer = None @@ -620,158 +606,6 @@ def mono_quantize( if language_model_lineage is not None: print("Updating full_model with quantized language_model...") language_model_lineage[-2].language_model = language_model - - # Qwen3 specific quantizer disabling patterns (thinker.model.layers only) - if "qkv_disabled" in args.qformat: - # Disable q_proj, k_proj, v_proj quantizers - for proj in ["q_proj", "k_proj", "v_proj"]: - quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { - "enable": False - } - if "qkvo_disabled" in args.qformat: - # Disable q_proj, k_proj, v_proj, o_proj quantizers - for proj in ["o_proj"]: - quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = { - "enable": False - } - if "first_and_last_n_disabled" in args.qformat: - # Disable both first N and last N layers - total_layers = 48 - n_layers_to_disable = 4 - for i in range(n_layers_to_disable): - quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} - for i in range(total_layers - n_layers_to_disable, total_layers): - quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False} - - if not model_is_already_quantized or calibration_only: - # Only run single sample for preview - calib_batch = next(iter(calib_dataloader)) - input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][ - 0:1 - ] - - # Generate preview before quantization - if is_nemotron_vl_model and tokenizer is not None: - generated_ids_before_ptq = run_nemotron_vl_preview( - full_model, - tokenizer, - input_ids, - args.pyt_ckpt_path, - "before quantization", - allow_fallback=True, - ) - elif model_type == "qwen3omni": - # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences - # Pass full batch with all multimodal inputs - result = full_model.generate(**calib_batch, max_new_tokens=100) - if isinstance(result, tuple): - text_ids, _ = result - generated_ids_before_ptq = ( - text_ids.sequences if hasattr(text_ids, "sequences") else text_ids - ) - else: - generated_ids_before_ptq = result - else: - # Standard generation for non-Nemotron VL models - generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100) - if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only": - print("Applying nvfp4 quantization (MoE only) for gpt-oss") - - # quantize the model - model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only) - - # For VL models, update full_model to use the quantized language model - if is_nemotron_vl_model: - language_model_lineage = get_language_model_from_vl(full_model) - if language_model_lineage is not None: - print("Updating full_model with quantized language_model...") - language_model_lineage[-2].language_model = model - - if args.verbose: - with open("./quant_summary.txt", "w") as f, contextlib.redirect_stdout(f): - mtq.print_quant_summary(full_model) - - # Run some samples - torch.cuda.empty_cache() - generated_ids_after_ptq = None - if model_type == "qwen3omni": - # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences - # Pass full batch with all multimodal inputs - result = full_model.generate(**calib_batch, max_new_tokens=100) - if isinstance(result, tuple): - text_ids, _ = result - generated_ids_after_ptq = ( - text_ids.sequences if hasattr(text_ids, "sequences") else text_ids - ) - else: - generated_ids_after_ptq = result - elif model_type != "llama4" and not is_nemotron_vl_model: - # Our fake quantizer may not be fully compatible with torch.compile. - generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100) - elif is_nemotron_vl_model and tokenizer is not None: - generated_ids_after_ptq = run_nemotron_vl_preview( - full_model, - tokenizer, - input_ids, - args.pyt_ckpt_path, - "after quantization", - allow_fallback=False, - ) - else: - warnings.warn( - "Llama4 Maverick generation after quantization has a bug. Skipping generation sample." - ) - - def input_decode(input_ids): - # BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor - if processor is not None and isinstance(processor, BaseImageProcessor): - return processor.tokenizer.batch_decode(input_ids) - elif processor is not None and isinstance(processor, WhisperProcessor): - return first_text - elif tokenizer is not None: - return tokenizer.batch_decode(input_ids) - else: - raise ValueError("The processor or tokenizer must be set") - - def output_decode(generated_ids, input_shape): - if is_enc_dec(model_type): - if processor is not None and isinstance(processor, WhisperProcessor): - return processor.tokenizer.batch_decode( - generated_ids, skip_special_tokens=True - )[0] - elif tokenizer is not None: - return tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - elif processor is not None and isinstance(processor, MllamaImageProcessor): - return processor.tokenizer.batch_decode(generated_ids[:, input_shape:]) - elif processor is not None and isinstance(processor, Qwen3OmniImageProcessor): - return processor.tokenizer.batch_decode( - generated_ids[:, input_shape:], - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - ) - elif tokenizer is not None: - return tokenizer.batch_decode(generated_ids[:, input_shape:]) - else: - raise ValueError("The processor or tokenizer must be set") - - if generated_ids_after_ptq is not None: - print("--------") - if is_nemotron_vl_model: - # For Nemotron VL models, generated_ids are text strings from model.chat() - print("Nemotron VL model text-only generation results:") - print(f"Text response before quantization: {generated_ids_before_ptq}") - print("--------") - print(f"Text response after quantization: {generated_ids_after_ptq}") - print("--------") - print("Note: Additional VL tests with images were run separately above") - else: - # For regular LLMs, generated_ids are token tensors that need decoding - print(f"example test input: {input_decode(input_ids)}") - print("--------") - print( - f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}" - ) - print("--------") else: warnings.warn("Skipping quantization: model is already quantized.") @@ -785,12 +619,6 @@ def export_quantized( default_padding_side, default_pad_token, ): - # Uncomment this to save the model as a .pt file - # if model_type == "qwen3omni": - # print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.") - # os.makedirs(os.path.dirname(args.export_path), exist_ok=True) - # mto.save(full_model, f"{args.export_path}/model.pt") - with torch.inference_mode(): if model_type is None: print(f"Unknown model type {type(language_model).__name__}. Continue exporting...") diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index f531967b69..6136ae39df 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -631,7 +631,7 @@ def _process_quantized_modules( _export_quantized_weight(sub_module, dtype, weight_name) -def _export_hf_checkpoint( +def _export_transformers_checkpoint( model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False, @@ -1003,7 +1003,7 @@ def export_hf_checkpoint( # Packed weights are only for TRT-LLM consumption # Set this to true if you want to save the weights in the original precision pack_weights = True - post_state_dict, hf_quant_config = _export_hf_checkpoint( + post_state_dict, hf_quant_config = _export_transformers_checkpoint( model, dtype, pack_weights=pack_weights ) diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 5e7bdc0780..e68ee4f998 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -74,7 +74,7 @@ }, "cnn_dailymail": { "config": {"path": "abisee/cnn_dailymail", "name": "3.0.0", "split": ["train"]}, - "preprocess": lambda sample: "/no_think " + sample["article"], + "preprocess": lambda sample: sample["article"], }, "pile": { "config": {"path": "monology/pile-uncopyrighted", "name": "v1.0", "split": ["train"]}, @@ -365,9 +365,8 @@ def _get_free_gpu_mem(): torch.cuda.empty_cache() free_mem_before, max_allocated_before = _get_free_gpu_mem() - is_enc_dec = model_type_is_enc_dec(model) - requires_generate = _model_requires_generate(model) - infer_method = model.generate if (is_enc_dec or requires_generate) else model.forward + use_generate = _should_use_generate(model) + infer_method = model.generate if use_generate else model.forward if sample_input_single_batch is None: sample_input_single_batch = ( @@ -504,9 +503,7 @@ def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None: dataloader: DataLoader containing the batched input data """ with torch.no_grad(): - is_enc_dec = model_type_is_enc_dec(model) - requires_generate = _model_requires_generate(model) - use_generate = is_enc_dec or requires_generate + use_generate = _should_use_generate(model) infer_method = model.generate if use_generate else model.forward max_working_batch_size = None # Initialize max working batch size as None @@ -593,13 +590,13 @@ def model_type_is_enc_dec(model): return any(model_name in model.__class__.__name__.lower() for model_name in enc_dec_model_list) -def _model_requires_generate(model): - """Check if model requires generate() instead of forward() for calibration. +def _should_use_generate(model): + """Check if model should use generate() instead of forward() for calibration. - Some conditional generation models (like Qwen3-Omni) don't have a standard - forward(input_ids, ...) signature and need to use generate() for calibration. + Returns True for: + - Encoder-decoder models (t5, bart, whisper) + - Conditional generation models that don't support standard forward() (qwen3omni) """ - # Models that require generate() for calibration instead of forward() generate_model_list = ["qwen3omni"] model_name = model.__class__.__name__.lower() - return any(name in model_name for name in generate_model_list) + return model_type_is_enc_dec(model) or any(name in model_name for name in generate_model_list) diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py index 264f9d3649..07deca7fc4 100644 --- a/modelopt/torch/utils/image_processor.py +++ b/modelopt/torch/utils/image_processor.py @@ -25,9 +25,6 @@ class BaseImageProcessor: def __init__(self, tokenizer, device="cuda"): """Constructor.""" self.tokenizer = tokenizer - # Handle invalid device values that can come from multi-GPU models with device_map="auto" - if device is None or str(device) in ("auto", "meta", "cpu"): - device = "cuda" self.device = device def __call__(self, **kwargs): From 3f12551148ec3c527bd2c0166b49e99bbb2459ff Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Tue, 3 Feb 2026 00:00:29 +0000 Subject: [PATCH 16/19] Added script to run with vllm Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/run_qwen_vllm.py | 134 ++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 examples/llm_ptq/run_qwen_vllm.py diff --git a/examples/llm_ptq/run_qwen_vllm.py b/examples/llm_ptq/run_qwen_vllm.py new file mode 100644 index 0000000000..2e16e01608 --- /dev/null +++ b/examples/llm_ptq/run_qwen_vllm.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Qwen3-Omni-30B-A3B text inference with vLLM. + +Usage: + python qwen3_omni_vllm.py + python qwen3_omni_vllm.py --model /path/to/model --tp 4 +""" + +from __future__ import annotations + +import argparse +import os +import shutil + +from huggingface_hub import snapshot_download +from transformers import Qwen3OmniMoeProcessor +from vllm import LLM, SamplingParams + +MODEL_ID = "Qwen/Qwen3-Omni-30B-A3B-Thinking" + +# Files needed for tokenizer/processor that vLLM loads from model path +TOKENIZER_FILES = [ + "vocab.json", + "merges.txt", + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "preprocessor_config.json", + "chat_template.json", +] + + +def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None: + """Copy tokenizer files from HF model to local quantized model dir if missing.""" + if not os.path.isdir(model_path): + return # Not a local path, nothing to do + + # Check if tokenizer files are missing + missing_files = [f for f in TOKENIZER_FILES if not os.path.exists(os.path.join(model_path, f))] + if not missing_files: + return + + print(f"Copying missing tokenizer files from {source_model_id}...") + # Download only tokenizer files from HF + cache_dir = snapshot_download( + source_model_id, + allow_patterns=TOKENIZER_FILES, + ) + + for fname in TOKENIZER_FILES: + src = os.path.join(cache_dir, fname) + dst = os.path.join(model_path, fname) + if os.path.exists(src) and not os.path.exists(dst): + shutil.copy2(src, dst) + print(f" Copied {fname}") + + +def main(): + parser = argparse.ArgumentParser(description="Run Qwen3-Omni text inference with vLLM") + parser.add_argument("--model", default=MODEL_ID, help="Model ID or path") + parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size") + parser.add_argument("--max-model-len", type=int, default=32768, help="Max model length") + + args = parser.parse_args() + + # Load processor for chat template + processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_ID) + + # Text-only conversations + conversations = [ + [ + { + "role": "user", + "content": [{"type": "text", "text": "What are the key features of Qwen3-Omni?"}], + } + ], + ] + + # Apply chat template with thinking disabled + texts = processor.apply_chat_template( + conversations, + add_generation_prompt=True, + tokenize=False, + enable_thinking=False, + ) + + # Process multimodal info (returns empty for text-only) + # audios, images, videos = process_mm_info(conversations, use_audio_in_video=False) + + # Ensure tokenizer files exist in local model dir (vLLM loads processor from model path) + ensure_tokenizer_files(args.model, MODEL_ID) + + print(f"Loading model: {args.model}") + llm = LLM( + model=args.model, + tokenizer=MODEL_ID, # Always use original tokenizer from HF + tensor_parallel_size=args.tp, + max_model_len=args.max_model_len, + trust_remote_code=True, + # Disable talker (audio generation) - text output only + # enable_talker=False, + ) + + sampling_params = SamplingParams( + temperature=0.7, + top_p=0.9, + max_tokens=512, + ) + + print("Running inference...") + outputs = llm.generate(texts, sampling_params) + + for output in outputs: + generated_text = output.outputs[0].text + print("-" * 80) + print(f"Generated: {generated_text}") + + +if __name__ == "__main__": + main() From a2ec8f3d479b1c6529f5dcc5b97ff91020eec4f0 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Tue, 3 Feb 2026 07:53:39 +0000 Subject: [PATCH 17/19] Disable audio tower and visual encoder quantization Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 9 ++++ examples/llm_ptq/run_quantized_qwen3omni.py | 47 ++++++++++++++++----- examples/llm_ptq/run_qwen_vllm.py | 23 +++++++++- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 9136fcce55..235053bec1 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -574,6 +574,15 @@ def mono_quantize( quant_cfg["quant_cfg"]["*radio*"] = {"enable": False} quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} + # For Qwen3Omni models, disable quantization of conv layers + if model_type == "qwen3omni": + print( + "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model" + ) + quant_cfg["quant_cfg"]["*conv*"] = {"enable": False} + quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False} + quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} + if not model_is_already_quantized or calibration_only: if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only": print("Applying nvfp4 quantization (MoE only) for gpt-oss") diff --git a/examples/llm_ptq/run_quantized_qwen3omni.py b/examples/llm_ptq/run_quantized_qwen3omni.py index 1d9c7629c4..b11f8d37cc 100644 --- a/examples/llm_ptq/run_quantized_qwen3omni.py +++ b/examples/llm_ptq/run_quantized_qwen3omni.py @@ -16,7 +16,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Script to load and run a quantized Qwen3Omni model from export_hf_checkpoint.""" +"""Script to load and run a quantized Qwen3Omni model from export_hf_checkpoint or mto.save().""" import argparse import time @@ -32,14 +32,28 @@ def main(args): - print(f"Loading quantized model from {args.checkpoint_path}...") - model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( - args.checkpoint_path, - torch_dtype="auto", - device_map="auto", - attn_implementation="flash_attention_2", - trust_remote_code=True, - ) + if args.pt_checkpoint_path: + # Load base model first, then restore quantization state from mto.save() checkpoint + print("Loading base model from Qwen/Qwen3-Omni-30B-A3B-Thinking...") + model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( + "Qwen/Qwen3-Omni-30B-A3B-Thinking", + torch_dtype="auto", + device_map="auto", + attn_implementation="flash_attention_2", + trust_remote_code=True, + ) + print(f"Restoring quantization state from {args.pt_checkpoint_path}...") + model = mto.restore(model, args.pt_checkpoint_path) + else: + # Load from HF checkpoint exported with export_hf_checkpoint() + print(f"Loading quantized model from {args.hf_checkpoint_path}...") + model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( + args.hf_checkpoint_path, + torch_dtype="auto", + device_map="auto", + attn_implementation="flash_attention_2", + trust_remote_code=True, + ) model.disable_talker() @@ -103,11 +117,17 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run quantized Qwen3Omni model") parser.add_argument( - "--checkpoint_path", + "--hf_checkpoint_path", type=str, - required=True, + default=None, help="Path to the export_hf_checkpoint() quantized checkpoint directory", ) + parser.add_argument( + "--pt_checkpoint_path", + type=str, + default=None, + help="Path to the mto.save() checkpoint file", + ) parser.add_argument( "--prompt", type=str, @@ -122,4 +142,9 @@ def main(args): ) args = parser.parse_args() + + # Validate arguments + if not args.hf_checkpoint_path and not args.pt_checkpoint_path: + parser.error("Either --hf_checkpoint_path or --pt_checkpoint_path must be provided") + main(args) diff --git a/examples/llm_ptq/run_qwen_vllm.py b/examples/llm_ptq/run_qwen_vllm.py index 2e16e01608..f5f775d4d9 100644 --- a/examples/llm_ptq/run_qwen_vllm.py +++ b/examples/llm_ptq/run_qwen_vllm.py @@ -26,12 +26,32 @@ import os import shutil +# import vllm.model_executor.parameter as vllm_param from huggingface_hub import snapshot_download from transformers import Qwen3OmniMoeProcessor from vllm import LLM, SamplingParams MODEL_ID = "Qwen/Qwen3-Omni-30B-A3B-Thinking" + +# # Debug patch to identify which weights cause shape mismatch +# def _patch_weight_loader_for_debug(): +# """Monkey-patch vLLM weight loader to print debug info on shape mismatch.""" +# original_load_column_parallel = vllm_param.ModelWeightParameter.load_column_parallel_weight + +# def debug_load_column_parallel(self, loaded_weight): +# print(f"Loading param: {getattr(self, 'name', getattr(self, '_name', repr(self)))}") +# print(f" Parameter shape (expected): {self.data.shape}") +# print(f" Loaded weight shape (got): {loaded_weight.shape}") + +# return original_load_column_parallel(self, loaded_weight) + +# vllm_param.ModelWeightParameter.load_column_parallel_weight = debug_load_column_parallel +# print("DEBUG: Patched vLLM weight loader to print shape mismatch info") + + +# _patch_weight_loader_for_debug() + # Files needed for tokenizer/processor that vLLM loads from model path TOKENIZER_FILES = [ "vocab.json", @@ -111,8 +131,7 @@ def main(): tensor_parallel_size=args.tp, max_model_len=args.max_model_len, trust_remote_code=True, - # Disable talker (audio generation) - text output only - # enable_talker=False, + quantization="modelopt_fp4", ) sampling_params = SamplingParams( From 0b1d9cafff18e945631bcb983285808e6c2605d8 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 4 Feb 2026 21:55:02 +0000 Subject: [PATCH 18/19] Add a flag to save the quant summary Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 235053bec1..4f3a8af28a 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -14,7 +14,9 @@ # limitations under the License. import argparse +import io import random +import sys import time import warnings from typing import Any @@ -802,7 +804,20 @@ def post_quantize( """ if args.verbose: - mtq.print_quant_summary(full_model) + if args.quant_summary_path: + # Capture the summary output to a file + old_stdout = sys.stdout + sys.stdout = buffer = io.StringIO() + try: + mtq.print_quant_summary(full_model) + finally: + sys.stdout = old_stdout + summary = buffer.getvalue() + with open(args.quant_summary_path, "w") as f: + f.write(summary) + print(f"Quantization summary saved to {args.quant_summary_path}") + else: + mtq.print_quant_summary(full_model) # Run some samples torch.cuda.empty_cache() @@ -1196,6 +1211,15 @@ def parse_args() -> argparse.Namespace: "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified." ), ) + parser.add_argument( + "--quant_summary_path", + type=str, + default=None, + help=( + "Path to save the quantization summary. If not specified, summary is printed to stdout. " + "Requires --verbose to be enabled (default: True)." + ), + ) return parser.parse_args() From 690620f775a390129542dbe0b860bf113bc574f7 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 4 Feb 2026 23:36:35 +0000 Subject: [PATCH 19/19] Forward tokens to all experts Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- .../torch/quantization/plugins/huggingface.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index f7b65fd3df..54b98052f1 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -797,22 +797,22 @@ def unpack_weight(self): pass # Uncomment to forward tokens to all MoE experts for full calibration. -# try: -# from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( -# Qwen3OmniMoeTalkerTextSparseMoeBlock, -# Qwen3OmniMoeThinkerTextSparseMoeBlock, -# ) -# -# if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry: -# QuantModuleRegistry.register( -# {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"} -# )(_QuantSparseMoe) -# if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry: -# QuantModuleRegistry.register( -# {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"} -# )(_QuantSparseMoe) -# except ImportError: -# pass +try: + from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( + Qwen3OmniMoeTalkerTextSparseMoeBlock, + Qwen3OmniMoeThinkerTextSparseMoeBlock, + ) + + if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry: + QuantModuleRegistry.register( + {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"} + )(_QuantSparseMoe) + if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry: + QuantModuleRegistry.register( + {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"} + )(_QuantSparseMoe) +except ImportError: + pass class _QuantGptOssExperts(_QuantFunctionalMixin):