diff --git a/examples/llm_eval/run_lm_eval_vllm.sh b/examples/llm_eval/run_lm_eval_vllm.sh old mode 100644 new mode 100755 index ef94a66d14..18c52995c9 --- a/examples/llm_eval/run_lm_eval_vllm.sh +++ b/examples/llm_eval/run_lm_eval_vllm.sh @@ -19,12 +19,13 @@ # Script to run lm-evaluation-harness against a running vLLM OpenAI-compatible server. # # Usage: -# bash run_lm_eval_vllm.sh [port] [task] +# bash run_lm_eval_vllm.sh [port] [task] [host] # # Arguments: # : The name of the model being served (e.g., Qwen/Qwen3-30B-A3B). Used for the 'model' argument in lm_eval. # [port]: The port the vLLM server is listening on (default: 8000). # [task]: The lm_eval task(s) to run (default: mmlu). +# [host]: The IP address or hostname of the vLLM server (default: localhost). # # Example: # # Start vLLM server first (in another terminal): @@ -35,6 +36,9 @@ # # # Run for a different task, e.g., hellaswag: # bash run_lm_eval_vllm.sh Qwen/Qwen3-30B-A3B 8000 hellaswag +# +# # Run against a remote server: +# bash run_lm_eval_vllm.sh Qwen/Qwen3-30B-A3B 8000 mmlu 10.78.17.40 # --- set -e @@ -42,16 +46,17 @@ set -x # --- Argument Parsing --- if [ -z "$1" ]; then - echo "Usage: $0 [port] [task]" + echo "Usage: $0 [port] [task] [host]" exit 1 fi MODEL_NAME=$1 PORT=${2:-8000} # Default port is 8000 if not provided TASK=${3:-mmlu} # Default task is mmlu if not provided +HOST=${4:-localhost} # Default host is localhost if not provided # --- Environment Setup --- export OPENAI_API_KEY="local" # Not strictly required for local, but good practice -BASE_URL="http://localhost:${PORT}/v1" +BASE_URL="http://${HOST}:${PORT}/v1" COMPLETIONS_URL="${BASE_URL}/completions" # --- Evaluation --- diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 58eb676111..a39acf4c73 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -45,12 +45,134 @@ except ImportError: snapshot_download = None -from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor +from modelopt.torch.export.model_utils import match_model_type_by_name +from modelopt.torch.utils.dataset_utils import get_dataset_dataloader +from modelopt.torch.utils.image_processor import ( + BaseImageProcessor, + MllamaImageProcessor, + Qwen3OmniImageProcessor, +) +from modelopt.torch.utils.video_dataset_utils import ( + Qwen3OmniVideoProcessor, + get_supported_video_datasets, + get_video_dataset_dataloader, +) +from modelopt.torch.utils.vlm_dataset_utils import ( + get_supported_vlm_datasets, + get_vlm_dataset_dataloader, +) logger = logging.getLogger(__name__) SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"] +# Files needed for tokenizer/processor that vLLM loads from model path +TOKENIZER_FILES = [ + "vocab.json", + "merges.txt", + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "preprocessor_config.json", + "chat_template.json", +] + + +def get_model_type_from_config(model_path: str) -> str | None: + """Get model type from the config.json file. + + Args: + model_path: Path to the model directory or HuggingFace model ID. + + Returns: + Model type string (e.g., 'qwen3omni', 'llama', 'gpt') or None if not found. + """ + config_path = os.path.join(model_path, "config.json") + if not os.path.exists(config_path): + return None + + with open(config_path) as f: + config = json.load(f) + + # Check architectures field first + for arch in config.get("architectures", []): + result = match_model_type_by_name(arch) + if result is not None: + return result + + # Fallback to model_type field + return match_model_type_by_name(config.get("model_type", "")) + + +def get_sampling_params_from_config(model_path: str) -> dict: + """Extract sampling params from generation_config.json if present.""" + gen_config_path = Path(model_path) / "generation_config.json" + if not gen_config_path.exists(): + return {} + + gen_config = json.loads(gen_config_path.read_text()) + + params = {k: gen_config[k] for k in ("temperature", "top_p", "top_k") if k in gen_config} + + for key in ("max_new_tokens", "max_length"): + if key in gen_config: + params["max_tokens"] = gen_config[key] + break + + return params + + +def get_quantization_format(model_path: str) -> str | None: + """Get quantization format from the model config. + + Args: + model_path: Path to the model directory. + + Returns: + vLLM quantization string ('modelopt', 'modelopt_fp4') or None if not quantized. + """ + hf_quant_config_path = os.path.join(model_path, "hf_quant_config.json") + if os.path.exists(hf_quant_config_path): + with open(hf_quant_config_path) as f: + quant_config = json.load(f) + quant_algo = quant_config.get("quantization", {}).get("quant_algo", "") + if "NVFP4" in quant_algo: + return "modelopt_fp4" + + return None + + +def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None: + """Copy tokenizer files from HF model to local quantized model dir if missing.""" + if not os.path.isdir(model_path): + return # Not a local path, nothing to do + + # Check if tokenizer files are missing + missing_files = [f for f in TOKENIZER_FILES if not os.path.exists(os.path.join(model_path, f))] + if not missing_files: + return + + if snapshot_download is None: + print("Warning: huggingface_hub not installed, cannot download tokenizer files") + return + + print(f"Copying missing tokenizer files from {source_model_id}...") + # Download only tokenizer files from HF + if os.path.isdir(source_model_id): + cache_dir = source_model_id + else: + cache_dir = snapshot_download( + source_model_id, + allow_patterns=TOKENIZER_FILES, + ) + + for fname in TOKENIZER_FILES: + src = os.path.join(cache_dir, fname) + dst = os.path.join(model_path, fname) + if os.path.exists(src) and not os.path.exists(dst): + shutil.copy2(src, dst) + print(f" Copied {fname}") + def run_nemotron_vl_preview( full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False @@ -241,9 +363,45 @@ def build_quant_cfg( quant_cfg["quant_cfg"]["*image*"] = {"enable": False} quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} + if model_type in ["qwen3moe", "qwen3next"] and qformat == "nvfp4": + # Disable the attention projection layers to retain accuracy + quant_cfg["quant_cfg"]["model*.*attn*in_proj*"] = {"enable": False} + quant_cfg["quant_cfg"]["model*.*attn*q_proj*"] = {"enable": False} + quant_cfg["quant_cfg"]["model*.*attn*k_proj*"] = {"enable": False} + quant_cfg["quant_cfg"]["model*.*attn*v_proj*"] = {"enable": False} + + if model_type == "deepseek": + # Disable MLA quantization for accuracy. + quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False} + quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False} + + if model_type == "qwen3omni": + print( + "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model" + ) + quant_cfg["quant_cfg"]["*conv*"] = {"enable": False} + quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False} + quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} + return quant_cfg +def get_generation_kwargs(model_type: str) -> dict[str, Any]: + """Get model-specific generation kwargs for calibration. + + Args: + model_type: The model type string. + + Returns: + Dictionary of generation kwargs for the model. + """ + generation_kwargs = {} + if model_type == "qwen3omni": + generation_kwargs["return_audio"] = False + generation_kwargs["thinker_max_new_tokens"] = 1 + return generation_kwargs + + def is_speculative(hf_config): """Check if the model architecture is a speculative model.""" return hf_config.architectures and any( @@ -284,7 +442,7 @@ def get_processor( if attn_implementation is not None: model_kwargs["attn_implementation"] = attn_implementation - if model_type == "whisper": + if model_type in ("whisper", "mllama", "qwen3omni"): processor = AutoProcessor.from_pretrained( ckpt_path, padding_side="left", @@ -296,20 +454,11 @@ def get_processor( f"Pad token for {ckpt_path} cannot be set!" ) + if model_type == "mllama": + return MllamaImageProcessor(processor, device) + elif model_type == "qwen3omni": + return Qwen3OmniImageProcessor(processor, device) return processor - elif model_type == "mllama": - processor = AutoProcessor.from_pretrained( - ckpt_path, - padding_side="left", - **model_kwargs, - ) - if processor.tokenizer.pad_token is None: - processor.tokenizer.pad_token = processor.tokenizer.eos_token - assert processor.tokenizer.pad_token is not None, ( - f"Pad token for {ckpt_path} cannot be set!" - ) - - return MllamaImageProcessor(processor, device) else: # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse) try: @@ -838,3 +987,86 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod print(f"Successfully copied {len(copied_files)} custom model files to {export_path}") else: print("No custom model files found to copy") + + +def get_qwen3omni_dataloader( + dataset_name: str | list[str] | None, + processor: Qwen3OmniImageProcessor | None, + tokenizer, + batch_size: int, + num_samples: int | list[int], + device: torch.device, + model_dtype: torch.dtype, + include_labels: bool = False, +): + """Create a calibration dataloader for Qwen3Omni models. + + Handles video, VLM, and text-only dataset configurations. + + Args: + dataset_name: Name of the dataset(s) to use for calibration. + processor: The Qwen3OmniImageProcessor for multimodal inputs. + tokenizer: The tokenizer for text-only fallback. + batch_size: Batch size for the dataloader. + num_samples: Number of samples to use (int or list for multi-dataset). + device: Target device for tensors. + model_dtype: Model dtype for proper tensor conversion. + include_labels: Whether to include labels (for gradient-based auto_quantize). + + Returns: + DataLoader for calibration. + """ + if dataset_name is None: + dataset_name = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] + num_samples = [512, 512] + + if processor is not None: + # Normalize single-element list to str for supported-dataset lookups + if isinstance(dataset_name, list) and len(dataset_name) == 1: + dataset_name = dataset_name[0] + if dataset_name in get_supported_video_datasets(): + assert isinstance(dataset_name, str) + video_processor = Qwen3OmniVideoProcessor( + processor.tokenizer if hasattr(processor, "tokenizer") else processor, + device=device, + dtype=model_dtype, + use_audio_in_video=True, + ) + calib_dataloader = get_video_dataset_dataloader( + dataset_name=dataset_name, + processor=video_processor, + batch_size=batch_size, + num_samples=num_samples if isinstance(num_samples, int) else num_samples[0], + ) + elif dataset_name in get_supported_vlm_datasets(): + assert isinstance(dataset_name, str) + assert isinstance(processor, Qwen3OmniImageProcessor), ( + "The Qwen3OmniImageProcessor must be set." + ) + # Set dtype for proper tensor conversion in collate_function. + # Processor is created before model_dtype is known, so we set it here. + processor.dtype = model_dtype + calib_dataloader = get_vlm_dataset_dataloader( + dataset_name=dataset_name, + processor=processor, + batch_size=batch_size, + num_samples=num_samples if isinstance(num_samples, int) else num_samples[0], + ) + else: + raise ValueError( + f"Dataset '{dataset_name}' not supported for Qwen3Omni with processor. " + f"Supported video datasets: {get_supported_video_datasets()}, " + f"Supported VLM datasets: {get_supported_vlm_datasets()}" + ) + else: + # Text-only fallback + calib_dataloader = get_dataset_dataloader( + dataset_name=dataset_name if isinstance(dataset_name, list) else [dataset_name], + tokenizer=tokenizer, + batch_size=batch_size, + num_samples=num_samples if isinstance(num_samples, list) else [num_samples], + device=device, + include_labels=include_labels, + ) + + return calib_dataloader diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index b81dc60c01..2d441f4b35 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -18,6 +18,7 @@ import random import time import warnings +from collections import namedtuple from typing import Any import numpy as np @@ -27,8 +28,10 @@ build_quant_cfg, copy_custom_model_files, create_vlm_calibration_loop, + get_generation_kwargs, get_model, get_processor, + get_qwen3omni_dataloader, get_tokenizer, is_enc_dec, is_nemotron_vl, @@ -70,7 +73,11 @@ get_max_batch_size, get_supported_datasets, ) -from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor +from modelopt.torch.utils.image_processor import ( + BaseImageProcessor, + MllamaImageProcessor, + Qwen3OmniImageProcessor, +) from modelopt.torch.utils.memory_monitor import launch_memory_monitor from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader @@ -208,6 +215,21 @@ def make_calib_dataloader( batch_size=args.batch_size, num_samples=args.calib_size[0], ) + elif model_type == "qwen3omni": + # Labels are only needed for gradient-based auto_quantize + include_labels = ( + args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient" + ) + calib_dataloader = get_qwen3omni_dataloader( + dataset_name=args.dataset[0] if args.dataset else None, + processor=processor, + tokenizer=tokenizer, + batch_size=args.batch_size, + num_samples=args.calib_size[0] if processor else args.calib_size, + device=device, + model_dtype=language_model.dtype, + include_labels=include_labels, + ) elif model_type == "whisper": assert processor is not None and isinstance(processor, WhisperProcessor), ( "The AutoProcessor must be set." @@ -408,7 +430,7 @@ def load_model(args: argparse.Namespace): print("Nemotron VL model detected. Enabling image-text calibration by default.") args.calib_with_images = True - if model_type == "mllama": + if model_type in ["mllama", "qwen3omni"]: processor = get_processor( args.pyt_ckpt_path, model_type, @@ -416,6 +438,14 @@ def load_model(args: argparse.Namespace): trust_remote_code=args.trust_remote_code, attn_implementation=args.attn_implementation, ) + if model_type == "qwen3omni": + print("Disabling talker for Qwen3Omni model") + full_model.disable_talker() + language_model = full_model.thinker.model + tokenizer = processor.tokenizer.tokenizer + processor = None + default_padding_side = tokenizer.padding_side + default_pad_token = tokenizer.pad_token elif model_type == "whisper": processor = get_processor( args.pyt_ckpt_path, @@ -555,6 +585,9 @@ def mono_quantize( quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False} # Nemotron-Parse specific print("Quantization will only be applied to the decoder (text generation) component") + # Get model-specific generation kwargs (e.g., for Qwen3Omni) + generation_kwargs = get_generation_kwargs(model_type) + if not model_is_already_quantized or calibration_only: # quantize the model @@ -569,7 +602,9 @@ def mono_quantize( if args.calib_with_images and is_nemotron_vl_model: calibrate_loop = create_vlm_calibration_loop(full_model, calib_dataloader) else: - calibrate_loop = create_forward_loop(dataloader=calib_dataloader) + calibrate_loop = create_forward_loop( + dataloader=calib_dataloader, generation_kwargs=generation_kwargs + ) if calibration_only: language_model = mtq.calibrate( @@ -719,6 +754,23 @@ def export_quantized( ) +PreQuantizeResult = namedtuple( + "PreQuantizeResult", ["preview_input_ids", "generated_ids_before_ptq", "calib_batch"] +) + + +def _qwen3omni_generate(model, calib_batch): + """Run Qwen3Omni generate and unpack the result. + + Qwen3Omni returns a (text_ids, audio) tuple; text_ids may have a .sequences attribute. + """ + result = model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100) + if isinstance(result, tuple): + text_ids, _ = result + return text_ids.sequences if hasattr(text_ids, "sequences") else text_ids + return result + + def pre_quantize( args: argparse.Namespace, full_model: torch.nn.Module, @@ -735,9 +787,10 @@ def pre_quantize( """ # Only run single sample for preview - preview_input_ids = next(iter(calib_dataloader))[ - "input_features" if model_type == "whisper" else "input_ids" - ][0:1] + calib_batch = next(iter(calib_dataloader)) + preview_input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][ + 0:1 + ] # Generate preview before quantization if args.skip_generate: @@ -759,10 +812,16 @@ def pre_quantize( "before quantization", allow_fallback=False, ) + elif model_type == "qwen3omni": + # Use only a single sample for preview generation to avoid OOM + single_sample = { + k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items() + } + generated_ids_before_ptq = _qwen3omni_generate(full_model, single_sample) else: generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) - return preview_input_ids, generated_ids_before_ptq + return PreQuantizeResult(preview_input_ids, generated_ids_before_ptq, calib_batch) def post_quantize( @@ -775,28 +834,59 @@ def post_quantize( generated_ids_before_ptq, is_nemotron_vl_model, first_text_speech_dataset, + calib_batch: dict | None = None, ): - """ - Processing after the quantization. + """Processing after the quantization. - Currently we run one round of generation using the quantized model for a sample prompt, - and compare it with pre-quantize generation. + Runs one round of generation using the quantized model for a sample prompt and + compares it with the pre-quantize generation from ``pre_quantize()``. + + Args: + args: Parsed CLI arguments. Used for ``verbose``, ``quant_summary_path``, + ``export_path``, ``pyt_ckpt_path``, and ``skip_generate`` flags. + full_model: The quantized model to run post-quantization generation on. + model_type: Model architecture identifier (e.g. ``"qwen3omni"``, ``"whisper"``, + ``"llama4"``, ``"deepseek"``). Controls model-specific generation and + decoding paths. ``None`` for generic models. + tokenizer: HF tokenizer for decoding generated token ids. May be ``None`` when + a ``processor`` is used instead (e.g. vision-language or speech models). + processor: HF image/audio processor for multimodal models. Used for decoding + outputs from vision-language (Mllama, Qwen3Omni) and speech (Whisper) + models. ``None`` for text-only models. + preview_input_ids: Input token ids (single sample) produced by ``pre_quantize()`` + for the preview generation comparison. + generated_ids_before_ptq: Generation output from ``pre_quantize()`` to compare + against post-quantization output. ``None`` if generation was skipped. + is_nemotron_vl_model: Whether the model is a Nemotron VL model, which uses + ``model.chat()`` and returns text strings instead of token tensors. + first_text_speech_dataset: Text transcript of the first speech sample, used as + the display input for Whisper models since their ``input_ids`` are + mel-spectrogram features rather than decodable tokens. + calib_batch: Full calibration batch dict from ``pre_quantize``. Required for + multimodal models (e.g. Qwen3Omni) whose ``generate()`` needs the complete + input dict (audio features, attention masks, etc.) rather than just + ``input_ids``. For text-only models this is unused and may be ``None``. """ if args.verbose: try: - mtq.print_quant_summary(full_model, args.export_path) + mtq.print_quant_summary(full_model, save_path=args.quant_summary_path) save_expert_token_count_table(full_model, args.export_path) except Exception as e: - print(f"Error saving quant summary: {e}") - print("Continuing with generation...") + print(f"Warning: Failed to print quant summary: {e}") # Run some samples torch.cuda.empty_cache() generated_ids_after_ptq = None if generated_ids_before_ptq is None: pass + elif model_type == "qwen3omni" and calib_batch is not None: + # Use only a single sample for preview generation to avoid OOM + single_sample = { + k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items() + } + generated_ids_after_ptq = _qwen3omni_generate(full_model, single_sample) elif model_type != "llama4" and not is_nemotron_vl_model: # Our fake quantizer may not be fully compatible with torch.compile. generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) @@ -815,12 +905,13 @@ def post_quantize( ) def input_decode(input_ids): - if processor is not None and isinstance(processor, MllamaImageProcessor): - return processor.tokenizer.batch_decode(input_ids) + # BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor + if processor is not None and isinstance(processor, BaseImageProcessor): + return processor.tokenizer.batch_decode(input_ids, skip_special_tokens=True) elif processor is not None and isinstance(processor, WhisperProcessor): return first_text_speech_dataset elif tokenizer is not None: - return tokenizer.batch_decode(input_ids) + return tokenizer.batch_decode(input_ids, skip_special_tokens=True) else: raise ValueError("The processor or tokenizer must be set") @@ -832,6 +923,12 @@ def output_decode(generated_ids, input_shape): return tokenizer.batch_decode(generated_ids, skip_special_tokens=True) elif processor is not None and isinstance(processor, MllamaImageProcessor): return processor.tokenizer.batch_decode(generated_ids[:, input_shape:]) + elif processor is not None and isinstance(processor, Qwen3OmniImageProcessor): + return processor.tokenizer.batch_decode( + generated_ids[:, input_shape:], + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) elif tokenizer is not None: return tokenizer.batch_decode(generated_ids[:, input_shape:]) else: @@ -919,7 +1016,7 @@ def quantize_main( # Detect if this is a Nemotron VL model using architecture-based detection is_nemotron_vl_model = is_nemotron_vl(full_model) - preview_input_ids, generated_ids_before_ptq = pre_quantize( + preview_input_ids, generated_ids_before_ptq, calib_batch = pre_quantize( args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model ) @@ -1014,6 +1111,7 @@ def quantize_main( generated_ids_before_ptq, is_nemotron_vl_model, first_text_speech_dataset, + calib_batch, ) export_quantized( args, @@ -1238,6 +1336,15 @@ def parse_args() -> argparse.Namespace: help="Export as vLLM fake-quant checkpoint (produces vllm_fq_modelopt_state.pth " "for use with vllm_serve_fakequant.py).", ) + parser.add_argument( + "--quant_summary_path", + type=str, + default=None, + help=( + "Path to save the quantization summary. If not specified, summary is printed to stdout. " + "Requires --verbose to be enabled (default: True)." + ), + ) args = parser.parse_args() if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0): diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py new file mode 100644 index 0000000000..60cfcb2cd1 --- /dev/null +++ b/examples/llm_ptq/run_vllm.py @@ -0,0 +1,145 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unified HF checkpoint inference with vLLM. + +Usage: + python run_vllm.py --model /path/to/quantized/model + python run_vllm.py --model /path/to/model --tp 4 +""" + +from __future__ import annotations + +import argparse + +from example_utils import ( + ensure_tokenizer_files, + get_model_type_from_config, + get_quantization_format, + get_sampling_params_from_config, +) +from transformers import AutoConfig, AutoProcessor +from vllm import LLM, SamplingParams + + +def main(): + parser = argparse.ArgumentParser(description="Run unified hf checkpoint inference with vLLM") + parser.add_argument("--model", type=str, required=True, help="Model ID or path") + parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size") + parser.add_argument( + "--max-model-len", + type=int, + default=None, + help="Max model length (auto-detected from config if not specified)", + ) + parser.add_argument("--prompt", type=str, default="What in Nvidia?", help="Text prompt") + parser.add_argument( + "--tokenizer", type=str, default=None, help="Tokenizer ID or path (defaults to model path)" + ) + parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature") + parser.add_argument("--top-p", type=float, default=0.9, help="Top-p sampling") + parser.add_argument("--top-k", type=int, default=-1, help="Top-k sampling (-1 to disable)") + parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate") + parser.add_argument( + "--trust-remote-code", + action="store_true", + default=False, + help="Trust remote code from HuggingFace model repos", + ) + + args = parser.parse_args() + + # Detect model type from config + model_type = get_model_type_from_config(args.model) + print(f"Detected model type: {model_type}") + + # Detect quantization format + quantization = get_quantization_format(args.model) + print(f"Detected quantization: {quantization}") + + # Get max_model_len from config if not specified + if args.max_model_len is None: + config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) + args.max_model_len = getattr(config, "max_position_embeddings", 4096) + print(f"Using max_model_len from config: {args.max_model_len}") + + # Determine tokenizer source + tokenizer_id = args.tokenizer or args.model + + # Load processor for chat template + processor = AutoProcessor.from_pretrained( + tokenizer_id, trust_remote_code=args.trust_remote_code + ) + + # Text-only conversations + conversations = [ + [ + { + "role": "user", + "content": [{"type": "text", "text": args.prompt}], + } + ], + ] + + # Apply chat template + apply_chat_kwargs = { + "add_generation_prompt": True, + "tokenize": False, + } + # Qwen3Omni-specific: disable thinking mode + if model_type == "qwen3omni": + apply_chat_kwargs["enable_thinking"] = False + + texts = processor.apply_chat_template(conversations, **apply_chat_kwargs) + + # Ensure tokenizer files exist in local model dir (vLLM loads processor from model path) + if args.tokenizer: + ensure_tokenizer_files(args.model, args.tokenizer) + + print(f"Loading model: {args.model}") + llm = LLM( + model=args.model, + tokenizer=tokenizer_id, + tensor_parallel_size=args.tp, + max_model_len=args.max_model_len, + trust_remote_code=args.trust_remote_code, + quantization=quantization, + enforce_eager=True, + ) + + # Get sampling params from config, with CLI/defaults as fallback + config_params = get_sampling_params_from_config(args.model) + sampling_kwargs = { + "temperature": config_params.get("temperature", args.temperature), + "top_p": config_params.get("top_p", args.top_p), + "max_tokens": config_params.get("max_tokens", args.max_tokens), + } + top_k = config_params.get("top_k", args.top_k) + if top_k > 0: + sampling_kwargs["top_k"] = top_k + print(f"Sampling params: {sampling_kwargs}") + sampling_params = SamplingParams(**sampling_kwargs) + + print("Running inference...") + outputs = llm.generate(texts, sampling_params) + + for output in outputs: + generated_text = output.outputs[0].text + print("-" * 80) + print(f"Generated: {generated_text}") + + +if __name__ == "__main__": + main() diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py index 9a2cd4b2f0..641204d4f7 100755 --- a/modelopt/torch/export/layer_utils.py +++ b/modelopt/torch/export/layer_utils.py @@ -972,6 +972,7 @@ def module_match_name_list(module, name_list): "Qwen3MoeSparseMoeBlock", "Qwen3NextSparseMoeBlock", "Qwen3_5MoeSparseMoeBlock", + "Qwen3OmniMoeThinkerTextSparseMoeBlock", "DeepseekMoE", ], ): diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 3bd72d9de9..17798d0837 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -31,6 +31,7 @@ "ChatGLM": "chatglm", "Qwen3Moe": "qwen3moe", "Qwen3Next": "qwen3next", + "Qwen3OmniMoeForConditionalGeneration": "qwen3omni", "QWen": "qwen", "RecurrentGemma": "recurrentgemma", "Gemma3": "gemma3", @@ -42,6 +43,7 @@ "Phi4MMForCausalLM": "phi4mm", "phi": "phi", "TLGv4ForCausalLM": "phi", + "NemotronH": "nemotron_h", "MixtralForCausalLM": "llama", "ArcticForCausalLM": "llama", "StarCoder": "gpt", @@ -51,12 +53,11 @@ "GLM": "glm", "InternLM2ForCausalLM": "internlm", "ExaoneForCausalLM": "exaone", - "NemotronH": "nemotron_h", "Nemotron": "gpt", "Deepseek": "deepseek", "Whisper": "whisper", - "gptoss": "gptoss", "MiniMax": "minimax", + "gptoss": "gptoss", } __doc__ = f"""Utility functions for model type detection and classification. @@ -66,17 +67,35 @@ {MODEL_NAME_TO_TYPE=} """ -__all__ = ["get_language_model_from_vl", "get_model_type", "is_multimodal_model"] +__all__ = [ + "get_language_model_from_vl", + "get_model_type", + "is_multimodal_model", + "match_model_type_by_name", +] -def get_model_type(model): - """Try get the model type from the model name. If not found, return None.""" +def match_model_type_by_name(name: str) -> str | None: + """Match a model type from MODEL_NAME_TO_TYPE by case-insensitive substring match. + + Args: + name: String to match against (e.g. class name, architecture string, model_type field). + + Returns: + Matched model type string, or None. + """ + name_lower = name.lower() for k, v in MODEL_NAME_TO_TYPE.items(): - if k.lower() in type(model).__name__.lower(): + if k.lower() in name_lower: return v return None +def get_model_type(model): + """Try get the model type from the model name. If not found, return None.""" + return match_model_type_by_name(type(model).__name__) + + def is_multimodal_model(model): """Check if a model is a Vision-Language Model (VLM) or multimodal model. @@ -149,6 +168,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None: if hasattr(model, "language_model"): return [model, model.language_model] + if hasattr(model, "thinker"): + return [model, model.thinker] + # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model. # Only match if the model is detected as multimodal to avoid matching non-VLM encoder-decoder # models like T5, Bart, Whisper which also have .decoder. diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 14a12bcdf3..c60469a587 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -87,7 +87,7 @@ QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ) -from .model_utils import get_language_model_from_vl, is_multimodal_model +from .model_utils import get_language_model_from_vl, get_model_type, is_multimodal_model from .plugins import SpeculativeDecodingExporter, has_spec_opt from .quant_utils import ( fuse_prequant_layernorm, @@ -357,9 +357,11 @@ def llm_dummy_forward(): [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype ).to(model.device) - if is_vl_model and "nemotron" in model_type: - # For Nemotron VL models, run optimization on just the language model/decoder. - # This avoids needing pixel_values for the vision encoder. + if getattr(model.config, "is_encoder_decoder", False): + # For encoder-decoder models, we need to pass both the encoder and decoder input ids + model(fake_input, decoder_input_ids=decoder_fake_input) + elif (is_vl_model and "nemotron" in model_type) or model_type.startswith("qwen3omni"): + # For Nemotron VL models, try to run optimization on just the language model part language_model_lineage = get_language_model_from_vl(model) if language_model_lineage is not None: @@ -371,7 +373,7 @@ def llm_dummy_forward(): language_model(fake_input, use_cache=False) else: raise ValueError( - f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " + f"Cannot extract language_model from VL model (type: {model_type}). " "This is required for requantization/resmoothing optimization. " "Please ensure the model architecture is supported or file an issue." ) @@ -779,6 +781,16 @@ def _export_transformers_checkpoint( exclude_modules.append(pattern) print(f"Adding MTP layer to quantization_config ignore: {pattern}") + # Add model-specific non-quantized module exclusions + _model_type_exclusions = { + "qwen3omni": ["thinker.audio_tower*", "thinker.visual*", "thinker.lm_head"], + } + model_type = get_model_type(model) + for pattern in _model_type_exclusions.get(model_type, []): + exclude_modules = quant_config["quantization"].setdefault("exclude_modules", []) + if pattern not in exclude_modules: + exclude_modules.append(pattern) + # Safety net: sync any gate/up weight quantizer amaxes that # requantize_resmooth_fused_llm_layers did not reach (e.g. experts not # activated during the dummy forward, or non-standard expert naming). @@ -1181,6 +1193,21 @@ def export_hf_checkpoint( if getattr(model, "hf_quantizer", None) is not None: model.hf_quantizer = None + # Fix generation_config conflicts before saving + # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors + # Restore the original value after save to avoid mutating the caller's model. + _gen_config_restore = None + if hasattr(model, "generation_config") and model.generation_config is not None: + gen_config = model.generation_config + if not getattr(gen_config, "do_sample", True): + # Enable sampling if sampling params are present + if any( + getattr(gen_config, attr, None) is not None + for attr in ["temperature", "top_p", "top_k"] + ): + _gen_config_restore = gen_config.do_sample + gen_config.do_sample = True + # Save model # Temporarily disable revert_weight_conversion if available — it doesn't handle # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError). @@ -1197,6 +1224,8 @@ def export_hf_checkpoint( ) finally: _unpatch_revert_weight_conversion(_patches) + if _gen_config_restore is not None: + model.generation_config.do_sample = _gen_config_restore original_config = f"{export_dir}/config.json" config_data = {} diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 4aa1ff46b4..a2dae3fbe9 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -17,7 +17,6 @@ import fnmatch import inspect -import os import warnings from collections.abc import Callable, Iterable from typing import Any @@ -583,22 +582,28 @@ def enable_quantizer(model: nn.Module, wildcard_or_filter_func: str | Callable): @atomic_print -def print_quant_summary(model: nn.Module, output_dir: str | None = None): - """Print summary of all quantizer modules in the model.""" - lines = [ - f"{name:80} {mod}" - for name, mod in model.named_modules() - if isinstance(mod, TensorQuantizer) - ] - lines.append(f"{len(lines)} TensorQuantizers found in model") - - if output_dir: - path = os.path.join(output_dir, ".quant_summary.txt") - with open(path, "w", encoding="utf-8") as f: - f.write("\n".join(lines) + "\n") - print(f"\033[1mQuant summary saved to {path}\033[0m") +def print_quant_summary(model: nn.Module, save_path: str | None = None): + """Print summary of all quantizer modules in the model. + + Args: + model: The model to summarize. + save_path: Optional path to save the summary to a file. If None, prints to stdout. + """ + lines = [] + count = 0 + for name, mod in model.named_modules(): + if isinstance(mod, TensorQuantizer): + lines.append(f"{name:80} {mod}") + count += 1 + lines.append(f"{count} TensorQuantizers found in model") + + summary = "\n".join(lines) + if save_path: + with open(save_path, "w") as f: + f.write(summary) + print(f"Quantization summary saved to {save_path}") else: - print("\n".join(lines)) + print(summary) def fold_weight(model: nn.Module, keep_attrs: bool = False): diff --git a/modelopt/torch/utils/__init__.py b/modelopt/torch/utils/__init__.py index f026e747a8..354212d56e 100644 --- a/modelopt/torch/utils/__init__.py +++ b/modelopt/torch/utils/__init__.py @@ -27,4 +27,5 @@ from .regex import * from .robust_json import * from .tensor import * +from .video_dataset_utils import * from .vlm_dataset_utils import * diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 00cdff8877..f5a64054fe 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -112,6 +112,7 @@ "get_dataset_samples", "get_jsonl_text_samples", "get_max_batch_size", + "get_qwen3omni_text_dataloader", "get_supported_datasets", ] @@ -211,6 +212,88 @@ def _auto_preprocess_sample( ) +def _load_text_samples(dataset_name, num_samples, **kwargs): + """Normalize inputs and load raw text samples from one or more datasets. + + Args: + dataset_name: Single name or list of names. + num_samples: Single count or list of counts (must match dataset_name length). + **kwargs: Forwarded to get_dataset_samples(). + + Returns: + List of raw text strings. + """ + if isinstance(num_samples, int): + num_samples = [num_samples] + if isinstance(dataset_name, str): + dataset_name = [dataset_name] + assert len(dataset_name) == len(num_samples), ( + "dataset_name and num_samples must be the same length" + ) + all_samples = [] + for ds_name, num_sample in zip(dataset_name, num_samples): + samples = get_dataset_samples(ds_name, num_sample, **kwargs) + all_samples.extend(samples) + return all_samples + + +class _ListDataset(torch.utils.data.Dataset): + """Simple dataset wrapping a list of dicts.""" + + def __init__(self, samples): + self.samples = samples + + def __getitem__(self, idx): + return self.samples[idx] + + def __len__(self): + return len(self.samples) + + +def get_qwen3omni_text_dataloader( + dataset_name: str | list[str] = "cnn_dailymail", + processor=None, + batch_size: int = 1, + num_samples: int | list[int] = 512, +) -> DataLoader: + """Get a text-only dataloader for Qwen3-Omni with proper conversation template applied. + + This function applies the Qwen3-Omni chat template to text samples before tokenization, + which is required for proper calibration of Qwen3-Omni models with text-only datasets. + + See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking + + Args: + dataset_name: Name of the dataset(s) to load. + processor: Qwen3OmniTextProcessor instance wrapping the Qwen3OmniMoeProcessor. + batch_size: Batch size of the returned dataloader. + num_samples: Number of samples from the dataset. + + Returns: + A DataLoader with properly formatted inputs for Qwen3-Omni. + """ + assert processor is not None, "Please provide a Qwen3OmniTextProcessor." + + all_samples = _load_text_samples(dataset_name, num_samples) + + # Preprocess each sample with the conversation template and convert to lists + from .image_processor import _Qwen3OmniProcessorMixin + + processed_samples = [] + for text in all_samples: + values = processor.preprocess_function(text) + processed_samples.append( + _Qwen3OmniProcessorMixin._serialize_for_arrow(values, list(values.keys())) + ) + + return DataLoader( + _ListDataset(processed_samples), + batch_size=batch_size, + shuffle=False, + collate_fn=processor.collate_function, + ) + + def get_dataset_samples( dataset_name: str, num_samples: int, @@ -367,23 +450,13 @@ def get_dataset_dataloader( "Tokenizer with the right padding_side may impact calibration accuracy. Recommend set to left" ) - if isinstance(num_samples, int): - num_samples = [num_samples] - - if isinstance(dataset_name, str): - dataset_name = [dataset_name] - - assert len(dataset_name) == len(num_samples), ( - "dataset_name and num_samples must be the same length" + all_samples = _load_text_samples( + dataset_name, + num_samples, + apply_chat_template=apply_chat_template, + tokenizer=tokenizer, ) - all_samples = [] - for ds_name, num_sample in zip(dataset_name, num_samples): - samples = get_dataset_samples( - ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer - ) - all_samples.extend(samples) - batch_encoded = tokenizer( all_samples, return_tensors="pt", @@ -452,8 +525,8 @@ def _get_free_gpu_mem(): torch.cuda.empty_cache() free_mem_before, max_allocated_before = _get_free_gpu_mem() - is_enc_dec = model_type_is_enc_dec(model) - infer_method = model.generate if is_enc_dec else model.forward + use_generate = _should_use_generate(model) + infer_method = model.generate if use_generate else model.forward if sample_input_single_batch is None: sample_input_single_batch = ( @@ -508,22 +581,29 @@ def _get_free_gpu_mem(): return 512 -def _process_batch(batch_data, infer_method, max_working_batch_size=None): +def _process_batch(batch_data, infer_method, generation_kwargs=None, max_working_batch_size=None): """Process a batch of data through the model's inference method. Args: batch_data: Dictionary containing the batch data infer_method: Model's inference method (either forward or generate) + generation_kwargs: Keyword arguments to pass to the model.generate() method. max_working_batch_size: Maximum batch size known to work without OOM Returns: The maximum batch size that worked successfully """ - assert all(torch.is_tensor(data) or data is None for data in batch_data.values()), ( - "batch_data values must be tensors" + if generation_kwargs is None: + generation_kwargs = {} + # Separate tensor values from scalar parameters (like max_new_tokens) + tensor_data = {k: v for k, v in batch_data.items() if torch.is_tensor(v) or v is None} + scalar_data = {k: v for k, v in batch_data.items() if not torch.is_tensor(v) and v is not None} + + assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), ( + "tensor_data values must be tensors" ) - # Get the batch size of current data - batch_size = batch_data[next(iter(batch_data.keys()))].shape[0] + # Get the batch size from the first non-None tensor value + batch_size = next(v for v in tensor_data.values() if v is not None).shape[0] # If we know a smaller batch size works, preemptively split if max_working_batch_size is not None and batch_size > max_working_batch_size: @@ -531,11 +611,13 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): for i in range(0, batch_size, max_working_batch_size): end_idx = min(i + max_working_batch_size, batch_size) split_data = {} - for key in batch_data: - if batch_data[key] is None: + for key in tensor_data: + if tensor_data[key] is None: split_data[key] = None else: - split_data[key] = batch_data[key][i:end_idx, ...] + split_data[key] = tensor_data[key][i:end_idx, ...] + # Add back scalar data (non-tensor params like max_new_tokens) + split_data.update(scalar_data) max_working_batch_size = _process_batch( split_data, infer_method, max_working_batch_size @@ -545,7 +627,7 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): # Try processing with current batch size try: - infer_method(**batch_data) + infer_method(**batch_data, **generation_kwargs) return ( batch_size if max_working_batch_size is None @@ -562,8 +644,11 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): # Split the batch in half mid = (batch_size + 1) // 2 warn(f"CUDA out of memory with batch size {batch_size}, trying with batch size {mid}") - split_data_1 = {key: batch_data[key][:mid, ...] for key in batch_data} - split_data_2 = {key: batch_data[key][mid:, ...] for key in batch_data} + split_data_1 = {key: tensor_data[key][:mid, ...] for key in tensor_data} + split_data_2 = {key: tensor_data[key][mid:, ...] for key in tensor_data} + # Add back scalar data (non-tensor params like max_new_tokens) + split_data_1.update(scalar_data) + split_data_2.update(scalar_data) # Recursively process each half and track max working batch size max_working_batch_size = _process_batch(split_data_1, infer_method) @@ -573,21 +658,28 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): return max_working_batch_size -def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None: +def _forward_loop( + model: torch.nn.Module, dataloader: DataLoader, generation_kwargs: dict | None = None +) -> None: """Runs forward passes through the model using data from the dataloader. Args: model: The PyTorch model to run inference on dataloader: DataLoader containing the batched input data + generation_kwargs: Keyword arguments to pass to the model.generate() method. """ + if generation_kwargs is None: + generation_kwargs = {} with torch.no_grad(): - is_enc_dec = model_type_is_enc_dec(model) - infer_method = model.generate if is_enc_dec else model.forward + use_generate = _should_use_generate(model) + infer_method = model.generate if use_generate else model.forward max_working_batch_size = None # Initialize max working batch size as None for _, data in enumerate(tqdm(dataloader)): # Process batch and update max working batch size - max_working_batch_size = _process_batch(data, infer_method, max_working_batch_size) + max_working_batch_size = _process_batch( + data, infer_method, generation_kwargs, max_working_batch_size + ) def create_forward_loop( @@ -600,6 +692,7 @@ def create_forward_loop( device: str | None = None, include_labels: bool = False, dataloader: DataLoader | None = None, + generation_kwargs: dict | None = None, ) -> Callable: """Creates and returns a forward loop function configured for a specific model, dataset, and tokenizer. @@ -618,7 +711,7 @@ def create_forward_loop( device: Target device for the returned dataloader. include_labels: Whether to include labels in the dataloader. dataloader: If provided, use the provided dataloader instead. - + generation_kwargs: Keyword arguments to pass to the model.generate() method. Example usage for quantization: .. code-block:: python @@ -641,6 +734,8 @@ def create_forward_loop( A forward loop function that can be called with no arguments. When called, this function iterates over the dataset specified by `dataset_name`. """ + if generation_kwargs is None: + generation_kwargs = {} if dataloader is None: if batch_size == 0: # We let the system to determine the max data batch for each forward. @@ -657,7 +752,7 @@ def create_forward_loop( include_labels=include_labels, ) - return lambda model: _forward_loop(model, dataloader) + return lambda model: _forward_loop(model, dataloader, generation_kwargs) def model_type_is_enc_dec(model): @@ -753,3 +848,18 @@ def download_hf_dataset_as_jsonl( jsonl_paths.append(jsonl_file_path) return jsonl_paths + + +def _should_use_generate(model): + """Check if model should use generate() instead of forward() for calibration. + + Returns True for: + - Encoder-decoder models (t5, bart, whisper) + - Conditional generation models that don't support standard forward() (qwen3omni) + """ + generate_model_list = ["qwen3omni"] + model_name = model.__class__.__name__.lower() + needs_generate = model_type_is_enc_dec(model) or any( + name in model_name for name in generate_model_list + ) + return needs_generate and hasattr(model, "generate") diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py index 6374642e3d..2f226e41c5 100644 --- a/modelopt/torch/utils/image_processor.py +++ b/modelopt/torch/utils/image_processor.py @@ -16,6 +16,8 @@ # Adapted from tensorrt_llm/quantization/image_processing.py """Utility classes for image processing.""" +from typing import Any + import torch @@ -39,6 +41,33 @@ def collate_function(self, examples): """Collate function to process images during data loading.""" raise NotImplementedError("Each image processor must implement its own collate method") + def _collate_first_item(self, batch, long_keys=(), float_keys=(), dtype=None): + """Shared collate helper: validates batch_size=1, converts lists to tensors. + + Args: + batch: List of sample dicts from the DataLoader. + long_keys: Keys to convert via torch.LongTensor. + float_keys: Keys to convert via torch.tensor with optional dtype cast. + dtype: Optional dtype for float_keys tensors. + + Returns: + Dict of tensors moved to self.device. + """ + if len(batch) != 1: + raise ValueError(f"{type(self).__name__} currently supports batch_size=1 only.") + first = batch[0] + result = {} + for key in long_keys: + if first.get(key) is not None: + result[key] = torch.LongTensor(first[key]).to(self.device) + for key in float_keys: + if first.get(key) is not None: + t = torch.tensor(first[key]) + if dtype is not None: + t = t.to(dtype) + result[key] = t.to(self.device) + return result + # A light Encapsulation for Huggingface MllamaImageProcessor @@ -110,3 +139,173 @@ def collate_function(self, batch): ).to(self.device) return batch[0] + + +class Qwen3OmniTextProcessor(BaseImageProcessor): + """Text-only processor for Qwen3-Omni that applies proper conversation template. + + This processor wraps raw text in the Qwen3-Omni conversation format and applies + the chat template before tokenization. Use this for text-only calibration datasets. + + See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking + """ + + def __init__(self, processor, device="auto", dtype=None): + """Constructor. + + Args: + processor: The Qwen3OmniMoeProcessor (from AutoProcessor.from_pretrained). + device: Device to move tensors to. + dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default. + """ + super().__init__(processor, device) + self.dtype = dtype + + def preprocess_function(self, text: str) -> dict: + """Preprocess a single text sample by applying conversation template. + + Args: + text: Raw text string from dataset. + + Returns: + Dictionary with tokenized inputs. + """ + # Build conversation in Qwen format (text-only) + conversation = [{"role": "user", "content": [{"type": "text", "text": text}]}] + formatted_text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False + ) + + # Tokenize with the processor (no multimodal inputs) + values = self.tokenizer( + text=formatted_text, + audio=None, + images=None, + videos=None, + return_tensors="pt", + padding=True, + ) + + return values + + def collate_function(self, batch): + """Collate function to process text inputs during data loading.""" + return self._collate_first_item( + batch, + long_keys=("input_ids", "attention_mask"), + ) + + +class _Qwen3OmniProcessorMixin: + """Shared preprocessing logic for Qwen3-Omni image/video processors.""" + + tokenizer: Any + process_mm_info: Any + use_audio_in_video: Any + + def _tokenize_conversation(self, conversation): + """Tokenize a Qwen3-Omni conversation and return processor outputs. + + Args: + conversation: List of conversation dicts in Qwen format. + + Returns: + Processor output dict with tensors. + """ + text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False + ) + audios, images, videos = self.process_mm_info( + conversation, use_audio_in_video=self.use_audio_in_video + ) + return self.tokenizer( + text=text, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=self.use_audio_in_video, + ) + + @staticmethod + def _serialize_for_arrow(values, all_keys): + """Convert processor outputs to lists for Arrow serialization. + + Args: + values: Processor output dict (may contain tensors). + all_keys: List of keys to include in the result (ensures consistent schema). + + Returns: + Dict with all_keys initialized to None, populated from values. + """ + result = dict.fromkeys(all_keys) + for key, val in values.items(): + if val is not None and hasattr(val, "tolist"): + result[key] = val.tolist() + elif val is not None: + result[key] = val + return result + + +class Qwen3OmniImageProcessor(_Qwen3OmniProcessorMixin, BaseImageProcessor): + """Image processor for Qwen3-Omni multimodal model.""" + + _ALL_KEYS = [ + "input_ids", + "attention_mask", + "pixel_values", + "image_grid_thw", + "audio_features", + "audio_feature_lens", + "video_grid_thw", + ] + + def __init__(self, tokenizer, device="auto", dtype=None, use_audio_in_video=False): + """Constructor.""" + super().__init__(tokenizer, device) + self.dtype = dtype + self.use_audio_in_video = use_audio_in_video + # Try to import qwen_omni_utils for multimodal processing + try: + from qwen_omni_utils import process_mm_info + + self.process_mm_info = process_mm_info + except ImportError: + raise ImportError( + "qwen_omni_utils is required for Qwen3OmniImageProcessor. " + "Please install it from https://github.com/QwenLM/Qwen3-Omni" + ) + + def preprocess_function(self, examples): + """Preprocess function for Qwen3-Omni.""" + question = examples.get("question", "Describe this image.") + + # Build conversation in Qwen format + content = [] + if examples.get("image") is not None: + content.append({"type": "image", "image": examples["image"]}) + if examples.get("audio") is not None: + content.append({"type": "audio", "audio": examples["audio"]}) + if examples.get("video") is not None: + content.append({"type": "video", "video": examples["video"]}) + content.append({"type": "text", "text": question}) + + conversation = [{"role": "user", "content": content}] + values = self._tokenize_conversation(conversation) + return self._serialize_for_arrow(values, self._ALL_KEYS) + + def collate_function(self, batch): + """Collate function to process inputs during data loading.""" + return self._collate_first_item( + batch, + long_keys=( + "input_ids", + "attention_mask", + "image_grid_thw", + "audio_feature_lens", + "video_grid_thw", + ), + float_keys=("pixel_values", "audio_features"), + dtype=self.dtype, + ) diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py new file mode 100644 index 0000000000..d8b02b7ee1 --- /dev/null +++ b/modelopt/torch/utils/video_dataset_utils.py @@ -0,0 +1,277 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for getting samples and forward loop function for video datasets.""" + +import os +import tempfile +from typing import Any + +import torch +from torch.utils.data import DataLoader + +from .image_processor import BaseImageProcessor, _Qwen3OmniProcessorMixin + +# Use dict to store the config for each dataset. +SUPPORTED_VIDEO_DATASET_CONFIG: dict[str, dict[str, Any]] = { + "finevideo": { + "config": {"path": "HuggingFaceFV/finevideo", "split": "train", "streaming": True} + }, +} + +__all__ = [ + "Qwen3OmniVideoProcessor", + "get_supported_video_datasets", + "get_video_dataset_dataloader", +] + + +def _get_video_dataset(dataset_name: str, num_samples: int): + """Load a portion of train dataset with the dataset name and a given size. + + Args: + dataset_name: Name of the dataset to load. + num_samples: Number of samples to load from the dataset. + + Returns: + A hugging face Dataset. + """ + if dataset_name in SUPPORTED_VIDEO_DATASET_CONFIG: + from datasets import Dataset, load_dataset + + config = SUPPORTED_VIDEO_DATASET_CONFIG[dataset_name]["config"] + is_streaming = config.get("streaming", False) + + dataset = load_dataset(**config) + + if is_streaming: + # For streaming datasets, use take() and convert to list then Dataset + samples = list(dataset.take(num_samples)) + return Dataset.from_list(samples) + else: + return dataset.select(range(num_samples)) + else: + raise NotImplementedError( + f"dataset {dataset_name} is not supported. Please use one of the following:" + f" {get_supported_video_datasets()}." + ) + + +def get_supported_video_datasets() -> list[str]: + """Retrieves a list of video datasets supported. + + Returns: + A list of strings, where each string is the name of a supported dataset. + + Example usage: + + .. code-block:: python + + from modelopt.torch.utils import get_supported_video_datasets + + print("Supported video datasets:", get_supported_video_datasets()) + """ + return list(SUPPORTED_VIDEO_DATASET_CONFIG.keys()) + + +def get_video_dataset_dataloader( + dataset_name: str = "finevideo", + processor: "Qwen3OmniVideoProcessor" = None, + batch_size: int = 1, + num_samples: int = 512, + cache_dir: str | None = None, +) -> DataLoader: + """Get a dataloader with the dataset name and processor of the target model. + + Args: + dataset_name: Name of the dataset to load. + processor: Processor used for encoding video and text data. + batch_size: Batch size of the returned dataloader. + num_samples: Number of samples from the dataset. + cache_dir: Directory to cache the processed dataset. Defaults to a temp directory. + If the cache exists, it will be loaded instead of reprocessing. + + Returns: + An instance of dataloader. + """ + assert processor is not None, "Please provide a valid processor." + + # Default cache_dir to temp directory + if cache_dir is None: + cache_dir = os.path.join(tempfile.gettempdir(), "modelopt_video_dataset_cache") + + processed_dataset = None + + # Try to load from cache (use torch.save/load to avoid Arrow 32-bit offset overflow) + if cache_dir is not None: + cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed.pt") + if os.path.exists(cache_path): + try: + from datasets import Dataset + + # weights_only=False is safe here: the cache file is self-generated at line 151 + processed_samples = torch.load(cache_path, weights_only=False) + processed_dataset = Dataset.from_list(processed_samples) + print(f"Loaded processed dataset from cache: {cache_path}") + except Exception as e: + print(f"Failed to load cache from {cache_path}: {e}. Reprocessing...") + processed_dataset = None + + # Process dataset if not loaded from cache + if processed_dataset is None: + from datasets import Dataset + + dataset = _get_video_dataset(dataset_name, num_samples=num_samples) + + # Process samples manually to avoid Arrow 32-bit offset overflow + # (dataset.map() uses Arrow internally which can't handle large nested lists) + processed_samples = [] + for i, sample in enumerate(dataset): + processed = processor.preprocess_function(sample) + processed_samples.append(processed) + if (i + 1) % 10 == 0: + print(f"Processed {i + 1}/{len(dataset)} samples...") + + processed_dataset = Dataset.from_list(processed_samples) + + # Save to cache using torch.save to avoid Arrow 32-bit offset overflow + if cache_dir is not None: + os.makedirs(cache_dir, exist_ok=True) + torch.save(processed_samples, cache_path) + print(f"Saved processed dataset to cache: {cache_path}") + + # Create DataLoader with the custom collate function + return DataLoader( + processed_dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=processor.collate_function, + ) + + +class Qwen3OmniVideoProcessor(_Qwen3OmniProcessorMixin, BaseImageProcessor): + """Video processor for Qwen3-Omni multimodal model with finevideo dataset support.""" + + def __init__(self, tokenizer, device="cuda", dtype=None, use_audio_in_video=True): + """Constructor. + + Args: + tokenizer: The Qwen3OmniMoeProcessor for tokenizing and processing inputs. + device: Device to move tensors to. + dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default. + use_audio_in_video: Whether to extract and use audio from video files. + """ + super().__init__(tokenizer, device) + self.dtype = dtype + self.use_audio_in_video = use_audio_in_video + self._temp_dir = tempfile.mkdtemp(prefix="qwen3omni_video_") + self._video_counter = 0 + # Try to import qwen_omni_utils for multimodal processing + try: + from qwen_omni_utils import process_mm_info + + self.process_mm_info = process_mm_info + except ImportError: + raise ImportError( + "qwen_omni_utils is required for Qwen3OmniVideoProcessor. " + "Please install it from https://github.com/QwenLM/Qwen3-Omni" + ) + + def _save_video_bytes_to_file(self, video_bytes: bytes) -> str: + """Save video bytes to a temporary file and return the path. + + Args: + video_bytes: Raw video bytes (e.g., from finevideo's 'mp4' field). + + Returns: + Path to the temporary video file. + """ + video_path = os.path.join(self._temp_dir, f"video_{self._video_counter}.mp4") + self._video_counter += 1 + with open(video_path, "wb") as f: + f.write(video_bytes) + return video_path + + _ALL_KEYS = [ + "input_ids", + "attention_mask", + "pixel_values_videos", + "video_grid_thw", + "video_second_per_grid", + "feature_attention_mask", + "input_features", + ] + + def preprocess_function(self, examples): + """Preprocess function for Qwen3-Omni with video support. + + Handles both standard video paths and raw video bytes (finevideo format). + """ + # Get question/prompt - finevideo has metadata in 'json' field + if "json" in examples and examples["json"] is not None: + metadata = examples["json"] + category = metadata.get("content_fine_category", "") + question = ( + f"Describe what is happening in this video in detail. Category hint: {category}" + ) + else: + question = examples.get("question", "Describe this video in detail.") + + # Build conversation in Qwen format + content = [] + + # Handle video - check for raw bytes (finevideo format) or path + video_path = None + if examples.get("mp4") is not None: + video_path = self._save_video_bytes_to_file(examples["mp4"]) + elif examples.get("video") is not None: + video_path = examples["video"] + + if video_path is not None: + content.append({"type": "video", "video": video_path}) + + content.append({"type": "text", "text": question}) + + conversation = [{"role": "user", "content": content}] + values = self._tokenize_conversation(conversation) + return self._serialize_for_arrow(values, self._ALL_KEYS) + + def collate_function(self, batch): + """Collate function to process inputs during data loading.""" + result = self._collate_first_item( + batch, + long_keys=( + "input_ids", + "attention_mask", + "video_grid_thw", + "feature_attention_mask", + ), + float_keys=("pixel_values_videos", "video_second_per_grid", "input_features"), + dtype=self.dtype, + ) + # Pass use_audio_in_video flag to model.generate() for Qwen3Omni + result["use_audio_in_video"] = self.use_audio_in_video + return result + + def cleanup(self): + """Clean up temporary video files.""" + import shutil + + if os.path.exists(self._temp_dir): + shutil.rmtree(self._temp_dir) + + def __del__(self): + """Ensure temporary files are cleaned up when the processor is garbage collected.""" + self.cleanup()