diff --git a/examples/llm_eval/run_lm_eval_vllm.sh b/examples/llm_eval/run_lm_eval_vllm.sh
old mode 100644
new mode 100755
index ef94a66d14..18c52995c9
--- a/examples/llm_eval/run_lm_eval_vllm.sh
+++ b/examples/llm_eval/run_lm_eval_vllm.sh
@@ -19,12 +19,13 @@
 # Script to run lm-evaluation-harness against a running vLLM OpenAI-compatible server.
 #
 # Usage:
-#   bash run_lm_eval_vllm.sh <model_name> [port] [task]
+#   bash run_lm_eval_vllm.sh <model_name> [port] [task] [host]
 #
 # Arguments:
 #   <model_name>: The name of the model being served (e.g., Qwen/Qwen3-30B-A3B). Used for the 'model' argument in lm_eval.
 #   [port]:       The port the vLLM server is listening on (default: 8000).
 #   [task]:       The lm_eval task(s) to run (default: mmlu).
+#   [host]:       The IP address or hostname of the vLLM server (default: localhost).
 #
 # Example:
 #   # Start vLLM server first (in another terminal):
@@ -35,6 +36,9 @@
 #
 #   # Run for a different task, e.g., hellaswag:
 #   bash run_lm_eval_vllm.sh Qwen/Qwen3-30B-A3B 8000 hellaswag
+#
+#   # Run against a remote server:
+#   bash run_lm_eval_vllm.sh Qwen/Qwen3-30B-A3B 8000 mmlu 10.78.17.40
 # ---
 
 set -e
@@ -42,16 +46,17 @@ set -x
 
 # --- Argument Parsing ---
 if [ -z "$1" ]; then
-  echo "Usage: $0 <model_name> [port] [task]"
+  echo "Usage: $0 <model_name> [port] [task] [host]"
   exit 1
 fi
 MODEL_NAME=$1
 PORT=${2:-8000}       # Default port is 8000 if not provided
 TASK=${3:-mmlu}       # Default task is mmlu if not provided
+HOST=${4:-localhost}  # Default host is localhost if not provided
 
 # --- Environment Setup ---
 export OPENAI_API_KEY="local" # Not strictly required for local, but good practice
-BASE_URL="http://localhost:${PORT}/v1"
+BASE_URL="http://${HOST}:${PORT}/v1"
 COMPLETIONS_URL="${BASE_URL}/completions"
 
 # --- Evaluation ---
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 58eb676111..a39acf4c73 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -45,12 +45,134 @@
 except ImportError:
     snapshot_download = None
 
-from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
+from modelopt.torch.export.model_utils import match_model_type_by_name
+from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
+from modelopt.torch.utils.image_processor import (
+    BaseImageProcessor,
+    MllamaImageProcessor,
+    Qwen3OmniImageProcessor,
+)
+from modelopt.torch.utils.video_dataset_utils import (
+    Qwen3OmniVideoProcessor,
+    get_supported_video_datasets,
+    get_video_dataset_dataloader,
+)
+from modelopt.torch.utils.vlm_dataset_utils import (
+    get_supported_vlm_datasets,
+    get_vlm_dataset_dataloader,
+)
 
 logger = logging.getLogger(__name__)
 
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
 
+# Files needed for tokenizer/processor that vLLM loads from model path
+TOKENIZER_FILES = [
+    "vocab.json",
+    "merges.txt",
+    "tokenizer.json",
+    "tokenizer_config.json",
+    "special_tokens_map.json",
+    "preprocessor_config.json",
+    "chat_template.json",
+]
+
+
+def get_model_type_from_config(model_path: str) -> str | None:
+    """Get model type from the config.json file.
+
+    Args:
+        model_path: Path to the model directory or HuggingFace model ID.
+
+    Returns:
+        Model type string (e.g., 'qwen3omni', 'llama', 'gpt') or None if not found.
+    """
+    config_path = os.path.join(model_path, "config.json")
+    if not os.path.exists(config_path):
+        return None
+
+    with open(config_path) as f:
+        config = json.load(f)
+
+    # Check architectures field first
+    for arch in config.get("architectures", []):
+        result = match_model_type_by_name(arch)
+        if result is not None:
+            return result
+
+    # Fallback to model_type field
+    return match_model_type_by_name(config.get("model_type", ""))
+
+
+def get_sampling_params_from_config(model_path: str) -> dict:
+    """Extract sampling params from generation_config.json if present."""
+    gen_config_path = Path(model_path) / "generation_config.json"
+    if not gen_config_path.exists():
+        return {}
+
+    gen_config = json.loads(gen_config_path.read_text())
+
+    params = {k: gen_config[k] for k in ("temperature", "top_p", "top_k") if k in gen_config}
+
+    for key in ("max_new_tokens", "max_length"):
+        if key in gen_config:
+            params["max_tokens"] = gen_config[key]
+            break
+
+    return params
+
+
+def get_quantization_format(model_path: str) -> str | None:
+    """Get quantization format from the model config.
+
+    Args:
+        model_path: Path to the model directory.
+
+    Returns:
+        vLLM quantization string ('modelopt', 'modelopt_fp4') or None if not quantized.
+    """
+    hf_quant_config_path = os.path.join(model_path, "hf_quant_config.json")
+    if os.path.exists(hf_quant_config_path):
+        with open(hf_quant_config_path) as f:
+            quant_config = json.load(f)
+        quant_algo = quant_config.get("quantization", {}).get("quant_algo", "")
+        if "NVFP4" in quant_algo:
+            return "modelopt_fp4"
+
+    return None
+
+
+def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None:
+    """Copy tokenizer files from HF model to local quantized model dir if missing."""
+    if not os.path.isdir(model_path):
+        return  # Not a local path, nothing to do
+
+    # Check if tokenizer files are missing
+    missing_files = [f for f in TOKENIZER_FILES if not os.path.exists(os.path.join(model_path, f))]
+    if not missing_files:
+        return
+
+    if snapshot_download is None:
+        print("Warning: huggingface_hub not installed, cannot download tokenizer files")
+        return
+
+    print(f"Copying missing tokenizer files from {source_model_id}...")
+    # Download only tokenizer files from HF
+    if os.path.isdir(source_model_id):
+        cache_dir = source_model_id
+    else:
+        cache_dir = snapshot_download(
+            source_model_id,
+            allow_patterns=TOKENIZER_FILES,
+        )
+
+    for fname in TOKENIZER_FILES:
+        src = os.path.join(cache_dir, fname)
+        dst = os.path.join(model_path, fname)
+        if os.path.exists(src) and not os.path.exists(dst):
+            shutil.copy2(src, dst)
+            print(f"  Copied {fname}")
+
 
 def run_nemotron_vl_preview(
     full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False
@@ -241,9 +363,45 @@ def build_quant_cfg(
         quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
 
+    if model_type in ["qwen3moe", "qwen3next"] and qformat == "nvfp4":
+        # Disable the attention projection layers to retain accuracy
+        quant_cfg["quant_cfg"]["model*.*attn*in_proj*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["model*.*attn*q_proj*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["model*.*attn*k_proj*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["model*.*attn*v_proj*"] = {"enable": False}
+
+    if model_type == "deepseek":
+        # Disable MLA quantization for accuracy.
+        quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False}
+
+    if model_type == "qwen3omni":
+        print(
+            "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model"
+        )
+        quant_cfg["quant_cfg"]["*conv*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+
     return quant_cfg
 
 
+def get_generation_kwargs(model_type: str) -> dict[str, Any]:
+    """Get model-specific generation kwargs for calibration.
+
+    Args:
+        model_type: The model type string.
+
+    Returns:
+        Dictionary of generation kwargs for the model.
+    """
+    generation_kwargs = {}
+    if model_type == "qwen3omni":
+        generation_kwargs["return_audio"] = False
+        generation_kwargs["thinker_max_new_tokens"] = 1
+    return generation_kwargs
+
+
 def is_speculative(hf_config):
     """Check if the model architecture is a speculative model."""
     return hf_config.architectures and any(
@@ -284,7 +442,7 @@ def get_processor(
     if attn_implementation is not None:
         model_kwargs["attn_implementation"] = attn_implementation
 
-    if model_type == "whisper":
+    if model_type in ("whisper", "mllama", "qwen3omni"):
         processor = AutoProcessor.from_pretrained(
             ckpt_path,
             padding_side="left",
@@ -296,20 +454,11 @@ def get_processor(
             f"Pad token for {ckpt_path} cannot be set!"
         )
 
+        if model_type == "mllama":
+            return MllamaImageProcessor(processor, device)
+        elif model_type == "qwen3omni":
+            return Qwen3OmniImageProcessor(processor, device)
         return processor
-    elif model_type == "mllama":
-        processor = AutoProcessor.from_pretrained(
-            ckpt_path,
-            padding_side="left",
-            **model_kwargs,
-        )
-        if processor.tokenizer.pad_token is None:
-            processor.tokenizer.pad_token = processor.tokenizer.eos_token
-        assert processor.tokenizer.pad_token is not None, (
-            f"Pad token for {ckpt_path} cannot be set!"
-        )
-
-        return MllamaImageProcessor(processor, device)
     else:
         # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
         try:
@@ -838,3 +987,86 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
         print(f"Successfully copied {len(copied_files)} custom model files to {export_path}")
     else:
         print("No custom model files found to copy")
+
+
+def get_qwen3omni_dataloader(
+    dataset_name: str | list[str] | None,
+    processor: Qwen3OmniImageProcessor | None,
+    tokenizer,
+    batch_size: int,
+    num_samples: int | list[int],
+    device: torch.device,
+    model_dtype: torch.dtype,
+    include_labels: bool = False,
+):
+    """Create a calibration dataloader for Qwen3Omni models.
+
+    Handles video, VLM, and text-only dataset configurations.
+
+    Args:
+        dataset_name: Name of the dataset(s) to use for calibration.
+        processor: The Qwen3OmniImageProcessor for multimodal inputs.
+        tokenizer: The tokenizer for text-only fallback.
+        batch_size: Batch size for the dataloader.
+        num_samples: Number of samples to use (int or list for multi-dataset).
+        device: Target device for tensors.
+        model_dtype: Model dtype for proper tensor conversion.
+        include_labels: Whether to include labels (for gradient-based auto_quantize).
+
+    Returns:
+        DataLoader for calibration.
+    """
+    if dataset_name is None:
+        dataset_name = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
+        num_samples = [512, 512]
+
+    if processor is not None:
+        # Normalize single-element list to str for supported-dataset lookups
+        if isinstance(dataset_name, list) and len(dataset_name) == 1:
+            dataset_name = dataset_name[0]
+        if dataset_name in get_supported_video_datasets():
+            assert isinstance(dataset_name, str)
+            video_processor = Qwen3OmniVideoProcessor(
+                processor.tokenizer if hasattr(processor, "tokenizer") else processor,
+                device=device,
+                dtype=model_dtype,
+                use_audio_in_video=True,
+            )
+            calib_dataloader = get_video_dataset_dataloader(
+                dataset_name=dataset_name,
+                processor=video_processor,
+                batch_size=batch_size,
+                num_samples=num_samples if isinstance(num_samples, int) else num_samples[0],
+            )
+        elif dataset_name in get_supported_vlm_datasets():
+            assert isinstance(dataset_name, str)
+            assert isinstance(processor, Qwen3OmniImageProcessor), (
+                "The Qwen3OmniImageProcessor must be set."
+            )
+            # Set dtype for proper tensor conversion in collate_function.
+            # Processor is created before model_dtype is known, so we set it here.
+            processor.dtype = model_dtype
+            calib_dataloader = get_vlm_dataset_dataloader(
+                dataset_name=dataset_name,
+                processor=processor,
+                batch_size=batch_size,
+                num_samples=num_samples if isinstance(num_samples, int) else num_samples[0],
+            )
+        else:
+            raise ValueError(
+                f"Dataset '{dataset_name}' not supported for Qwen3Omni with processor. "
+                f"Supported video datasets: {get_supported_video_datasets()}, "
+                f"Supported VLM datasets: {get_supported_vlm_datasets()}"
+            )
+    else:
+        # Text-only fallback
+        calib_dataloader = get_dataset_dataloader(
+            dataset_name=dataset_name if isinstance(dataset_name, list) else [dataset_name],
+            tokenizer=tokenizer,
+            batch_size=batch_size,
+            num_samples=num_samples if isinstance(num_samples, list) else [num_samples],
+            device=device,
+            include_labels=include_labels,
+        )
+
+    return calib_dataloader
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index b81dc60c01..2d441f4b35 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -18,6 +18,7 @@
 import random
 import time
 import warnings
+from collections import namedtuple
 from typing import Any
 
 import numpy as np
@@ -27,8 +28,10 @@
     build_quant_cfg,
     copy_custom_model_files,
     create_vlm_calibration_loop,
+    get_generation_kwargs,
     get_model,
     get_processor,
+    get_qwen3omni_dataloader,
     get_tokenizer,
     is_enc_dec,
     is_nemotron_vl,
@@ -70,7 +73,11 @@
     get_max_batch_size,
     get_supported_datasets,
 )
-from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
+from modelopt.torch.utils.image_processor import (
+    BaseImageProcessor,
+    MllamaImageProcessor,
+    Qwen3OmniImageProcessor,
+)
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
 from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
@@ -208,6 +215,21 @@ def make_calib_dataloader(
             batch_size=args.batch_size,
             num_samples=args.calib_size[0],
         )
+    elif model_type == "qwen3omni":
+        # Labels are only needed for gradient-based auto_quantize
+        include_labels = (
+            args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
+        )
+        calib_dataloader = get_qwen3omni_dataloader(
+            dataset_name=args.dataset[0] if args.dataset else None,
+            processor=processor,
+            tokenizer=tokenizer,
+            batch_size=args.batch_size,
+            num_samples=args.calib_size[0] if processor else args.calib_size,
+            device=device,
+            model_dtype=language_model.dtype,
+            include_labels=include_labels,
+        )
     elif model_type == "whisper":
         assert processor is not None and isinstance(processor, WhisperProcessor), (
             "The AutoProcessor must be set."
@@ -408,7 +430,7 @@ def load_model(args: argparse.Namespace):
         print("Nemotron VL model detected. Enabling image-text calibration by default.")
         args.calib_with_images = True
 
-    if model_type == "mllama":
+    if model_type in ["mllama", "qwen3omni"]:
         processor = get_processor(
             args.pyt_ckpt_path,
             model_type,
@@ -416,6 +438,14 @@ def load_model(args: argparse.Namespace):
             trust_remote_code=args.trust_remote_code,
             attn_implementation=args.attn_implementation,
         )
+        if model_type == "qwen3omni":
+            print("Disabling talker for Qwen3Omni model")
+            full_model.disable_talker()
+            language_model = full_model.thinker.model
+            tokenizer = processor.tokenizer.tokenizer
+            processor = None
+            default_padding_side = tokenizer.padding_side
+            default_pad_token = tokenizer.pad_token
     elif model_type == "whisper":
         processor = get_processor(
             args.pyt_ckpt_path,
@@ -555,6 +585,9 @@ def mono_quantize(
         quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False}  # Nemotron-Parse specific
         print("Quantization will only be applied to the decoder (text generation) component")
 
+    # Get model-specific generation kwargs (e.g., for Qwen3Omni)
+    generation_kwargs = get_generation_kwargs(model_type)
+
     if not model_is_already_quantized or calibration_only:
         # quantize the model
 
@@ -569,7 +602,9 @@ def mono_quantize(
             if args.calib_with_images and is_nemotron_vl_model:
                 calibrate_loop = create_vlm_calibration_loop(full_model, calib_dataloader)
             else:
-                calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+                calibrate_loop = create_forward_loop(
+                    dataloader=calib_dataloader, generation_kwargs=generation_kwargs
+                )
 
         if calibration_only:
             language_model = mtq.calibrate(
@@ -719,6 +754,23 @@ def export_quantized(
         )
 
 
+PreQuantizeResult = namedtuple(
+    "PreQuantizeResult", ["preview_input_ids", "generated_ids_before_ptq", "calib_batch"]
+)
+
+
+def _qwen3omni_generate(model, calib_batch):
+    """Run Qwen3Omni generate and unpack the result.
+
+    Qwen3Omni returns a (text_ids, audio) tuple; text_ids may have a .sequences attribute.
+    """
+    result = model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
+    if isinstance(result, tuple):
+        text_ids, _ = result
+        return text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
+    return result
+
+
 def pre_quantize(
     args: argparse.Namespace,
     full_model: torch.nn.Module,
@@ -735,9 +787,10 @@ def pre_quantize(
 
     """
     # Only run single sample for preview
-    preview_input_ids = next(iter(calib_dataloader))[
-        "input_features" if model_type == "whisper" else "input_ids"
-    ][0:1]
+    calib_batch = next(iter(calib_dataloader))
+    preview_input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][
+        0:1
+    ]
 
     # Generate preview before quantization
     if args.skip_generate:
@@ -759,10 +812,16 @@ def pre_quantize(
             "before quantization",
             allow_fallback=False,
         )
+    elif model_type == "qwen3omni":
+        # Use only a single sample for preview generation to avoid OOM
+        single_sample = {
+            k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items()
+        }
+        generated_ids_before_ptq = _qwen3omni_generate(full_model, single_sample)
     else:
         generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
 
-    return preview_input_ids, generated_ids_before_ptq
+    return PreQuantizeResult(preview_input_ids, generated_ids_before_ptq, calib_batch)
 
 
 def post_quantize(
@@ -775,28 +834,59 @@ def post_quantize(
     generated_ids_before_ptq,
     is_nemotron_vl_model,
     first_text_speech_dataset,
+    calib_batch: dict | None = None,
 ):
-    """
-    Processing after the quantization.
+    """Processing after the quantization.
 
-    Currently we run one round of generation using the quantized model for a sample prompt,
-    and compare it with pre-quantize generation.
+    Runs one round of generation using the quantized model for a sample prompt and
+    compares it with the pre-quantize generation from ``pre_quantize()``.
+
+    Args:
+        args: Parsed CLI arguments. Used for ``verbose``, ``quant_summary_path``,
+            ``export_path``, ``pyt_ckpt_path``, and ``skip_generate`` flags.
+        full_model: The quantized model to run post-quantization generation on.
+        model_type: Model architecture identifier (e.g. ``"qwen3omni"``, ``"whisper"``,
+            ``"llama4"``, ``"deepseek"``). Controls model-specific generation and
+            decoding paths. ``None`` for generic models.
+        tokenizer: HF tokenizer for decoding generated token ids. May be ``None`` when
+            a ``processor`` is used instead (e.g. vision-language or speech models).
+        processor: HF image/audio processor for multimodal models. Used for decoding
+            outputs from vision-language (Mllama, Qwen3Omni) and speech (Whisper)
+            models. ``None`` for text-only models.
+        preview_input_ids: Input token ids (single sample) produced by ``pre_quantize()``
+            for the preview generation comparison.
+        generated_ids_before_ptq: Generation output from ``pre_quantize()`` to compare
+            against post-quantization output. ``None`` if generation was skipped.
+        is_nemotron_vl_model: Whether the model is a Nemotron VL model, which uses
+            ``model.chat()`` and returns text strings instead of token tensors.
+        first_text_speech_dataset: Text transcript of the first speech sample, used as
+            the display input for Whisper models since their ``input_ids`` are
+            mel-spectrogram features rather than decodable tokens.
+        calib_batch: Full calibration batch dict from ``pre_quantize``. Required for
+            multimodal models (e.g. Qwen3Omni) whose ``generate()`` needs the complete
+            input dict (audio features, attention masks, etc.) rather than just
+            ``input_ids``. For text-only models this is unused and may be ``None``.
 
     """
 
     if args.verbose:
         try:
-            mtq.print_quant_summary(full_model, args.export_path)
+            mtq.print_quant_summary(full_model, save_path=args.quant_summary_path)
             save_expert_token_count_table(full_model, args.export_path)
         except Exception as e:
-            print(f"Error saving quant summary: {e}")
-            print("Continuing with generation...")
+            print(f"Warning: Failed to print quant summary: {e}")
 
     # Run some samples
     torch.cuda.empty_cache()
     generated_ids_after_ptq = None
     if generated_ids_before_ptq is None:
         pass
+    elif model_type == "qwen3omni" and calib_batch is not None:
+        # Use only a single sample for preview generation to avoid OOM
+        single_sample = {
+            k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items()
+        }
+        generated_ids_after_ptq = _qwen3omni_generate(full_model, single_sample)
     elif model_type != "llama4" and not is_nemotron_vl_model:
         # Our fake quantizer may not be fully compatible with torch.compile.
         generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
@@ -815,12 +905,13 @@ def post_quantize(
         )
 
     def input_decode(input_ids):
-        if processor is not None and isinstance(processor, MllamaImageProcessor):
-            return processor.tokenizer.batch_decode(input_ids)
+        # BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor
+        if processor is not None and isinstance(processor, BaseImageProcessor):
+            return processor.tokenizer.batch_decode(input_ids, skip_special_tokens=True)
         elif processor is not None and isinstance(processor, WhisperProcessor):
             return first_text_speech_dataset
         elif tokenizer is not None:
-            return tokenizer.batch_decode(input_ids)
+            return tokenizer.batch_decode(input_ids, skip_special_tokens=True)
         else:
             raise ValueError("The processor or tokenizer must be set")
 
@@ -832,6 +923,12 @@ def output_decode(generated_ids, input_shape):
                 return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         elif processor is not None and isinstance(processor, MllamaImageProcessor):
             return processor.tokenizer.batch_decode(generated_ids[:, input_shape:])
+        elif processor is not None and isinstance(processor, Qwen3OmniImageProcessor):
+            return processor.tokenizer.batch_decode(
+                generated_ids[:, input_shape:],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )
         elif tokenizer is not None:
             return tokenizer.batch_decode(generated_ids[:, input_shape:])
         else:
@@ -919,7 +1016,7 @@ def quantize_main(
     # Detect if this is a Nemotron VL model using architecture-based detection
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
-    preview_input_ids, generated_ids_before_ptq = pre_quantize(
+    preview_input_ids, generated_ids_before_ptq, calib_batch = pre_quantize(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
 
@@ -1014,6 +1111,7 @@ def quantize_main(
         generated_ids_before_ptq,
         is_nemotron_vl_model,
         first_text_speech_dataset,
+        calib_batch,
     )
     export_quantized(
         args,
@@ -1238,6 +1336,15 @@ def parse_args() -> argparse.Namespace:
         help="Export as vLLM fake-quant checkpoint (produces vllm_fq_modelopt_state.pth "
         "for use with vllm_serve_fakequant.py).",
     )
+    parser.add_argument(
+        "--quant_summary_path",
+        type=str,
+        default=None,
+        help=(
+            "Path to save the quantization summary. If not specified, summary is printed to stdout. "
+            "Requires --verbose to be enabled (default: True)."
+        ),
+    )
 
     args = parser.parse_args()
     if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py
new file mode 100644
index 0000000000..60cfcb2cd1
--- /dev/null
+++ b/examples/llm_ptq/run_vllm.py
@@ -0,0 +1,145 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unified HF checkpoint inference with vLLM.
+
+Usage:
+    python run_vllm.py --model /path/to/quantized/model
+    python run_vllm.py --model /path/to/model --tp 4
+"""
+
+from __future__ import annotations
+
+import argparse
+
+from example_utils import (
+    ensure_tokenizer_files,
+    get_model_type_from_config,
+    get_quantization_format,
+    get_sampling_params_from_config,
+)
+from transformers import AutoConfig, AutoProcessor
+from vllm import LLM, SamplingParams
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run unified hf checkpoint inference with vLLM")
+    parser.add_argument("--model", type=str, required=True, help="Model ID or path")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=None,
+        help="Max model length (auto-detected from config if not specified)",
+    )
+    parser.add_argument("--prompt", type=str, default="What in Nvidia?", help="Text prompt")
+    parser.add_argument(
+        "--tokenizer", type=str, default=None, help="Tokenizer ID or path (defaults to model path)"
+    )
+    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
+    parser.add_argument("--top-p", type=float, default=0.9, help="Top-p sampling")
+    parser.add_argument("--top-k", type=int, default=-1, help="Top-k sampling (-1 to disable)")
+    parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate")
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        default=False,
+        help="Trust remote code from HuggingFace model repos",
+    )
+
+    args = parser.parse_args()
+
+    # Detect model type from config
+    model_type = get_model_type_from_config(args.model)
+    print(f"Detected model type: {model_type}")
+
+    # Detect quantization format
+    quantization = get_quantization_format(args.model)
+    print(f"Detected quantization: {quantization}")
+
+    # Get max_model_len from config if not specified
+    if args.max_model_len is None:
+        config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
+        args.max_model_len = getattr(config, "max_position_embeddings", 4096)
+        print(f"Using max_model_len from config: {args.max_model_len}")
+
+    # Determine tokenizer source
+    tokenizer_id = args.tokenizer or args.model
+
+    # Load processor for chat template
+    processor = AutoProcessor.from_pretrained(
+        tokenizer_id, trust_remote_code=args.trust_remote_code
+    )
+
+    # Text-only conversations
+    conversations = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": args.prompt}],
+            }
+        ],
+    ]
+
+    # Apply chat template
+    apply_chat_kwargs = {
+        "add_generation_prompt": True,
+        "tokenize": False,
+    }
+    # Qwen3Omni-specific: disable thinking mode
+    if model_type == "qwen3omni":
+        apply_chat_kwargs["enable_thinking"] = False
+
+    texts = processor.apply_chat_template(conversations, **apply_chat_kwargs)
+
+    # Ensure tokenizer files exist in local model dir (vLLM loads processor from model path)
+    if args.tokenizer:
+        ensure_tokenizer_files(args.model, args.tokenizer)
+
+    print(f"Loading model: {args.model}")
+    llm = LLM(
+        model=args.model,
+        tokenizer=tokenizer_id,
+        tensor_parallel_size=args.tp,
+        max_model_len=args.max_model_len,
+        trust_remote_code=args.trust_remote_code,
+        quantization=quantization,
+        enforce_eager=True,
+    )
+
+    # Get sampling params from config, with CLI/defaults as fallback
+    config_params = get_sampling_params_from_config(args.model)
+    sampling_kwargs = {
+        "temperature": config_params.get("temperature", args.temperature),
+        "top_p": config_params.get("top_p", args.top_p),
+        "max_tokens": config_params.get("max_tokens", args.max_tokens),
+    }
+    top_k = config_params.get("top_k", args.top_k)
+    if top_k > 0:
+        sampling_kwargs["top_k"] = top_k
+    print(f"Sampling params: {sampling_kwargs}")
+    sampling_params = SamplingParams(**sampling_kwargs)
+
+    print("Running inference...")
+    outputs = llm.generate(texts, sampling_params)
+
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 80)
+        print(f"Generated: {generated_text}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
index 9a2cd4b2f0..641204d4f7 100755
--- a/modelopt/torch/export/layer_utils.py
+++ b/modelopt/torch/export/layer_utils.py
@@ -972,6 +972,7 @@ def module_match_name_list(module, name_list):
             "Qwen3MoeSparseMoeBlock",
             "Qwen3NextSparseMoeBlock",
             "Qwen3_5MoeSparseMoeBlock",
+            "Qwen3OmniMoeThinkerTextSparseMoeBlock",
             "DeepseekMoE",
         ],
     ):
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 3bd72d9de9..17798d0837 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -31,6 +31,7 @@
     "ChatGLM": "chatglm",
     "Qwen3Moe": "qwen3moe",
     "Qwen3Next": "qwen3next",
+    "Qwen3OmniMoeForConditionalGeneration": "qwen3omni",
     "QWen": "qwen",
     "RecurrentGemma": "recurrentgemma",
     "Gemma3": "gemma3",
@@ -42,6 +43,7 @@
     "Phi4MMForCausalLM": "phi4mm",
     "phi": "phi",
     "TLGv4ForCausalLM": "phi",
+    "NemotronH": "nemotron_h",
     "MixtralForCausalLM": "llama",
     "ArcticForCausalLM": "llama",
     "StarCoder": "gpt",
@@ -51,12 +53,11 @@
     "GLM": "glm",
     "InternLM2ForCausalLM": "internlm",
     "ExaoneForCausalLM": "exaone",
-    "NemotronH": "nemotron_h",
     "Nemotron": "gpt",
     "Deepseek": "deepseek",
     "Whisper": "whisper",
-    "gptoss": "gptoss",
     "MiniMax": "minimax",
+    "gptoss": "gptoss",
 }
 
 __doc__ = f"""Utility functions for model type detection and classification.
@@ -66,17 +67,35 @@
         {MODEL_NAME_TO_TYPE=}
 """
 
-__all__ = ["get_language_model_from_vl", "get_model_type", "is_multimodal_model"]
+__all__ = [
+    "get_language_model_from_vl",
+    "get_model_type",
+    "is_multimodal_model",
+    "match_model_type_by_name",
+]
 
 
-def get_model_type(model):
-    """Try get the model type from the model name. If not found, return None."""
+def match_model_type_by_name(name: str) -> str | None:
+    """Match a model type from MODEL_NAME_TO_TYPE by case-insensitive substring match.
+
+    Args:
+        name: String to match against (e.g. class name, architecture string, model_type field).
+
+    Returns:
+        Matched model type string, or None.
+    """
+    name_lower = name.lower()
     for k, v in MODEL_NAME_TO_TYPE.items():
-        if k.lower() in type(model).__name__.lower():
+        if k.lower() in name_lower:
             return v
     return None
 
 
+def get_model_type(model):
+    """Try get the model type from the model name. If not found, return None."""
+    return match_model_type_by_name(type(model).__name__)
+
+
 def is_multimodal_model(model):
     """Check if a model is a Vision-Language Model (VLM) or multimodal model.
 
@@ -149,6 +168,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None:
     if hasattr(model, "language_model"):
         return [model, model.language_model]
 
+    if hasattr(model, "thinker"):
+        return [model, model.thinker]
+
     # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model.
     # Only match if the model is detected as multimodal to avoid matching non-VLM encoder-decoder
     # models like T5, Bart, Whisper which also have .decoder.
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 14a12bcdf3..c60469a587 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -87,7 +87,7 @@
     QUANTIZATION_W4A8_AWQ,
     QUANTIZATION_W4A8_NVFP4_FP8,
 )
-from .model_utils import get_language_model_from_vl, is_multimodal_model
+from .model_utils import get_language_model_from_vl, get_model_type, is_multimodal_model
 from .plugins import SpeculativeDecodingExporter, has_spec_opt
 from .quant_utils import (
     fuse_prequant_layernorm,
@@ -357,9 +357,11 @@ def llm_dummy_forward():
                 [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype
             ).to(model.device)
 
-        if is_vl_model and "nemotron" in model_type:
-            # For Nemotron VL models, run optimization on just the language model/decoder.
-            # This avoids needing pixel_values for the vision encoder.
+        if getattr(model.config, "is_encoder_decoder", False):
+            # For encoder-decoder models, we need to pass both the encoder and decoder input ids
+            model(fake_input, decoder_input_ids=decoder_fake_input)
+        elif (is_vl_model and "nemotron" in model_type) or model_type.startswith("qwen3omni"):
+            # For Nemotron VL models, try to run optimization on just the language model part
             language_model_lineage = get_language_model_from_vl(model)
 
             if language_model_lineage is not None:
@@ -371,7 +373,7 @@ def llm_dummy_forward():
                 language_model(fake_input, use_cache=False)
             else:
                 raise ValueError(
-                    f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
+                    f"Cannot extract language_model from VL model (type: {model_type}). "
                     "This is required for requantization/resmoothing optimization. "
                     "Please ensure the model architecture is supported or file an issue."
                 )
@@ -779,6 +781,16 @@ def _export_transformers_checkpoint(
                 exclude_modules.append(pattern)
                 print(f"Adding MTP layer to quantization_config ignore: {pattern}")
 
+    # Add model-specific non-quantized module exclusions
+    _model_type_exclusions = {
+        "qwen3omni": ["thinker.audio_tower*", "thinker.visual*", "thinker.lm_head"],
+    }
+    model_type = get_model_type(model)
+    for pattern in _model_type_exclusions.get(model_type, []):
+        exclude_modules = quant_config["quantization"].setdefault("exclude_modules", [])
+        if pattern not in exclude_modules:
+            exclude_modules.append(pattern)
+
     # Safety net: sync any gate/up weight quantizer amaxes that
     # requantize_resmooth_fused_llm_layers did not reach (e.g. experts not
     # activated during the dummy forward, or non-standard expert naming).
@@ -1181,6 +1193,21 @@ def export_hf_checkpoint(
         if getattr(model, "hf_quantizer", None) is not None:
             model.hf_quantizer = None
 
+        # Fix generation_config conflicts before saving
+        # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors
+        # Restore the original value after save to avoid mutating the caller's model.
+        _gen_config_restore = None
+        if hasattr(model, "generation_config") and model.generation_config is not None:
+            gen_config = model.generation_config
+            if not getattr(gen_config, "do_sample", True):
+                # Enable sampling if sampling params are present
+                if any(
+                    getattr(gen_config, attr, None) is not None
+                    for attr in ["temperature", "top_p", "top_k"]
+                ):
+                    _gen_config_restore = gen_config.do_sample
+                    gen_config.do_sample = True
+
         # Save model
         # Temporarily disable revert_weight_conversion if available — it doesn't handle
         # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
@@ -1197,6 +1224,8 @@ def export_hf_checkpoint(
             )
         finally:
             _unpatch_revert_weight_conversion(_patches)
+            if _gen_config_restore is not None:
+                model.generation_config.do_sample = _gen_config_restore
 
         original_config = f"{export_dir}/config.json"
         config_data = {}
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
index 4aa1ff46b4..a2dae3fbe9 100644
--- a/modelopt/torch/quantization/model_quant.py
+++ b/modelopt/torch/quantization/model_quant.py
@@ -17,7 +17,6 @@
 
 import fnmatch
 import inspect
-import os
 import warnings
 from collections.abc import Callable, Iterable
 from typing import Any
@@ -583,22 +582,28 @@ def enable_quantizer(model: nn.Module, wildcard_or_filter_func: str | Callable):
 
 
 @atomic_print
-def print_quant_summary(model: nn.Module, output_dir: str | None = None):
-    """Print summary of all quantizer modules in the model."""
-    lines = [
-        f"{name:80} {mod}"
-        for name, mod in model.named_modules()
-        if isinstance(mod, TensorQuantizer)
-    ]
-    lines.append(f"{len(lines)} TensorQuantizers found in model")
-
-    if output_dir:
-        path = os.path.join(output_dir, ".quant_summary.txt")
-        with open(path, "w", encoding="utf-8") as f:
-            f.write("\n".join(lines) + "\n")
-        print(f"\033[1mQuant summary saved to {path}\033[0m")
+def print_quant_summary(model: nn.Module, save_path: str | None = None):
+    """Print summary of all quantizer modules in the model.
+
+    Args:
+        model: The model to summarize.
+        save_path: Optional path to save the summary to a file. If None, prints to stdout.
+    """
+    lines = []
+    count = 0
+    for name, mod in model.named_modules():
+        if isinstance(mod, TensorQuantizer):
+            lines.append(f"{name:80} {mod}")
+            count += 1
+    lines.append(f"{count} TensorQuantizers found in model")
+
+    summary = "\n".join(lines)
+    if save_path:
+        with open(save_path, "w") as f:
+            f.write(summary)
+        print(f"Quantization summary saved to {save_path}")
     else:
-        print("\n".join(lines))
+        print(summary)
 
 
 def fold_weight(model: nn.Module, keep_attrs: bool = False):
diff --git a/modelopt/torch/utils/__init__.py b/modelopt/torch/utils/__init__.py
index f026e747a8..354212d56e 100644
--- a/modelopt/torch/utils/__init__.py
+++ b/modelopt/torch/utils/__init__.py
@@ -27,4 +27,5 @@
 from .regex import *
 from .robust_json import *
 from .tensor import *
+from .video_dataset_utils import *
 from .vlm_dataset_utils import *
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
index 00cdff8877..f5a64054fe 100644
--- a/modelopt/torch/utils/dataset_utils.py
+++ b/modelopt/torch/utils/dataset_utils.py
@@ -112,6 +112,7 @@
     "get_dataset_samples",
     "get_jsonl_text_samples",
     "get_max_batch_size",
+    "get_qwen3omni_text_dataloader",
     "get_supported_datasets",
 ]
 
@@ -211,6 +212,88 @@ def _auto_preprocess_sample(
     )
 
 
+def _load_text_samples(dataset_name, num_samples, **kwargs):
+    """Normalize inputs and load raw text samples from one or more datasets.
+
+    Args:
+        dataset_name: Single name or list of names.
+        num_samples: Single count or list of counts (must match dataset_name length).
+        **kwargs: Forwarded to get_dataset_samples().
+
+    Returns:
+        List of raw text strings.
+    """
+    if isinstance(num_samples, int):
+        num_samples = [num_samples]
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+    assert len(dataset_name) == len(num_samples), (
+        "dataset_name and num_samples must be the same length"
+    )
+    all_samples = []
+    for ds_name, num_sample in zip(dataset_name, num_samples):
+        samples = get_dataset_samples(ds_name, num_sample, **kwargs)
+        all_samples.extend(samples)
+    return all_samples
+
+
+class _ListDataset(torch.utils.data.Dataset):
+    """Simple dataset wrapping a list of dicts."""
+
+    def __init__(self, samples):
+        self.samples = samples
+
+    def __getitem__(self, idx):
+        return self.samples[idx]
+
+    def __len__(self):
+        return len(self.samples)
+
+
+def get_qwen3omni_text_dataloader(
+    dataset_name: str | list[str] = "cnn_dailymail",
+    processor=None,
+    batch_size: int = 1,
+    num_samples: int | list[int] = 512,
+) -> DataLoader:
+    """Get a text-only dataloader for Qwen3-Omni with proper conversation template applied.
+
+    This function applies the Qwen3-Omni chat template to text samples before tokenization,
+    which is required for proper calibration of Qwen3-Omni models with text-only datasets.
+
+    See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking
+
+    Args:
+        dataset_name: Name of the dataset(s) to load.
+        processor: Qwen3OmniTextProcessor instance wrapping the Qwen3OmniMoeProcessor.
+        batch_size: Batch size of the returned dataloader.
+        num_samples: Number of samples from the dataset.
+
+    Returns:
+        A DataLoader with properly formatted inputs for Qwen3-Omni.
+    """
+    assert processor is not None, "Please provide a Qwen3OmniTextProcessor."
+
+    all_samples = _load_text_samples(dataset_name, num_samples)
+
+    # Preprocess each sample with the conversation template and convert to lists
+    from .image_processor import _Qwen3OmniProcessorMixin
+
+    processed_samples = []
+    for text in all_samples:
+        values = processor.preprocess_function(text)
+        processed_samples.append(
+            _Qwen3OmniProcessorMixin._serialize_for_arrow(values, list(values.keys()))
+        )
+
+    return DataLoader(
+        _ListDataset(processed_samples),
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=processor.collate_function,
+    )
+
+
 def get_dataset_samples(
     dataset_name: str,
     num_samples: int,
@@ -367,23 +450,13 @@ def get_dataset_dataloader(
             "Tokenizer with the right padding_side may impact calibration accuracy. Recommend set to left"
         )
 
-    if isinstance(num_samples, int):
-        num_samples = [num_samples]
-
-    if isinstance(dataset_name, str):
-        dataset_name = [dataset_name]
-
-    assert len(dataset_name) == len(num_samples), (
-        "dataset_name and num_samples must be the same length"
+    all_samples = _load_text_samples(
+        dataset_name,
+        num_samples,
+        apply_chat_template=apply_chat_template,
+        tokenizer=tokenizer,
     )
 
-    all_samples = []
-    for ds_name, num_sample in zip(dataset_name, num_samples):
-        samples = get_dataset_samples(
-            ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer
-        )
-        all_samples.extend(samples)
-
     batch_encoded = tokenizer(
         all_samples,
         return_tensors="pt",
@@ -452,8 +525,8 @@ def _get_free_gpu_mem():
     torch.cuda.empty_cache()
 
     free_mem_before, max_allocated_before = _get_free_gpu_mem()
-    is_enc_dec = model_type_is_enc_dec(model)
-    infer_method = model.generate if is_enc_dec else model.forward
+    use_generate = _should_use_generate(model)
+    infer_method = model.generate if use_generate else model.forward
 
     if sample_input_single_batch is None:
         sample_input_single_batch = (
@@ -508,22 +581,29 @@ def _get_free_gpu_mem():
         return 512
 
 
-def _process_batch(batch_data, infer_method, max_working_batch_size=None):
+def _process_batch(batch_data, infer_method, generation_kwargs=None, max_working_batch_size=None):
     """Process a batch of data through the model's inference method.
 
     Args:
         batch_data: Dictionary containing the batch data
         infer_method: Model's inference method (either forward or generate)
+        generation_kwargs: Keyword arguments to pass to the model.generate() method.
         max_working_batch_size: Maximum batch size known to work without OOM
 
     Returns:
         The maximum batch size that worked successfully
     """
-    assert all(torch.is_tensor(data) or data is None for data in batch_data.values()), (
-        "batch_data values must be tensors"
+    if generation_kwargs is None:
+        generation_kwargs = {}
+    # Separate tensor values from scalar parameters (like max_new_tokens)
+    tensor_data = {k: v for k, v in batch_data.items() if torch.is_tensor(v) or v is None}
+    scalar_data = {k: v for k, v in batch_data.items() if not torch.is_tensor(v) and v is not None}
+
+    assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), (
+        "tensor_data values must be tensors"
     )
-    # Get the batch size of current data
-    batch_size = batch_data[next(iter(batch_data.keys()))].shape[0]
+    # Get the batch size from the first non-None tensor value
+    batch_size = next(v for v in tensor_data.values() if v is not None).shape[0]
 
     # If we know a smaller batch size works, preemptively split
     if max_working_batch_size is not None and batch_size > max_working_batch_size:
@@ -531,11 +611,13 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
         for i in range(0, batch_size, max_working_batch_size):
             end_idx = min(i + max_working_batch_size, batch_size)
             split_data = {}
-            for key in batch_data:
-                if batch_data[key] is None:
+            for key in tensor_data:
+                if tensor_data[key] is None:
                     split_data[key] = None
                 else:
-                    split_data[key] = batch_data[key][i:end_idx, ...]
+                    split_data[key] = tensor_data[key][i:end_idx, ...]
+            # Add back scalar data (non-tensor params like max_new_tokens)
+            split_data.update(scalar_data)
 
             max_working_batch_size = _process_batch(
                 split_data, infer_method, max_working_batch_size
@@ -545,7 +627,7 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
 
     # Try processing with current batch size
     try:
-        infer_method(**batch_data)
+        infer_method(**batch_data, **generation_kwargs)
         return (
             batch_size
             if max_working_batch_size is None
@@ -562,8 +644,11 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
     # Split the batch in half
     mid = (batch_size + 1) // 2
     warn(f"CUDA out of memory with batch size {batch_size}, trying with batch size {mid}")
-    split_data_1 = {key: batch_data[key][:mid, ...] for key in batch_data}
-    split_data_2 = {key: batch_data[key][mid:, ...] for key in batch_data}
+    split_data_1 = {key: tensor_data[key][:mid, ...] for key in tensor_data}
+    split_data_2 = {key: tensor_data[key][mid:, ...] for key in tensor_data}
+    # Add back scalar data (non-tensor params like max_new_tokens)
+    split_data_1.update(scalar_data)
+    split_data_2.update(scalar_data)
 
     # Recursively process each half and track max working batch size
     max_working_batch_size = _process_batch(split_data_1, infer_method)
@@ -573,21 +658,28 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
     return max_working_batch_size
 
 
-def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None:
+def _forward_loop(
+    model: torch.nn.Module, dataloader: DataLoader, generation_kwargs: dict | None = None
+) -> None:
     """Runs forward passes through the model using data from the dataloader.
 
     Args:
         model: The PyTorch model to run inference on
         dataloader: DataLoader containing the batched input data
+        generation_kwargs: Keyword arguments to pass to the model.generate() method.
     """
+    if generation_kwargs is None:
+        generation_kwargs = {}
     with torch.no_grad():
-        is_enc_dec = model_type_is_enc_dec(model)
-        infer_method = model.generate if is_enc_dec else model.forward
+        use_generate = _should_use_generate(model)
+        infer_method = model.generate if use_generate else model.forward
         max_working_batch_size = None  # Initialize max working batch size as None
 
         for _, data in enumerate(tqdm(dataloader)):
             # Process batch and update max working batch size
-            max_working_batch_size = _process_batch(data, infer_method, max_working_batch_size)
+            max_working_batch_size = _process_batch(
+                data, infer_method, generation_kwargs, max_working_batch_size
+            )
 
 
 def create_forward_loop(
@@ -600,6 +692,7 @@ def create_forward_loop(
     device: str | None = None,
     include_labels: bool = False,
     dataloader: DataLoader | None = None,
+    generation_kwargs: dict | None = None,
 ) -> Callable:
     """Creates and returns a forward loop function configured for a specific model, dataset, and tokenizer.
 
@@ -618,7 +711,7 @@ def create_forward_loop(
         device: Target device for the returned dataloader.
         include_labels: Whether to include labels in the dataloader.
         dataloader: If provided, use the provided dataloader instead.
-
+        generation_kwargs: Keyword arguments to pass to the model.generate() method.
     Example usage for quantization:
 
     .. code-block:: python
@@ -641,6 +734,8 @@ def create_forward_loop(
         A forward loop function that can be called with no arguments. When called, this function iterates over
             the dataset specified by `dataset_name`.
     """
+    if generation_kwargs is None:
+        generation_kwargs = {}
     if dataloader is None:
         if batch_size == 0:
             # We let the system to determine the max data batch for each forward.
@@ -657,7 +752,7 @@ def create_forward_loop(
             include_labels=include_labels,
         )
 
-    return lambda model: _forward_loop(model, dataloader)
+    return lambda model: _forward_loop(model, dataloader, generation_kwargs)
 
 
 def model_type_is_enc_dec(model):
@@ -753,3 +848,18 @@ def download_hf_dataset_as_jsonl(
         jsonl_paths.append(jsonl_file_path)
 
     return jsonl_paths
+
+
+def _should_use_generate(model):
+    """Check if model should use generate() instead of forward() for calibration.
+
+    Returns True for:
+    - Encoder-decoder models (t5, bart, whisper)
+    - Conditional generation models that don't support standard forward() (qwen3omni)
+    """
+    generate_model_list = ["qwen3omni"]
+    model_name = model.__class__.__name__.lower()
+    needs_generate = model_type_is_enc_dec(model) or any(
+        name in model_name for name in generate_model_list
+    )
+    return needs_generate and hasattr(model, "generate")
diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py
index 6374642e3d..2f226e41c5 100644
--- a/modelopt/torch/utils/image_processor.py
+++ b/modelopt/torch/utils/image_processor.py
@@ -16,6 +16,8 @@
 # Adapted from tensorrt_llm/quantization/image_processing.py
 """Utility classes for image processing."""
 
+from typing import Any
+
 import torch
 
 
@@ -39,6 +41,33 @@ def collate_function(self, examples):
         """Collate function to process images during data loading."""
         raise NotImplementedError("Each image processor must implement its own collate method")
 
+    def _collate_first_item(self, batch, long_keys=(), float_keys=(), dtype=None):
+        """Shared collate helper: validates batch_size=1, converts lists to tensors.
+
+        Args:
+            batch: List of sample dicts from the DataLoader.
+            long_keys: Keys to convert via torch.LongTensor.
+            float_keys: Keys to convert via torch.tensor with optional dtype cast.
+            dtype: Optional dtype for float_keys tensors.
+
+        Returns:
+            Dict of tensors moved to self.device.
+        """
+        if len(batch) != 1:
+            raise ValueError(f"{type(self).__name__} currently supports batch_size=1 only.")
+        first = batch[0]
+        result = {}
+        for key in long_keys:
+            if first.get(key) is not None:
+                result[key] = torch.LongTensor(first[key]).to(self.device)
+        for key in float_keys:
+            if first.get(key) is not None:
+                t = torch.tensor(first[key])
+                if dtype is not None:
+                    t = t.to(dtype)
+                result[key] = t.to(self.device)
+        return result
+
 
 # A light Encapsulation for Huggingface MllamaImageProcessor
 
@@ -110,3 +139,173 @@ def collate_function(self, batch):
             ).to(self.device)
 
         return batch[0]
+
+
+class Qwen3OmniTextProcessor(BaseImageProcessor):
+    """Text-only processor for Qwen3-Omni that applies proper conversation template.
+
+    This processor wraps raw text in the Qwen3-Omni conversation format and applies
+    the chat template before tokenization. Use this for text-only calibration datasets.
+
+    See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking
+    """
+
+    def __init__(self, processor, device="auto", dtype=None):
+        """Constructor.
+
+        Args:
+            processor: The Qwen3OmniMoeProcessor (from AutoProcessor.from_pretrained).
+            device: Device to move tensors to.
+            dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default.
+        """
+        super().__init__(processor, device)
+        self.dtype = dtype
+
+    def preprocess_function(self, text: str) -> dict:
+        """Preprocess a single text sample by applying conversation template.
+
+        Args:
+            text: Raw text string from dataset.
+
+        Returns:
+            Dictionary with tokenized inputs.
+        """
+        # Build conversation in Qwen format (text-only)
+        conversation = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+        formatted_text = self.tokenizer.apply_chat_template(
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
+        )
+
+        # Tokenize with the processor (no multimodal inputs)
+        values = self.tokenizer(
+            text=formatted_text,
+            audio=None,
+            images=None,
+            videos=None,
+            return_tensors="pt",
+            padding=True,
+        )
+
+        return values
+
+    def collate_function(self, batch):
+        """Collate function to process text inputs during data loading."""
+        return self._collate_first_item(
+            batch,
+            long_keys=("input_ids", "attention_mask"),
+        )
+
+
+class _Qwen3OmniProcessorMixin:
+    """Shared preprocessing logic for Qwen3-Omni image/video processors."""
+
+    tokenizer: Any
+    process_mm_info: Any
+    use_audio_in_video: Any
+
+    def _tokenize_conversation(self, conversation):
+        """Tokenize a Qwen3-Omni conversation and return processor outputs.
+
+        Args:
+            conversation: List of conversation dicts in Qwen format.
+
+        Returns:
+            Processor output dict with tensors.
+        """
+        text = self.tokenizer.apply_chat_template(
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
+        )
+        audios, images, videos = self.process_mm_info(
+            conversation, use_audio_in_video=self.use_audio_in_video
+        )
+        return self.tokenizer(
+            text=text,
+            audio=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True,
+            use_audio_in_video=self.use_audio_in_video,
+        )
+
+    @staticmethod
+    def _serialize_for_arrow(values, all_keys):
+        """Convert processor outputs to lists for Arrow serialization.
+
+        Args:
+            values: Processor output dict (may contain tensors).
+            all_keys: List of keys to include in the result (ensures consistent schema).
+
+        Returns:
+            Dict with all_keys initialized to None, populated from values.
+        """
+        result = dict.fromkeys(all_keys)
+        for key, val in values.items():
+            if val is not None and hasattr(val, "tolist"):
+                result[key] = val.tolist()
+            elif val is not None:
+                result[key] = val
+        return result
+
+
+class Qwen3OmniImageProcessor(_Qwen3OmniProcessorMixin, BaseImageProcessor):
+    """Image processor for Qwen3-Omni multimodal model."""
+
+    _ALL_KEYS = [
+        "input_ids",
+        "attention_mask",
+        "pixel_values",
+        "image_grid_thw",
+        "audio_features",
+        "audio_feature_lens",
+        "video_grid_thw",
+    ]
+
+    def __init__(self, tokenizer, device="auto", dtype=None, use_audio_in_video=False):
+        """Constructor."""
+        super().__init__(tokenizer, device)
+        self.dtype = dtype
+        self.use_audio_in_video = use_audio_in_video
+        # Try to import qwen_omni_utils for multimodal processing
+        try:
+            from qwen_omni_utils import process_mm_info
+
+            self.process_mm_info = process_mm_info
+        except ImportError:
+            raise ImportError(
+                "qwen_omni_utils is required for Qwen3OmniImageProcessor. "
+                "Please install it from https://github.com/QwenLM/Qwen3-Omni"
+            )
+
+    def preprocess_function(self, examples):
+        """Preprocess function for Qwen3-Omni."""
+        question = examples.get("question", "Describe this image.")
+
+        # Build conversation in Qwen format
+        content = []
+        if examples.get("image") is not None:
+            content.append({"type": "image", "image": examples["image"]})
+        if examples.get("audio") is not None:
+            content.append({"type": "audio", "audio": examples["audio"]})
+        if examples.get("video") is not None:
+            content.append({"type": "video", "video": examples["video"]})
+        content.append({"type": "text", "text": question})
+
+        conversation = [{"role": "user", "content": content}]
+        values = self._tokenize_conversation(conversation)
+        return self._serialize_for_arrow(values, self._ALL_KEYS)
+
+    def collate_function(self, batch):
+        """Collate function to process inputs during data loading."""
+        return self._collate_first_item(
+            batch,
+            long_keys=(
+                "input_ids",
+                "attention_mask",
+                "image_grid_thw",
+                "audio_feature_lens",
+                "video_grid_thw",
+            ),
+            float_keys=("pixel_values", "audio_features"),
+            dtype=self.dtype,
+        )
diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py
new file mode 100644
index 0000000000..d8b02b7ee1
--- /dev/null
+++ b/modelopt/torch/utils/video_dataset_utils.py
@@ -0,0 +1,277 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for getting samples and forward loop function for video datasets."""
+
+import os
+import tempfile
+from typing import Any
+
+import torch
+from torch.utils.data import DataLoader
+
+from .image_processor import BaseImageProcessor, _Qwen3OmniProcessorMixin
+
+# Use dict to store the config for each dataset.
+SUPPORTED_VIDEO_DATASET_CONFIG: dict[str, dict[str, Any]] = {
+    "finevideo": {
+        "config": {"path": "HuggingFaceFV/finevideo", "split": "train", "streaming": True}
+    },
+}
+
+__all__ = [
+    "Qwen3OmniVideoProcessor",
+    "get_supported_video_datasets",
+    "get_video_dataset_dataloader",
+]
+
+
+def _get_video_dataset(dataset_name: str, num_samples: int):
+    """Load a portion of train dataset with the dataset name and a given size.
+
+    Args:
+        dataset_name: Name of the dataset to load.
+        num_samples: Number of samples to load from the dataset.
+
+    Returns:
+        A hugging face Dataset.
+    """
+    if dataset_name in SUPPORTED_VIDEO_DATASET_CONFIG:
+        from datasets import Dataset, load_dataset
+
+        config = SUPPORTED_VIDEO_DATASET_CONFIG[dataset_name]["config"]
+        is_streaming = config.get("streaming", False)
+
+        dataset = load_dataset(**config)
+
+        if is_streaming:
+            # For streaming datasets, use take() and convert to list then Dataset
+            samples = list(dataset.take(num_samples))
+            return Dataset.from_list(samples)
+        else:
+            return dataset.select(range(num_samples))
+    else:
+        raise NotImplementedError(
+            f"dataset {dataset_name} is not supported. Please use one of the following:"
+            f" {get_supported_video_datasets()}."
+        )
+
+
+def get_supported_video_datasets() -> list[str]:
+    """Retrieves a list of video datasets supported.
+
+    Returns:
+        A list of strings, where each string is the name of a supported dataset.
+
+    Example usage:
+
+    .. code-block:: python
+
+        from modelopt.torch.utils import get_supported_video_datasets
+
+        print("Supported video datasets:", get_supported_video_datasets())
+    """
+    return list(SUPPORTED_VIDEO_DATASET_CONFIG.keys())
+
+
+def get_video_dataset_dataloader(
+    dataset_name: str = "finevideo",
+    processor: "Qwen3OmniVideoProcessor" = None,
+    batch_size: int = 1,
+    num_samples: int = 512,
+    cache_dir: str | None = None,
+) -> DataLoader:
+    """Get a dataloader with the dataset name and processor of the target model.
+
+    Args:
+        dataset_name: Name of the dataset to load.
+        processor: Processor used for encoding video and text data.
+        batch_size: Batch size of the returned dataloader.
+        num_samples: Number of samples from the dataset.
+        cache_dir: Directory to cache the processed dataset. Defaults to a temp directory.
+            If the cache exists, it will be loaded instead of reprocessing.
+
+    Returns:
+        An instance of dataloader.
+    """
+    assert processor is not None, "Please provide a valid processor."
+
+    # Default cache_dir to temp directory
+    if cache_dir is None:
+        cache_dir = os.path.join(tempfile.gettempdir(), "modelopt_video_dataset_cache")
+
+    processed_dataset = None
+
+    # Try to load from cache (use torch.save/load to avoid Arrow 32-bit offset overflow)
+    if cache_dir is not None:
+        cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed.pt")
+        if os.path.exists(cache_path):
+            try:
+                from datasets import Dataset
+
+                # weights_only=False is safe here: the cache file is self-generated at line 151
+                processed_samples = torch.load(cache_path, weights_only=False)
+                processed_dataset = Dataset.from_list(processed_samples)
+                print(f"Loaded processed dataset from cache: {cache_path}")
+            except Exception as e:
+                print(f"Failed to load cache from {cache_path}: {e}. Reprocessing...")
+                processed_dataset = None
+
+    # Process dataset if not loaded from cache
+    if processed_dataset is None:
+        from datasets import Dataset
+
+        dataset = _get_video_dataset(dataset_name, num_samples=num_samples)
+
+        # Process samples manually to avoid Arrow 32-bit offset overflow
+        # (dataset.map() uses Arrow internally which can't handle large nested lists)
+        processed_samples = []
+        for i, sample in enumerate(dataset):
+            processed = processor.preprocess_function(sample)
+            processed_samples.append(processed)
+            if (i + 1) % 10 == 0:
+                print(f"Processed {i + 1}/{len(dataset)} samples...")
+
+        processed_dataset = Dataset.from_list(processed_samples)
+
+        # Save to cache using torch.save to avoid Arrow 32-bit offset overflow
+        if cache_dir is not None:
+            os.makedirs(cache_dir, exist_ok=True)
+            torch.save(processed_samples, cache_path)
+            print(f"Saved processed dataset to cache: {cache_path}")
+
+    # Create DataLoader with the custom collate function
+    return DataLoader(
+        processed_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=processor.collate_function,
+    )
+
+
+class Qwen3OmniVideoProcessor(_Qwen3OmniProcessorMixin, BaseImageProcessor):
+    """Video processor for Qwen3-Omni multimodal model with finevideo dataset support."""
+
+    def __init__(self, tokenizer, device="cuda", dtype=None, use_audio_in_video=True):
+        """Constructor.
+
+        Args:
+            tokenizer: The Qwen3OmniMoeProcessor for tokenizing and processing inputs.
+            device: Device to move tensors to.
+            dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default.
+            use_audio_in_video: Whether to extract and use audio from video files.
+        """
+        super().__init__(tokenizer, device)
+        self.dtype = dtype
+        self.use_audio_in_video = use_audio_in_video
+        self._temp_dir = tempfile.mkdtemp(prefix="qwen3omni_video_")
+        self._video_counter = 0
+        # Try to import qwen_omni_utils for multimodal processing
+        try:
+            from qwen_omni_utils import process_mm_info
+
+            self.process_mm_info = process_mm_info
+        except ImportError:
+            raise ImportError(
+                "qwen_omni_utils is required for Qwen3OmniVideoProcessor. "
+                "Please install it from https://github.com/QwenLM/Qwen3-Omni"
+            )
+
+    def _save_video_bytes_to_file(self, video_bytes: bytes) -> str:
+        """Save video bytes to a temporary file and return the path.
+
+        Args:
+            video_bytes: Raw video bytes (e.g., from finevideo's 'mp4' field).
+
+        Returns:
+            Path to the temporary video file.
+        """
+        video_path = os.path.join(self._temp_dir, f"video_{self._video_counter}.mp4")
+        self._video_counter += 1
+        with open(video_path, "wb") as f:
+            f.write(video_bytes)
+        return video_path
+
+    _ALL_KEYS = [
+        "input_ids",
+        "attention_mask",
+        "pixel_values_videos",
+        "video_grid_thw",
+        "video_second_per_grid",
+        "feature_attention_mask",
+        "input_features",
+    ]
+
+    def preprocess_function(self, examples):
+        """Preprocess function for Qwen3-Omni with video support.
+
+        Handles both standard video paths and raw video bytes (finevideo format).
+        """
+        # Get question/prompt - finevideo has metadata in 'json' field
+        if "json" in examples and examples["json"] is not None:
+            metadata = examples["json"]
+            category = metadata.get("content_fine_category", "")
+            question = (
+                f"Describe what is happening in this video in detail. Category hint: {category}"
+            )
+        else:
+            question = examples.get("question", "Describe this video in detail.")
+
+        # Build conversation in Qwen format
+        content = []
+
+        # Handle video - check for raw bytes (finevideo format) or path
+        video_path = None
+        if examples.get("mp4") is not None:
+            video_path = self._save_video_bytes_to_file(examples["mp4"])
+        elif examples.get("video") is not None:
+            video_path = examples["video"]
+
+        if video_path is not None:
+            content.append({"type": "video", "video": video_path})
+
+        content.append({"type": "text", "text": question})
+
+        conversation = [{"role": "user", "content": content}]
+        values = self._tokenize_conversation(conversation)
+        return self._serialize_for_arrow(values, self._ALL_KEYS)
+
+    def collate_function(self, batch):
+        """Collate function to process inputs during data loading."""
+        result = self._collate_first_item(
+            batch,
+            long_keys=(
+                "input_ids",
+                "attention_mask",
+                "video_grid_thw",
+                "feature_attention_mask",
+            ),
+            float_keys=("pixel_values_videos", "video_second_per_grid", "input_features"),
+            dtype=self.dtype,
+        )
+        # Pass use_audio_in_video flag to model.generate() for Qwen3Omni
+        result["use_audio_in_video"] = self.use_audio_in_video
+        return result
+
+    def cleanup(self):
+        """Clean up temporary video files."""
+        import shutil
+
+        if os.path.exists(self._temp_dir):
+            shutil.rmtree(self._temp_dir)
+
+    def __del__(self):
+        """Ensure temporary files are cleaned up when the processor is garbage collected."""
+        self.cleanup()