Bug fixes

ajrasane · ajrasane · commit 7a857ade2bde · 2026-04-01T18:44:35.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -45,7 +45,6 @@
 except ImportError:
     snapshot_download = None
 
-import modelopt.torch.quantization as mtq
 from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE
 from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
 from modelopt.torch.utils.image_processor import (
@@ -1074,6 +1073,9 @@ def get_qwen3omni_dataloader(
         num_samples = [512, 512]
 
     if processor is not None:
+        # Normalize single-element list to str for supported-dataset lookups
+        if isinstance(dataset_name, list) and len(dataset_name) == 1:
+            dataset_name = dataset_name[0]
         if dataset_name in get_supported_video_datasets():
             assert isinstance(dataset_name, str)
             video_processor = Qwen3OmniVideoProcessor(
@@ -1093,7 +1095,8 @@ def get_qwen3omni_dataloader(
             assert isinstance(processor, Qwen3OmniImageProcessor), (
                 "The Qwen3OmniImageProcessor must be set."
             )
-            # Set the dtype for proper tensor conversion in collate_function
+            # Set dtype for proper tensor conversion in collate_function.
+            # Processor is created before model_dtype is known, so we set it here.
             processor.dtype = model_dtype
             calib_dataloader = get_vlm_dataset_dataloader(
                 dataset_name=dataset_name,
diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py
@@ -52,6 +52,12 @@ def main():
     parser.add_argument("--top-p", type=float, default=0.9, help="Top-p sampling")
     parser.add_argument("--top-k", type=int, default=-1, help="Top-k sampling (-1 to disable)")
     parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate")
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        default=False,
+        help="Trust remote code from HuggingFace model repos",
+    )
 
     args = parser.parse_args()
 
@@ -65,15 +71,17 @@ def main():
 
     # Get max_model_len from config if not specified
     if args.max_model_len is None:
-        config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
+        config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
         args.max_model_len = getattr(config, "max_position_embeddings", 4096)
         print(f"Using max_model_len from config: {args.max_model_len}")
 
     # Determine tokenizer source
     tokenizer_id = args.tokenizer or args.model
 
     # Load processor for chat template
-    processor = AutoProcessor.from_pretrained(tokenizer_id, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(
+        tokenizer_id, trust_remote_code=args.trust_remote_code
+    )
 
     # Text-only conversations
     conversations = [
@@ -106,7 +114,7 @@ def main():
         tokenizer=tokenizer_id,
         tensor_parallel_size=args.tp,
         max_model_len=args.max_model_len,
-        trust_remote_code=True,
+        trust_remote_code=args.trust_remote_code,
         quantization=quantization,
     )
 
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
@@ -17,7 +17,6 @@
 
 import fnmatch
 import inspect
-import os
 import warnings
 from collections.abc import Callable, Iterable
 from typing import Any
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
@@ -606,8 +606,8 @@ def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_b
     assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), (
         "tensor_data values must be tensors"
     )
-    # Get the batch size of current data
-    batch_size = tensor_data[next(iter(batch_data.keys()))].shape[0]
+    # Get the batch size from the first non-None tensor value
+    batch_size = next(v for v in tensor_data.values() if v is not None).shape[0]
 
     # If we know a smaller batch size works, preemptively split
     if max_working_batch_size is not None and batch_size > max_working_batch_size:
diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py
@@ -175,9 +175,10 @@ def collate_function(self, batch):
 class Qwen3OmniImageProcessor(BaseImageProcessor):
     """Image processor for Qwen3-Omni multimodal model."""
 
-    def __init__(self, tokenizer, device="auto", use_audio_in_video=False):
+    def __init__(self, tokenizer, device="auto", dtype=None, use_audio_in_video=False):
         """Constructor."""
         super().__init__(tokenizer, device)
+        self.dtype = dtype
         self.use_audio_in_video = use_audio_in_video
         # Try to import qwen_omni_utils for multimodal processing
         try:
@@ -251,7 +252,8 @@ def collate_function(self, batch):
         """Collate function to process inputs during data loading."""
         result = {}
 
-        # Take first item from batch (batch_size handling)
+        # Take first item only — multimodal inputs have variable-length sequences
+        # (images, audio) that cannot be stacked, so batch_size=1 is expected.
         first = batch[0]
 
         # Convert lists to tensors and move to device
@@ -262,7 +264,10 @@ def collate_function(self, batch):
 
         # Handle pixel values for images
         if first.get("pixel_values") is not None:
-            result["pixel_values"] = torch.tensor(first["pixel_values"]).to(self.device)
+            pv = torch.tensor(first["pixel_values"])
+            if self.dtype is not None:
+                pv = pv.to(self.dtype)
+            result["pixel_values"] = pv.to(self.device)
 
         # Handle image grid thw (tile height width info)
         if first.get("image_grid_thw") is not None:
@@ -274,7 +279,10 @@ def collate_function(self, batch):
                 self.device
             )
         if first.get("audio_features") is not None:
-            result["audio_features"] = torch.tensor(first["audio_features"]).to(self.device)
+            af = torch.tensor(first["audio_features"])
+            if self.dtype is not None:
+                af = af.to(self.dtype)
+            result["audio_features"] = af.to(self.device)
 
         # Handle video features if present
         if first.get("video_grid_thw") is not None:
diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py
@@ -121,6 +121,7 @@ def get_video_dataset_dataloader(
             try:
                 from datasets import Dataset
 
+                # weights_only=False is safe here: the cache file is self-generated at line 151
                 processed_samples = torch.load(cache_path, weights_only=False)
                 processed_dataset = Dataset.from_list(processed_samples)
                 print(f"Loaded processed dataset from cache: {cache_path}")
@@ -282,7 +283,8 @@ def collate_function(self, batch):
         """Collate function to process inputs during data loading."""
         result = {}
 
-        # Take first item from batch (batch_size handling)
+        # Take first item only — multimodal inputs have variable-length sequences
+        # (video frames, audio) that cannot be stacked, so batch_size=1 is expected.
         first = batch[0]
 
         # Convert lists to tensors and move to device