Skip to content

Commit 7a857ad

Browse files
committed
Bug fixes
Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
1 parent 2a730e3 commit 7a857ad

File tree

6 files changed

+33
-13
lines changed

6 files changed

+33
-13
lines changed

examples/llm_ptq/example_utils.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@
4545
except ImportError:
4646
snapshot_download = None
4747

48-
import modelopt.torch.quantization as mtq
4948
from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE
5049
from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
5150
from modelopt.torch.utils.image_processor import (
@@ -1074,6 +1073,9 @@ def get_qwen3omni_dataloader(
10741073
num_samples = [512, 512]
10751074

10761075
if processor is not None:
1076+
# Normalize single-element list to str for supported-dataset lookups
1077+
if isinstance(dataset_name, list) and len(dataset_name) == 1:
1078+
dataset_name = dataset_name[0]
10771079
if dataset_name in get_supported_video_datasets():
10781080
assert isinstance(dataset_name, str)
10791081
video_processor = Qwen3OmniVideoProcessor(
@@ -1093,7 +1095,8 @@ def get_qwen3omni_dataloader(
10931095
assert isinstance(processor, Qwen3OmniImageProcessor), (
10941096
"The Qwen3OmniImageProcessor must be set."
10951097
)
1096-
# Set the dtype for proper tensor conversion in collate_function
1098+
# Set dtype for proper tensor conversion in collate_function.
1099+
# Processor is created before model_dtype is known, so we set it here.
10971100
processor.dtype = model_dtype
10981101
calib_dataloader = get_vlm_dataset_dataloader(
10991102
dataset_name=dataset_name,

examples/llm_ptq/run_vllm.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ def main():
5252
parser.add_argument("--top-p", type=float, default=0.9, help="Top-p sampling")
5353
parser.add_argument("--top-k", type=int, default=-1, help="Top-k sampling (-1 to disable)")
5454
parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate")
55+
parser.add_argument(
56+
"--trust-remote-code",
57+
action="store_true",
58+
default=False,
59+
help="Trust remote code from HuggingFace model repos",
60+
)
5561

5662
args = parser.parse_args()
5763

@@ -65,15 +71,17 @@ def main():
6571

6672
# Get max_model_len from config if not specified
6773
if args.max_model_len is None:
68-
config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
74+
config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
6975
args.max_model_len = getattr(config, "max_position_embeddings", 4096)
7076
print(f"Using max_model_len from config: {args.max_model_len}")
7177

7278
# Determine tokenizer source
7379
tokenizer_id = args.tokenizer or args.model
7480

7581
# Load processor for chat template
76-
processor = AutoProcessor.from_pretrained(tokenizer_id, trust_remote_code=True)
82+
processor = AutoProcessor.from_pretrained(
83+
tokenizer_id, trust_remote_code=args.trust_remote_code
84+
)
7785

7886
# Text-only conversations
7987
conversations = [
@@ -106,7 +114,7 @@ def main():
106114
tokenizer=tokenizer_id,
107115
tensor_parallel_size=args.tp,
108116
max_model_len=args.max_model_len,
109-
trust_remote_code=True,
117+
trust_remote_code=args.trust_remote_code,
110118
quantization=quantization,
111119
)
112120

modelopt/torch/quantization/model_quant.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
import fnmatch
1919
import inspect
20-
import os
2120
import warnings
2221
from collections.abc import Callable, Iterable
2322
from typing import Any

modelopt/torch/utils/dataset_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -606,8 +606,8 @@ def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_b
606606
assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), (
607607
"tensor_data values must be tensors"
608608
)
609-
# Get the batch size of current data
610-
batch_size = tensor_data[next(iter(batch_data.keys()))].shape[0]
609+
# Get the batch size from the first non-None tensor value
610+
batch_size = next(v for v in tensor_data.values() if v is not None).shape[0]
611611

612612
# If we know a smaller batch size works, preemptively split
613613
if max_working_batch_size is not None and batch_size > max_working_batch_size:

modelopt/torch/utils/image_processor.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,10 @@ def collate_function(self, batch):
175175
class Qwen3OmniImageProcessor(BaseImageProcessor):
176176
"""Image processor for Qwen3-Omni multimodal model."""
177177

178-
def __init__(self, tokenizer, device="auto", use_audio_in_video=False):
178+
def __init__(self, tokenizer, device="auto", dtype=None, use_audio_in_video=False):
179179
"""Constructor."""
180180
super().__init__(tokenizer, device)
181+
self.dtype = dtype
181182
self.use_audio_in_video = use_audio_in_video
182183
# Try to import qwen_omni_utils for multimodal processing
183184
try:
@@ -251,7 +252,8 @@ def collate_function(self, batch):
251252
"""Collate function to process inputs during data loading."""
252253
result = {}
253254

254-
# Take first item from batch (batch_size handling)
255+
# Take first item only — multimodal inputs have variable-length sequences
256+
# (images, audio) that cannot be stacked, so batch_size=1 is expected.
255257
first = batch[0]
256258

257259
# Convert lists to tensors and move to device
@@ -262,7 +264,10 @@ def collate_function(self, batch):
262264

263265
# Handle pixel values for images
264266
if first.get("pixel_values") is not None:
265-
result["pixel_values"] = torch.tensor(first["pixel_values"]).to(self.device)
267+
pv = torch.tensor(first["pixel_values"])
268+
if self.dtype is not None:
269+
pv = pv.to(self.dtype)
270+
result["pixel_values"] = pv.to(self.device)
266271

267272
# Handle image grid thw (tile height width info)
268273
if first.get("image_grid_thw") is not None:
@@ -274,7 +279,10 @@ def collate_function(self, batch):
274279
self.device
275280
)
276281
if first.get("audio_features") is not None:
277-
result["audio_features"] = torch.tensor(first["audio_features"]).to(self.device)
282+
af = torch.tensor(first["audio_features"])
283+
if self.dtype is not None:
284+
af = af.to(self.dtype)
285+
result["audio_features"] = af.to(self.device)
278286

279287
# Handle video features if present
280288
if first.get("video_grid_thw") is not None:

modelopt/torch/utils/video_dataset_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ def get_video_dataset_dataloader(
121121
try:
122122
from datasets import Dataset
123123

124+
# weights_only=False is safe here: the cache file is self-generated at line 151
124125
processed_samples = torch.load(cache_path, weights_only=False)
125126
processed_dataset = Dataset.from_list(processed_samples)
126127
print(f"Loaded processed dataset from cache: {cache_path}")
@@ -282,7 +283,8 @@ def collate_function(self, batch):
282283
"""Collate function to process inputs during data loading."""
283284
result = {}
284285

285-
# Take first item from batch (batch_size handling)
286+
# Take first item only — multimodal inputs have variable-length sequences
287+
# (video frames, audio) that cannot be stacked, so batch_size=1 is expected.
286288
first = batch[0]
287289

288290
# Convert lists to tensors and move to device

0 commit comments

Comments
 (0)