Skip to content

Commit 4f92fbf

Browse files
committed
restore configs
Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
1 parent aa77565 commit 4f92fbf

5 files changed

Lines changed: 27 additions & 199 deletions

File tree

examples/llm_ptq/example_utils.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -244,21 +244,27 @@ def build_quant_cfg(
244244
quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False}
245245
quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False}
246246

247-
# Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
248-
if "qkv_disabled" in qformat:
249-
quant_cfg = copy.deepcopy(quant_cfg) # Don't modify global config
247+
if model_type == "qwen3omni":
248+
if qformat == "qwen3_nvfp4_qkv_disabled":
250249
for proj in ["q_proj", "k_proj", "v_proj"]:
251250
quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
252251
"enable": False
253252
}
254-
if "qkvo_disabled" in qformat:
255-
if "qkv_disabled" not in qformat: # Avoid double deepcopy
256-
quant_cfg = copy.deepcopy(quant_cfg)
257-
for proj in ["o_proj"]:
253+
elif qformat == "qwen3_nvfp4_qkvo_disabled":
254+
for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
258255
quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
259256
"enable": False
260257
}
261258

259+
elif qformat == "qwen3_nvfp4_first_and_last_n_disabled":
260+
# Disable both first N and last N layers
261+
total_layers = 48
262+
n_layers_to_disable = 4
263+
for i in range(n_layers_to_disable):
264+
quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
265+
for i in range(total_layers - n_layers_to_disable, total_layers):
266+
quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
267+
262268
return quant_cfg
263269

264270

examples/llm_ptq/hf_ptq.py

Lines changed: 2 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
# limitations under the License.
1515

1616
import argparse
17-
import contextlib
18-
import os
1917
import random
2018
import time
2119
import warnings
@@ -104,7 +102,7 @@
104102
"mxfp8": mtq.MXFP8_DEFAULT_CFG,
105103
"qwen3_nvfp4_qkv_disabled": mtq.NVFP4_DEFAULT_CFG,
106104
"qwen3_nvfp4_qkvo_disabled": mtq.NVFP4_DEFAULT_CFG,
107-
"qwen3_first_and_last_n_disabled": mtq.NVFP4_DEFAULT_CFG,
105+
"qwen3_nvfp4_first_and_last_n_disabled": mtq.NVFP4_DEFAULT_CFG,
108106
}
109107

110108
KV_QUANT_CFG_CHOICES = {
@@ -199,9 +197,6 @@ def make_calib_dataloader(
199197
num_samples=args.calib_size[0],
200198
)
201199
elif model_type == "qwen3omni":
202-
assert len(args.calib_size) == 1, (
203-
"qwen3omni only supports one dataset for calibration, can extend this in the future"
204-
)
205200
assert processor is not None, "The processor must be set for qwen3omni model."
206201
dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail"
207202
# Check if using video dataset (e.g., finevideo)
@@ -394,10 +389,6 @@ def load_model(args: argparse.Namespace):
394389
attn_implementation=args.attn_implementation,
395390
)
396391

397-
# Uncomment this to load the model from a .pt file
398-
# model = mto.restore(model, "./qwen3_omni_30b_nvfp4/model.pt")
399-
# print("Qwen3Omni model restored from checkpoint")
400-
401392
quant_cfg = QUANT_CFG_CHOICES[args.qformat]
402393
else:
403394
assert args.qformat in QUANT_CFG_CHOICES, (
@@ -425,18 +416,13 @@ def load_model(args: argparse.Namespace):
425416
calibration_only = True
426417

427418
model_type = get_model_type(full_model)
428-
if model_type == "qwen3omni" and os.environ.get("DISABLE_TALKER", "0") == "1":
419+
if model_type == "qwen3omni":
429420
print("Disabling talker for Qwen3Omni model")
430421
full_model.disable_talker()
431422

432423
device = full_model.device
433424
if hasattr(full_model, "model"):
434425
device = full_model.model.device
435-
# For multi-GPU models with device_map="auto", model.device may return 'meta' or 'cpu'
436-
# since parameters are distributed. Force cuda:0 for input tensors.
437-
if device is None or str(device) in ("meta", "cpu"):
438-
device = "cuda"
439-
print(f"Overriding device to {device}")
440426

441427
processor = None
442428
tokenizer = None
@@ -620,158 +606,6 @@ def mono_quantize(
620606
if language_model_lineage is not None:
621607
print("Updating full_model with quantized language_model...")
622608
language_model_lineage[-2].language_model = language_model
623-
624-
# Qwen3 specific quantizer disabling patterns (thinker.model.layers only)
625-
if "qkv_disabled" in args.qformat:
626-
# Disable q_proj, k_proj, v_proj quantizers
627-
for proj in ["q_proj", "k_proj", "v_proj"]:
628-
quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
629-
"enable": False
630-
}
631-
if "qkvo_disabled" in args.qformat:
632-
# Disable q_proj, k_proj, v_proj, o_proj quantizers
633-
for proj in ["o_proj"]:
634-
quant_cfg["quant_cfg"][f"*thinker.model.layers.*.self_attn.{proj}*"] = {
635-
"enable": False
636-
}
637-
if "first_and_last_n_disabled" in args.qformat:
638-
# Disable both first N and last N layers
639-
total_layers = 48
640-
n_layers_to_disable = 4
641-
for i in range(n_layers_to_disable):
642-
quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
643-
for i in range(total_layers - n_layers_to_disable, total_layers):
644-
quant_cfg["quant_cfg"][f"*thinker.model.layers.{i}.*"] = {"enable": False}
645-
646-
if not model_is_already_quantized or calibration_only:
647-
# Only run single sample for preview
648-
calib_batch = next(iter(calib_dataloader))
649-
input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][
650-
0:1
651-
]
652-
653-
# Generate preview before quantization
654-
if is_nemotron_vl_model and tokenizer is not None:
655-
generated_ids_before_ptq = run_nemotron_vl_preview(
656-
full_model,
657-
tokenizer,
658-
input_ids,
659-
args.pyt_ckpt_path,
660-
"before quantization",
661-
allow_fallback=True,
662-
)
663-
elif model_type == "qwen3omni":
664-
# Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
665-
# Pass full batch with all multimodal inputs
666-
result = full_model.generate(**calib_batch, max_new_tokens=100)
667-
if isinstance(result, tuple):
668-
text_ids, _ = result
669-
generated_ids_before_ptq = (
670-
text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
671-
)
672-
else:
673-
generated_ids_before_ptq = result
674-
else:
675-
# Standard generation for non-Nemotron VL models
676-
generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
677-
if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
678-
print("Applying nvfp4 quantization (MoE only) for gpt-oss")
679-
680-
# quantize the model
681-
model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only)
682-
683-
# For VL models, update full_model to use the quantized language model
684-
if is_nemotron_vl_model:
685-
language_model_lineage = get_language_model_from_vl(full_model)
686-
if language_model_lineage is not None:
687-
print("Updating full_model with quantized language_model...")
688-
language_model_lineage[-2].language_model = model
689-
690-
if args.verbose:
691-
with open("./quant_summary.txt", "w") as f, contextlib.redirect_stdout(f):
692-
mtq.print_quant_summary(full_model)
693-
694-
# Run some samples
695-
torch.cuda.empty_cache()
696-
generated_ids_after_ptq = None
697-
if model_type == "qwen3omni":
698-
# Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
699-
# Pass full batch with all multimodal inputs
700-
result = full_model.generate(**calib_batch, max_new_tokens=100)
701-
if isinstance(result, tuple):
702-
text_ids, _ = result
703-
generated_ids_after_ptq = (
704-
text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
705-
)
706-
else:
707-
generated_ids_after_ptq = result
708-
elif model_type != "llama4" and not is_nemotron_vl_model:
709-
# Our fake quantizer may not be fully compatible with torch.compile.
710-
generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100)
711-
elif is_nemotron_vl_model and tokenizer is not None:
712-
generated_ids_after_ptq = run_nemotron_vl_preview(
713-
full_model,
714-
tokenizer,
715-
input_ids,
716-
args.pyt_ckpt_path,
717-
"after quantization",
718-
allow_fallback=False,
719-
)
720-
else:
721-
warnings.warn(
722-
"Llama4 Maverick generation after quantization has a bug. Skipping generation sample."
723-
)
724-
725-
def input_decode(input_ids):
726-
# BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor
727-
if processor is not None and isinstance(processor, BaseImageProcessor):
728-
return processor.tokenizer.batch_decode(input_ids)
729-
elif processor is not None and isinstance(processor, WhisperProcessor):
730-
return first_text
731-
elif tokenizer is not None:
732-
return tokenizer.batch_decode(input_ids)
733-
else:
734-
raise ValueError("The processor or tokenizer must be set")
735-
736-
def output_decode(generated_ids, input_shape):
737-
if is_enc_dec(model_type):
738-
if processor is not None and isinstance(processor, WhisperProcessor):
739-
return processor.tokenizer.batch_decode(
740-
generated_ids, skip_special_tokens=True
741-
)[0]
742-
elif tokenizer is not None:
743-
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
744-
elif processor is not None and isinstance(processor, MllamaImageProcessor):
745-
return processor.tokenizer.batch_decode(generated_ids[:, input_shape:])
746-
elif processor is not None and isinstance(processor, Qwen3OmniImageProcessor):
747-
return processor.tokenizer.batch_decode(
748-
generated_ids[:, input_shape:],
749-
skip_special_tokens=True,
750-
clean_up_tokenization_spaces=False,
751-
)
752-
elif tokenizer is not None:
753-
return tokenizer.batch_decode(generated_ids[:, input_shape:])
754-
else:
755-
raise ValueError("The processor or tokenizer must be set")
756-
757-
if generated_ids_after_ptq is not None:
758-
print("--------")
759-
if is_nemotron_vl_model:
760-
# For Nemotron VL models, generated_ids are text strings from model.chat()
761-
print("Nemotron VL model text-only generation results:")
762-
print(f"Text response before quantization: {generated_ids_before_ptq}")
763-
print("--------")
764-
print(f"Text response after quantization: {generated_ids_after_ptq}")
765-
print("--------")
766-
print("Note: Additional VL tests with images were run separately above")
767-
else:
768-
# For regular LLMs, generated_ids are token tensors that need decoding
769-
print(f"example test input: {input_decode(input_ids)}")
770-
print("--------")
771-
print(
772-
f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
773-
)
774-
print("--------")
775609
else:
776610
warnings.warn("Skipping quantization: model is already quantized.")
777611

@@ -785,12 +619,6 @@ def export_quantized(
785619
default_padding_side,
786620
default_pad_token,
787621
):
788-
# Uncomment this to save the model as a .pt file
789-
# if model_type == "qwen3omni":
790-
# print("Export of Qwen3Omni model is not supported yet. Saving .pt file instead.")
791-
# os.makedirs(os.path.dirname(args.export_path), exist_ok=True)
792-
# mto.save(full_model, f"{args.export_path}/model.pt")
793-
794622
with torch.inference_mode():
795623
if model_type is None:
796624
print(f"Unknown model type {type(language_model).__name__}. Continue exporting...")

modelopt/torch/export/unified_export_hf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ def _process_quantized_modules(
631631
_export_quantized_weight(sub_module, dtype, weight_name)
632632

633633

634-
def _export_hf_checkpoint(
634+
def _export_transformers_checkpoint(
635635
model: nn.Module,
636636
dtype: torch.dtype | None = None,
637637
is_modelopt_qlora: bool = False,
@@ -1003,7 +1003,7 @@ def export_hf_checkpoint(
10031003
# Packed weights are only for TRT-LLM consumption
10041004
# Set this to true if you want to save the weights in the original precision
10051005
pack_weights = True
1006-
post_state_dict, hf_quant_config = _export_hf_checkpoint(
1006+
post_state_dict, hf_quant_config = _export_transformers_checkpoint(
10071007
model, dtype, pack_weights=pack_weights
10081008
)
10091009

modelopt/torch/utils/dataset_utils.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
},
7575
"cnn_dailymail": {
7676
"config": {"path": "abisee/cnn_dailymail", "name": "3.0.0", "split": ["train"]},
77-
"preprocess": lambda sample: "/no_think " + sample["article"],
77+
"preprocess": lambda sample: sample["article"],
7878
},
7979
"pile": {
8080
"config": {"path": "monology/pile-uncopyrighted", "name": "v1.0", "split": ["train"]},
@@ -365,9 +365,8 @@ def _get_free_gpu_mem():
365365
torch.cuda.empty_cache()
366366

367367
free_mem_before, max_allocated_before = _get_free_gpu_mem()
368-
is_enc_dec = model_type_is_enc_dec(model)
369-
requires_generate = _model_requires_generate(model)
370-
infer_method = model.generate if (is_enc_dec or requires_generate) else model.forward
368+
use_generate = _should_use_generate(model)
369+
infer_method = model.generate if use_generate else model.forward
371370

372371
if sample_input_single_batch is None:
373372
sample_input_single_batch = (
@@ -504,9 +503,7 @@ def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None:
504503
dataloader: DataLoader containing the batched input data
505504
"""
506505
with torch.no_grad():
507-
is_enc_dec = model_type_is_enc_dec(model)
508-
requires_generate = _model_requires_generate(model)
509-
use_generate = is_enc_dec or requires_generate
506+
use_generate = _should_use_generate(model)
510507
infer_method = model.generate if use_generate else model.forward
511508
max_working_batch_size = None # Initialize max working batch size as None
512509

@@ -593,13 +590,13 @@ def model_type_is_enc_dec(model):
593590
return any(model_name in model.__class__.__name__.lower() for model_name in enc_dec_model_list)
594591

595592

596-
def _model_requires_generate(model):
597-
"""Check if model requires generate() instead of forward() for calibration.
593+
def _should_use_generate(model):
594+
"""Check if model should use generate() instead of forward() for calibration.
598595
599-
Some conditional generation models (like Qwen3-Omni) don't have a standard
600-
forward(input_ids, ...) signature and need to use generate() for calibration.
596+
Returns True for:
597+
- Encoder-decoder models (t5, bart, whisper)
598+
- Conditional generation models that don't support standard forward() (qwen3omni)
601599
"""
602-
# Models that require generate() for calibration instead of forward()
603600
generate_model_list = ["qwen3omni"]
604601
model_name = model.__class__.__name__.lower()
605-
return any(name in model_name for name in generate_model_list)
602+
return model_type_is_enc_dec(model) or any(name in model_name for name in generate_model_list)

modelopt/torch/utils/image_processor.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,6 @@ class BaseImageProcessor:
2525
def __init__(self, tokenizer, device="cuda"):
2626
"""Constructor."""
2727
self.tokenizer = tokenizer
28-
# Handle invalid device values that can come from multi-GPU models with device_map="auto"
29-
if device is None or str(device) in ("auto", "meta", "cpu"):
30-
device = "cuda"
3128
self.device = device
3229

3330
def __call__(self, **kwargs):

0 commit comments

Comments
 (0)