NVIDIA
diff --git a/‎examples/llm_ptq/example_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/llm_ptq/example_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 65 additions & 20 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 65 additions & 20 deletions
diff --git a/‎modelopt/torch/quantization/model_calib.py‎
Lines changed: 19 additions & 0 deletions b/‎modelopt/torch/quantization/model_calib.py‎
Lines changed: 19 additions & 0 deletions
@@ -583,7 +583,7 @@ def get_model(
     model_kwargs = config_kwargs.copy()
     # Don't set torch_dtype for VILA models as they handle it explicitly in their builder
     if "vila" not in ckpt_path.lower():
-        model_kwargs.setdefault("dtype", "auto")
+        model_kwargs.setdefault("torch_dtype", "auto")
 
     if "vila" in ckpt_path.lower():
         hf_vila = AutoModel.from_pretrained(
@@ -666,7 +666,7 @@ def has_pack_quantized_config(config):
                 model_kwargs2 = model_kwargs.copy()
                 if auto_model_module not in [AutoModelForCausalLM, AutoModel]:
                     model_kwargs2.pop("trust_remote_code", None)
-                model_kwargs2["dtype"] = torch_dtype
+                model_kwargs2["torch_dtype"] = torch_dtype
                 model_kwargs2.pop("max_memory", None)
                 model = from_config(hf_config, **model_kwargs2)
 
 
@@ -71,6 +71,7 @@
 )
 from modelopt.torch.utils.dataset_utils import (
     create_forward_loop,
+    get_calib_and_holdout_dataloaders,
     get_dataset_dataloader,
     get_max_batch_size,
     get_supported_datasets,
@@ -203,9 +204,10 @@ def make_calib_dataloader(
     tokenizer: PreTrainedTokenizerBase | None,
     device: torch.device,
     model_type: str | None,
-) -> tuple[DataLoader | _DeviceDataLoader, str | None]:
+) -> tuple[DataLoader | _DeviceDataLoader, str | None, Path | None]:
     calib_dataloader = None
     first_text_speech_dataset = None
+    holdout_path = None
     if args.specdec_offline_dataset is not None:
         offline_data_path = Path(args.specdec_offline_dataset)
         dumped_files = sorted(str(p) for p in offline_data_path.glob("*.pt"))
@@ -283,15 +285,29 @@ def make_calib_dataloader(
         include_labels = (
             args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
         )
-        calib_dataloader = get_dataset_dataloader(
-            dataset_name=args.dataset,
-            tokenizer=tokenizer,
-            batch_size=args.batch_size,
-            num_samples=args.calib_size,
-            device=device,
-            include_labels=include_labels,
-        )
-    return calib_dataloader, first_text_speech_dataset
+
+        if args.holdout_size > 0:
+            calib_dataloader, holdout_path = get_calib_and_holdout_dataloaders(
+                dataset_name=args.dataset,
+                tokenizer=tokenizer,
+                batch_size=args.batch_size,
+                calib_size=args.calib_size,
+                holdout_size=args.holdout_size,
+                max_sample_length=args.calib_seq,
+                device=device,
+                include_labels=include_labels,
+                save_dir=args.calib_data_dir,
+            )
+        else:
+            calib_dataloader = get_dataset_dataloader(
+                dataset_name=args.dataset,
+                tokenizer=tokenizer,
+                batch_size=args.batch_size,
+                num_samples=args.calib_size,
+                device=device,
+                include_labels=include_labels,
+            )
+    return calib_dataloader, first_text_speech_dataset, holdout_path
 
 
 def auto_quantize(
@@ -419,10 +435,15 @@ def load_model(args: argparse.Namespace):
             attn_implementation=args.attn_implementation,
         )
     else:
-        assert args.qformat in QUANT_CFG_CHOICES, (
-            f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
-        )
-        quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+        if args.qformat in QUANT_CFG_CHOICES:
+            quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+        elif hasattr(mtq, args.qformat):
+            quant_cfg = getattr(mtq, args.qformat)
+        else:
+            raise AssertionError(
+                f"Quantization format is not supported for low memory mode. "
+                f"Supported formats: {QUANT_CFG_CHOICES.keys()}"
+            )
         if args.kv_cache_qformat != "none":
             quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
                 quant_cfg,
@@ -1028,7 +1049,7 @@ def quantize_main(
 
     print(f"Use calib batch_size {args.batch_size}")
 
-    calib_dataloader, first_text_speech_dataset = make_calib_dataloader(
+    calib_dataloader, first_text_speech_dataset, holdout_path = make_calib_dataloader(
         args, language_model, processor, tokenizer, device, model_type
     )
 
@@ -1066,10 +1087,14 @@ def quantize_main(
                 "Plain quantization supports only one quantization format."
             )
 
-            assert args.qformat in QUANT_CFG_CHOICES, (
-                f"Unsupported quantization format: {args.qformat}, choices are: {list(QUANT_CFG_CHOICES.keys())}"
-            )
-            quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+            if args.qformat in QUANT_CFG_CHOICES:
+                quant_cfg = QUANT_CFG_CHOICES[args.qformat]
+            elif hasattr(mtq, args.qformat):
+                quant_cfg = getattr(mtq, args.qformat)
+            else:
+                raise AssertionError(
+                    f"Unsupported quantization format: {args.qformat}, choices are: {list(QUANT_CFG_CHOICES.keys())}"
+                )
 
             quant_cfg = build_quant_cfg(
                 args.qformat,
@@ -1104,7 +1129,7 @@ def quantize_main(
             quant_cfg = copy.deepcopy(quant_cfg)
             _set_kv_cache_constant_amax(quant_cfg["quant_cfg"])
 
-        if args.qformat in QUANT_CFG_CHOICES:
+        if args.qformat in QUANT_CFG_CHOICES or hasattr(mtq, args.qformat):
             mono_quantize(
                 args,
                 quant_cfg,
@@ -1180,6 +1205,26 @@ def parse_args() -> argparse.Namespace:
         type=str,
         default="512",
     )
+    parser.add_argument(
+        "--holdout_size",
+        help=(
+            "Number of holdout samples to save as a .pt file for evaluation. "
+            "Holdout samples are drawn from the same dataset immediately after "
+            "the calibration samples so there is no overlap. 0 disables holdout."
+        ),
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--calib_data_dir",
+        help=(
+            "Directory to save/load calib.pt and holdout.pt. "
+            "If both files exist, data is reloaded from disk instead of re-downloading. "
+            "Defaults to --export_path if not specified."
+        ),
+        type=str,
+        default=None,
+    )
     parser.add_argument(
         "--calib_seq",
         help="Maximum sequence length for calibration.",
 
@@ -1589,6 +1589,21 @@ def sequential_calibrate(
 
             def _layer_forward_loop(m, _inputs=layer_inputs):
                 for args, kwargs_input in _inputs:
+                    # Reset past_key_values to prevent the KV cache from
+                    # accumulating across multiple forward replays (e.g.
+                    # max_calibrate then Hessian collection in GPTQ).
+                    # The layer doesn't need stale KV data — each replay
+                    # should start with a fresh cache.
+                    if (
+                        "past_key_values" in kwargs_input
+                        and kwargs_input["past_key_values"] is not None
+                    ):
+                        kwargs_input = dict(kwargs_input)
+                        cache = kwargs_input["past_key_values"]
+                        if hasattr(cache, "reset"):
+                            cache.reset()
+                        else:
+                            kwargs_input["past_key_values"] = None
                     m(*args, **kwargs_input)
 
             calib_func(layer, _layer_forward_loop, **calib_kwargs)
@@ -1665,6 +1680,10 @@ def gptq(
     print_rank_0("Updating weights using GPTQ algorithm...")
     for handle in gptq_handles.values():
         handle.update_weights(block_size, perc_damp)
+
+        # Disable weight quantizer after running GPTQ update since weights are already QDQ'ed
+        if hasattr(handle.module, "weight_quantizer"):
+            handle.module.weight_quantizer.disable()
         handle.free()
     del gptq_handles