Merge branch 'main' into jingyux/diffusion-skip-softmax

jingyu-ml · web-flow · commit 3d5646d5c332 · 2026-04-17T17:12:02.000-07:00
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -283,6 +283,7 @@ def make_calib_dataloader(
         include_labels = (
             args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
         )
+
         calib_dataloader = get_dataset_dataloader(
             dataset_name=args.dataset,
             tokenizer=tokenizer,
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -49,7 +49,7 @@
     reduce_amax,
     weight_attr_names,
 )
-from .utils.calib_utils import GPTQHelper
+from .utils.calib_utils import _GPTQ_HELPER_REGISTRY, GPTQHelper
 
 __all__ = [
     "awq",
@@ -1589,6 +1589,21 @@ def sequential_calibrate(
 
             def _layer_forward_loop(m, _inputs=layer_inputs):
                 for args, kwargs_input in _inputs:
+                    # Reset past_key_values to prevent the KV cache from
+                    # accumulating across multiple forward replays (e.g.
+                    # max_calibrate then Hessian collection in GPTQ).
+                    # The layer doesn't need stale KV data — each replay
+                    # should start with a fresh cache.
+                    if (
+                        "past_key_values" in kwargs_input
+                        and kwargs_input["past_key_values"] is not None
+                    ):
+                        kwargs_input = dict(kwargs_input)
+                        cache = kwargs_input["past_key_values"]
+                        if hasattr(cache, "reset"):
+                            cache.reset()
+                        else:
+                            kwargs_input["past_key_values"] = None
                     m(*args, **kwargs_input)
 
             calib_func(layer, _layer_forward_loop, **calib_kwargs)
@@ -1648,7 +1663,15 @@ def gptq(
         print_rank_0("No quantized linear layers found, skipping GPTQ")
         return
 
-    gptq_handles = {name: GPTQHelper(m, name, offload_to_cpu=True) for name, m in quantized_layers}
+    def _make_gptq_handle(name, m):
+        backend = getattr(m.weight_quantizer, "backend", None)
+        if backend is None:
+            cls = GPTQHelper
+        else:
+            cls = _GPTQ_HELPER_REGISTRY.get(backend, GPTQHelper)
+        return cls(m, name, offload_to_cpu=True)
+
+    gptq_handles = {name: _make_gptq_handle(name, m) for name, m in quantized_layers}
     for handle in gptq_handles.values():
         handle.setup()
 
diff --git a/modelopt/torch/quantization/utils/calib_utils.py b/modelopt/torch/quantization/utils/calib_utils.py
@@ -143,9 +143,7 @@ def update_weights(self, block_size, perc_damp):
         hessian = self.hessian.to(self.module.weight.device)
         self.weight = self.module.weight.data.float().clone()
         self._prepare_hessian_inverse(hessian, perc_damp)
-
         self._blockwise_update(block_size)
-
         self._print_mse_error(hessian)
         self.module.weight.data = self.weight.reshape(self.module.weight.shape).to(
             self.module.weight.data.dtype
@@ -231,3 +229,16 @@ def _print_mse_error(self, hessian):
         mse = (delta).mm(hessian).mul(delta).mean() / (w_orig.mm(hessian).mul(w_orig).mean() + 1e-6)
         suffix = f", n_hessian_samples: {self.n_samples}" if self.n_samples else ""
         print_rank_0(f"[{self.name}] Relative MSE error: {mse.item():.2e}{suffix}")
+
+
+_GPTQ_HELPER_REGISTRY: dict[str, type[GPTQHelper]] = {}
+
+
+def register_gptq_helper(backend: str, factory: type[GPTQHelper]) -> None:
+    """Register a :class:`GPTQHelper` subclass for a quantizer backend.
+
+    When :func:`modelopt.torch.quantization.model_calib.gptq` encounters a
+    module whose ``weight_quantizer.backend`` matches ``backend``, it will
+    construct ``factory`` instead of the default ``GPTQHelper``.
+    """
+    _GPTQ_HELPER_REGISTRY[backend] = factory

Original file line number	Diff line number	Diff line change
`@@ -283,6 +283,7 @@ def make_calib_dataloader(`
`283`	`283`	`include_labels = (`
`284`	`284`	`args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"`
`285`	`285`	`)`
	`286`	`+`
`286`	`287`	`calib_dataloader = get_dataset_dataloader(`
`287`	`288`	`dataset_name=args.dataset,`
`288`	`289`	`tokenizer=tokenizer,`