NVIDIA
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 35 additions & 3 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 35 additions & 3 deletions
diff --git a/‎modelopt/torch/quantization/algorithms/watersic_kv/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎modelopt/torch/quantization/algorithms/watersic_kv/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/quantization/algorithms/watersic_kv/config.py‎
Lines changed: 4 additions & 4 deletions b/‎modelopt/torch/quantization/algorithms/watersic_kv/config.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎…n/algorithms/watersic_kv/kv_quantizer.py‎ ‎…ization/algorithms/watersic_kv/helper.py‎modelopt/torch/quantization/algorithms/watersic_kv/kv_quantizer.py renamed to modelopt/torch/quantization/algorithms/watersic_kv/helper.py
Lines changed: 20 additions & 82 deletions b/‎…n/algorithms/watersic_kv/kv_quantizer.py‎ ‎…ization/algorithms/watersic_kv/helper.py‎modelopt/torch/quantization/algorithms/watersic_kv/kv_quantizer.py renamed to modelopt/torch/quantization/algorithms/watersic_kv/helper.py
Lines changed: 20 additions & 82 deletions
diff --git a/‎modelopt/torch/quantization/algorithms/watersic_kv/zsic.py‎
Lines changed: 14 additions & 34 deletions b/‎modelopt/torch/quantization/algorithms/watersic_kv/zsic.py‎
Lines changed: 14 additions & 34 deletions
@@ -127,6 +127,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
     "nvfp4": "NVFP4_KV_CFG",
     "nvfp4_affine": "NVFP4_AFFINE_KV_CFG",
     "nvfp4_rotate": "NVFP4_KV_ROTATE_CFG",
+    "watersic_kv": "WATERSIC_KV_CFG",
 }
 
 # Formats that use use_constant_amax (no calibration needed).
@@ -384,7 +385,7 @@ def forward_step(model, batch):
     # We need to explicitly set up KV cache quantization after auto_quantize
     enable_quant_kv_cache = args.kv_cache_qformat != "none"
     print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
-    if enable_quant_kv_cache:
+    if enable_quant_kv_cache and args.kv_cache_qformat != "watersic_kv":
         kv_cache_quant_cfg = copy.deepcopy(
             getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"]
         )
@@ -403,6 +404,16 @@ def forward_step(model, batch):
                 [{"quantizer_name": "*", "enable": False}, *kv_cache_quant_cfg],
             ):
                 mtq.calibrate(language_model, algorithm="max", forward_loop=calibrate_loop)
+
+    # WaterSIC KV-cache needs a separate quantization pass with its own algorithm
+    if args.kv_cache_qformat == "watersic_kv":
+        watersic_cfg = copy.deepcopy(getattr(mtq, KV_QUANT_CFG_CHOICES["watersic_kv"]))
+        if args.watersic_target_rate is not None:
+            watersic_cfg["algorithm"]["target_rate"] = args.watersic_target_rate
+        if args.watersic_kl_aware:
+            watersic_cfg["algorithm"]["kl_aware"] = True
+        language_model = mtq.quantize(language_model, watersic_cfg, forward_loop=calibrate_loop)
+
     return language_model
 
 
@@ -423,7 +434,7 @@ def load_model(args: argparse.Namespace):
             f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
         )
         quant_cfg = QUANT_CFG_CHOICES[args.qformat]
-        if args.kv_cache_qformat != "none":
+        if args.kv_cache_qformat not in {"none", "watersic_kv"}:
             quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
                 quant_cfg,
                 getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"],
@@ -652,6 +663,15 @@ def mono_quantize(
         else:
             language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop)
 
+        # WaterSIC KV-cache needs a separate quantization pass with its own algorithm
+        if args.kv_cache_qformat == "watersic_kv":
+            watersic_cfg = copy.deepcopy(getattr(mtq, KV_QUANT_CFG_CHOICES["watersic_kv"]))
+            if args.watersic_target_rate is not None:
+                watersic_cfg["algorithm"]["target_rate"] = args.watersic_target_rate
+            if args.watersic_kl_aware:
+                watersic_cfg["algorithm"]["kl_aware"] = True
+            language_model = mtq.quantize(language_model, watersic_cfg, forward_loop=calibrate_loop)
+
         # For VL models, update full_model to use the quantized language model
         if is_nemotron_vl_model:
             language_model_lineage = get_language_model_from_vl(full_model)
@@ -1083,7 +1103,8 @@ def quantize_main(
             print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
 
             # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer.
-            if enable_quant_kv_cache:
+            # WaterSIC KV-cache uses a separate quantization pass, so skip merging here.
+            if enable_quant_kv_cache and args.kv_cache_qformat != "watersic_kv":
                 quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant(
                     quant_cfg,
                     getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"],
@@ -1242,6 +1263,17 @@ def parse_args() -> argparse.Namespace:
             "Other formats (fp8, nvfp4, etc.) use data-driven calibration."
         ),
     )
+    parser.add_argument(
+        "--watersic_target_rate",
+        type=float,
+        default=None,
+        help="Target bits per element for WaterSIC KV-cache quantization (default: 2.0)",
+    )
+    parser.add_argument(
+        "--watersic_kl_aware",
+        action="store_true",
+        help="Enable KL-aware importance weighting for WaterSIC KV-cache quantization",
+    )
     parser.add_argument(
         "--export_fmt",
         required=False,
 
@@ -18,6 +18,6 @@
 from __future__ import annotations
 
 from .config import WaterSICKVCalibConfig
-from .kv_quantizer import WaterSICKVHelper, WaterSICKVState
+from .helper import WaterSICKVHelper, WaterSICKVState
 
 __all__ = ["WaterSICKVCalibConfig", "WaterSICKVHelper", "WaterSICKVState"]
@@ -105,11 +105,11 @@ class WaterSICKVCalibConfig(QuantizeAlgorithmConfig):
     )
 
     use_sequential: bool = ModeloptField(
-        default=True,
+        default=False,
         title="Enable sequential layer-by-layer calibration.",
         description=(
-            "When True, the WaterSIC calibration is applied layer-by-layer in "
-            "decoder-block order so that each layer's quantized KV representation "
-            "is propagated to subsequent layers before they are calibrated."
+            "Must be False for WaterSIC. Unlike weight quantization, KV-cache "
+            "quantization does not have progressive error accumulation between "
+            "layers, so sequential calibration is not needed."
         ),
     )
@@ -32,10 +32,6 @@
 
 from .zsic import _compute_hessian_cholesky, binary_search_c, damp_for_rate, watersic_quantize
 
-# ---------------------------------------------------------------------------
-# Data structures
-# ---------------------------------------------------------------------------
-
 
 @dataclass
 class WaterSICKVState:
@@ -53,11 +49,6 @@ class WaterSICKVState:
     """Achieved coding rate (bits per element)."""
 
 
-# ---------------------------------------------------------------------------
-# Importance weighting
-# ---------------------------------------------------------------------------
-
-
 def _compute_importance_weights(P: Tensor, importance_clip: float = 50.0) -> Tensor:
     """Derive per-token importance weights from an attention probability matrix.
 
@@ -90,63 +81,6 @@ def _compute_importance_weights(P: Tensor, importance_clip: float = 50.0) -> Ten
     return w.sqrt().unsqueeze(1)  # (N, 1)
 
 
-# ---------------------------------------------------------------------------
-# KL divergence in logit space
-# ---------------------------------------------------------------------------
-
-
-def kl_divergence_logits(
-    Q: Tensor,
-    K: Tensor,
-    K_q: Tensor,
-    temperature: float = 1.0,
-) -> float:
-    """Compute the KL divergence between attention distributions induced by *K* and *K_q*.
-
-    Uses the logit identity to avoid materialising the full attention matrix:
-
-        KL(P || P_q) = E_x[ P^T (s - s_q) + logsumexp(s_q) - logsumexp(s) ]
-
-    where ``s = Q K^T / temperature`` and ``s_q = Q K_q^T / temperature``.
-
-    Parameters
-    ----------
-    Q : Tensor (..., S, D)
-    K : Tensor (..., N, D)
-    K_q : Tensor (..., N, D)
-    temperature : float
-
-    Returns:
-    -------
-    kl : float
-        Mean KL divergence in **bits** (i.e. divided by ln 2).
-    """
-    Q64 = Q.double()
-    K64 = K.double()
-    Kq64 = K_q.double()
-
-    s = Q64 @ K64.transpose(-2, -1) / temperature  # (..., S, N)
-    s_q = Q64 @ Kq64.transpose(-2, -1) / temperature  # (..., S, N)
-
-    log_Z = torch.logsumexp(s, dim=-1)  # (..., S)
-    log_Z_q = torch.logsumexp(s_q, dim=-1)  # (..., S)
-
-    P = torch.softmax(s, dim=-1)  # (..., S, N)
-
-    # KL per query position: sum_n P_n (s_n - s_q_n) + log_Z_q - log_Z
-    kl_per_query = (P * (s - s_q)).sum(dim=-1) + log_Z_q - log_Z  # (..., S)
-
-    # Convert nats to bits and return mean.
-    import math
-
-    return (kl_per_query.mean() / math.log(2)).item()
-
-
-# ---------------------------------------------------------------------------
-# WaterSICKVHelper
-# ---------------------------------------------------------------------------
-
-
 class WaterSICKVHelper:
     """Hook-based helper that captures Q/K activations and runs WaterSIC quantisation.
 
@@ -178,8 +112,6 @@ def __init__(
 
         self._original_fn = None
 
-    # ----- patching --------------------------------------------------
-
     def setup(self):
         """Patch ``_quantized_attention`` on the module instance to capture Q/K."""
         # The original is a @staticmethod on the class - grab the underlying function.
@@ -220,8 +152,6 @@ def cleanup(self):
         if "_quantized_attention" in vars(self.module):
             delattr(self.module, "_quantized_attention")
 
-    # ----- quantisation -----------------------------------------------
-
     def quantize(
         self,
         target_rate: float = 4.0,
@@ -262,14 +192,17 @@ def quantize(
 
         damp_pct = damp_for_rate(target_rate)
 
+        # Run quantization on GPU if available (much faster for real models).
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
         for h in range(H):
             # K_h shape: (B, S_k, D) → treat as weight matrix (a, n) where
             # a = B * S_k (token-batch dimension) and n = D (head dimension).
-            K_h = K_all[:, h, :, :].reshape(-1, D).double()  # (B*S_k, D)
+            K_h = K_all[:, h, :, :].reshape(-1, D).to(device=device, dtype=torch.float64)
 
             # Activation matrix: use Q_h^T so the Hessian reflects query-key
             # interaction.  A shape: (D, B*S_q).
-            Q_h = Q_all[:, h, :, :].reshape(-1, D).double()  # (B*S_q, D)
+            Q_h = Q_all[:, h, :, :].reshape(-1, D).to(device=device, dtype=torch.float64)
             A = Q_h.T  # (D, B*S_q)
 
             # Optional importance weighting — scale K rows (not A) so that
@@ -320,19 +253,26 @@ def quantize(
             # Recover per-head state.
             # alpha = c / L.diag() (same as inside watersic_quantize).
             alpha_h = (c / L.diag()).float()
-
-            Z_heads.append(Z_h)
-            alpha_heads.append(alpha_h)
-            gamma_heads.append(gamma_h.float())
-            perm_heads.append(perm)
+            if perm is not None:
+                inv_perm = torch.argsort(perm)
+                alpha_h = alpha_h[inv_perm]
+
+            # Move results to CPU to free GPU memory for next head.
+            Z_heads.append(Z_h.cpu())
+            alpha_heads.append(alpha_h.cpu())
+            gamma_heads.append(gamma_h.float().cpu())
+            perm_heads.append(perm.cpu() if perm is not None else None)
             rates.append(rate)
 
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
         mean_rate = sum(rates) / len(rates) if rates else 0.0
 
         state = WaterSICKVState(
-            Z=torch.stack(Z_heads),  # (H, B*S_k, D)
-            alpha=torch.stack(alpha_heads),  # (H, D)
-            gamma=torch.stack(gamma_heads),  # (H, D)
+            Z=torch.stack(Z_heads),
+            alpha=torch.stack(alpha_heads),
+            gamma=torch.stack(gamma_heads),
             perm=torch.stack(perm_heads) if perm_heads and perm_heads[0] is not None else None,
             rate=mean_rate,
         )
@@ -342,8 +282,6 @@ def quantize(
 
         return state
 
-    # ----- cleanup -----------------------------------------------------
-
     def free(self):
         """Release collected calibration data."""
         self.collected_Q.clear()
 
@@ -16,8 +16,7 @@
 """Core ZSIC (Zero-Shot Integer Compression) algorithm for WaterSIC KV-cache quantization.
 
 This is a pure math module with no Model-Optimizer dependencies.  It implements
-the sequential integer coding algorithm described in the WaterSIC paper, ported
-from psx-vfp commit 39073b1.
+the sequential integer coding algorithm described in the WaterSIC paper.
 """
 
 from __future__ import annotations
@@ -27,10 +26,6 @@
 import torch
 from torch import Tensor
 
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
 
 def damp_for_rate(target_rate: float, base: float = 1e-4, knee: float = 5.0) -> float:
     """Return a damping coefficient that decays for rates above *knee*.
@@ -51,15 +46,20 @@ def compute_entropy(Z: Tensor) -> float:
 
 
 def compute_output_nmse(W: Tensor, W_q: Tensor, A: Tensor) -> float:
-    """Normalised MSE measured in the output space: ``||err @ A||^2 / ||W @ A||^2``."""
-    err = (W - W_q) @ A
-    ref = W @ A
-    return (err.norm() ** 2 / ref.norm() ** 2).item()
-
+    """Normalised MSE measured in the output space: ``||err @ A||^2 / ||W @ A||^2``.
 
-# ---------------------------------------------------------------------------
-# Hessian / Cholesky
-# ---------------------------------------------------------------------------
+    Uses the trace identity ``||M @ N||_F^2 = tr(M^T M  N N^T)`` to avoid
+    materialising the ``(a, a)`` output matrix, which can be prohibitively large
+    when the number of tokens *a* is high (e.g. real-model calibration).
+    Only ``(n, n)`` intermediates are needed, where *n* = ``A.shape[0]``.
+    """
+    Sigma_X = A @ A.T  # (n, n)
+    delta = W - W_q  # (a, n)
+    err_gram = delta.T @ delta  # (n, n)
+    ref_gram = W.T @ W  # (n, n)
+    err_sq = (err_gram * Sigma_X).sum()
+    ref_sq = (ref_gram * Sigma_X).sum()
+    return (err_sq / ref_sq).item()
 
 
 def _compute_hessian_cholesky(
@@ -111,11 +111,6 @@ def _compute_hessian_cholesky(
     return Sigma_X, L, perm
 
 
-# ---------------------------------------------------------------------------
-# Rescaler optimisation
-# ---------------------------------------------------------------------------
-
-
 def _optimize_rescalers(
     W_hat_0: Tensor,
     W: Tensor,
@@ -157,11 +152,6 @@ def _optimize_rescalers(
     return t.unsqueeze(1) * W_hat_0 * gamma.unsqueeze(0)
 
 
-# ---------------------------------------------------------------------------
-# Core sequential coding
-# ---------------------------------------------------------------------------
-
-
 def zsic_quantize(
     W: Tensor,
     A: Tensor,
@@ -234,11 +224,6 @@ def zsic_quantize(
     return W_hat, rate, nmse, Z, gamma
 
 
-# ---------------------------------------------------------------------------
-# WaterSIC interface
-# ---------------------------------------------------------------------------
-
-
 def watersic_quantize(
     W: Tensor,
     A: Tensor,
@@ -302,11 +287,6 @@ def watersic_quantize(
     return W_hat, rate, nmse, Z, gamma
 
 
-# ---------------------------------------------------------------------------
-# Binary search for c
-# ---------------------------------------------------------------------------
-
-
 def binary_search_c(
     W: Tensor,
     A: Tensor,