NVIDIA
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 35 additions & 3 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 35 additions & 3 deletions
diff --git a/‎modelopt/torch/quantization/algorithms/watersic_kv/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎modelopt/torch/quantization/algorithms/watersic_kv/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/quantization/algorithms/watersic_kv/config.py‎
Lines changed: 4 additions & 4 deletions b/‎modelopt/torch/quantization/algorithms/watersic_kv/config.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎…n/algorithms/watersic_kv/kv_quantizer.py‎ ‎…ization/algorithms/watersic_kv/helper.py‎modelopt/torch/quantization/algorithms/watersic_kv/kv_quantizer.py renamed to modelopt/torch/quantization/algorithms/watersic_kv/helper.py
Lines changed: 27 additions & 82 deletions b/‎…n/algorithms/watersic_kv/kv_quantizer.py‎ ‎…ization/algorithms/watersic_kv/helper.py‎modelopt/torch/quantization/algorithms/watersic_kv/kv_quantizer.py renamed to modelopt/torch/quantization/algorithms/watersic_kv/helper.py
Lines changed: 27 additions & 82 deletions
@@ -127,6 +127,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
     "nvfp4": "NVFP4_KV_CFG",
     "nvfp4_affine": "NVFP4_AFFINE_KV_CFG",
     "nvfp4_rotate": "NVFP4_KV_ROTATE_CFG",
+    "watersic_kv": "WATERSIC_KV_CFG",
 }
 
 # Formats that use use_constant_amax (no calibration needed).
@@ -384,7 +385,7 @@ def forward_step(model, batch):
     # We need to explicitly set up KV cache quantization after auto_quantize
     enable_quant_kv_cache = args.kv_cache_qformat != "none"
     print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
-    if enable_quant_kv_cache:
+    if enable_quant_kv_cache and args.kv_cache_qformat != "watersic_kv":
         kv_cache_quant_cfg = copy.deepcopy(
             getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"]
         )
@@ -403,6 +404,16 @@ def forward_step(model, batch):
                 [{"quantizer_name": "*", "enable": False}, *kv_cache_quant_cfg],
             ):
                 mtq.calibrate(language_model, algorithm="max", forward_loop=calibrate_loop)
+
+    # WaterSIC KV-cache needs a separate quantization pass with its own algorithm
+    if args.kv_cache_qformat == "watersic_kv":
+        watersic_cfg = copy.deepcopy(getattr(mtq, KV_QUANT_CFG_CHOICES["watersic_kv"]))
+        if args.watersic_target_rate is not None:
+            watersic_cfg["algorithm"]["target_rate"] = args.watersic_target_rate
+        if args.watersic_kl_aware:
+            watersic_cfg["algorithm"]["kl_aware"] = True
+        language_model = mtq.quantize(language_model, watersic_cfg, forward_loop=calibrate_loop)
+
     return language_model
 
 
@@ -423,7 +434,7 @@ def load_model(args: argparse.Namespace):
             f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
         )
         quant_cfg = QUANT_CFG_CHOICES[args.qformat]
-        if args.kv_cache_qformat != "none":
+        if args.kv_cache_qformat not in {"none", "watersic_kv"}:
             quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
                 quant_cfg,
                 getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"],
@@ -652,6 +663,15 @@ def mono_quantize(
         else:
             language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop)
 
+        # WaterSIC KV-cache needs a separate quantization pass with its own algorithm
+        if args.kv_cache_qformat == "watersic_kv":
+            watersic_cfg = copy.deepcopy(getattr(mtq, KV_QUANT_CFG_CHOICES["watersic_kv"]))
+            if args.watersic_target_rate is not None:
+                watersic_cfg["algorithm"]["target_rate"] = args.watersic_target_rate
+            if args.watersic_kl_aware:
+                watersic_cfg["algorithm"]["kl_aware"] = True
+            language_model = mtq.quantize(language_model, watersic_cfg, forward_loop=calibrate_loop)
+
         # For VL models, update full_model to use the quantized language model
         if is_nemotron_vl_model:
             language_model_lineage = get_language_model_from_vl(full_model)
@@ -1083,7 +1103,8 @@ def quantize_main(
             print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
 
             # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer.
-            if enable_quant_kv_cache:
+            # WaterSIC KV-cache uses a separate quantization pass, so skip merging here.
+            if enable_quant_kv_cache and args.kv_cache_qformat != "watersic_kv":
                 quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant(
                     quant_cfg,
                     getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"],
@@ -1242,6 +1263,17 @@ def parse_args() -> argparse.Namespace:
             "Other formats (fp8, nvfp4, etc.) use data-driven calibration."
         ),
     )
+    parser.add_argument(
+        "--watersic_target_rate",
+        type=float,
+        default=None,
+        help="Target bits per element for WaterSIC KV-cache quantization (default: 2.0)",
+    )
+    parser.add_argument(
+        "--watersic_kl_aware",
+        action="store_true",
+        help="Enable KL-aware importance weighting for WaterSIC KV-cache quantization",
+    )
     parser.add_argument(
         "--export_fmt",
         required=False,
 
@@ -18,6 +18,6 @@
 from __future__ import annotations
 
 from .config import WaterSICKVCalibConfig
-from .kv_quantizer import WaterSICKVHelper, WaterSICKVState
+from .helper import WaterSICKVHelper, WaterSICKVState
 
 __all__ = ["WaterSICKVCalibConfig", "WaterSICKVHelper", "WaterSICKVState"]
@@ -105,11 +105,11 @@ class WaterSICKVCalibConfig(QuantizeAlgorithmConfig):
     )
 
     use_sequential: bool = ModeloptField(
-        default=True,
+        default=False,
         title="Enable sequential layer-by-layer calibration.",
         description=(
-            "When True, the WaterSIC calibration is applied layer-by-layer in "
-            "decoder-block order so that each layer's quantized KV representation "
-            "is propagated to subsequent layers before they are calibrated."
+            "Must be False for WaterSIC. Unlike weight quantization, KV-cache "
+            "quantization does not have progressive error accumulation between "
+            "layers, so sequential calibration is not needed."
         ),
     )
@@ -32,10 +32,6 @@
 
 from .zsic import _compute_hessian_cholesky, binary_search_c, damp_for_rate, watersic_quantize
 
-# ---------------------------------------------------------------------------
-# Data structures
-# ---------------------------------------------------------------------------
-
 
 @dataclass
 class WaterSICKVState:
@@ -53,11 +49,6 @@ class WaterSICKVState:
     """Achieved coding rate (bits per element)."""
 
 
-# ---------------------------------------------------------------------------
-# Importance weighting
-# ---------------------------------------------------------------------------
-
-
 def _compute_importance_weights(P: Tensor, importance_clip: float = 50.0) -> Tensor:
     """Derive per-token importance weights from an attention probability matrix.
 
@@ -90,63 +81,6 @@ def _compute_importance_weights(P: Tensor, importance_clip: float = 50.0) -> Ten
     return w.sqrt().unsqueeze(1)  # (N, 1)
 
 
-# ---------------------------------------------------------------------------
-# KL divergence in logit space
-# ---------------------------------------------------------------------------
-
-
-def kl_divergence_logits(
-    Q: Tensor,
-    K: Tensor,
-    K_q: Tensor,
-    temperature: float = 1.0,
-) -> float:
-    """Compute the KL divergence between attention distributions induced by *K* and *K_q*.
-
-    Uses the logit identity to avoid materialising the full attention matrix:
-
-        KL(P || P_q) = E_x[ P^T (s - s_q) + logsumexp(s_q) - logsumexp(s) ]
-
-    where ``s = Q K^T / temperature`` and ``s_q = Q K_q^T / temperature``.
-
-    Parameters
-    ----------
-    Q : Tensor (..., S, D)
-    K : Tensor (..., N, D)
-    K_q : Tensor (..., N, D)
-    temperature : float
-
-    Returns:
-    -------
-    kl : float
-        Mean KL divergence in **bits** (i.e. divided by ln 2).
-    """
-    Q64 = Q.double()
-    K64 = K.double()
-    Kq64 = K_q.double()
-
-    s = Q64 @ K64.transpose(-2, -1) / temperature  # (..., S, N)
-    s_q = Q64 @ Kq64.transpose(-2, -1) / temperature  # (..., S, N)
-
-    log_Z = torch.logsumexp(s, dim=-1)  # (..., S)
-    log_Z_q = torch.logsumexp(s_q, dim=-1)  # (..., S)
-
-    P = torch.softmax(s, dim=-1)  # (..., S, N)
-
-    # KL per query position: sum_n P_n (s_n - s_q_n) + log_Z_q - log_Z
-    kl_per_query = (P * (s - s_q)).sum(dim=-1) + log_Z_q - log_Z  # (..., S)
-
-    # Convert nats to bits and return mean.
-    import math
-
-    return (kl_per_query.mean() / math.log(2)).item()
-
-
-# ---------------------------------------------------------------------------
-# WaterSICKVHelper
-# ---------------------------------------------------------------------------
-
-
 class WaterSICKVHelper:
     """Hook-based helper that captures Q/K activations and runs WaterSIC quantisation.
 
@@ -178,8 +112,6 @@ def __init__(
 
         self._original_fn = None
 
-    # ----- patching --------------------------------------------------
-
     def setup(self):
         """Patch ``_quantized_attention`` on the module instance to capture Q/K."""
         # The original is a @staticmethod on the class - grab the underlying function.
@@ -220,8 +152,6 @@ def cleanup(self):
         if "_quantized_attention" in vars(self.module):
             delattr(self.module, "_quantized_attention")
 
-    # ----- quantisation -----------------------------------------------
-
     def quantize(
         self,
         target_rate: float = 4.0,
@@ -246,6 +176,13 @@ def quantize(
         -------
         WaterSICKVState
         """
+        if not self.collected_Q or not self.collected_K:
+            raise RuntimeError(
+                f"[{self.name}] No Q/K activations were collected during the calibration "
+                f"forward pass. Ensure setup() was called before the forward loop and that "
+                f"the forward loop passes data through this attention layer."
+            )
+
         # Concatenate collected activations across calibration batches.
         # Each tensor is (batch, n_heads, seq, d_head).
         Q_all = torch.cat(self.collected_Q, dim=0)  # (B_total, H, S_q, D)
@@ -262,14 +199,17 @@ def quantize(
 
         damp_pct = damp_for_rate(target_rate)
 
+        # Run quantization on GPU if available (much faster for real models).
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
         for h in range(H):
             # K_h shape: (B, S_k, D) → treat as weight matrix (a, n) where
             # a = B * S_k (token-batch dimension) and n = D (head dimension).
-            K_h = K_all[:, h, :, :].reshape(-1, D).double()  # (B*S_k, D)
+            K_h = K_all[:, h, :, :].reshape(-1, D).to(device=device, dtype=torch.float64)
 
             # Activation matrix: use Q_h^T so the Hessian reflects query-key
             # interaction.  A shape: (D, B*S_q).
-            Q_h = Q_all[:, h, :, :].reshape(-1, D).double()  # (B*S_q, D)
+            Q_h = Q_all[:, h, :, :].reshape(-1, D).to(device=device, dtype=torch.float64)
             A = Q_h.T  # (D, B*S_q)
 
             # Optional importance weighting — scale K rows (not A) so that
@@ -320,19 +260,26 @@ def quantize(
             # Recover per-head state.
             # alpha = c / L.diag() (same as inside watersic_quantize).
             alpha_h = (c / L.diag()).float()
-
-            Z_heads.append(Z_h)
-            alpha_heads.append(alpha_h)
-            gamma_heads.append(gamma_h.float())
-            perm_heads.append(perm)
+            if perm is not None:
+                inv_perm = torch.argsort(perm)
+                alpha_h = alpha_h[inv_perm]
+
+            # Move results to CPU to free GPU memory for next head.
+            Z_heads.append(Z_h.cpu())
+            alpha_heads.append(alpha_h.cpu())
+            gamma_heads.append(gamma_h.float().cpu())
+            perm_heads.append(perm.cpu() if perm is not None else None)
             rates.append(rate)
 
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
         mean_rate = sum(rates) / len(rates) if rates else 0.0
 
         state = WaterSICKVState(
-            Z=torch.stack(Z_heads),  # (H, B*S_k, D)
-            alpha=torch.stack(alpha_heads),  # (H, D)
-            gamma=torch.stack(gamma_heads),  # (H, D)
+            Z=torch.stack(Z_heads),
+            alpha=torch.stack(alpha_heads),
+            gamma=torch.stack(gamma_heads),
             perm=torch.stack(perm_heads) if perm_heads and perm_heads[0] is not None else None,
             rate=mean_rate,
         )
@@ -342,8 +289,6 @@ def quantize(
 
         return state
 
-    # ----- cleanup -----------------------------------------------------
-
     def free(self):
         """Release collected calibration data."""
         self.collected_Q.clear()