Integrate WaterSIC KV-cache in hf_ptq.py

kaix-nv · kaix-nv · commit 7b0bb08383f8 · 2026-04-10T13:13:48.000-07:00
Signed-off-by: Kai Xu &lt;kaix@nvidia.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -127,6 +127,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None:
     "nvfp4": "NVFP4_KV_CFG",
     "nvfp4_affine": "NVFP4_AFFINE_KV_CFG",
     "nvfp4_rotate": "NVFP4_KV_ROTATE_CFG",
+    "watersic_kv": "WATERSIC_KV_CFG",
 }
 
 # Formats that use use_constant_amax (no calibration needed).
@@ -384,7 +385,7 @@ def forward_step(model, batch):
     # We need to explicitly set up KV cache quantization after auto_quantize
     enable_quant_kv_cache = args.kv_cache_qformat != "none"
     print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
-    if enable_quant_kv_cache:
+    if enable_quant_kv_cache and args.kv_cache_qformat != "watersic_kv":
         kv_cache_quant_cfg = copy.deepcopy(
             getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"]
         )
@@ -403,6 +404,16 @@ def forward_step(model, batch):
                 [{"quantizer_name": "*", "enable": False}, *kv_cache_quant_cfg],
             ):
                 mtq.calibrate(language_model, algorithm="max", forward_loop=calibrate_loop)
+
+    # WaterSIC KV-cache needs a separate quantization pass with its own algorithm
+    if args.kv_cache_qformat == "watersic_kv":
+        watersic_cfg = copy.deepcopy(getattr(mtq, KV_QUANT_CFG_CHOICES["watersic_kv"]))
+        if args.watersic_target_rate is not None:
+            watersic_cfg["algorithm"]["target_rate"] = args.watersic_target_rate
+        if args.watersic_kl_aware:
+            watersic_cfg["algorithm"]["kl_aware"] = True
+        language_model = mtq.quantize(language_model, watersic_cfg, forward_loop=calibrate_loop)
+
     return language_model
 
 
@@ -423,7 +434,7 @@ def load_model(args: argparse.Namespace):
             f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
         )
         quant_cfg = QUANT_CFG_CHOICES[args.qformat]
-        if args.kv_cache_qformat != "none":
+        if args.kv_cache_qformat not in {"none", "watersic_kv"}:
             quant_cfg = mtq.utils.update_quant_cfg_with_kv_cache_quant(
                 quant_cfg,
                 getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"],
@@ -652,6 +663,15 @@ def mono_quantize(
         else:
             language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop)
 
+        # WaterSIC KV-cache needs a separate quantization pass with its own algorithm
+        if args.kv_cache_qformat == "watersic_kv":
+            watersic_cfg = copy.deepcopy(getattr(mtq, KV_QUANT_CFG_CHOICES["watersic_kv"]))
+            if args.watersic_target_rate is not None:
+                watersic_cfg["algorithm"]["target_rate"] = args.watersic_target_rate
+            if args.watersic_kl_aware:
+                watersic_cfg["algorithm"]["kl_aware"] = True
+            language_model = mtq.quantize(language_model, watersic_cfg, forward_loop=calibrate_loop)
+
         # For VL models, update full_model to use the quantized language model
         if is_nemotron_vl_model:
             language_model_lineage = get_language_model_from_vl(full_model)
@@ -1083,7 +1103,8 @@ def quantize_main(
             print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization")
 
             # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer.
-            if enable_quant_kv_cache:
+            # WaterSIC KV-cache uses a separate quantization pass, so skip merging here.
+            if enable_quant_kv_cache and args.kv_cache_qformat != "watersic_kv":
                 quant_cfg = mtq.update_quant_cfg_with_kv_cache_quant(
                     quant_cfg,
                     getattr(mtq, KV_QUANT_CFG_CHOICES[args.kv_cache_qformat])["quant_cfg"],
@@ -1242,6 +1263,17 @@ def parse_args() -> argparse.Namespace:
             "Other formats (fp8, nvfp4, etc.) use data-driven calibration."
         ),
     )
+    parser.add_argument(
+        "--watersic_target_rate",
+        type=float,
+        default=None,
+        help="Target bits per element for WaterSIC KV-cache quantization (default: 2.0)",
+    )
+    parser.add_argument(
+        "--watersic_kl_aware",
+        action="store_true",
+        help="Enable KL-aware importance weighting for WaterSIC KV-cache quantization",
+    )
     parser.add_argument(
         "--export_fmt",
         required=False,
diff --git a/modelopt/torch/quantization/algorithms/watersic_kv/kv_quantizer.py b/modelopt/torch/quantization/algorithms/watersic_kv/kv_quantizer.py
@@ -32,10 +32,6 @@
 
 from .zsic import _compute_hessian_cholesky, binary_search_c, damp_for_rate, watersic_quantize
 
-# ---------------------------------------------------------------------------
-# Data structures
-# ---------------------------------------------------------------------------
-
 
 @dataclass
 class WaterSICKVState:
@@ -53,11 +49,6 @@ class WaterSICKVState:
     """Achieved coding rate (bits per element)."""
 
 
-# ---------------------------------------------------------------------------
-# Importance weighting
-# ---------------------------------------------------------------------------
-
-
 def _compute_importance_weights(P: Tensor, importance_clip: float = 50.0) -> Tensor:
     """Derive per-token importance weights from an attention probability matrix.
 
@@ -90,11 +81,6 @@ def _compute_importance_weights(P: Tensor, importance_clip: float = 50.0) -> Ten
     return w.sqrt().unsqueeze(1)  # (N, 1)
 
 
-# ---------------------------------------------------------------------------
-# KL divergence in logit space
-# ---------------------------------------------------------------------------
-
-
 def kl_divergence_logits(
     Q: Tensor,
     K: Tensor,
@@ -142,11 +128,6 @@ def kl_divergence_logits(
     return (kl_per_query.mean() / math.log(2)).item()
 
 
-# ---------------------------------------------------------------------------
-# WaterSICKVHelper
-# ---------------------------------------------------------------------------
-
-
 class WaterSICKVHelper:
     """Hook-based helper that captures Q/K activations and runs WaterSIC quantisation.
 
@@ -178,7 +159,6 @@ def __init__(
 
         self._original_fn = None
 
-    # ----- patching --------------------------------------------------
 
     def setup(self):
         """Patch ``_quantized_attention`` on the module instance to capture Q/K."""
@@ -220,7 +200,6 @@ def cleanup(self):
         if "_quantized_attention" in vars(self.module):
             delattr(self.module, "_quantized_attention")
 
-    # ----- quantisation -----------------------------------------------
 
     def quantize(
         self,
@@ -342,7 +321,6 @@ def quantize(
 
         return state
 
-    # ----- cleanup -----------------------------------------------------
 
     def free(self):
         """Release collected calibration data."""
diff --git a/modelopt/torch/quantization/algorithms/watersic_kv/zsic.py b/modelopt/torch/quantization/algorithms/watersic_kv/zsic.py
@@ -27,10 +27,6 @@
 import torch
 from torch import Tensor
 
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
 
 def damp_for_rate(target_rate: float, base: float = 1e-4, knee: float = 5.0) -> float:
     """Return a damping coefficient that decays for rates above *knee*.
@@ -57,11 +53,6 @@ def compute_output_nmse(W: Tensor, W_q: Tensor, A: Tensor) -> float:
     return (err.norm() ** 2 / ref.norm() ** 2).item()
 
 
-# ---------------------------------------------------------------------------
-# Hessian / Cholesky
-# ---------------------------------------------------------------------------
-
-
 def _compute_hessian_cholesky(
     A: Tensor,
     damp_pct: float = 1e-4,
@@ -111,11 +102,6 @@ def _compute_hessian_cholesky(
     return Sigma_X, L, perm
 
 
-# ---------------------------------------------------------------------------
-# Rescaler optimisation
-# ---------------------------------------------------------------------------
-
-
 def _optimize_rescalers(
     W_hat_0: Tensor,
     W: Tensor,
@@ -157,11 +143,6 @@ def _optimize_rescalers(
     return t.unsqueeze(1) * W_hat_0 * gamma.unsqueeze(0)
 
 
-# ---------------------------------------------------------------------------
-# Core sequential coding
-# ---------------------------------------------------------------------------
-
-
 def zsic_quantize(
     W: Tensor,
     A: Tensor,
@@ -234,11 +215,6 @@ def zsic_quantize(
     return W_hat, rate, nmse, Z, gamma
 
 
-# ---------------------------------------------------------------------------
-# WaterSIC interface
-# ---------------------------------------------------------------------------
-
-
 def watersic_quantize(
     W: Tensor,
     A: Tensor,
@@ -302,11 +278,6 @@ def watersic_quantize(
     return W_hat, rate, nmse, Z, gamma
 
 
-# ---------------------------------------------------------------------------
-# Binary search for c
-# ---------------------------------------------------------------------------
-
-
 def binary_search_c(
     W: Tensor,
     A: Tensor,
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -741,6 +741,17 @@ def _nvfp4_selective_quant_cfg(
     "algorithm": "max",
 }
 
+WATERSIC_KV_CFG = {
+    "quant_cfg": [
+        {"quantizer_name": "*", "enable": False},
+        {"quantizer_name": "*[kv]_bmm_quantizer", "enable": True},
+    ],
+    "algorithm": {
+        "method": "watersic_kv",
+        "target_rate": 2.0,
+    },
+}
+
 NVFP4_SVDQUANT_DEFAULT_CFG = _nvfp4_selective_quant_cfg(
     ["*"], algorithm={"method": "svdquant", "lowrank": 32}
 )
@@ -833,6 +844,7 @@ def _nvfp4_selective_quant_cfg(
     "MAMBA_MOE_FP8_CONSERVATIVE_CFG",
     "MAMBA_MOE_FP8_AGGRESSIVE_CFG",
     "NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG",
+    "WATERSIC_KV_CFG",
 }
 
 BiasType = Literal["static", "dynamic"]