Add WaterSIC KV-cache calibration config and update package exports

kaix-nv · kaix-nv · commit e1d32e2c0098 · 2026-04-10T13:05:27.000-07:00
Signed-off-by: Kai Xu &lt;kaix@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/algorithms/watersic_kv/__init__.py b/modelopt/torch/quantization/algorithms/watersic_kv/__init__.py
@@ -16,3 +16,8 @@
 """WaterSIC KV-cache quantization algorithm."""
 
 from __future__ import annotations
+
+from .config import WaterSICKVCalibConfig
+from .kv_quantizer import WaterSICKVHelper, WaterSICKVState
+
+__all__ = ["WaterSICKVCalibConfig", "WaterSICKVHelper", "WaterSICKVState"]
diff --git a/modelopt/torch/quantization/algorithms/watersic_kv/config.py b/modelopt/torch/quantization/algorithms/watersic_kv/config.py
@@ -0,0 +1,115 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration for the WaterSIC KV-cache quantization algorithm."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from modelopt.torch.opt.config import ModeloptField
+from modelopt.torch.quantization.config import QuantizeAlgorithmConfig
+
+
+class WaterSICKVCalibConfig(QuantizeAlgorithmConfig):
+    """Configuration for WaterSIC KV-cache quantization.
+
+    WaterSIC (Water-filling Successive Interference Cancellation) is a
+    rate-adaptive quantization method for KV-cache compression.  It
+    applies the ZSIC algorithm with optional KL-aware importance
+    weighting and LMMSE shrinkage correction to minimize attention-output
+    distortion at a target bits-per-element budget.
+
+    Reference: "WaterSIC: Water-filling Successive Interference
+    Cancellation for KV-Cache Quantization" (2024).
+    """
+
+    method: Literal["watersic_kv"] = ModeloptField(
+        "watersic_kv",
+        title="Calibration algorithm identifier.",
+        description="Fixed identifier for the WaterSIC KV-cache calibration method.",
+    )
+
+    target_rate: float = ModeloptField(
+        default=2.0,
+        gt=0.0,
+        title="Target bits per element.",
+        description=(
+            "Average number of bits per quantized KV-cache element.  The binary "
+            "search over the ZSIC damping parameter c is driven to hit this rate."
+        ),
+    )
+
+    kl_aware: bool = ModeloptField(
+        default=False,
+        title="Enable KL-aware importance weighting.",
+        description=(
+            "When True, per-token importance weights derived from the attention "
+            "distribution are folded into the Hessian so that tokens with higher "
+            "attention mass receive tighter quantization."
+        ),
+    )
+
+    importance_clip: float = ModeloptField(
+        default=50.0,
+        gt=0.0,
+        title="Importance weight clipping ratio.",
+        description=(
+            "Maximum ratio by which a single token's importance weight may exceed "
+            "the mean weight.  Clips extreme outlier tokens to prevent them from "
+            "dominating the Hessian estimate."
+        ),
+    )
+
+    use_lmmse: bool = ModeloptField(
+        default=True,
+        title="Apply LMMSE shrinkage correction.",
+        description=(
+            "When True, the LMMSE (Linear Minimum Mean-Squared Error) shrinkage "
+            "correction is applied after ZSIC quantization to partially undo "
+            "quantization bias and reduce reconstruction NMSE."
+        ),
+    )
+
+    n_rescaler_iters: int = ModeloptField(
+        default=0,
+        ge=0,
+        title="Diagonal rescaler optimization iterations.",
+        description=(
+            "Number of coordinate-descent iterations for the diagonal rescaler "
+            "that adjusts per-column scale factors after LMMSE.  Set to 0 to "
+            "disable the rescaler (faster but slightly higher distortion)."
+        ),
+    )
+
+    sample_frac: float | None = ModeloptField(
+        default=None,
+        title="Row subsampling fraction for binary search.",
+        description=(
+            "If set, only this fraction of rows (KV heads) are used during the "
+            "binary search for c.  Full rows are then quantized with the found c.  "
+            "Speeds up calibration on large KV caches at a small accuracy cost."
+        ),
+    )
+
+    use_sequential: bool = ModeloptField(
+        default=True,
+        title="Enable sequential layer-by-layer calibration.",
+        description=(
+            "When True, the WaterSIC calibration is applied layer-by-layer in "
+            "decoder-block order so that each layer's quantized KV representation "
+            "is propagated to subsequent layers before they are calibrated."
+        ),
+    )
diff --git a/tests/unit/torch/quantization/test_watersic_kv.py b/tests/unit/torch/quantization/test_watersic_kv.py
@@ -362,3 +362,37 @@ def test_state_creation(self):
         assert state.gamma is gamma
         assert state.perm is None
         assert state.rate == 2.5
+
+
+# ---------------------------------------------------------------------------
+# TestWaterSICKVCalibConfig
+# ---------------------------------------------------------------------------
+
+
+class TestWaterSICKVCalibConfig:
+    def test_defaults(self):
+        from modelopt.torch.quantization.algorithms.watersic_kv.config import WaterSICKVCalibConfig
+
+        cfg = WaterSICKVCalibConfig()
+        assert cfg.method == "watersic_kv"
+        assert cfg.target_rate == 2.0
+        assert cfg.kl_aware is False
+        assert cfg.use_lmmse is True
+        assert cfg.use_sequential is True
+
+    def test_custom_values(self):
+        from modelopt.torch.quantization.algorithms.watersic_kv.config import WaterSICKVCalibConfig
+
+        cfg = WaterSICKVCalibConfig(target_rate=4.0, kl_aware=True, importance_clip=20.0)
+        assert cfg.target_rate == 4.0
+        assert cfg.kl_aware is True
+        assert cfg.importance_clip == 20.0
+
+    def test_serialization_roundtrip(self):
+        from modelopt.torch.quantization.algorithms.watersic_kv.config import WaterSICKVCalibConfig
+
+        cfg = WaterSICKVCalibConfig(target_rate=3.0, kl_aware=True)
+        data = cfg.model_dump()
+        cfg2 = WaterSICKVCalibConfig(**data)
+        assert cfg2.target_rate == 3.0
+        assert cfg2.kl_aware is True