feat: integrate KVPress for KV cache compression (#366) (#623)

kschwethelm · gsprochette · web-flow · commit b210fdb75523 · 2026-05-06T16:35:33.000+02:00
* feat: integrate KVPress for KV cache compression

Add NVIDIA KVPress as an optional dependency, enabling 31 KV cache
compression strategies for causal language models. Includes algorithm
class, test tester, and compatibility updates across existing LLM
algorithms.

* feat: bump kvpress to &gt;=0.5.2, add FastKVzipPress

kvpress 0.5.2 relaxes the datasets&lt;3 constraint and reverts to
transformers&gt;=4.56, resolving the dependency conflict. uv sync --extra
kvpress now works without workarounds.

* feat: add press_kwargs for press-specific parameters

Allow passing additional keyword arguments to the press constructor
via the press_kwargs hyperparameter, enabling fine-grained control
over press-specific settings like window_size, n_sink, etc.

* fix: compatibility, press_kwargs, unit tests, remove wrappers

- Replace tags.QUANTIZER with explicit LLM algorithm names to avoid
  false symmetry matches with diffuser algorithms
- Fix SmashConfig.add() dict flattening: only flatten when key is a
  registered algorithm name, not for dict-valued hyperparameters
- Remove wrapper/special presses from PRESS_TYPES (CriticalKVPress
  and others that don't accept compression_ratio directly)
- Add unit tests for press type validation and kwargs forwarding
- Add SnapKV integration test with press_kwargs

* feat: add KV_CACHER tag, replace explicit kvpress references

Add a new KV_CACHER algorithm tag for KV cache compression algorithms,
separate from CACHER (used by diffuser cachers). Use the tag in all
LLM algorithm compatibility lists instead of explicit "kvpress" strings.

* refactor: rename KV_CACHER tag to KV_COMPRESSOR, improve docstrings

* docs: document excluded wrapper presses in kvpress docstring

* refactor: remove KV_COMPRESSOR tag, reference kvpress by name

Drop the dedicated KV_COMPRESSOR tag and use tags.PRUNER as kvpress's
group tag, matching how other pruners are categorized. Replace all
tags.KV_COMPRESSOR references in compatible_before/after lists with the
string "kvpress" to align with the repo convention of naming specific
algorithms in compatibility lists.

* fix: handle transformers pipeline in kvpress _apply

Add pipeline guard at the top of _apply to delegate to
_apply_to_model_within_transformers_pipeline when the model is a
TextGenerationPipeline, matching the pattern used by gptq, torch_compile,
and other algorithms.

* ci: register requires_kvpress marker for optional extra

* test: mark kvpress as require_kvpress

---------

Co-authored-by: Gaspar Rochette &lt;gaspar.rochette@pruna.ai&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -87,6 +87,7 @@ conflicts = [
     [{ extra = "intel" }, { extra = "stable-fast" }, { extra = "stable-fast-extraindex" }],
     [{ extra = "intel" }, { extra = "full" }, { extra = "stable-fast-extraindex" }],
     [{ extra = "intel" }, { extra = "vllm" }],
+    [{ extra = "kvpress" }, { extra = "vbench" }],
 ]
 
 [tool.uv.sources]
@@ -248,6 +249,9 @@ intel = [
     "torch>=2.7.0,<2.9.0",
     "torchvision>=0.22.0,<0.24.0",
 ]
+kvpress = [
+    "kvpress>=0.5.2",
+]
 
 [build-system]
 requires = ["hatchling"]
diff --git a/src/pruna/algorithms/gptq_model.py b/src/pruna/algorithms/gptq_model.py
@@ -46,7 +46,7 @@ class GPTQ(PrunaAlgorithmBase):
     processor_required: bool = False
     runs_on: list[str] = ["cuda"]
     dataset_required: bool = True
-    compatible_after: Iterable[str] = ["torch_compile", "sage_attn"]
+    compatible_after: Iterable[str] = ["torch_compile", "sage_attn", "kvpress"]
     required_install: str = (
         "You must first install the base package with ``pip install pruna`` "
         "before installing the GPTQ extension with ``pip install pruna[gptq] --extra-index-url https://prunaai.pythonanywhere.com/``"
diff --git a/src/pruna/algorithms/half.py b/src/pruna/algorithms/half.py
@@ -50,6 +50,7 @@ class Half(PrunaAlgorithmBase):
         "stable_fast",
         "torch_compile",
         "ifw",
+        "kvpress",
         "whisper_s2t",
         "sage_attn",
         "hyper",
diff --git a/src/pruna/algorithms/hqq.py b/src/pruna/algorithms/hqq.py
@@ -63,7 +63,7 @@ class HQQ(PrunaAlgorithmBase):
     runs_on: list[str] = ["cuda"]
     dataset_required: bool = False
     compatible_before: Iterable[str] = ["torch_structured", "moe_kernel_tuner"]
-    compatible_after: Iterable[str] = ["torch_compile", "sage_attn", "moe_kernel_tuner"]
+    compatible_after: Iterable[str] = ["torch_compile", "sage_attn", "kvpress", "moe_kernel_tuner"]
     disjointly_compatible_before: Iterable[str] = []
     disjointly_compatible_after: Iterable[str] = ["torchao"]
 
diff --git a/src/pruna/algorithms/huggingface_llm_int8.py b/src/pruna/algorithms/huggingface_llm_int8.py
@@ -58,7 +58,7 @@ class LLMInt8(PrunaAlgorithmBase):
     runs_on: list[str] = ["cuda", "accelerate"]
     save_fn: None = None
     compatible_before: Iterable[str] = ["moe_kernel_tuner"]
-    compatible_after: Iterable[str] = ["torch_compile", "sage_attn", "moe_kernel_tuner"]
+    compatible_after: Iterable[str] = ["torch_compile", "sage_attn", "kvpress", "moe_kernel_tuner"]
 
     def get_hyperparameters(self) -> list:
         """
diff --git a/src/pruna/algorithms/kvpress.py b/src/pruna/algorithms/kvpress.py
@@ -0,0 +1,184 @@
+# Copyright 2025 - Pruna AI GmbH. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import functools
+from collections.abc import Iterable
+from typing import Any, Dict
+
+from ConfigSpace import CategoricalHyperparameter, UniformFloatHyperparameter
+
+from pruna.algorithms.base.pruna_base import PrunaAlgorithmBase
+from pruna.algorithms.base.tags import AlgorithmTag as tags
+from pruna.config.hyperparameters import UnconstrainedHyperparameter
+from pruna.config.smash_config import SmashConfigPrefixWrapper
+from pruna.engine.model_checks import is_causal_lm, is_transformers_pipeline_with_causal_lm
+from pruna.engine.save import SAVE_FUNCTIONS
+
+PRESS_TYPES = [
+    "CompactorPress",
+    "CURPress",
+    "ExpectedAttentionPress",
+    "ExpectedAttentionStatsPress",
+    "FastKVzipPress",
+    "FinchPress",
+    "KnormPress",
+    "KVzapPress",
+    "KVzipPress",
+    "KeyDiffPress",
+    "LagKVPress",
+    "LeverageScorePress",
+    "NonCausalAttnPress",
+    "ObservedAttentionPress",
+    "PyramidKVPress",
+    "QFilterPress",
+    "RandomPress",
+    "SnapKVPress",
+    "StreamingLLMPress",
+    "TOVAPress",
+]
+
+
+class KVPress(PrunaAlgorithmBase):
+    """
+    Compress the KV cache of causal language models using KVPress.
+
+    KVPress is a library by NVIDIA that provides over 20 compression strategies (presses) for
+    reducing the memory footprint of the key-value cache during long-context inference. Each press
+    scores and prunes KV pairs after the prefill phase according to a chosen importance criterion.
+
+    This integration covers all scorer and standalone presses. Wrapper presses (e.g., ChunkPress,
+    AdaKVPress, PerLayerCompressionPress) that require a nested scorer press as input are not
+    included, as well as ThinKPress which compresses along the channel dimension with a different
+    parameter interface.
+    """
+
+    algorithm_name: str = "kvpress"
+    group_tags: list[tags] = [tags.PRUNER]
+    save_fn: SAVE_FUNCTIONS = SAVE_FUNCTIONS.reapply
+    references: dict[str, str] = {
+        "GitHub": "https://github.com/NVIDIA/kvpress",
+        "Article": "https://huggingface.co/blog/nvidia/kvpress",
+    }
+    required_install: str = "pip install pruna[kvpress]"
+    tokenizer_required: bool = False
+    processor_required: bool = False
+    dataset_required: bool = False
+    runs_on: list[str] = ["cuda"]
+    compatible_before: Iterable[str] = [
+        "awq", "gptq", "half", "hqq", "llm_int8",
+        "quanto", "sage_attn", "torchao", "moe_kernel_tuner",
+    ]
+    compatible_after: Iterable[str] = ["torch_compile", "moe_kernel_tuner"]
+
+    def get_hyperparameters(self) -> list:
+        """
+        Configure all algorithm-specific hyperparameters with ConfigSpace.
+
+        Returns
+        -------
+        list
+            The hyperparameters.
+        """
+        return [
+            CategoricalHyperparameter(
+                "press_type",
+                choices=PRESS_TYPES,
+                default_value="ExpectedAttentionPress",
+                meta={"desc": "The KV cache compression strategy to use."},
+            ),
+            UniformFloatHyperparameter(
+                "compression_ratio",
+                lower=0.0,
+                upper=1.0,
+                default_value=0.5,
+                meta={"desc": "Fraction of KV pairs to remove. 0.0 means no compression."},
+            ),
+            UnconstrainedHyperparameter(
+                "press_kwargs",
+                default_value=None,
+                meta={"desc": "Additional keyword arguments passed to the press constructor."},
+            ),
+        ]
+
+    def model_check_fn(self, model: Any) -> bool:
+        """
+        Check if the model is a causal language model or a pipeline wrapping one.
+
+        Parameters
+        ----------
+        model : Any
+            The model to check.
+
+        Returns
+        -------
+        bool
+            True if the model is compatible with KV cache compression, False otherwise.
+        """
+        return is_causal_lm(model) or is_transformers_pipeline_with_causal_lm(model)
+
+    def _apply(self, model: Any, smash_config: SmashConfigPrefixWrapper) -> Any:
+        """
+        Wrap the model's generate method to apply KV cache compression via a press context manager.
+
+        Parameters
+        ----------
+        model : Any
+            The causal language model to compress.
+        smash_config : SmashConfigPrefixWrapper
+            The algorithm-prefixed configuration containing press_type, compression_ratio, and press_kwargs.
+
+        Returns
+        -------
+        Any
+            The model with its generate method wrapped to compress the KV cache on each call.
+        """
+        if is_transformers_pipeline_with_causal_lm(model):
+            return self._apply_to_model_within_transformers_pipeline(model, smash_config)
+
+        imported_modules = self.import_algorithm_packages()
+
+        press_type = smash_config["press_type"]
+        compression_ratio = smash_config["compression_ratio"]
+        press_kwargs = smash_config["press_kwargs"] or {}
+
+        press_cls = imported_modules[press_type]
+        press = press_cls(compression_ratio=compression_ratio, **press_kwargs)
+
+        original_generate = model.generate
+
+        @functools.wraps(original_generate)
+        def generate_with_press(*args, **kwargs):
+            with press(model):
+                return original_generate(*args, **kwargs)
+
+        model.generate = generate_with_press
+        model._kvpress_original_generate = original_generate
+        model._kvpress_press = press
+
+        return model
+
+    def import_algorithm_packages(self) -> Dict[str, Any]:
+        """
+        Lazily import kvpress and collect all supported press classes.
+
+        Returns
+        -------
+        Dict[str, Any]
+            A dictionary mapping press class names to their classes.
+        """
+        import kvpress
+
+        return {name: getattr(kvpress, name) for name in PRESS_TYPES}
diff --git a/src/pruna/algorithms/llm_compressor.py b/src/pruna/algorithms/llm_compressor.py
@@ -52,7 +52,7 @@ class LLMCompressor(PrunaAlgorithmBase):
     dataset_required: bool = True
     runs_on: list[str] = ["cuda"]
     compatible_before: Iterable[str] = ["moe_kernel_tuner"]
-    compatible_after: Iterable[str] = ["sage_attn", "moe_kernel_tuner"]
+    compatible_after: Iterable[str] = ["sage_attn", "kvpress", "moe_kernel_tuner"]
     required_install = "``uv pip install 'pruna[awq]'``"
 
     def get_hyperparameters(self) -> list:
diff --git a/src/pruna/algorithms/moe_kernel_tuner.py b/src/pruna/algorithms/moe_kernel_tuner.py
@@ -50,13 +50,13 @@ class MoeKernelTuner(PrunaAlgorithmBase):
     dataset_required: bool = False
     compatible_before: Iterable[str] = [
         "awq", "deepcache", "diffusers_int8", "fastercache", "flash_attn3",
-        "fora", "hqq", "hqq_diffusers", "llm_int8", "pab", "padding_pruning",
+        "fora", "hqq", "hqq_diffusers", "kvpress", "llm_int8", "pab", "padding_pruning",
         "qkv_diffusers", "quanto", "reduce_noe", "ring_attn", "sage_attn",
         "torch_compile", "torchao",
     ]
     compatible_after: Iterable[str] = [
         "awq", "deepcache", "diffusers_int8", "fastercache", "flash_attn3",
-        "fora", "hqq", "hqq_diffusers", "llm_int8", "pab", "padding_pruning",
+        "fora", "hqq", "hqq_diffusers", "kvpress", "llm_int8", "pab", "padding_pruning",
         "qkv_diffusers", "quanto", "ring_attn", "sage_attn",
         "torch_compile", "torchao",
     ]
diff --git a/src/pruna/algorithms/quanto.py b/src/pruna/algorithms/quanto.py
@@ -52,6 +52,7 @@ class Quanto(PrunaAlgorithmBase):
     compatible_before: Iterable[str] = ["qkv_diffusers", "moe_kernel_tuner"]
     compatible_after: Iterable[str] = [
         "deepcache",
+        "kvpress",
         "sage_attn",
         "text_to_image_distillation_inplace_perp",
         "text_to_image_distillation_lora",
diff --git a/src/pruna/algorithms/sage_attn.py b/src/pruna/algorithms/sage_attn.py
@@ -50,7 +50,7 @@ class SageAttn(PrunaAlgorithmBase):
     runs_on: list[str] = ["cuda", "accelerate"]
     dataset_required: bool = False
     compatible_before: Iterable[str | tags] = [tags.QUANTIZER, "moe_kernel_tuner"]
-    compatible_after: Iterable[str | tags] = ["torch_compile", tags.CACHER, "moe_kernel_tuner"]
+    compatible_after: Iterable[str | tags] = ["torch_compile", tags.CACHER, "kvpress", "moe_kernel_tuner"]
 
     def model_check_fn(self, model: Any) -> bool:
         """
diff --git a/src/pruna/algorithms/torch_compile/torch_compile.py b/src/pruna/algorithms/torch_compile/torch_compile.py
@@ -74,6 +74,7 @@ class TorchCompile(PrunaAlgorithmBase):
         "hyper",
         "padding_pruning",
         "ring_attn",
+        "kvpress",
         "text_to_image_distillation_inplace_perp",
         "text_to_image_distillation_lora",
         "text_to_image_distillation_perp",
diff --git a/src/pruna/algorithms/torchao.py b/src/pruna/algorithms/torchao.py
@@ -99,6 +99,7 @@ class Torchao(PrunaAlgorithmBase):
         "fora",
         "torch_compile",
         "sage_attn",
+        "kvpress",
         "img2img_denoise",
         "realesrgan_upscale",
         "moe_kernel_tuner",
diff --git a/src/pruna/config/smash_config.py b/src/pruna/config/smash_config.py
@@ -623,8 +623,9 @@ def add(self, request: str | list[str] | dict[str, Any]) -> None:
         # request wants to activate a dictionary of algorithms and their hyperparameters
         elif isinstance(request, dict):
             for key, value in request.items():
-                # target modules are a special case, as they are a hyperparameter but their value is a dict
-                if isinstance(value, dict) and "target_module" not in key:
+                # if the key is an algorithm name and the value is a dict, treat it as
+                # algorithm activation + hyperparameter setting (e.g. {"hqq": {"weight_bits": 4}})
+                if isinstance(value, dict) and key in SMASH_SPACE.get_all_algorithms():
                     self._configuration[key] = True
                     for k, v in value.items():
                         if not k.startswith(key):
diff --git a/tests/algorithms/testers/kvpress.py b/tests/algorithms/testers/kvpress.py
@@ -0,0 +1,46 @@
+import pytest
+
+from pruna import PrunaModel
+from pruna.algorithms.kvpress import KVPress
+
+from .base_tester import AlgorithmTesterBase
+
+
+@pytest.mark.requires_kvpress
+class TestKVPress(AlgorithmTesterBase):
+    """Test the KVPress KV cache compression algorithm with default settings."""
+
+    models = ["llama_3_tiny_random"]
+    reject_models = ["sd_tiny_random"]
+    allow_pickle_files = False
+    algorithm_class = KVPress
+    metrics = ["perplexity"]
+
+    def post_smash_hook(self, model: PrunaModel) -> None:
+        """Verify that the press was applied to the model."""
+        assert hasattr(model, "_kvpress_press")
+        assert hasattr(model, "_kvpress_original_generate")
+
+
+@pytest.mark.requires_kvpress
+class TestKVPressSnapKV(AlgorithmTesterBase):
+    """Test the KVPress algorithm with SnapKV and custom press_kwargs."""
+
+    models = ["llama_3_tiny_random"]
+    reject_models = ["sd_tiny_random"]
+    allow_pickle_files = False
+    algorithm_class = KVPress
+    metrics = ["perplexity"]
+    hyperparameters = {
+        "kvpress_press_type": "SnapKVPress",
+        "kvpress_compression_ratio": 0.3,
+        "kvpress_press_kwargs": {"window_size": 32, "kernel_size": 3},
+    }
+
+    def post_smash_hook(self, model: PrunaModel) -> None:
+        """Verify that SnapKV press was applied with correct parameters."""
+        assert hasattr(model, "_kvpress_press")
+        press = model._kvpress_press
+        assert type(press).__name__ == "SnapKVPress"
+        assert press.window_size == 32
+        assert press.kernel_size == 3
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -20,6 +20,7 @@
     "requires_whisper": "mark test that needs pruna[whisper]",
     "requires_upscale": "mark test that needs pruna[upscale]",
     "requires_rapidata": "mark test that needs pruna[rapidata]",
+    "requires_kvpress": "mark test that needs pruna[kvpress]",
 }