feat(kvcache): add KV-cache-compression reclaim hook to KVCacheManagerV2.update_resources

Hudayday · Hudayday · commit 5cdbbf3c55b4 · 2026-06-28T20:28:04.000-07:00
KVCacheManagerV2.update_resources grows every generation request's KV history by one
token per decode step. KV-cache compression methods compact a request's kept tokens in
place, after which the history should shrink and the freed blocks return to the pool.

This establishes the integration point: a compression-evicted request reports its
evicted-token count in py_kv_evicted_tokens, and update_resources has the hook where the
non-growing reclaim (shrink to the compacted length + free the blocks) belongs.

The reclaim itself needs a _KVCache non-growing shrink/free primitive (the planned
page-sharing fork) and is left as a TODO follow-up; until then the hook is a no-op and
every request grows exactly as before (byte-identical behavior).

Adds unit tests pinning the grow / completing / suspended paths so the hook stays
non-breaking, and registers them in l0_a10.yml.

Signed-off-by: Tianrui Hu &lt;tianruih@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_manager_v2.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_manager_v2.py
@@ -2480,11 +2480,23 @@ def update_resources(
             # will be resumed by the scheduler on the next iteration.
             if not kv_cache.is_active:
                 continue
-            new_capacity = (
-                None
-                if req.state in (LlmRequestState.GENERATION_COMPLETE, LlmRequestState.CONTEXT_INIT)
-                else kv_cache.capacity - req.py_rewind_len
+            completing = req.state in (
+                LlmRequestState.GENERATION_COMPLETE,
+                LlmRequestState.CONTEXT_INIT,
             )
+            # KV-cache-compression reclaim hook. A request whose kept tokens were
+            # compacted to the front reports its evicted-token count in
+            # py_kv_evicted_tokens; when that is > 0 the non-growing reclaim — shrink
+            # history to (max_beam_num_tokens - evicted) and return the freed blocks to
+            # the pool — belongs here, in place of the grow below.
+            # TODO(kvcache): implement once _KVCache exposes the non-growing shrink/free
+            # primitive (the planned page-sharing fork); out of scope for this change.
+            # Until then every request grows as usual (still correct — the compression
+            # manager reconciles the read length; the freed blocks are not yet reclaimed).
+            evicted = int(getattr(req, "py_kv_evicted_tokens", 0) or 0)
+            if evicted > 0 and not completing:
+                pass  # TODO(kvcache): reclaim to (req.max_beam_num_tokens - evicted)
+            new_capacity = None if completing else kv_cache.capacity - req.py_rewind_len
             success = kv_cache.resize(new_capacity, req.max_beam_num_tokens - 1)
             if not success:
                 raise ValueError(
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -35,6 +35,7 @@ l0_a10:
   - unittest/_torch/executor/test_kv_pool_rebalance.py
   - unittest/_torch/executor/test_disagg_index_mapper_early_release.py
   - unittest/_torch/pyexecutor/test_kv_cache_compression_manager.py
+  - unittest/_torch/pyexecutor/test_kv_cache_v2_compression_reclaim.py
   - unittest/_torch/modules/dwdp/test_dwdp_fixup_moe_backends.py
   - unittest/_torch/modules/dwdp/test_dwdp_manager.py
   - unittest/_torch/modules/dwdp/test_dwdp_mapping.py
diff --git a/tests/unittest/_torch/pyexecutor/test_kv_cache_v2_compression_reclaim.py b/tests/unittest/_torch/pyexecutor/test_kv_cache_v2_compression_reclaim.py
@@ -0,0 +1,88 @@
+"""Unit tests for KVCacheManagerV2's KV-cache-compression reclaim hook in
+``update_resources``.
+
+A request whose cache was compacted in place reports its evicted-token count in
+``py_kv_evicted_tokens``. The actual non-growing reclaim (shrink history + return the
+freed blocks to the pool) needs a ``_KVCache`` shrink/free primitive and is a TODO
+follow-up, so for now the hook is a no-op and every request grows as before. These
+tests pin that the hook does not change the existing grow / completing / suspended
+paths (i.e. it is byte-identical, non-breaking).
+"""
+
+from unittest.mock import MagicMock
+
+from tensorrt_llm._torch.pyexecutor.kv_cache_manager_v2 import KVCacheManagerV2
+from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequestState
+
+
+def _fake_manager():
+    """A KVCacheManagerV2 with only the attributes update_resources touches."""
+    m = KVCacheManagerV2.__new__(KVCacheManagerV2)
+    m.is_draft = True  # skip the draft-token-location helper (orthogonal)
+    m.kv_cache_map = {}
+    return m
+
+
+def _req(rid, max_beam, evicted=0, rewind=0, state=LlmRequestState.GENERATION_IN_PROGRESS):
+    r = MagicMock()
+    r.py_request_id = rid
+    r.max_beam_num_tokens = max_beam
+    r.py_kv_evicted_tokens = evicted
+    r.py_rewind_len = rewind
+    r.state = state
+    return r
+
+
+def _kvcache(capacity=4096):
+    k = MagicMock()
+    k.is_active = True
+    k.capacity = capacity
+    k.resize.return_value = True
+    return k
+
+
+def _batch(reqs):
+    b = MagicMock()
+    b.generation_requests = reqs
+    return b
+
+
+def _run(manager, reqs):
+    for r in reqs:
+        manager.kv_cache_map[r.py_request_id] = _kvcache()
+    manager.update_resources(_batch(reqs))
+    return {r.py_request_id: manager.kv_cache_map[r.py_request_id] for r in reqs}
+
+
+def test_evicted_request_grows_until_reclaim_implemented():
+    """A compression-evicted request reports py_kv_evicted_tokens, but until the
+    reclaim primitive lands the hook is a no-op and the request still grows like any
+    other (resize(capacity, max_beam-1)) -- the hook must not break the grow path."""
+    m = _fake_manager()
+    k = _run(m, [_req(1, max_beam=200, evicted=50, rewind=0)])[1]
+    k.resize.assert_called_once_with(4096, 199)
+
+
+def test_unevicted_request_grows_exactly_as_before():
+    """Backward-compat: no eviction -> resize(capacity - rewind, max_beam - 1)."""
+    m = _fake_manager()
+    k = _run(m, [_req(2, max_beam=200, evicted=0, rewind=3)])[2]
+    k.resize.assert_called_once_with(4096 - 3, 199)
+
+
+def test_completing_request_resizes_with_none_capacity():
+    """A completing/generation-done request keeps the resize(None, max_beam-1) path."""
+    m = _fake_manager()
+    k = _run(m, [_req(3, max_beam=200, state=LlmRequestState.GENERATION_COMPLETE)])[3]
+    k.resize.assert_called_once_with(None, 199)
+
+
+def test_inactive_cache_is_skipped():
+    """A suspended (overlap-scheduler) cache is skipped entirely -- no resize."""
+    m = _fake_manager()
+    r = _req(6, max_beam=200, evicted=50)
+    k = _kvcache()
+    k.is_active = False
+    m.kv_cache_map[6] = k
+    m.update_resources(_batch([r]))
+    k.resize.assert_not_called()