NVIDIA
diff --git a/‎tensorrt_llm/_torch/pyexecutor/kv_cache_manager_v2.py‎
Lines changed: 34 additions & 217 deletions b/‎tensorrt_llm/_torch/pyexecutor/kv_cache_manager_v2.py‎
Lines changed: 34 additions & 217 deletions
@@ -801,10 +801,6 @@ def append_to_kv_heads_per_layer(
         self.enable_block_reuse = kv_cache_config.enable_block_reuse
         self.enable_partial_reuse = kv_cache_config.enable_partial_reuse
         self.disk_prefetch_num_reqs = kv_cache_config.disk_prefetch_num_reqs
-        self._decode_capacity_only_requests: set[int] = set()
-        self._pending_compacted_capacities: dict[
-            int, tuple[int, int, Optional[torch.cuda.Event]]
-        ] = {}
 
         # With pipeline parallelism, multiple microbatches can be in-flight
         # simultaneously, so we need slots for all concurrent sequences.
@@ -1287,142 +1283,6 @@ def is_request_active(self, request_id: int) -> bool:
         kv_cache = self.kv_cache_map.get(request_id)
         return kv_cache is not None and kv_cache.is_active
 
-    def enable_decode_capacity_only(self, request_id: int) -> None:
-        """Preserve a request's finalized history while decoding.
-
-        Capacity-only decode is intended for KV compression methods that compact
-        uncommitted decode KV to the front of an all-full-attention cache. Context
-        updates still advance history to the finalized prompt prefix; decode
-        updates only resize capacity around that prefix.
-
-        Args:
-            request_id: Request whose decode capacity will be managed explicitly.
-
-        Raises:
-            ValueError: If block reuse is enabled or any local layer has an SWA or
-                SSM lifecycle.
-        """
-        if self.enable_block_reuse:
-            raise ValueError("Decode capacity-only mode requires block reuse to be disabled")
-        if (
-            self.max_beam_width != 1
-            or self.num_extra_kv_tokens
-            or self.max_total_draft_tokens
-            or self._kv_reserve_draft_tokens
-        ):
-            raise ValueError(
-                "Decode capacity-only mode currently supports single-token, beam-width-one "
-                "decoding only"
-            )
-        if any(window is not None for window in self.max_attention_window_vec) or any(
-            not isinstance(layer, AttentionLayerConfig) or layer.sliding_window_size is not None
-            for layer in self.kv_cache_manager_py_config.layers
-        ):
-            raise ValueError(
-                "Decode capacity-only mode supports full-attention layers only; "
-                "SWA, VSWA, and SSM layers are not supported"
-            )
-        self._decode_capacity_only_requests.add(request_id)
-
-    def has_pending_compacted_capacity(self, request_id: int) -> bool:
-        """Return whether a compacted capacity target is waiting to be consumed."""
-        return request_id in self._pending_compacted_capacities
-
-    def get_pre_forward_kv_length(self, request_id: int) -> int:
-        """Return written KV tokens after scheduling and before the next forward.
-
-        The generation scheduler has reserved one unwritten slot at this point.
-        A pending compaction target can coexist with a later overlap reservation,
-        so derive the effective capacity from both instead of from request logical
-        length.
-        """
-        if request_id not in self._decode_capacity_only_requests:
-            raise ValueError(f"Request {request_id} is not enabled for decode capacity-only mode")
-        kv_cache = self.kv_cache_map.get(request_id)
-        if kv_cache is None or not kv_cache.is_active:
-            raise ValueError(f"Request {request_id} has no active KV cache")
-        allocated_draft_len = self._allocated_draft_lens.get(request_id)
-        if allocated_draft_len is None:
-            raise ValueError(
-                f"Request {request_id} has no generation capacity reserved for this forward"
-            )
-        if allocated_draft_len:
-            raise ValueError(
-                "Decode capacity-only mode currently supports single-token, beam-width-one "
-                "decoding only"
-            )
-        effective_capacity = kv_cache.capacity
-        pending_target = self._pending_compacted_capacities.get(request_id)
-        if pending_target is not None:
-            target_capacity, published_capacity, _ = pending_target
-            capacity_growth = kv_cache.capacity - published_capacity
-            if capacity_growth < 0:
-                raise ValueError(
-                    f"Request {request_id} capacity {kv_cache.capacity} fell below "
-                    f"published capacity {published_capacity}"
-                )
-            effective_capacity = target_capacity + capacity_growth
-        if effective_capacity < 1:
-            raise ValueError(
-                f"Request {request_id} has invalid pre-forward capacity {effective_capacity}"
-            )
-        written_length = effective_capacity - 1
-        if written_length < kv_cache.history_length:
-            raise ValueError(
-                f"Request {request_id} pre-forward KV length {written_length} is below "
-                f"finalized history {kv_cache.history_length}"
-            )
-        return written_length
-
-    def set_compacted_capacity(
-        self,
-        request_id: int,
-        target_capacity: int,
-        event: Optional[torch.cuda.Event] = None,
-    ) -> None:
-        """Queue a one-shot physical capacity target for a compacted request.
-
-        The target is consumed by the next active generation update. Capacity
-        reserved after this call is added to the target, so an overlapped next
-        forward cannot lose its slot. If an event is supplied, the manager's
-        execution stream waits for it before releasing trailing KV pages.
-
-        Args:
-            request_id: Request previously enabled for capacity-only decode.
-            target_capacity: Physical capacity before the generation rewind is
-                applied.
-            event: Optional CUDA event recorded after compaction.
-
-        Raises:
-            ValueError: If the request is not enabled, has no KV cache, the target
-                does not leave a forward slot above finalized history, exceeds
-                current capacity, or another target is still pending.
-        """
-        if request_id not in self._decode_capacity_only_requests:
-            raise ValueError(f"Request {request_id} is not enabled for decode capacity-only mode")
-        if target_capacity < 0:
-            raise ValueError(f"Compacted capacity must be non-negative, got {target_capacity}")
-        kv_cache = self.kv_cache_map.get(request_id)
-        if kv_cache is None:
-            raise ValueError(f"Request {request_id} has no KV cache to compact")
-        if target_capacity > kv_cache.capacity:
-            raise ValueError(
-                f"Compacted capacity {target_capacity} for request {request_id} "
-                f"cannot exceed current capacity {kv_cache.capacity}"
-            )
-        if target_capacity <= kv_cache.history_length:
-            raise ValueError(
-                f"Compacted capacity {target_capacity} for request {request_id} "
-                f"must leave a forward slot above finalized history {kv_cache.history_length}"
-            )
-        if request_id in self._pending_compacted_capacities:
-            raise ValueError(f"Request {request_id} already has a pending compacted capacity")
-        self._pending_compacted_capacities[request_id] = (
-            target_capacity,
-            kv_cache.capacity,
-            event,
-        )
-
     def _effective_draft_len(self, req: LlmRequest) -> int:
         """Draft token length to use for next-step KV capacity calculation.
 
@@ -1524,53 +1384,23 @@ def revert_allocate_generation(self, req: LlmRequest) -> None:
         host page-index buffer.
 
         Mirror the effective draft length used in _required_gen_capacity
-        so disagg-gen-trans-complete revert stays symmetric. The scheduler
-        overwrites ``_allocated_draft_lens`` for every revert-eligible
-        allocation; a successful revert consumes that marker.
+        so disagg-gen-trans-complete revert stays symmetric.
         """
         kv_cache = self.kv_cache_map.get(req.py_request_id)
         if kv_cache is None or not kv_cache.is_active:
             return
-        has_allocation_marker = req.py_request_id in self._allocated_draft_lens
-        draft_len = self._allocated_draft_lens.get(
+        draft_len = self._allocated_draft_lens.pop(
             req.py_request_id, self._effective_draft_len(req)
         )
-        pending_target = self._pending_compacted_capacities.get(req.py_request_id)
-        published_this_allocation = (
-            has_allocation_marker
-            and pending_target is not None
-            and pending_target[1] == kv_cache.capacity
-        )
         reverted_cap = kv_cache.capacity - 1 - draft_len
         if reverted_cap < 0:
-            self._allocated_draft_lens.pop(req.py_request_id, None)
             return
-        reverted_pending_target = None
-        if published_this_allocation:
-            target_capacity, published_capacity, event = pending_target
-            reverted_target = target_capacity - 1 - draft_len
-            if reverted_target < kv_cache.history_length:
-                raise RuntimeError(
-                    f"Reverting request {req.py_request_id} would move compacted "
-                    f"capacity {reverted_target} below finalized history "
-                    f"{kv_cache.history_length}"
-                )
-            reverted_pending_target = (
-                reverted_target,
-                published_capacity - 1 - draft_len,
-                event,
-            )
-        if pending_target is not None and pending_target[2] is not None:
-            self._stream.wait_event(pending_target[2])
         if not kv_cache.resize(reverted_cap):
             raise RuntimeError(
                 f"Failed to revert KV cache capacity for request "
                 f"{req.py_request_id} from {kv_cache.capacity} to "
                 f"{reverted_cap}"
             )
-        self._allocated_draft_lens.pop(req.py_request_id, None)
-        if reverted_pending_target is not None:
-            self._pending_compacted_capacities[req.py_request_id] = reverted_pending_target
 
     def revert_allocate_context(self, req: LlmRequest) -> None:
         """Undo the capacity growth from this iter's ``resize_context``.
@@ -2365,25 +2195,19 @@ def release_index_slot(self, request_id: int) -> None:
         self._early_freed_index_requests.add(request_id)
 
     def free_resources(self, request: LlmRequest, pin_on_release: bool = False):
-        request_id = request.py_request_id
-        self._allocated_draft_lens.pop(request_id, None)
-        pending_target = self._pending_compacted_capacities.get(request_id)
-        if pending_target is not None and pending_target[2] is not None:
-            self._stream.wait_event(pending_target[2])
-        self._decode_capacity_only_requests.discard(request_id)
-        self._pending_compacted_capacities.pop(request_id, None)
-        kv_cache = self.kv_cache_map.pop(request_id, None)
+        self._allocated_draft_lens.pop(request.py_request_id, None)
+        kv_cache = self.kv_cache_map.pop(request.py_request_id, None)
         if kv_cache is None:
-            self.impl.clear_stats_excluded(request_id)
+            self.impl.clear_stats_excluded(request.py_request_id)
             return
         kv_cache.discard_pending_stats()
         self.try_commit_blocks_for_reuse(request, kv_cache)
         kv_cache.close()
-        self.impl.clear_stats_excluded(request_id)
-        if request_id in self._early_freed_index_requests:
-            self._early_freed_index_requests.discard(request_id)
+        self.impl.clear_stats_excluded(request.py_request_id)
+        if request.py_request_id in self._early_freed_index_requests:
+            self._early_freed_index_requests.discard(request.py_request_id)
         else:
-            self.index_mapper.remove_sequence(request_id)
+            self.index_mapper.remove_sequence(request.py_request_id)
 
     def get_batch_cache_indices(
         self, request_ids: List[int], layer_idx: Optional[int] = None
@@ -2656,46 +2480,39 @@ def update_resources(
             # will be resumed by the scheduler on the next iteration.
             if not kv_cache.is_active:
                 continue
-            completing = req.state in (
-                LlmRequestState.GENERATION_COMPLETE,
-                LlmRequestState.CONTEXT_INIT,
+            new_capacity = (
+                None
+                if req.state in (LlmRequestState.GENERATION_COMPLETE, LlmRequestState.CONTEXT_INIT)
+                else kv_cache.capacity - req.py_rewind_len
             )
-            request_id = req.py_request_id
-            if request_id in self._decode_capacity_only_requests:
-                pending_target = self._pending_compacted_capacities.get(request_id)
-                if pending_target is not None:
-                    target_capacity, published_capacity, event = pending_target
-                    if event is not None:
-                        self._stream.wait_event(event)
-                if completing:
-                    new_capacity = None
-                elif pending_target is None:
-                    new_capacity = kv_cache.capacity - req.py_rewind_len
-                else:
-                    capacity_growth = kv_cache.capacity - published_capacity
-                    if capacity_growth < 0:
-                        raise ValueError(
-                            f"Request {request_id} capacity {kv_cache.capacity} fell below "
-                            f"published capacity {published_capacity}"
-                        )
-                    new_capacity = target_capacity + capacity_growth - req.py_rewind_len
-                success = kv_cache.resize(new_capacity, None)
-                if not success:
+            capacity_only = getattr(req, "py_kv_cache_decode_capacity_only", False) is True
+            history_length = None if capacity_only else req.max_beam_num_tokens - 1
+            compaction = getattr(req, "py_kv_cache_compaction", None)
+            consume_compaction = capacity_only and compaction is not None
+            if consume_compaction:
+                target_capacity, published_capacity, event = compaction
+                capacity_growth = kv_cache.capacity - published_capacity
+                if capacity_growth < 0:
                     raise ValueError(
-                        f"Failed to resize KV cache for request {request_id} "
-                        f"to capacity {new_capacity} while preserving its finalized history"
+                        f"Request {req.py_request_id} capacity {kv_cache.capacity} "
+                        f"fell below published capacity {published_capacity}"
                     )
-                if completing or pending_target is not None:
-                    self._pending_compacted_capacities.pop(request_id, None)
-                continue
-            new_capacity = None if completing else kv_cache.capacity - req.py_rewind_len
-            success = kv_cache.resize(new_capacity, req.max_beam_num_tokens - 1)
+                # K+1 retains every block addressable by this forward. Resizing
+                # may race the full-table offset copy, but only rewrites the
+                # unreachable tail; the stream event protects page reuse.
+                if event is not None:
+                    self._stream.wait_event(event)
+                if new_capacity is not None:
+                    new_capacity = target_capacity + capacity_growth - req.py_rewind_len
+            success = kv_cache.resize(new_capacity, history_length)
             if not success:
                 raise ValueError(
                     f"Failed to resize KV cache for request {req.py_request_id} "
                     f"to capacity {new_capacity} and history length "
-                    f"{req.max_beam_num_tokens - 1} tokens at generation update"
+                    f"{history_length} tokens at generation update"
                 )
+            if consume_compaction:
+                req.py_kv_cache_compaction = None
 
     def copy_batch_block_offsets(
         self,