Move chunk_block_offset into KVSlice dataclass

chienchunhung · athena-nv · commit 596d45388a1a · 2026-06-22T22:30:53.000Z
Per reviewer feedback (chuangz0, Shixiaowei02): chunk_block_offset
belongs as a member of KVSlice rather than a function parameter on
send(). The KVSlice dataclass was designed to carry all slice metadata.

- Add chunk_block_offset: int = 0 to KVSlice dataclass
- Remove chunk_block_offset from TxSessionBase.send() signature
- Remove chunk_block_offset from TxSession.send() signature
- Remove chunk_block_offset from KVSendTask.__init__
- Read chunk_block_offset from task._slice in _build_kv_write_meta
  and _deliver_kv_to_agent callback
- Set chunk_block_offset on each KVSlice in _create_kv_slices
- Update all tests accordingly

Signed-off-by: Chien-Chun Hung &lt;2679986+chienchunhung@users.noreply.github.com&gt;
Made-with: Cursor
Signed-off-by: Chien-Chun Hung &lt;2679986+chienchunhung@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -681,6 +681,17 @@ class GenerationRequest
         ++mNumFrontBlocksRemovedPerWindow.at(windowSize);
     }
 
+    //! \brief Advance ``mNumFrontBlocksRemoved`` without touching cache blocks.
+    //! \details Used by ``BlockManager::releasePrefixBlocks`` to advance the
+    //! shared front-block counter once after every ``WindowBlockManager`` has
+    //! processed the same prefix range.  Has clearer intent than calling
+    //! ``removeFrontBlock`` with a sentinel ``windowSize`` value, and is robust
+    //! to future changes that consume the ``windowSize`` argument.
+    void incrementNumFrontBlocksRemoved()
+    {
+        ++mNumFrontBlocksRemoved;
+    }
+
     void removeLastBlock(SizeType32 windowSize)
     {
         for (auto& beamBlockIds : mCacheBlockIds.at(windowSize))
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -2916,9 +2916,12 @@ void BlockManager::releasePrefixBlocks(GenerationRequest& sequence, SizeType32 n
         manager.releasePrefixBlocks(sequence, startIdx, numBlocks);
     }
     // Advance the shared counter once, after all managers have released.
+    // Uses incrementNumFrontBlocksRemoved (counter-only) instead of
+    // removeFrontBlock so the intent is explicit and we do not depend on
+    // removeFrontBlock ignoring its windowSize argument.
     while (sequence.getNumFrontBlocksRemoved() < numBlocks)
     {
-        sequence.removeFrontBlock(0);
+        sequence.incrementNumFrontBlocksRemoved();
     }
 }
 
@@ -3942,6 +3945,16 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(
 
 void KVCacheManager::releasePrefixBlocks(RequestIdType requestId, SizeType32 numBlocks)
 {
+    // Hard precondition: BlockManager::releasePrefixBlocks advances the shared
+    // mNumFrontBlocksRemoved counter to numBlocks for every WindowBlockManager,
+    // even when a window has fewer than numBlocks allocated.  Under variable
+    // sliding window attention (VSWA), that would cause WindowBlockManager::
+    // releaseBlocks (called during removeSequence) to underrun rbegin() and
+    // skip tail blocks for the smaller window.  Disagg serving already gates
+    // VSWA out, but we enforce the assumption here so the C++ API contract is
+    // self-defending instead of relying on caller discipline.
+    TLLM_CHECK_WITH_INFO(
+        !mBlockManager.isVariableWindow(), "releasePrefixBlocks does not support variable sliding window attention");
     if (numBlocks <= 0)
     {
         return;
diff --git a/tensorrt_llm/_torch/disaggregation/base/transfer.py b/tensorrt_llm/_torch/disaggregation/base/transfer.py
@@ -65,6 +65,7 @@ class KVSlice:
     )  # Physical block IDs per layer group, each np.ndarray(dtype=np.int64)
     is_last_slice: bool = False
     mamba_state_index: Optional[int] = None
+    chunk_block_offset: int = 0
 
 
 class SessionStatus(Enum):
@@ -158,15 +159,14 @@ def __init__(self, sender: SenderBase, args: SessionArgsBase):
         self._sender = sender
 
     @abstractmethod
-    def send(self, slice: KVSlice, chunk_block_offset: int = 0) -> None:
+    def send(self, slice: KVSlice) -> None:
         """Send a KV slice.
 
         Args:
             slice: The KV slice describing which source blocks to send.
-            chunk_block_offset: Block offset into the receiver's full
-                destination block list for this chunk. Used by sender-side
-                chunking to slice the receiver's destination blocks correctly.
-                Defaults to 0 for monolithic transfer.
+                The slice's ``chunk_block_offset`` field indicates the offset
+                into the receiver's destination block list for sender-side
+                chunking.
         """
         ...
 
diff --git a/tensorrt_llm/_torch/disaggregation/native/transfer.py b/tensorrt_llm/_torch/disaggregation/native/transfer.py
@@ -208,11 +208,10 @@ class KVSendTask(SendTaskBase):
 
     Args:
         kv_slice: The KV slice describing which blocks to transfer.
+            The slice's ``chunk_block_offset`` field indicates the
+            offset into the receiver's destination block list.
         params: Disaggregated serving parameters for this request.
         slice_id: Index of this slice within the session's task list.
-        chunk_block_offset: Block offset into the receiver's full
-            destination block list.  Used by sender-side chunking to
-            slice the receiver's destination blocks correctly.
     """
 
     def __init__(
@@ -222,15 +221,13 @@ def __init__(
         slice_id: int,
         prompt_len: Optional[int] = None,
         beam_width: int = 1,
-        chunk_block_offset: int = 0,
     ) -> None:
         super().__init__(params)
         self.slice_id = slice_id
         self.transferred_count = 0
         self._slice = kv_slice
         self._prompt_len = prompt_len
         self._beam_width = beam_width
-        self.chunk_block_offset = chunk_block_offset
 
 
 class Sender(SenderBase):
@@ -587,7 +584,7 @@ def _deliver_kv_to_agent(self, write_meta: WriteMeta):
                         )
                         session._on_chunk_transferred(
                             request_id=session.request_id,
-                            chunk_block_offset=task.chunk_block_offset,
+                            chunk_block_offset=task._slice.chunk_block_offset,
                             num_blocks=num_blocks,
                         )
                     except Exception as e:
@@ -727,7 +724,7 @@ def _build_kv_write_meta(self, task: KVSendTask, req_info: RecvReqInfo) -> Write
             dst_block_ids_per_groups = req_info.block_ids_per_layer_groups
             src_block_ids_per_groups = task._slice.block_ids_per_layer_groups
 
-            chunk_offset = task.chunk_block_offset
+            chunk_offset = task._slice.chunk_block_offset
             for (self_lg, self_pi), (peer_lg, peer_pi) in pool_mapping.items():
                 src_block_ids = src_block_ids_per_groups[self_lg]
                 full_dst_block_ids = dst_block_ids_per_groups[peer_lg]
@@ -1182,7 +1179,7 @@ def status(self) -> SessionStatus:
             return SessionStatus.TRANSFERRING
         return SessionStatus.READY if self.receiver_ready else SessionStatus.INIT
 
-    def send(self, slice: KVSlice, chunk_block_offset: int = 0) -> None:
+    def send(self, slice: KVSlice) -> None:
         with self.lock:
             params = self._base_args.params
             slice_id = len(self.kv_tasks)
@@ -1192,7 +1189,6 @@ def send(self, slice: KVSlice, chunk_block_offset: int = 0) -> None:
                 slice_id,
                 prompt_len=self._base_args.prompt_len,
                 beam_width=self._base_args.beam_width,
-                chunk_block_offset=chunk_block_offset,
             )
             task._unique_rid = self.disagg_request_id
             self.kv_tasks.append(task)
diff --git a/tensorrt_llm/_torch/disaggregation/transceiver.py b/tensorrt_llm/_torch/disaggregation/transceiver.py
@@ -146,6 +146,10 @@ def shutdown(self):
         if getattr(self, "_shutdown", False):
             return
         self._shutdown = True
+        # Drain any pending prefix-release entries before tearing down sessions
+        # so memory frees in the same shutdown step instead of leaking until
+        # removeSequence cleans up at session close.
+        self._drain_pending_releases()
         for session in list(self._send_sessions.values()):
             session.close()
         for session in list(self._recv_sessions.values()):
@@ -272,6 +276,7 @@ def _create_kv_slices(self, req: LlmRequest) -> List[KVSlice]:
 
         num_chunks = math.ceil(max_blocks / self._chunk_size_blocks)
         slices: List[KVSlice] = []
+        block_offset = 0
         for chunk_idx in range(num_chunks):
             start = chunk_idx * self._chunk_size_blocks
             end = start + self._chunk_size_blocks
@@ -284,8 +289,16 @@ def _create_kv_slices(self, req: LlmRequest) -> List[KVSlice]:
                     block_ids_per_layer_groups=chunk_block_ids,
                     mamba_state_index=base_slice.mamba_state_index,
                     token_range=base_slice.token_range,
+                    chunk_block_offset=block_offset,
                 )
             )
+            # Use the max length across layer groups to advance the receiver
+            # offset.  This is the contract that lets receiver-side slicing in
+            # native/transfer.py (`_build_kv_write_meta`) trim the per-LG dst
+            # range with `len(src_block_ids)`, so asymmetric layer groups still
+            # land at the right destination position even though the offset is
+            # shared across groups.
+            block_offset += max((len(ids) for ids in chunk_block_ids), default=0)
 
         for lg_idx, original_ids in enumerate(all_block_ids):
             reassembled = np.concatenate([s.block_ids_per_layer_groups[lg_idx] for s in slices])
@@ -318,8 +331,24 @@ def _make_chunk_callback(self) -> Optional[Callable]:
         """
         if self._chunk_size_blocks is None:
             return None
+        manager_name = type(self._kv_cache_manager).__name__
         if not hasattr(self._kv_cache_manager, "release_prefix_blocks"):
+            # Surface the gate decision in logs so a typo or missing wrapper on
+            # the manager side is observable at startup, not silent.
+            logger.warning(
+                "Chunked KV transfer is enabled (chunk_size_blocks=%s) but %s "
+                "does not implement release_prefix_blocks; early prefix block "
+                "release is disabled. Blocks will be freed at session teardown.",
+                self._chunk_size_blocks,
+                manager_name,
+            )
             return None
+        logger.info(
+            "Chunked KV transfer with early prefix block release enabled "
+            "(chunk_size_blocks=%s, manager=%s).",
+            self._chunk_size_blocks,
+            manager_name,
+        )
 
         release_queue = self._pending_prefix_releases
 
@@ -515,6 +544,11 @@ def _build_to_process(
         return to_process
 
     def _close_failed_sessions(self, sessions: dict, reqs: dict, failed: list):
+        # Drain pending prefix releases before closing failed sessions so that
+        # already-completed chunks of healthy sister sessions free memory now
+        # rather than waiting for the next check_context_transfer_status pass.
+        # No-op when the queue is empty, including on the gen-side path.
+        self._drain_pending_releases()
         for rid in failed:
             reqs[rid].state = LlmRequestState.DISAGG_TRANS_ERROR
             sessions[rid].close()
@@ -574,12 +608,8 @@ def _finalize_send(self, req: LlmRequest, session: TxSessionBase):
     def respond_and_send_async(self, req: LlmRequest):
         session = self._get_or_create_send_session(req)
         req.state = LlmRequestState.DISAGG_CONTEXT_TRANS_IN_PROGRESS
-        chunk_block_offset = 0
         for kv_slice in self._create_kv_slices(req):
-            session.send(kv_slice, chunk_block_offset=chunk_block_offset)
-            chunk_block_offset += max(
-                (len(ids) for ids in kv_slice.block_ids_per_layer_groups), default=0
-            )
+            session.send(kv_slice)
         self._finalize_send(req, session)
 
     @nvtx_range("KvCacheTransceiverV2.request_and_receive_sync")
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
@@ -78,9 +78,15 @@ def create_kv_cache_transceiver(
     if (not use_python
             and cache_transceiver_config.chunk_size_blocks is not None):
         if cache_transceiver_config.backend in (None, "DEFAULT", "NIXL"):
-            logger.info(
-                "chunk_size_blocks is set; auto-selecting Python transceiver "
-                "for chunked KV cache transfer support")
+            # Use warning (not info) so users notice the transceiver swap and
+            # the implied perf / staging-buffer characteristics change.  Set
+            # transceiver_runtime='CPP' explicitly to opt out (and lose
+            # chunked transfer + early block release).
+            logger.warning(
+                "chunk_size_blocks is set; auto-selecting the Python "
+                "transceiver instead of the C++ transceiver to enable "
+                "chunked KV cache transfer + early block release. "
+                "Set transceiver_runtime='CPP' to disable this auto-selection.")
             use_python = True
         else:
             logger.warning(
@@ -90,6 +96,20 @@ def create_kv_cache_transceiver(
                 f"chunk_size_blocks will be ignored. Use NIXL backend to "
                 f"enable chunked transfer.")
 
+    # Warn when chunk_size_blocks is below the recommended floor.  The Pydantic
+    # field is PositiveInt (>=1), but values below ~16 push the per-chunk RDMA
+    # overhead into the regime where it dominates transfer throughput.
+    _MIN_RECOMMENDED_CHUNK_SIZE_BLOCKS = 16
+    if (cache_transceiver_config.chunk_size_blocks is not None
+            and cache_transceiver_config.chunk_size_blocks
+            < _MIN_RECOMMENDED_CHUNK_SIZE_BLOCKS):
+        logger.warning(
+            f"chunk_size_blocks={cache_transceiver_config.chunk_size_blocks} "
+            f"is below the recommended floor of "
+            f"{_MIN_RECOMMENDED_CHUNK_SIZE_BLOCKS}; per-chunk RDMA overhead "
+            f"may dominate transfer throughput. Consider 64-128 for "
+            f"long-context workloads (ISL >= 32K).")
+
     # Select transceiver implementation based on transceiver_runtime
     # transceiver_runtime == None or "CPP" -> use C++ transceiver (default)
     # transceiver_runtime == "PYTHON" -> use Python transceiver
diff --git a/tests/unittest/disaggregated/test_chunked_transfer.py b/tests/unittest/disaggregated/test_chunked_transfer.py
@@ -74,8 +74,9 @@ def _make_tx_session(num_slices: int, rid: int = 42, **kwargs) -> TxSession:
         s = KVSlice(
             is_last_slice=(i == num_slices - 1),
             block_ids_per_layer_groups=[[i]],
+            chunk_block_offset=i,
         )
-        session.send(s, chunk_block_offset=i)
+        session.send(s)
     return session
 
 
@@ -103,19 +104,19 @@ def _make_rx_session(num_slices: int, rid: int = 42) -> RxSession:
 
 
 def test_kv_send_task_chunk_block_offset():
-    """KVSendTask stores chunk_block_offset correctly."""
-    s = KVSlice(is_last_slice=False, block_ids_per_layer_groups=[[0, 1]])
-    task = KVSendTask(s, _make_params(), slice_id=1, chunk_block_offset=512)
-    assert task.chunk_block_offset == 512
+    """KVSendTask reads chunk_block_offset from the slice."""
+    s = KVSlice(is_last_slice=False, block_ids_per_layer_groups=[[0, 1]], chunk_block_offset=512)
+    task = KVSendTask(s, _make_params(), slice_id=1)
+    assert task._slice.chunk_block_offset == 512
     assert task.slice_id == 1
     assert task._slice is s
 
 
 def test_kv_send_task_default_offset():
-    """Default chunk_block_offset is 0."""
+    """Default chunk_block_offset on KVSlice is 0."""
     s = KVSlice(is_last_slice=True, block_ids_per_layer_groups=[[0]])
     task = KVSendTask(s, _make_params(), slice_id=0)
-    assert task.chunk_block_offset == 0
+    assert task._slice.chunk_block_offset == 0
 
 
 # ---------------------------------------------------------------------------
@@ -281,6 +282,47 @@ def test_drain_pending_releases():
     assert calls[2].args == (20, 32)
 
 
+def test_drain_pending_releases_tolerates_stale_rid():
+    """A pending release for a request that was already removed must be a no-op.
+
+    Models the production race where the sender worker enqueues a release
+    after the main thread has already torn the sequence down via
+    ``removeSequence``.  ``KVCacheManager.release_prefix_blocks`` returns
+    early in that case, so ``_drain_pending_releases`` must not raise.
+    """
+    from tensorrt_llm._torch.disaggregation.transceiver import KvCacheTransceiverV2
+
+    transceiver = MagicMock()
+    transceiver._pending_prefix_releases = queue.Queue()
+    transceiver._kv_cache_manager = MagicMock()
+    # Manager wrapper is a no-op for unknown rids; drain must propagate that
+    # no-op semantics rather than crashing.
+    transceiver._kv_cache_manager.release_prefix_blocks = MagicMock(return_value=None)
+
+    transceiver._pending_prefix_releases.put((9999, 64))  # unknown rid
+    transceiver._pending_prefix_releases.put((9999, 128))
+
+    KvCacheTransceiverV2._drain_pending_releases(transceiver)
+
+    calls = transceiver._kv_cache_manager.release_prefix_blocks.call_args_list
+    assert len(calls) == 2
+    assert calls[0].args == (9999, 64)
+    assert calls[1].args == (9999, 128)
+
+
+def test_drain_pending_releases_empty_queue_is_noop():
+    """Drain on an empty queue is a no-op and never calls the manager."""
+    from tensorrt_llm._torch.disaggregation.transceiver import KvCacheTransceiverV2
+
+    transceiver = MagicMock()
+    transceiver._pending_prefix_releases = queue.Queue()
+    transceiver._kv_cache_manager = MagicMock()
+
+    KvCacheTransceiverV2._drain_pending_releases(transceiver)
+
+    transceiver._kv_cache_manager.release_prefix_blocks.assert_not_called()
+
+
 @pytest.mark.parametrize(
     "has_release,chunk_size,expected_none",
     [
diff --git a/tests/unittest/disaggregated/test_kv_transfer.py b/tests/unittest/disaggregated/test_kv_transfer.py
@@ -1564,8 +1564,9 @@ def add_and_verify_chunked_request(
             kv_slice = KVSlice(
                 is_last_slice=is_last,
                 block_ids_per_layer_groups=chunk_block_ids,
+                chunk_block_offset=chunk_offset,
             )
-            sender_session.send(kv_slice, chunk_block_offset=chunk_offset)
+            sender_session.send(kv_slice)
             chunk_offset += max(len(ids) for ids in chunk_block_ids)
 
     receiver_sessions = [

Original file line number	Diff line number	Diff line change
`@@ -2916,9 +2916,12 @@ void BlockManager::releasePrefixBlocks(GenerationRequest& sequence, SizeType32 n`
`2916`	`2916`	`manager.releasePrefixBlocks(sequence, startIdx, numBlocks);`
`2917`	`2917`	`}`
`2918`	`2918`	`// Advance the shared counter once, after all managers have released.`
	`2919`	`+ // Uses incrementNumFrontBlocksRemoved (counter-only) instead of`
	`2920`	`+ // removeFrontBlock so the intent is explicit and we do not depend on`
	`2921`	`+ // removeFrontBlock ignoring its windowSize argument.`
`2919`	`2922`	`while (sequence.getNumFrontBlocksRemoved() < numBlocks)`
`2920`	`2923`	`{`
`2921`		`- sequence.removeFrontBlock(0);`
	`2924`	`+ sequence.incrementNumFrontBlocksRemoved();`
`2922`	`2925`	`}`
`2923`	`2926`	`}`
`2924`	`2927`
`@@ -3942,6 +3945,16 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(`
`3942`	`3945`
`3943`	`3946`	`void KVCacheManager::releasePrefixBlocks(RequestIdType requestId, SizeType32 numBlocks)`
`3944`	`3947`	`{`
	`3948`	`+ // Hard precondition: BlockManager::releasePrefixBlocks advances the shared`
	`3949`	`+ // mNumFrontBlocksRemoved counter to numBlocks for every WindowBlockManager,`
	`3950`	`+ // even when a window has fewer than numBlocks allocated. Under variable`
	`3951`	`+ // sliding window attention (VSWA), that would cause WindowBlockManager::`
	`3952`	`+ // releaseBlocks (called during removeSequence) to underrun rbegin() and`
	`3953`	`+ // skip tail blocks for the smaller window. Disagg serving already gates`
	`3954`	`+ // VSWA out, but we enforce the assumption here so the C++ API contract is`
	`3955`	`+ // self-defending instead of relying on caller discipline.`
	`3956`	`+ TLLM_CHECK_WITH_INFO(`
	`3957`	`+ !mBlockManager.isVariableWindow(), "releasePrefixBlocks does not support variable sliding window attention");`
`3945`	`3958`	`if (numBlocks <= 0)`
`3946`	`3959`	`{`
`3947`	`3960`	`return;`