NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 0 additions & 29 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 0 additions & 90 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 0 additions & 90 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 1 addition & 3 deletions b/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp‎
Lines changed: 0 additions & 53 deletions b/‎cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp‎
Lines changed: 0 additions & 53 deletions
diff --git a/‎tensorrt_llm/_torch/disaggregation/native/transfer.py‎
Lines changed: 3 additions & 36 deletions b/‎tensorrt_llm/_torch/disaggregation/native/transfer.py‎
Lines changed: 3 additions & 36 deletions
@@ -681,15 +681,6 @@ class GenerationRequest
         ++mNumFrontBlocksRemovedPerWindow.at(windowSize);
     }
 
-    //! \brief Advance the per-window front-block counter without touching cache blocks.
-    //! \details Used by ``BlockManager::releasePrefixBlocks`` to advance the
-    //! single-window front-block counter once after every ``WindowBlockManager`` has
-    //! processed the same prefix range.
-    void incrementNumFrontBlocksRemoved(SizeType32 windowSize)
-    {
-        ++mNumFrontBlocksRemovedPerWindow.at(windowSize);
-    }
-
     void removeLastBlock(SizeType32 windowSize)
     {
         for (auto& beamBlockIds : mCacheBlockIds.at(windowSize))
@@ -982,14 +973,6 @@ class WindowBlockManager
     std::optional<KVCacheBlock::IdType> releaseBlocks(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest);
 
-    //! \brief Release prefix blocks in range [startIdx, numBlocks) for a sequence.
-    //! \details Used by disaggregated serving to free sender-side KV memory
-    //! for blocks whose data has already been transferred.  Reuses the
-    //! detachFrontBlock mechanism (decRefCount + eviction policy release).
-    //! Called by BlockManager::releasePrefixBlocks which coordinates the
-    //! single-window front-block counter across all window managers.
-    void releasePrefixBlocks(GenerationRequest& sequence, SizeType32 startIdx, SizeType32 numBlocks);
-
     //! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks
     void schedulingReleaseBlocks(LlmRequest::RequestIdType requestId);
 
@@ -1531,13 +1514,6 @@ class BlockManager
     std::optional<KVCacheBlock::IdType> releaseBlocks(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
 
-    //! \brief Release the first numBlocks prefix blocks of a sequence.
-    //! \details Mirrors detachFrontBlock logic: decRefCount + eviction policy
-    //! release for each prefix block.  The front-block counter on
-    //! GenerationRequest ensures releaseBlocks (called during removeSequence)
-    //! skips already-freed prefix blocks.
-    void releasePrefixBlocks(GenerationRequest& sequence, SizeType32 numBlocks);
-
     [[nodiscard]] std::vector<KVCacheBlock::IdType> storeBlocksForReuse(
         GenerationRequest& sequence, OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinBlocks = false);
 
@@ -2455,11 +2431,6 @@ class KVCacheManager : public BaseKVCacheManager
     [[nodiscard]] std::optional<KVCacheBlock::IdType> removeSequence(LlmRequest::RequestIdType requestId,
         OptionalRef<LlmRequest const> llmRequest = std::nullopt, bool pinOnRelease = false) override;
 
-    //! \brief Release prefix blocks for a sequence without removing it.
-    //! \details Used by disaggregated serving for early block release during
-    //! chunked KV cache transfer.  No-op if the sequence does not exist.
-    void releasePrefixBlocks(LlmRequest::RequestIdType requestId, SizeType32 numBlocks);
-
     void schedulingRemoveSequence(LlmRequest::RequestIdType requestId) override;
 
     [[nodiscard]] runtime::ITensor::SharedPtr getBlockPoolPointers() const override
 
@@ -2897,34 +2897,6 @@ std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(
     return lastStoredId;
 }
 
-void BlockManager::releasePrefixBlocks(GenerationRequest& sequence, SizeType32 numBlocks)
-{
-    // NOTE: This assumes a single window size (no VSWA).  With different window
-    // sizes, each WindowBlockManager may have a different number of allocated
-    // blocks, so releasing the same numBlocks from all managers would need
-    // per-window-size handling.  Disaggregated serving does not support VSWA
-    // today (gated by should_store_blocks: not is_vswa in the executor and
-    // beamWidth == 1 assertion in WindowBlockManager::releasePrefixBlocks).
-    //
-    auto const windowSize = mWindowBlockManagers.cbegin()->first;
-    // Snapshot the counter before iterating so that every WindowBlockManager
-    // releases the same range.  Without this, the first manager would advance
-    // the single-window front-block counter and subsequent managers would see
-    // the counter already at the target, skipping their own blocks.
-    SizeType32 const startIdx = sequence.getNumFrontBlocksRemoved(windowSize);
-    for (auto& [_, manager] : mWindowBlockManagers)
-    {
-        manager.releasePrefixBlocks(sequence, startIdx, numBlocks);
-    }
-    // Advance the single-window counter once, after all managers have released.
-    // Uses incrementNumFrontBlocksRemoved (counter-only) instead of
-    // removeFrontBlock so the intent is explicit.
-    while (sequence.getNumFrontBlocksRemoved(windowSize) < numBlocks)
-    {
-        sequence.incrementNumFrontBlocksRemoved(windowSize);
-    }
-}
-
 void BlockManager::pinBlocks(GenerationRequest& sequence)
 {
     for (auto& [_, manager] : mWindowBlockManagers)
@@ -3737,43 +3709,6 @@ void WindowBlockManager::detachFrontBlock(GenerationRequest& sequence)
         sequence.getNumFrontBlocksRemoved(mWindowSize));
 }
 
-void WindowBlockManager::releasePrefixBlocks(GenerationRequest& sequence, SizeType32 startIdx, SizeType32 numBlocks)
-{
-    TLLM_CHECK_WITH_INFO(
-        sequence.getBeamWidth() == 1, "[kv cache manager] releasePrefixBlocks does not support beamWidth > 1");
-
-    auto const requestId = sequence.getRequestId();
-    auto& allocatedBlocks = mAllocatedBlocksPerSeq.at(requestId);
-    SizeType32 const target = std::min(numBlocks, static_cast<SizeType32>(allocatedBlocks.size()));
-
-    // Release blocks in range [startIdx, target).  The single-window
-    // front-block counter is advanced by BlockManager after
-    // all WindowBlockManagers have processed the same range.
-    for (SizeType32 blockIdx = startIdx; blockIdx < target; ++blockIdx)
-    {
-        auto& block = allocatedBlocks.at(blockIdx);
-        auto releasedBlock = block;
-
-        TLLM_LOG_DEBUG("%s::releasePrefixBlocks - Releasing block %d from sequence %lu", mLogPrefix.c_str(),
-            releasedBlock->getBlockId(), requestId);
-
-        // Replace the sequence slot with a placeholder, matching detachFrontBlock().
-        // removeSequence later walks allocatedBlocks in releaseBlocks(); leaving the
-        // real block here would release it a second time and corrupt the eviction
-        // policy's free-block count.
-        block = KVCacheBlock::createPlaceholder();
-
-        if (releasedBlock->hasRefs())
-        {
-            releasedBlock->decRefCount();
-        }
-        if (!releasedBlock->hasRefs())
-        {
-            mEvictionPolicy->releaseBlock(releasedBlock);
-        }
-    }
-}
-
 PrefixReuseSummary KVCacheManager::analyzePrefixReuse(
     VecUniqueTokens const& uniqueTokens, LlmRequest const& llmRequest) const
 {
@@ -3950,31 +3885,6 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(
     return lastStoredId;
 }
 
-void KVCacheManager::releasePrefixBlocks(RequestIdType requestId, SizeType32 numBlocks)
-{
-    // Hard precondition: BlockManager::releasePrefixBlocks advances the
-    // single-window front-block counter to numBlocks for every WindowBlockManager,
-    // even when a window has fewer than numBlocks allocated.  Under variable
-    // sliding window attention (VSWA), that would cause WindowBlockManager::
-    // releaseBlocks (called during removeSequence) to underrun rbegin() and
-    // skip tail blocks for the smaller window.  Disagg serving already gates
-    // VSWA out, but we enforce the assumption here so the C++ API contract is
-    // self-defending instead of relying on caller discipline.
-    TLLM_CHECK_WITH_INFO(
-        !mBlockManager.isVariableWindow(), "releasePrefixBlocks does not support variable sliding window attention");
-    if (numBlocks <= 0)
-    {
-        return;
-    }
-    std::scoped_lock lock(mSequencesMtx);
-    auto it = mSequences.find(requestId);
-    if (it == mSequences.end())
-    {
-        return;
-    }
-    mBlockManager.releasePrefixBlocks(it->second, numBlocks);
-}
-
 std::vector<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse(
     RequestIdType requestId, OptionalRef<LlmRequest const> llmRequest, bool pinBlocks)
 {
 
@@ -683,9 +683,7 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
         .def("copy_linear_attention_block", &tbk::KVCacheManager::copyLinearAttentionBlock, nb::arg("llm_request"),
             nb::call_guard<nb::gil_scoped_release>())
         .def("copy_linear_attention_block_batch", &tbk::KVCacheManager::copyLinearAttentionBlockBatch,
-            nb::arg("llm_requests"), nb::call_guard<nb::gil_scoped_release>())
-        .def("release_prefix_blocks", &tbk::KVCacheManager::releasePrefixBlocks, nb::arg("request_id"),
-            nb::arg("num_blocks"), nb::call_guard<nb::gil_scoped_release>());
+            nb::arg("llm_requests"), nb::call_guard<nb::gil_scoped_release>());
 }
 
 void tb::BasePeftCacheManagerBindings::initBindings(nb::module_& m)
 
@@ -270,59 +270,6 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
         std::runtime_error);
 }
 
-TEST_F(KVCacheManagerTest, BlockManagerReleasePrefixBlocksDoesNotDoubleFreeOnTeardown)
-{
-    auto constexpr numLayers = 12;
-    auto constexpr numKvHeads = 6;
-    auto constexpr sizePerHead = 128;
-    auto constexpr tokensPerBlock = 4;
-    auto constexpr blocksInPrimaryPool = 8;
-    auto constexpr blocksInSecondaryPool = 0;
-    auto constexpr maxNumSequences = 8;
-    auto const stream = std::make_shared<tr::CudaStream>();
-
-    auto constexpr beamWidth = 1;
-    auto constexpr numBlocksPerBeam = 4;
-    auto constexpr numTokens = tokensPerBlock * numBlocksPerBeam;
-    auto constexpr maxAttentionWindow = numTokens;
-
-    auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}};
-
-    BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
-        maxNumSequences, stream, maxAttentionWindow, beamWidth,
-        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, nvinfer1::DataType::kHALF, 0, maxAttentionWindow);
-    blockManager.allocatePools(false);
-
-    SizeType32 constexpr maxNewTokens{0};
-    tr::SamplingConfig const samplingConfig{beamWidth};
-    bool constexpr isStreaming{false};
-
-    auto tokens = std::make_shared<VecTokens>();
-    for (SizeType32 i = 0; i < numTokens; ++i)
-    {
-        tokens->push_back(i);
-    }
-
-    LlmRequest::RequestIdType constexpr requestId{42};
-    auto llmReq = std::make_shared<LlmRequest>(requestId, maxNewTokens, tokens, samplingConfig, isStreaming);
-    GenerationRequest seq{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
-
-    (void) blockManager.addSequenceBatch(
-        {&seq}, {numTokens}, {numBlocksPerBeam}, {std::ref(*llmReq)}, maxAttentionWindow, /*isEnableBlockReuse=*/false);
-    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocksPerBeam);
-
-    blockManager.releasePrefixBlocks(seq, 2);
-    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - 2);
-
-    // releasePrefixBlocks has cumulative semantics. This should release only
-    // one additional block rather than releasing the first two again.
-    blockManager.releasePrefixBlocks(seq, 3);
-    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - 1);
-
-    blockManager.releaseBlocks(seq);
-    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
-}
-
 template <typename T>
 void writePatternToOffloadedBlocksDRAM(T* rawBlockPtr, int blockSize, int mask)
 {
 
@@ -7,7 +7,7 @@
 import weakref
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable, List, Optional, Union
+from typing import List, Optional, Union
 
 import msgpack
 import numpy as np
@@ -57,8 +57,6 @@
 AttentionTypeCpp = tensorrt_llm.bindings.internal.batch_manager.AttentionType
 LlmRequestType = tensorrt_llm.bindings.internal.batch_manager.LlmRequestType
 
-OnChunkTransferredCallback = Callable[[int, int, int], None]
-
 # Number of worker threads for KV transfer queues (default: 1)
 KV_TRANSFER_NUM_THREADS = int(os.environ.get("TRTLLM_KV_TRANSFER_NUM_THREADS", "1"))
 
@@ -569,29 +567,6 @@ def _deliver_kv_to_agent(self, write_meta: WriteMeta):
                 )
             else:
                 task.complete()
-                if session._on_chunk_transferred is not None:
-                    try:
-                        # Use the max across layer groups as the
-                        # cumulative release count.  For asymmetric
-                        # layer groups (e.g., sliding window), shorter
-                        # groups may have fewer blocks per chunk, but
-                        # each WindowBlockManager independently clamps
-                        # to its own allocated block count via
-                        # min(numBlocks, allocatedBlocks.size()).
-                        num_blocks = max(
-                            (len(ids) for ids in task._slice.block_ids_per_layer_groups),
-                            default=0,
-                        )
-                        session._on_chunk_transferred(
-                            request_id=session.request_id,
-                            chunk_block_offset=task._slice.chunk_block_offset,
-                            num_blocks=num_blocks,
-                        )
-                    except Exception as e:
-                        logger.warning(
-                            f"on_chunk_transferred callback failed for "
-                            f"request {session.request_id} slice {write_meta.slice_id}: {e}"
-                        )
 
         logger.debug(
             f"deliver_kv_to_agent completed: unique_rid={write_meta.unique_rid}, "
@@ -751,10 +726,10 @@ def _build_kv_write_meta(self, task: KVSendTask, req_info: RecvReqInfo) -> Write
                         f"src={src_block_ids.size}, dst={dst_block_ids.size}"
                     )
                     dst_block_ids = dst_block_ids[:-1]
-                elif block_diff != 0:
+                elif block_diff > 1:
                     raise ValueError(
                         f"src/dst block count mismatch: {src_block_ids.size} vs "
-                        f"{dst_block_ids.size} (expected 0 <= diff <= 1)"
+                        f"{dst_block_ids.size} (expected diff <= 1)"
                     )
                 tpb = extractor.page_table.tokens_per_block
                 token_range = task._slice.token_range
@@ -1131,7 +1106,6 @@ def __init__(
         timeout_s: Optional[float] = None,
         prompt_len: Optional[int] = None,
         beam_width: int = 1,
-        on_chunk_transferred: Optional[OnChunkTransferredCallback] = None,
     ):
         super().__init__(
             sender,
@@ -1147,7 +1121,6 @@ def __init__(
         self.kv_tasks = []
         self.aux_task = None
         self.lock = threading.Lock()
-        self._on_chunk_transferred = on_chunk_transferred
 
         self._exception: Optional[Exception] = None
         self._closed = False
@@ -2046,16 +2019,11 @@ def populate_instance_and_rank_info(self, endpoints: list[str], layer_num_per_pp
     def create_tx_session(
         self,
         request: LlmRequest,
-        on_chunk_transferred: Optional[OnChunkTransferredCallback] = None,
     ) -> TxSession:
         """Create a TxSession for the given request.
 
         Args:
             request: The LLM request to create a send session for.
-            on_chunk_transferred: Optional callback invoked on the
-                sender worker thread after each chunk's RDMA completes.
-                Signature: ``(request_id: int, chunk_block_offset: int,
-                num_blocks: int) -> None``.
 
         Returns:
             A new ``TxSession`` ready to accept ``send()`` calls.
@@ -2070,7 +2038,6 @@ def create_tx_session(
             timeout_s=self._config.tx_timeout_s,
             prompt_len=request.prompt_len,
             beam_width=request.py_beam_width,
-            on_chunk_transferred=on_chunk_transferred,
         )
 
     def create_rx_session(self, request: LlmRequest) -> RxSession: