fix ups

athena-nv · athena-nv · commit 7746bbcd35f9 · 2026-06-22T22:41:20.000Z
Signed-off-by: Athena Cai &lt;athenac@nvidia.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -681,15 +681,13 @@ class GenerationRequest
         ++mNumFrontBlocksRemovedPerWindow.at(windowSize);
     }
 
-    //! \brief Advance ``mNumFrontBlocksRemoved`` without touching cache blocks.
+    //! \brief Advance the per-window front-block counter without touching cache blocks.
     //! \details Used by ``BlockManager::releasePrefixBlocks`` to advance the
-    //! shared front-block counter once after every ``WindowBlockManager`` has
-    //! processed the same prefix range.  Has clearer intent than calling
-    //! ``removeFrontBlock`` with a sentinel ``windowSize`` value, and is robust
-    //! to future changes that consume the ``windowSize`` argument.
-    void incrementNumFrontBlocksRemoved()
+    //! single-window front-block counter once after every ``WindowBlockManager`` has
+    //! processed the same prefix range.
+    void incrementNumFrontBlocksRemoved(SizeType32 windowSize)
     {
-        ++mNumFrontBlocksRemoved;
+        ++mNumFrontBlocksRemovedPerWindow.at(windowSize);
     }
 
     void removeLastBlock(SizeType32 windowSize)
@@ -989,7 +987,7 @@ class WindowBlockManager
     //! for blocks whose data has already been transferred.  Reuses the
     //! detachFrontBlock mechanism (decRefCount + eviction policy release).
     //! Called by BlockManager::releasePrefixBlocks which coordinates the
-    //! shared mNumFrontBlocksRemoved counter across all window managers.
+    //! single-window front-block counter across all window managers.
     void releasePrefixBlocks(GenerationRequest& sequence, SizeType32 startIdx, SizeType32 numBlocks);
 
     //! \brief Simulate freeing all blocks for that sequence to check impact on number of free blocks
@@ -1535,7 +1533,7 @@ class BlockManager
 
     //! \brief Release the first numBlocks prefix blocks of a sequence.
     //! \details Mirrors detachFrontBlock logic: decRefCount + eviction policy
-    //! release for each prefix block.  The mNumFrontBlocksRemoved counter on
+    //! release for each prefix block.  The front-block counter on
     //! GenerationRequest ensures releaseBlocks (called during removeSequence)
     //! skips already-freed prefix blocks.
     void releasePrefixBlocks(GenerationRequest& sequence, SizeType32 numBlocks);
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -2906,22 +2906,22 @@ void BlockManager::releasePrefixBlocks(GenerationRequest& sequence, SizeType32 n
     // today (gated by should_store_blocks: not is_vswa in the executor and
     // beamWidth == 1 assertion in WindowBlockManager::releasePrefixBlocks).
     //
+    auto const windowSize = mWindowBlockManagers.cbegin()->first;
     // Snapshot the counter before iterating so that every WindowBlockManager
     // releases the same range.  Without this, the first manager would advance
-    // the shared mNumFrontBlocksRemoved counter and subsequent managers would
-    // see the counter already at the target, skipping their own blocks.
-    SizeType32 const startIdx = sequence.getNumFrontBlocksRemoved();
+    // the single-window front-block counter and subsequent managers would see
+    // the counter already at the target, skipping their own blocks.
+    SizeType32 const startIdx = sequence.getNumFrontBlocksRemoved(windowSize);
     for (auto& [_, manager] : mWindowBlockManagers)
     {
         manager.releasePrefixBlocks(sequence, startIdx, numBlocks);
     }
-    // Advance the shared counter once, after all managers have released.
+    // Advance the single-window counter once, after all managers have released.
     // Uses incrementNumFrontBlocksRemoved (counter-only) instead of
-    // removeFrontBlock so the intent is explicit and we do not depend on
-    // removeFrontBlock ignoring its windowSize argument.
-    while (sequence.getNumFrontBlocksRemoved() < numBlocks)
+    // removeFrontBlock so the intent is explicit.
+    while (sequence.getNumFrontBlocksRemoved(windowSize) < numBlocks)
     {
-        sequence.incrementNumFrontBlocksRemoved();
+        sequence.incrementNumFrontBlocksRemoved(windowSize);
     }
 }
 
@@ -3746,23 +3746,30 @@ void WindowBlockManager::releasePrefixBlocks(GenerationRequest& sequence, SizeTy
     auto& allocatedBlocks = mAllocatedBlocksPerSeq.at(requestId);
     SizeType32 const target = std::min(numBlocks, static_cast<SizeType32>(allocatedBlocks.size()));
 
-    // Release blocks in range [startIdx, target).  The shared
-    // mNumFrontBlocksRemoved counter is advanced by BlockManager after
+    // Release blocks in range [startIdx, target).  The single-window
+    // front-block counter is advanced by BlockManager after
     // all WindowBlockManagers have processed the same range.
     for (SizeType32 blockIdx = startIdx; blockIdx < target; ++blockIdx)
     {
         auto& block = allocatedBlocks.at(blockIdx);
+        auto releasedBlock = block;
 
         TLLM_LOG_DEBUG("%s::releasePrefixBlocks - Releasing block %d from sequence %lu", mLogPrefix.c_str(),
-            block->getBlockId(), requestId);
+            releasedBlock->getBlockId(), requestId);
 
-        if (block->hasRefs())
+        // Replace the sequence slot with a placeholder, matching detachFrontBlock().
+        // removeSequence later walks allocatedBlocks in releaseBlocks(); leaving the
+        // real block here would release it a second time and corrupt the eviction
+        // policy's free-block count.
+        block = KVCacheBlock::createPlaceholder();
+
+        if (releasedBlock->hasRefs())
         {
-            block->decRefCount();
+            releasedBlock->decRefCount();
         }
-        if (!block->hasRefs())
+        if (!releasedBlock->hasRefs())
         {
-            mEvictionPolicy->releaseBlock(block);
+            mEvictionPolicy->releaseBlock(releasedBlock);
         }
     }
 }
@@ -3945,8 +3952,8 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(
 
 void KVCacheManager::releasePrefixBlocks(RequestIdType requestId, SizeType32 numBlocks)
 {
-    // Hard precondition: BlockManager::releasePrefixBlocks advances the shared
-    // mNumFrontBlocksRemoved counter to numBlocks for every WindowBlockManager,
+    // Hard precondition: BlockManager::releasePrefixBlocks advances the
+    // single-window front-block counter to numBlocks for every WindowBlockManager,
     // even when a window has fewer than numBlocks allocated.  Under variable
     // sliding window attention (VSWA), that would cause WindowBlockManager::
     // releaseBlocks (called during removeSequence) to underrun rbegin() and
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
@@ -270,6 +270,59 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
         std::runtime_error);
 }
 
+TEST_F(KVCacheManagerTest, BlockManagerReleasePrefixBlocksDoesNotDoubleFreeOnTeardown)
+{
+    auto constexpr numLayers = 12;
+    auto constexpr numKvHeads = 6;
+    auto constexpr sizePerHead = 128;
+    auto constexpr tokensPerBlock = 4;
+    auto constexpr blocksInPrimaryPool = 8;
+    auto constexpr blocksInSecondaryPool = 0;
+    auto constexpr maxNumSequences = 8;
+    auto const stream = std::make_shared<tr::CudaStream>();
+
+    auto constexpr beamWidth = 1;
+    auto constexpr numBlocksPerBeam = 4;
+    auto constexpr numTokens = tokensPerBlock * numBlocksPerBeam;
+    auto constexpr maxAttentionWindow = numTokens;
+
+    auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {blocksInPrimaryPool, blocksInSecondaryPool}}};
+
+    BlockManager blockManager(std::vector(numLayers, numKvHeads), sizePerHead, tokensPerBlock, blocksPerWindow,
+        maxNumSequences, stream, maxAttentionWindow, beamWidth,
+        std::vector<BlockManager::SizeType32>{maxAttentionWindow}, nvinfer1::DataType::kHALF, 0, maxAttentionWindow);
+    blockManager.allocatePools(false);
+
+    SizeType32 constexpr maxNewTokens{0};
+    tr::SamplingConfig const samplingConfig{beamWidth};
+    bool constexpr isStreaming{false};
+
+    auto tokens = std::make_shared<VecTokens>();
+    for (SizeType32 i = 0; i < numTokens; ++i)
+    {
+        tokens->push_back(i);
+    }
+
+    LlmRequest::RequestIdType constexpr requestId{42};
+    auto llmReq = std::make_shared<LlmRequest>(requestId, maxNewTokens, tokens, samplingConfig, isStreaming);
+    GenerationRequest seq{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()};
+
+    (void) blockManager.addSequenceBatch(
+        {&seq}, {numTokens}, {numBlocksPerBeam}, {std::ref(*llmReq)}, maxAttentionWindow, /*isEnableBlockReuse=*/false);
+    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocksPerBeam);
+
+    blockManager.releasePrefixBlocks(seq, 2);
+    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - 2);
+
+    // releasePrefixBlocks has cumulative semantics. This should release only
+    // one additional block rather than releasing the first two again.
+    blockManager.releasePrefixBlocks(seq, 3);
+    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - 1);
+
+    blockManager.releaseBlocks(seq);
+    EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool);
+}
+
 template <typename T>
 void writePatternToOffloadedBlocksDRAM(T* rawBlockPtr, int blockSize, int mask)
 {
diff --git a/tensorrt_llm/_torch/disaggregation/native/transfer.py b/tensorrt_llm/_torch/disaggregation/native/transfer.py
@@ -1168,6 +1168,10 @@ def disagg_request_id(self) -> int:
     def status(self) -> SessionStatus:
         if self._terminal_status is not None:
             return self._terminal_status
+        if self._exception is not None or any(t.status == TaskStatus.ERROR for t in self.kv_tasks):
+            return SessionStatus.ERROR
+        if self.aux_task is not None and self.aux_task.status == TaskStatus.ERROR:
+            return SessionStatus.ERROR
         kv_all_transferred = bool(self.kv_tasks) and all(
             t.status == TaskStatus.TRANSFERRED for t in self.kv_tasks
         )
@@ -1755,15 +1759,15 @@ def process_kv_agent_result(
                 )
 
     def process_aux_agent_result(self, _peer_rank: int, status: AgentResult):
-        # Aux is session-level (not per-slice); expected_transfers is identical
-        # across all kv_tasks, so any task provides the right count.
+        # Aux is session-level (not per-slice); use the final KV task's
+        # expected transfer count so chunked sessions wait for all senders.
         with self.lock:
             if not self._kv_tasks:
                 logger.warning(
                     f"Aux result received before any KV tasks for request {self.request_id}"
                 )
                 return
-            task = self._kv_tasks[0]
+            task = self._kv_tasks[-1]
             if status == AgentResult.SUCCESS:
                 self._aux_count += 1
 
diff --git a/tensorrt_llm/_torch/disaggregation/transceiver.py b/tensorrt_llm/_torch/disaggregation/transceiver.py
@@ -353,6 +353,10 @@ def _make_chunk_callback(self) -> Optional[Callable]:
         release_queue = self._pending_prefix_releases
 
         def _on_chunk_transferred(request_id: int, chunk_block_offset: int, num_blocks: int):
+            logger.debug(
+                f"Early release _on_chunk_transferred: request_id: {request_id}, "
+                f"chunk_block_offset: {chunk_block_offset}, num_blocks: {num_blocks}"
+            )
             cumulative_blocks = chunk_block_offset + num_blocks
             release_queue.put((request_id, cumulative_blocks))
 
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
@@ -15,7 +15,6 @@
 from .modeling_deepseekv3 import DeepseekV3ForCausalLM
 from .modeling_exaone4 import Exaone4ForCausalLM
 from .modeling_exaone4_5 import Exaone4_5_ForConditionalGeneration
-from .modeling_exaone_moe import ExaoneMoeForCausalLM
 from .modeling_gemma3 import Gemma3ForCausalLM
 from .modeling_gemma3vl import Gemma3VLM
 from .modeling_glm import Glm4MoeForCausalLM
@@ -57,6 +56,11 @@
 from .modeling_utils import get_model_architecture
 from .modeling_vila import VilaModel
 
+try:
+    from .modeling_exaone_moe import ExaoneMoeForCausalLM
+except ImportError:
+    ExaoneMoeForCausalLM = None
+
 # Note: for better readiblity, this should have same order as imports above
 __all__ = [
     "AfmoeForCausalLM",
@@ -67,7 +71,6 @@
     "DeepseekV3ForCausalLM",
     "Exaone4ForCausalLM",
     "Exaone4_5_ForConditionalGeneration",
-    "ExaoneMoeForCausalLM",
     "Gemma3ForCausalLM",
     "Gemma3VLM",
     "HCXVisionForCausalLM",
@@ -116,6 +119,9 @@
     "Step3p7VLForConditionalGeneration",
 ]
 
+if ExaoneMoeForCausalLM is not None:
+    __all__.append("ExaoneMoeForCausalLM")
+
 if transformers.__version__ >= "4.45.1":
     from .modeling_mllama import MllamaForConditionalGeneration  # noqa
 
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -733,6 +733,49 @@ def test_kv_cache_v2_nixl_python(self):
                                       self.MODEL_PATH) as llm:
             run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"])
 
+    @skip_pre_hopper
+    @pytest.mark.skip_less_device(2)
+    @parametrize_with_ids("chunk_size_blocks", [64])
+    @parametrize_with_ids("enable_block_reuse", [False, True])
+    def test_chunked_kv_transfer_nixl_python_accuracy(self,
+                                                      chunk_size_blocks: int,
+                                                      enable_block_reuse: bool):
+        """Test chunked KV transfer accuracy using Python transceiver and C++ KVCacheManager."""
+        kv_cache_config = {
+            "use_kv_cache_manager_v2": False,
+            "enable_block_reuse": enable_block_reuse,
+        }
+        cache_transceiver_config = {
+            "backend": "NIXL",
+            "transceiver_runtime": "PYTHON",
+            "max_tokens_in_buffer": 4096,
+            "chunk_size_blocks": chunk_size_blocks,
+        }
+        ctx_server_config = {
+            "disable_overlap_scheduler": True,
+            "kv_cache_config": dict(kv_cache_config),
+            "cache_transceiver_config": dict(cache_transceiver_config),
+        }
+        gen_server_config = {
+            "disable_overlap_scheduler": False,
+            "kv_cache_config": dict(kv_cache_config),
+            "cache_transceiver_config": dict(cache_transceiver_config),
+        }
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+            },
+            "generation_servers": {
+                "num_instances": 1,
+            },
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"])
+
     @pytest.mark.skip_less_device(2)
     def test_ngram(self):
         speculative_decoding_config = {
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -18,6 +18,7 @@ l0_dgx_b200:
   - unittest/_torch/misc/test_autotuner.py::test_autotuner_distributed_strategy
   - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16[tp2-CUTLASS]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_bf16[tp2-TRTLLM]
+  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_chunked_kv_transfer_nixl_python_accuracy
   # ------------- KV Cache V2 Scheduler IT (multi-GPU) ---------------
   - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2DSv3Lite::test_mtp_draft_tokens
   - kv_cache/test_kv_cache_v2_scheduler.py::TestKVCacheV2DSv3Lite::test_mtp_chunked_draft_tokens
diff --git a/tests/unittest/disaggregated/test_chunked_transfer.py b/tests/unittest/disaggregated/test_chunked_transfer.py
diff --git a/tests/unittest/disaggregated/test_kv_transfer.py b/tests/unittest/disaggregated/test_kv_transfer.py