NVIDIA
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 39 additions & 17 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 39 additions & 17 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/nanobind/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/bindings.cpp‎
Lines changed: 2 additions & 0 deletions b/‎cpp/tensorrt_llm/nanobind/bindings.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/testing/kvCacheManagerTestUtilBinding.cpp‎
Lines changed: 37 additions & 0 deletions b/‎cpp/tensorrt_llm/nanobind/testing/kvCacheManagerTestUtilBinding.cpp‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/testing/kvCacheManagerTestUtilBinding.h‎
Lines changed: 29 additions & 0 deletions b/‎cpp/tensorrt_llm/nanobind/testing/kvCacheManagerTestUtilBinding.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/testing/kvCacheManagerTestUtil.h‎
Lines changed: 43 additions & 0 deletions b/‎cpp/tensorrt_llm/testing/kvCacheManagerTestUtil.h‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp‎
Lines changed: 2 additions & 0 deletions b/‎cpp/tests/unit_tests/batch_manager/capacitySchedulerTest.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -87,6 +87,33 @@ std::vector<BlockPtr> getAllSequenceBlocks(BlockPtr lastBlock)
     return sequenceBlocks;
 }
 
+// Compute maximum number of tokens that have been computed by prefill and generation.
+// Accounts for chunked prefill to avoid storing state that hasn't been written to KV cache yet.
+// We call LlmRequest::getContextRemainingLength to see how many tokens are still waiting to be computed in prefill.
+// If this value is > 0 prefill is not finished yet, and number of computed tokens must be capped at the current context
+// position. If it is == 0, we are in generation mode, and number of computed tokens equals number of unique tokens
+// stored in request.
+SizeType32 getMaterializedUniqueTokenCountForReuse(
+    VecUniqueTokens const& uniqueTokens, tensorrt_llm::batch_manager::LlmRequest const& llmRequest)
+{
+    auto const totalUniqueTokenCount = static_cast<SizeType32>(uniqueTokens.size());
+    if (llmRequest.getContextRemainingLength() > 0)
+    {
+        return std::min(totalUniqueTokenCount, llmRequest.getContextCurrentPosition());
+    }
+    return totalUniqueTokenCount;
+}
+
+// Compute number of tokens that can be stored for reuse. The last computed token is never stored in KV cache, hence
+// cannot be stored for reuse. Number of tokens that can be stored for reuse is thus the greater of 0 or
+// getMaterializedUniqueTokenCountForReuse() - 1.
+SizeType32 getUsableUniqueTokenCountForReuse(
+    VecUniqueTokens const& uniqueTokens, tensorrt_llm::batch_manager::LlmRequest const& llmRequest)
+{
+    auto const materializedUniqueTokenCount = getMaterializedUniqueTokenCountForReuse(uniqueTokens, llmRequest);
+    return materializedUniqueTokenCount > 0 ? materializedUniqueTokenCount - 1 : 0;
+}
+
 } // namespace
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
@@ -926,12 +953,9 @@ void BlockManager::storeContextBlocks(GenerationRequest& sequence, LlmRequest co
         auto const& uniqueTokens = llmRequest.getUniqueTokens(beamIdx);
         TLLM_LOG_DEBUG("storeContextBlocks for request %lu on window %d with %d unique tokens", llmRequest.mRequestId,
             windowSize, uniqueTokens.size());
-        // only store the tokens that have been completed
-        size_t const completedTokens = llmRequest.getContextCurrentPosition();
-        auto usableSize = std::min(completedTokens, uniqueTokens.size() - 1);
-
+        auto const usableUniqueTokenCount = getUsableUniqueTokenCountForReuse(uniqueTokens, llmRequest);
         auto blockedUniqueTokens
-            = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, getTokensPerBlock(), false);
+            = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableUniqueTokenCount, getTokensPerBlock(), false);
         auto blockKeys = buildBlockKeys(blockedUniqueTokens, llmRequest);
         (void) manager.storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx]);
     }
@@ -2369,17 +2393,17 @@ std::vector<KVCacheBlock::IdType> WindowBlockManager::storeBlocksForReuse(
     auto constexpr beamIdx = 0;
     auto const& uniqueTokens = llmRequest->getUniqueTokens(beamIdx);
     auto const& cacheBlockIds = sequence.getCacheBlockIds(mWindowSize);
-    // TODO: get the caller to mark tokens as filled / not filled, so that the kv-cache manager doesn't
-    // have to guess. Only (length - 1) tokens of the sequence have their kv-state recorded in kv-cache. We assume
-    // the last token's state is not filled yet.
-    auto usableSize = static_cast<runtime::SizeType32>(uniqueTokens.size()) - 1;
+
+    auto usableUniqueTokenCount = getUsableUniqueTokenCountForReuse(uniqueTokens, *llmRequest);
     if (isRecurrentState())
     {
-        usableSize = std::min(llmRequest->getPromptLen() - 1, usableSize); // TODO: enable store for completed sequences
+        usableUniqueTokenCount = std::min(
+            llmRequest->getPromptLen() - 1, usableUniqueTokenCount); // TODO: enable store for completed sequences
     }
     TLLM_LOG_DEBUG("%s::storeBlocksForReuse: req=%lu, windowSize=%d, uniqueTokens.size()=%zu, usableSize=%zu",
-        mLogPrefix.c_str(), llmRequest->mRequestId, mWindowSize, uniqueTokens.size(), usableSize);
-    auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, mTokensPerBlock, true);
+        mLogPrefix.c_str(), llmRequest->mRequestId, mWindowSize, uniqueTokens.size(), usableUniqueTokenCount);
+    auto blockedUniqueTokens
+        = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableUniqueTokenCount, mTokensPerBlock, true);
     auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
 
     auto [numStored, pinnedBlockIds] = storeBlocks(std::move(blockKeys), cacheBlockIds[beamIdx], pinBlocks);
@@ -2414,11 +2438,9 @@ std::optional<KVCacheBlock::IdType> WindowBlockManager::releaseBlocks(
                     sequence.getRequestId());
             }
             auto const& uniqueTokens = llmRequest->getUniqueTokens(/*beamIdx=*/0);
-            // Only (length - 1) tokens of the sequence have their kv-state
-            // recorded in kv-cache. We assume the last token's state is not filled yet.
-            auto const usableSize = static_cast<runtime::SizeType32>(uniqueTokens.size()) - 1;
-            auto blockedUniqueTokens
-                = chopVectorIntoBlocks<UniqueToken>(uniqueTokens, usableSize, mTokensPerBlock, /*allowPartial=*/true);
+            auto const usableUniqueTokenCount = getUsableUniqueTokenCountForReuse(uniqueTokens, *llmRequest);
+            auto blockedUniqueTokens = chopVectorIntoBlocks<UniqueToken>(
+                uniqueTokens, usableUniqueTokenCount, mTokensPerBlock, /*allowPartial=*/true);
             auto blockKeys = buildBlockKeys(blockedUniqueTokens, *llmRequest);
 
             std::vector<KVCacheBlock::IdType> cacheBlockIds(allocatedBlocks.size());
 
@@ -22,6 +22,7 @@ set(SRCS
     runtime/hostfunc.cpp
     runtime/moeBindings.cpp
     suffixAutomaton/bindings.cpp
+    testing/kvCacheManagerTestUtilBinding.cpp
     testing/modelSpecBinding.cpp
     userbuffers/bindings.cpp
     thop/bindings.cpp
 
@@ -45,6 +45,7 @@
 #include "tensorrt_llm/nanobind/process_group/bindings.h"
 #include "tensorrt_llm/nanobind/runtime/bindings.h"
 #include "tensorrt_llm/nanobind/suffixAutomaton/bindings.h"
+#include "tensorrt_llm/nanobind/testing/kvCacheManagerTestUtilBinding.h"
 #include "tensorrt_llm/nanobind/testing/modelSpecBinding.h"
 #include "tensorrt_llm/nanobind/thop/bindings.h"
 #include "tensorrt_llm/nanobind/userbuffers/bindings.h"
@@ -512,6 +513,7 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
     tpb::Buffers::initBindings(mInternalBatchManager);
     tensorrt_llm::nanobind::runtime::initBindings(mInternalRuntime);
     tensorrt_llm::nanobind::testing::initBindings(mInternalTesting);
+    tensorrt_llm::nanobind::testing::initKvCacheTestUtilBindings(mInternalTesting);
     tpb::initBindings(mInternalBatchManager);
 
     tb::kv_cache_manager::KVCacheManagerConnectorBindings::initBindings(mInternalBatchManager);
 
@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kvCacheManagerTestUtilBinding.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/testing/kvCacheManagerTestUtil.h"
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::testing
+{
+
+void initKvCacheTestUtilBindings(nb::module_& m)
+{
+    m.def("simulate_prefill_completion_only_use_for_testing",
+        &tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion, nb::arg("llm_request"),
+        nb::call_guard<nb::gil_scoped_release>(),
+        "NEVER USE IN PRODUCTION. Simulates prefill completion on an LlmRequest for test purposes.");
+}
+
+} // namespace tensorrt_llm::nanobind::testing
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::testing
+{
+
+void initKvCacheTestUtilBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::testing
@@ -0,0 +1,43 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+
+namespace tensorrt_llm::testing
+{
+
+/// @brief Test utilities for KV cache manager unit tests. NEVER use in production code.
+class KvCacheManagerTestUtil
+{
+public:
+    /// @brief Simulate completion of the prefill stage on an LlmRequest.
+    ///
+    /// NEVER CALL FROM PRODUCTION CODE. This is solely for use in unit tests.
+    ///
+    /// Most BlockManager/KVCacheManager functions (storeContextBlocks, releaseBlocks,
+    /// removeSequence, releaseSequence) require prefill to be complete before they are
+    /// called. This method updates llmRequest state as if prefill has just finished,
+    /// allowing unit tests to invoke those functions correctly.
+    static void simulatePrefillCompletion(batch_manager::LlmRequest& llmRequest)
+    {
+        llmRequest.setContextCurrentPosition(llmRequest.getPromptLen());
+    }
+};
+
+} // namespace tensorrt_llm::testing
@@ -28,6 +28,7 @@
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/executor/requestUtils.h"
 #include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/testing/kvCacheManagerTestUtil.h"
 
 #include <NvInferPlugin.h>
 
@@ -401,6 +402,7 @@ int runTest(CapacityScheduler& capacityScheduler,
 
                 if (llmReq->getContextRemainingLength() == 0)
                 {
+                    tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmReq);
                     kvCacheManager->storeContextBlocks(*llmReq);
                     if (crossKvCacheManager)
                     {
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`#include "tensorrt_llm/executor/executor.h"`
`29`	`29`	`#include "tensorrt_llm/executor/requestUtils.h"`
`30`	`30`	`#include "tensorrt_llm/executor/types.h"`
	`31`	`+#include "tensorrt_llm/testing/kvCacheManagerTestUtil.h"`
`31`	`32`
`32`	`33`	`#include <NvInferPlugin.h>`
`33`	`34`
`@@ -401,6 +402,7 @@ int runTest(CapacityScheduler& capacityScheduler,`
`401`	`402`
`402`	`403`	`if (llmReq->getContextRemainingLength() == 0)`
`403`	`404`	`{`
	`405`	`+ tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmReq);`
`404`	`406`	`kvCacheManager->storeContextBlocks(*llmReq);`
`405`	`407`	`if (crossKvCacheManager)`
`406`	`408`	`{`