NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 28 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 73 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 3 additions & 1 deletion b/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp‎
Lines changed: 210 additions & 0 deletions b/‎cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp‎
Lines changed: 210 additions & 0 deletions
@@ -2196,6 +2196,31 @@ class BaseKVCacheManager
     [[nodiscard]] virtual executor::RetentionPriority getPriorityByBlockId(
         KVCacheBlock::IdType blockId, SizeType32 windowSize) const
         = 0;
+
+    //! @brief Commit and return the chain of stored block hashes for \p llmRequest's currently-full blocks.
+    //! @details For each block index `b` in `[0, numFullBlocks)`:
+    //!   - if the block has already been marked full (`isFull() == true`), reuse its stored hash;
+    //!   - otherwise, build the BlockKey from `llmRequest`'s tokens for block `b`, then call
+    //!     `setBlockKey(blockKey, /*isFull=*/true)` and `setHash()` so the block holds the same
+    //!     hash that storeBlocks would later compute. Hashes chain through `mPrevBlockInSeq`,
+    //!     identical to `BlockKeyHasher::hash(blockKey, prevHash)`.
+    //!
+    //!   Beam-width-1 only. The connector enforces this at startup; this method
+    //!   asserts the invariant defensively.
+    //!
+    //!   Sliding-window attention with detached front blocks is not supported: once front
+    //!   blocks are evicted they remain in the cache block ID list but no longer align with
+    //!   token positions, so this method asserts `getNumFrontBlocksRemoved(windowSize) == 0`.
+    //!
+    //! @param llmRequest Request whose currently-allocated blocks should be hashed.
+    //! @param windowSize Attention window size identifying the per-window block manager.
+    //! @return Ordered hashes for full blocks at indices `[0, numFullBlocks)`, chained from
+    //!     `mPrevBlockInSeq`. Empty when the request has no full blocks yet.
+    [[nodiscard]] virtual std::vector<executor::IdType> commitAndGetBlockHashesForRequest(
+        LlmRequest const& llmRequest, SizeType32 windowSize)
+    {
+        TLLM_THROW("commitAndGetBlockHashesForRequest is not implemented for this KV cache manager.");
+    }
 };
 
 class KVCacheManager : public BaseKVCacheManager
@@ -2515,6 +2540,9 @@ class KVCacheManager : public BaseKVCacheManager
     [[nodiscard]] executor::RetentionPriority getPriorityByBlockId(
         KVCacheBlock::IdType blockId, SizeType32 windowSize) const override;
 
+    [[nodiscard]] std::vector<executor::IdType> commitAndGetBlockHashesForRequest(
+        LlmRequest const& llmRequest, SizeType32 windowSize) override;
+
     std::optional<KVCacheBlock::IdType> getLastBlockId(LlmRequest::RequestIdType requestId) const override;
 
     /// @brief Calculates the number of kv-cache blocks that a sequence will require, for a single beam.
 
@@ -4408,6 +4408,79 @@ std::vector<std::vector<SizeType32>> const& KVCacheManager::getCacheBlockIds(
     return getSequence(requestId).getCacheBlockIds(windowSize);
 }
 
+std::vector<executor::IdType> KVCacheManager::commitAndGetBlockHashesForRequest(
+    LlmRequest const& llmRequest, SizeType32 windowSize)
+{
+    constexpr SizeType32 beamIdx = 0;
+    TLLM_CHECK_WITH_INFO(
+        llmRequest.getTokens().size() == 1, "commitAndGetBlockHashesForRequest only supports beam width 1.");
+
+    auto const& sequence = getSequence(llmRequest.mRequestId);
+
+    // Under sliding-window attention, detached front blocks remain in the cache block ID list
+    // (see WindowBlockManager::detachFrontBlock) but no longer correspond to token range
+    // [b * tokensPerBlock, ...). Walking them here would hash/mutate recycled blocks and break
+    // the index<->token alignment this method relies on, so fail fast until SWA is supported.
+    TLLM_CHECK_WITH_INFO(sequence.getNumFrontBlocksRemoved(windowSize) == 0,
+        "commitAndGetBlockHashesForRequest does not support sliding-window attention with detached front blocks "
+        "(windowSize=%d, request %lu).",
+        windowSize, static_cast<unsigned long>(llmRequest.mRequestId));
+
+    auto const& perBeamBlockIds = sequence.getCacheBlockIds(windowSize);
+    if (perBeamBlockIds.empty() || perBeamBlockIds[beamIdx].empty())
+    {
+        return {};
+    }
+    auto const& blockIds = perBeamBlockIds[beamIdx];
+
+    auto const& uniqueTokens = llmRequest.getUniqueTokens(beamIdx);
+    auto const tokensPerBlock = getTokensPerBlock();
+    // Count full blocks from uniqueTokens.size() (NOT getUsableUniqueTokenCountForReuse).
+    // This is intentional: the connector chain front-runs storeBlocks, committing a block's
+    // hash the moment the block fills -- including a trailing block that lands exactly on a
+    // block boundary. getUsableUniqueTokenCountForReuse subtracts the final unmaterialized
+    // token, which would drop that just-filled trailing block and silently disable
+    // front-running. See KVCacheManagerTest.CommitAndGetBlockHashesFrontRunsTrailingFullBlock.
+    auto const numFullTokenBlocks = static_cast<SizeType32>(uniqueTokens.size()) / tokensPerBlock;
+    auto const numAllocatedBlocks = static_cast<SizeType32>(blockIds.size());
+    // The allocator may have allocated a (partial) trailing block; clip to whichever count is
+    // smaller so we never index past either side.
+    auto const limit = std::min(numFullTokenBlocks, numAllocatedBlocks);
+    if (limit == 0)
+    {
+        return {};
+    }
+
+    bool const usesExtraIds = llmRequest.getInputTokensExtraIds().has_value();
+    auto const loraTaskId = llmRequest.getLoraTaskId();
+    auto const cacheSaltID = llmRequest.getCacheSaltID();
+
+    std::vector<executor::IdType> hashes;
+    hashes.reserve(static_cast<size_t>(limit));
+    for (SizeType32 b = 0; b < limit; ++b)
+    {
+        auto block = mBlockManager.getBlockById(blockIds[b], windowSize);
+        TLLM_CHECK_WITH_INFO(block != nullptr,
+            "commitAndGetBlockHashesForRequest: null block at index %d (blockId=%d, request %lu).", b, blockIds[b],
+            static_cast<unsigned long>(llmRequest.mRequestId));
+        if (!block->isFull())
+        {
+            SizeType32 const tokenStart = b * tokensPerBlock;
+            SizeType32 const tokenEnd = tokenStart + tokensPerBlock;
+            auto extraKeys = generateBlockHashExtraKeys(llmRequest, tokenStart, tokenEnd);
+            VecUniqueTokens blockTokens(uniqueTokens.begin() + tokenStart, uniqueTokens.begin() + tokenEnd);
+            BlockKey blockKey(usesExtraIds, loraTaskId, std::move(blockTokens), std::move(extraKeys), cacheSaltID);
+            block->setBlockKey(blockKey, /*isFull=*/true);
+            // setHash() chains through mPrevBlockInSeq, which was wired in addBlockToBeam. The
+            // loop walks blocks in allocation order, so by the time we reach block b its
+            // predecessor (if any) has already been committed and exposes a stable hash.
+            block->setHash();
+        }
+        hashes.push_back(static_cast<executor::IdType>(block->getHash()));
+    }
+    return hashes;
+}
+
 std::vector<std::vector<std::vector<SizeType32>>> KVCacheManager::getBatchCacheBlockIds(
     std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const
 {
 
@@ -629,7 +629,9 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
         .def("unpin_blocks_by_id", &BaseKVCacheManager::unpinBlocksById, nb::call_guard<nb::gil_scoped_release>())
         .def("reset_reuse_state", &BaseKVCacheManager::resetReuseState, nb::call_guard<nb::gil_scoped_release>())
         .def("get_priority_by_block_id", &BaseKVCacheManager::getPriorityByBlockId, nb::arg("block_id"),
-            nb::arg("window_size"), nb::call_guard<nb::gil_scoped_release>());
+            nb::arg("window_size"), nb::call_guard<nb::gil_scoped_release>())
+        .def("commit_and_get_block_hashes_for_request", &BaseKVCacheManager::commitAndGetBlockHashesForRequest,
+            nb::arg("llm_request"), nb::arg("window_size"), nb::call_guard<nb::gil_scoped_release>());
 
     nb::bind_vector<CacheBlockIds>(m, "CacheBlockIds")
         .def("__getstate__", [](CacheBlockIds const& v) { return nb::make_tuple(v); })
 
@@ -4483,6 +4483,216 @@ TEST_F(KVCacheManagerTest, GetPriorityByBlockId)
     EXPECT_EQ(invalidOutOfRange, KvCacheRetentionConfig::kDefaultRetentionPriority);
 }
 
+TEST_F(KVCacheManagerTest, CommitAndGetBlockHashesForRequest)
+{
+    // Validates KVCacheManager::commitAndGetBlockHashesForRequest (the hash chain exposed to
+    // the KV cache connector):
+    //   * a request with fewer than one full block yields an empty chain,
+    //   * one hash is returned per *full* block; a partial trailing block is clipped,
+    //   * the chain matches BlockKeyHasher applied block-by-block to the request's tokens,
+    //   * a block that fills during generation is committed in the same step (the front-running
+    //     semantic exercising the "set" branch, not just the already-full lookup branch),
+    //   * repeated calls are idempotent (already-full blocks become pure lookups), and
+    //   * the committed hashes equal the hashes the KV cache Stored events later emit.
+    auto constexpr numLayers = 2;
+    auto constexpr numKvHeads = 2;
+    auto constexpr sizePerHead = 16;
+    auto constexpr tokensPerBlock = 4;
+    auto constexpr numBlocks = 8;
+    auto constexpr maxAttentionWindow = 32;
+    auto constexpr maxNumSequences = 4;
+    auto constexpr beamWidth = 1;
+    auto constexpr beamIdx = 0;
+    auto constexpr dtype = nvinfer1::DataType::kHALF;
+    auto const stream = std::make_shared<tr::CudaStream>();
+    SizeType32 constexpr maxNewTokens = 8;
+    tr::SamplingConfig const samplingConfig{beamWidth};
+    bool constexpr isStreaming{false};
+
+    auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
+
+    KVCacheManager kvCacheManager(numLayers, numKvHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
+        beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, dtype, 0, stream, maxAttentionWindow,
+        maxAttentionWindow, /*enableBlockReuse=*/true, CacheType::kSELF, std::nullopt,
+        std::make_unique<tlk::KVCacheEventManager>(1024));
+    kvCacheManager.allocatePools(false);
+    (void) getEvents(kvCacheManager); // Drain the Created event.
+
+    // Ground truth: chain BlockKeyHasher over the request's full token blocks, exactly as the
+    // production storeBlocks path (and KV cache events) would for a freshly-allocated sequence.
+    auto const expectedChain = [&](LlmRequest const& req)
+    {
+        auto const& uniqueTokens = req.getUniqueTokens(beamIdx);
+        auto const numFull = static_cast<SizeType32>(uniqueTokens.size()) / tokensPerBlock;
+        std::vector<tle::IdType> expected;
+        std::size_t parentHash = 0;
+        for (SizeType32 b = 0; b < numFull; ++b)
+        {
+            VecUniqueTokens slice(
+                uniqueTokens.begin() + b * tokensPerBlock, uniqueTokens.begin() + (b + 1) * tokensPerBlock);
+            BlockKey const blockKey(/*usesExtraIds=*/false, /*loraTaskId=*/std::nullopt, std::move(slice));
+            auto const hash = BlockKeyHasher::hash(blockKey, parentHash);
+            expected.push_back(static_cast<tle::IdType>(hash));
+            parentHash = hash;
+        }
+        return expected;
+    };
+
+    // Case 1: fewer than one full block -> empty chain.
+    {
+        auto inputTokens = std::make_shared<VecTokens>(VecTokens{0, 1});
+        auto llmRequest = std::make_shared<LlmRequest>(0, maxNewTokens, inputTokens, samplingConfig, isStreaming);
+        kvCacheManager.addSequenceBatch(
+            {{{0, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
+        EXPECT_TRUE(kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow).empty());
+        tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
+        (void) kvCacheManager.removeSequence(0, llmRequest);
+    }
+
+    // Case 2: 6 context tokens -> 1 full block (committed at allocation, lookup branch) + a
+    // partial trailing block. The partial 2nd block must be clipped, so only one hash is returned.
+    auto inputTokens = std::make_shared<VecTokens>(VecTokens{10, 11, 12, 13, 14, 15});
+    auto llmRequest = std::make_shared<LlmRequest>(1, maxNewTokens, inputTokens, samplingConfig, isStreaming);
+    kvCacheManager.addSequenceBatch(
+        {{{1, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
+    tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
+
+    auto contextHashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
+    auto contextExpected = expectedChain(*llmRequest);
+    ASSERT_EQ(contextExpected.size(), 1u); // Partial 2nd block must be clipped.
+    EXPECT_EQ(contextHashes, contextExpected);
+
+    // Generate tokens 16, 17, 18 so the 2nd block (tokens 14..17) fills *during generation*. It
+    // was allocated partial, so commitAndGetBlockHashesForRequest must take the "set" branch:
+    // build the full BlockKey, mark the block full, and hash it chained from the first block.
+    // Token 18 starts a 3rd (partial) block so that block 2 is no longer the sequence's trailing
+    // block: storeBlocks drops the final unusable token, and we want block 2 keyed with all four
+    // tokens at store time so its stored hash matches the committed one (see the event check).
+    for (auto const token : {16, 17, 18})
+    {
+        llmRequest->addNewToken(token, beamIdx);
+        kvCacheManager.addToken(1);
+    }
+
+    auto hashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
+    auto expected = expectedChain(*llmRequest);
+    ASSERT_EQ(expected.size(), 2u);
+    EXPECT_EQ(hashes, expected);
+    // The first block's hash is unchanged from the context-only call (front-running only appends).
+    EXPECT_EQ(hashes.front(), contextHashes.front());
+
+    // Idempotent: a repeated call (now pure lookups on full blocks) returns the same chain.
+    EXPECT_EQ(kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow), hashes);
+
+    // The committed hashes must match the hashes the KV cache Stored events emit for the same
+    // blocks once the sequence is released and its full blocks are stored for reuse. This holds
+    // for blocks that are not the sequence's trailing block (storeBlocks drops the final unusable
+    // token, which would otherwise shorten the trailing block's key relative to the committed one).
+    (void) getEvents(kvCacheManager); // Drain pending events before storing.
+    tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
+    (void) kvCacheManager.removeSequence(1, llmRequest);
+
+    std::set<tle::IdType> storedHashes;
+    for (auto const& event : getEvents(kvCacheManager))
+    {
+        if (std::holds_alternative<tle::KVCacheStoredData>(event.data))
+        {
+            for (auto const& block : std::get<tle::KVCacheStoredData>(event.data).blocks)
+            {
+                storedHashes.insert(static_cast<tle::IdType>(block.blockHash));
+            }
+        }
+    }
+    for (auto const hash : hashes)
+    {
+        EXPECT_GT(storedHashes.count(hash), 0u) << "committed hash not emitted by a Stored event";
+    }
+}
+
+TEST_F(KVCacheManagerTest, CommitAndGetBlockHashesFrontRunsTrailingFullBlock)
+{
+    // Regression guard for the front-running contract: when a block fills *exactly* on a block
+    // boundary so that it is the sequence's trailing block (no partial block follows it),
+    // commitAndGetBlockHashesForRequest must still commit and return that block's hash in the
+    // same step. The sibling test CommitAndGetBlockHashesForRequest only covers a just-filled
+    // block that is followed by a partial block, so it would still pass if the implementation
+    // switched to getUsableUniqueTokenCountForReuse (which subtracts the final unmaterialized
+    // token and would drop the trailing full block). This test pins the exact-boundary case so
+    // that regression fails loudly: with tokensPerBlock=4 and 8 tokens, the usable-count path
+    // would yield (8 - 1) / 4 = 1 block, whereas the correct front-running chain has 2.
+    auto constexpr numLayers = 2;
+    auto constexpr numKvHeads = 2;
+    auto constexpr sizePerHead = 16;
+    auto constexpr tokensPerBlock = 4;
+    auto constexpr numBlocks = 8;
+    auto constexpr maxAttentionWindow = 32;
+    auto constexpr maxNumSequences = 4;
+    auto constexpr beamWidth = 1;
+    auto constexpr beamIdx = 0;
+    auto constexpr dtype = nvinfer1::DataType::kHALF;
+    auto const stream = std::make_shared<tr::CudaStream>();
+    SizeType32 constexpr maxNewTokens = 8;
+    tr::SamplingConfig const samplingConfig{beamWidth};
+    bool constexpr isStreaming{false};
+
+    auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
+
+    KVCacheManager kvCacheManager(numLayers, numKvHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
+        beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, dtype, 0, stream, maxAttentionWindow,
+        maxAttentionWindow, /*enableBlockReuse=*/true, CacheType::kSELF, std::nullopt);
+    kvCacheManager.allocatePools(false);
+
+    // Chain BlockKeyHasher over the request's full token blocks (keyed by uniqueTokens.size()),
+    // mirroring the front-running chain the connector expects.
+    auto const expectedChain = [&](LlmRequest const& req)
+    {
+        auto const& uniqueTokens = req.getUniqueTokens(beamIdx);
+        auto const numFull = static_cast<SizeType32>(uniqueTokens.size()) / tokensPerBlock;
+        std::vector<tle::IdType> expected;
+        std::size_t parentHash = 0;
+        for (SizeType32 b = 0; b < numFull; ++b)
+        {
+            VecUniqueTokens slice(
+                uniqueTokens.begin() + b * tokensPerBlock, uniqueTokens.begin() + (b + 1) * tokensPerBlock);
+            BlockKey const blockKey(/*usesExtraIds=*/false, /*loraTaskId=*/std::nullopt, std::move(slice));
+            auto const hash = BlockKeyHasher::hash(blockKey, parentHash);
+            expected.push_back(static_cast<tle::IdType>(hash));
+            parentHash = hash;
+        }
+        return expected;
+    };
+
+    // 6 context tokens -> block 0 full (10..13), block 1 partial (14, 15).
+    auto inputTokens = std::make_shared<VecTokens>(VecTokens{10, 11, 12, 13, 14, 15});
+    auto llmRequest = std::make_shared<LlmRequest>(0, maxNewTokens, inputTokens, samplingConfig, isStreaming);
+    kvCacheManager.addSequenceBatch(
+        {{{0, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
+    tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
+
+    auto contextHashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
+    ASSERT_EQ(contextHashes.size(), 1u); // block 1 is still partial here.
+
+    // Generate exactly tokens 16, 17 so block 1 (tokens 14..17) fills and becomes the *trailing*
+    // block -- no further (partial) block is started. The just-filled trailing block must be
+    // committed via the "set" branch in this same step.
+    for (auto const token : {16, 17})
+    {
+        llmRequest->addNewToken(token, beamIdx);
+        kvCacheManager.addToken(0);
+    }
+
+    auto hashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
+    auto expected = expectedChain(*llmRequest);
+    ASSERT_EQ(expected.size(), 2u);
+    // The crux: 2 hashes, not 1. A usable-count implementation would drop the trailing block.
+    EXPECT_EQ(hashes, expected);
+    // Front-running only appends; block 0's hash is unchanged from the context-only call.
+    EXPECT_EQ(hashes.front(), contextHashes.front());
+
+    tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
+    (void) kvCacheManager.removeSequence(0, llmRequest);
+}
+
 TEST(KVCacheManagerHelpersTest, ChopVectorIntoBlocksBasicNoPartial)
 {
     using namespace tensorrt_llm::batch_manager::kv_cache_manager;