@@ -4483,6 +4483,216 @@ TEST_F(KVCacheManagerTest, GetPriorityByBlockId)
44834483 EXPECT_EQ(invalidOutOfRange, KvCacheRetentionConfig::kDefaultRetentionPriority);
44844484}
44854485
4486+ TEST_F(KVCacheManagerTest, CommitAndGetBlockHashesForRequest)
4487+ {
4488+ // Validates KVCacheManager::commitAndGetBlockHashesForRequest (the hash chain exposed to
4489+ // the KV cache connector):
4490+ // * a request with fewer than one full block yields an empty chain,
4491+ // * one hash is returned per *full* block; a partial trailing block is clipped,
4492+ // * the chain matches BlockKeyHasher applied block-by-block to the request's tokens,
4493+ // * a block that fills during generation is committed in the same step (the front-running
4494+ // semantic exercising the "set" branch, not just the already-full lookup branch),
4495+ // * repeated calls are idempotent (already-full blocks become pure lookups), and
4496+ // * the committed hashes equal the hashes the KV cache Stored events later emit.
4497+ auto constexpr numLayers = 2;
4498+ auto constexpr numKvHeads = 2;
4499+ auto constexpr sizePerHead = 16;
4500+ auto constexpr tokensPerBlock = 4;
4501+ auto constexpr numBlocks = 8;
4502+ auto constexpr maxAttentionWindow = 32;
4503+ auto constexpr maxNumSequences = 4;
4504+ auto constexpr beamWidth = 1;
4505+ auto constexpr beamIdx = 0;
4506+ auto constexpr dtype = nvinfer1::DataType::kHALF;
4507+ auto const stream = std::make_shared<tr::CudaStream>();
4508+ SizeType32 constexpr maxNewTokens = 8;
4509+ tr::SamplingConfig const samplingConfig{beamWidth};
4510+ bool constexpr isStreaming{false};
4511+
4512+ auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
4513+
4514+ KVCacheManager kvCacheManager(numLayers, numKvHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
4515+ beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, dtype, 0, stream, maxAttentionWindow,
4516+ maxAttentionWindow, /*enableBlockReuse=*/true, CacheType::kSELF, std::nullopt,
4517+ std::make_unique<tlk::KVCacheEventManager>(1024));
4518+ kvCacheManager.allocatePools(false);
4519+ (void) getEvents(kvCacheManager); // Drain the Created event.
4520+
4521+ // Ground truth: chain BlockKeyHasher over the request's full token blocks, exactly as the
4522+ // production storeBlocks path (and KV cache events) would for a freshly-allocated sequence.
4523+ auto const expectedChain = [&](LlmRequest const& req)
4524+ {
4525+ auto const& uniqueTokens = req.getUniqueTokens(beamIdx);
4526+ auto const numFull = static_cast<SizeType32>(uniqueTokens.size()) / tokensPerBlock;
4527+ std::vector<tle::IdType> expected;
4528+ std::size_t parentHash = 0;
4529+ for (SizeType32 b = 0; b < numFull; ++b)
4530+ {
4531+ VecUniqueTokens slice(
4532+ uniqueTokens.begin() + b * tokensPerBlock, uniqueTokens.begin() + (b + 1) * tokensPerBlock);
4533+ BlockKey const blockKey(/*usesExtraIds=*/false, /*loraTaskId=*/std::nullopt, std::move(slice));
4534+ auto const hash = BlockKeyHasher::hash(blockKey, parentHash);
4535+ expected.push_back(static_cast<tle::IdType>(hash));
4536+ parentHash = hash;
4537+ }
4538+ return expected;
4539+ };
4540+
4541+ // Case 1: fewer than one full block -> empty chain.
4542+ {
4543+ auto inputTokens = std::make_shared<VecTokens>(VecTokens{0, 1});
4544+ auto llmRequest = std::make_shared<LlmRequest>(0, maxNewTokens, inputTokens, samplingConfig, isStreaming);
4545+ kvCacheManager.addSequenceBatch(
4546+ {{{0, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
4547+ EXPECT_TRUE(kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow).empty());
4548+ tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4549+ (void) kvCacheManager.removeSequence(0, llmRequest);
4550+ }
4551+
4552+ // Case 2: 6 context tokens -> 1 full block (committed at allocation, lookup branch) + a
4553+ // partial trailing block. The partial 2nd block must be clipped, so only one hash is returned.
4554+ auto inputTokens = std::make_shared<VecTokens>(VecTokens{10, 11, 12, 13, 14, 15});
4555+ auto llmRequest = std::make_shared<LlmRequest>(1, maxNewTokens, inputTokens, samplingConfig, isStreaming);
4556+ kvCacheManager.addSequenceBatch(
4557+ {{{1, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
4558+ tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4559+
4560+ auto contextHashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
4561+ auto contextExpected = expectedChain(*llmRequest);
4562+ ASSERT_EQ(contextExpected.size(), 1u); // Partial 2nd block must be clipped.
4563+ EXPECT_EQ(contextHashes, contextExpected);
4564+
4565+ // Generate tokens 16, 17, 18 so the 2nd block (tokens 14..17) fills *during generation*. It
4566+ // was allocated partial, so commitAndGetBlockHashesForRequest must take the "set" branch:
4567+ // build the full BlockKey, mark the block full, and hash it chained from the first block.
4568+ // Token 18 starts a 3rd (partial) block so that block 2 is no longer the sequence's trailing
4569+ // block: storeBlocks drops the final unusable token, and we want block 2 keyed with all four
4570+ // tokens at store time so its stored hash matches the committed one (see the event check).
4571+ for (auto const token : {16, 17, 18})
4572+ {
4573+ llmRequest->addNewToken(token, beamIdx);
4574+ kvCacheManager.addToken(1);
4575+ }
4576+
4577+ auto hashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
4578+ auto expected = expectedChain(*llmRequest);
4579+ ASSERT_EQ(expected.size(), 2u);
4580+ EXPECT_EQ(hashes, expected);
4581+ // The first block's hash is unchanged from the context-only call (front-running only appends).
4582+ EXPECT_EQ(hashes.front(), contextHashes.front());
4583+
4584+ // Idempotent: a repeated call (now pure lookups on full blocks) returns the same chain.
4585+ EXPECT_EQ(kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow), hashes);
4586+
4587+ // The committed hashes must match the hashes the KV cache Stored events emit for the same
4588+ // blocks once the sequence is released and its full blocks are stored for reuse. This holds
4589+ // for blocks that are not the sequence's trailing block (storeBlocks drops the final unusable
4590+ // token, which would otherwise shorten the trailing block's key relative to the committed one).
4591+ (void) getEvents(kvCacheManager); // Drain pending events before storing.
4592+ tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4593+ (void) kvCacheManager.removeSequence(1, llmRequest);
4594+
4595+ std::set<tle::IdType> storedHashes;
4596+ for (auto const& event : getEvents(kvCacheManager))
4597+ {
4598+ if (std::holds_alternative<tle::KVCacheStoredData>(event.data))
4599+ {
4600+ for (auto const& block : std::get<tle::KVCacheStoredData>(event.data).blocks)
4601+ {
4602+ storedHashes.insert(static_cast<tle::IdType>(block.blockHash));
4603+ }
4604+ }
4605+ }
4606+ for (auto const hash : hashes)
4607+ {
4608+ EXPECT_GT(storedHashes.count(hash), 0u) << "committed hash not emitted by a Stored event";
4609+ }
4610+ }
4611+
4612+ TEST_F(KVCacheManagerTest, CommitAndGetBlockHashesFrontRunsTrailingFullBlock)
4613+ {
4614+ // Regression guard for the front-running contract: when a block fills *exactly* on a block
4615+ // boundary so that it is the sequence's trailing block (no partial block follows it),
4616+ // commitAndGetBlockHashesForRequest must still commit and return that block's hash in the
4617+ // same step. The sibling test CommitAndGetBlockHashesForRequest only covers a just-filled
4618+ // block that is followed by a partial block, so it would still pass if the implementation
4619+ // switched to getUsableUniqueTokenCountForReuse (which subtracts the final unmaterialized
4620+ // token and would drop the trailing full block). This test pins the exact-boundary case so
4621+ // that regression fails loudly: with tokensPerBlock=4 and 8 tokens, the usable-count path
4622+ // would yield (8 - 1) / 4 = 1 block, whereas the correct front-running chain has 2.
4623+ auto constexpr numLayers = 2;
4624+ auto constexpr numKvHeads = 2;
4625+ auto constexpr sizePerHead = 16;
4626+ auto constexpr tokensPerBlock = 4;
4627+ auto constexpr numBlocks = 8;
4628+ auto constexpr maxAttentionWindow = 32;
4629+ auto constexpr maxNumSequences = 4;
4630+ auto constexpr beamWidth = 1;
4631+ auto constexpr beamIdx = 0;
4632+ auto constexpr dtype = nvinfer1::DataType::kHALF;
4633+ auto const stream = std::make_shared<tr::CudaStream>();
4634+ SizeType32 constexpr maxNewTokens = 8;
4635+ tr::SamplingConfig const samplingConfig{beamWidth};
4636+ bool constexpr isStreaming{false};
4637+
4638+ auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
4639+
4640+ KVCacheManager kvCacheManager(numLayers, numKvHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
4641+ beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, dtype, 0, stream, maxAttentionWindow,
4642+ maxAttentionWindow, /*enableBlockReuse=*/true, CacheType::kSELF, std::nullopt);
4643+ kvCacheManager.allocatePools(false);
4644+
4645+ // Chain BlockKeyHasher over the request's full token blocks (keyed by uniqueTokens.size()),
4646+ // mirroring the front-running chain the connector expects.
4647+ auto const expectedChain = [&](LlmRequest const& req)
4648+ {
4649+ auto const& uniqueTokens = req.getUniqueTokens(beamIdx);
4650+ auto const numFull = static_cast<SizeType32>(uniqueTokens.size()) / tokensPerBlock;
4651+ std::vector<tle::IdType> expected;
4652+ std::size_t parentHash = 0;
4653+ for (SizeType32 b = 0; b < numFull; ++b)
4654+ {
4655+ VecUniqueTokens slice(
4656+ uniqueTokens.begin() + b * tokensPerBlock, uniqueTokens.begin() + (b + 1) * tokensPerBlock);
4657+ BlockKey const blockKey(/*usesExtraIds=*/false, /*loraTaskId=*/std::nullopt, std::move(slice));
4658+ auto const hash = BlockKeyHasher::hash(blockKey, parentHash);
4659+ expected.push_back(static_cast<tle::IdType>(hash));
4660+ parentHash = hash;
4661+ }
4662+ return expected;
4663+ };
4664+
4665+ // 6 context tokens -> block 0 full (10..13), block 1 partial (14, 15).
4666+ auto inputTokens = std::make_shared<VecTokens>(VecTokens{10, 11, 12, 13, 14, 15});
4667+ auto llmRequest = std::make_shared<LlmRequest>(0, maxNewTokens, inputTokens, samplingConfig, isStreaming);
4668+ kvCacheManager.addSequenceBatch(
4669+ {{{0, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
4670+ tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4671+
4672+ auto contextHashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
4673+ ASSERT_EQ(contextHashes.size(), 1u); // block 1 is still partial here.
4674+
4675+ // Generate exactly tokens 16, 17 so block 1 (tokens 14..17) fills and becomes the *trailing*
4676+ // block -- no further (partial) block is started. The just-filled trailing block must be
4677+ // committed via the "set" branch in this same step.
4678+ for (auto const token : {16, 17})
4679+ {
4680+ llmRequest->addNewToken(token, beamIdx);
4681+ kvCacheManager.addToken(0);
4682+ }
4683+
4684+ auto hashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
4685+ auto expected = expectedChain(*llmRequest);
4686+ ASSERT_EQ(expected.size(), 2u);
4687+ // The crux: 2 hashes, not 1. A usable-count implementation would drop the trailing block.
4688+ EXPECT_EQ(hashes, expected);
4689+ // Front-running only appends; block 0's hash is unchanged from the context-only call.
4690+ EXPECT_EQ(hashes.front(), contextHashes.front());
4691+
4692+ tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4693+ (void) kvCacheManager.removeSequence(0, llmRequest);
4694+ }
4695+
44864696TEST(KVCacheManagerHelpersTest, ChopVectorIntoBlocksBasicNoPartial)
44874697{
44884698 using namespace tensorrt_llm::batch_manager::kv_cache_manager;
0 commit comments