Skip to content

Commit 96bf189

Browse files
Merge branch 'main' into initial-stats-sweep
2 parents cb8cd45 + 358505c commit 96bf189

9 files changed

Lines changed: 428 additions & 9 deletions

File tree

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2196,6 +2196,31 @@ class BaseKVCacheManager
21962196
[[nodiscard]] virtual executor::RetentionPriority getPriorityByBlockId(
21972197
KVCacheBlock::IdType blockId, SizeType32 windowSize) const
21982198
= 0;
2199+
2200+
//! @brief Commit and return the chain of stored block hashes for \p llmRequest's currently-full blocks.
2201+
//! @details For each block index `b` in `[0, numFullBlocks)`:
2202+
//! - if the block has already been marked full (`isFull() == true`), reuse its stored hash;
2203+
//! - otherwise, build the BlockKey from `llmRequest`'s tokens for block `b`, then call
2204+
//! `setBlockKey(blockKey, /*isFull=*/true)` and `setHash()` so the block holds the same
2205+
//! hash that storeBlocks would later compute. Hashes chain through `mPrevBlockInSeq`,
2206+
//! identical to `BlockKeyHasher::hash(blockKey, prevHash)`.
2207+
//!
2208+
//! Beam-width-1 only. The connector enforces this at startup; this method
2209+
//! asserts the invariant defensively.
2210+
//!
2211+
//! Sliding-window attention with detached front blocks is not supported: once front
2212+
//! blocks are evicted they remain in the cache block ID list but no longer align with
2213+
//! token positions, so this method asserts `getNumFrontBlocksRemoved(windowSize) == 0`.
2214+
//!
2215+
//! @param llmRequest Request whose currently-allocated blocks should be hashed.
2216+
//! @param windowSize Attention window size identifying the per-window block manager.
2217+
//! @return Ordered hashes for full blocks at indices `[0, numFullBlocks)`, chained from
2218+
//! `mPrevBlockInSeq`. Empty when the request has no full blocks yet.
2219+
[[nodiscard]] virtual std::vector<executor::IdType> commitAndGetBlockHashesForRequest(
2220+
LlmRequest const& llmRequest, SizeType32 windowSize)
2221+
{
2222+
TLLM_THROW("commitAndGetBlockHashesForRequest is not implemented for this KV cache manager.");
2223+
}
21992224
};
22002225

22012226
class KVCacheManager : public BaseKVCacheManager
@@ -2515,6 +2540,9 @@ class KVCacheManager : public BaseKVCacheManager
25152540
[[nodiscard]] executor::RetentionPriority getPriorityByBlockId(
25162541
KVCacheBlock::IdType blockId, SizeType32 windowSize) const override;
25172542

2543+
[[nodiscard]] std::vector<executor::IdType> commitAndGetBlockHashesForRequest(
2544+
LlmRequest const& llmRequest, SizeType32 windowSize) override;
2545+
25182546
std::optional<KVCacheBlock::IdType> getLastBlockId(LlmRequest::RequestIdType requestId) const override;
25192547

25202548
/// @brief Calculates the number of kv-cache blocks that a sequence will require, for a single beam.

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4408,6 +4408,79 @@ std::vector<std::vector<SizeType32>> const& KVCacheManager::getCacheBlockIds(
44084408
return getSequence(requestId).getCacheBlockIds(windowSize);
44094409
}
44104410

4411+
std::vector<executor::IdType> KVCacheManager::commitAndGetBlockHashesForRequest(
4412+
LlmRequest const& llmRequest, SizeType32 windowSize)
4413+
{
4414+
constexpr SizeType32 beamIdx = 0;
4415+
TLLM_CHECK_WITH_INFO(
4416+
llmRequest.getTokens().size() == 1, "commitAndGetBlockHashesForRequest only supports beam width 1.");
4417+
4418+
auto const& sequence = getSequence(llmRequest.mRequestId);
4419+
4420+
// Under sliding-window attention, detached front blocks remain in the cache block ID list
4421+
// (see WindowBlockManager::detachFrontBlock) but no longer correspond to token range
4422+
// [b * tokensPerBlock, ...). Walking them here would hash/mutate recycled blocks and break
4423+
// the index<->token alignment this method relies on, so fail fast until SWA is supported.
4424+
TLLM_CHECK_WITH_INFO(sequence.getNumFrontBlocksRemoved(windowSize) == 0,
4425+
"commitAndGetBlockHashesForRequest does not support sliding-window attention with detached front blocks "
4426+
"(windowSize=%d, request %lu).",
4427+
windowSize, static_cast<unsigned long>(llmRequest.mRequestId));
4428+
4429+
auto const& perBeamBlockIds = sequence.getCacheBlockIds(windowSize);
4430+
if (perBeamBlockIds.empty() || perBeamBlockIds[beamIdx].empty())
4431+
{
4432+
return {};
4433+
}
4434+
auto const& blockIds = perBeamBlockIds[beamIdx];
4435+
4436+
auto const& uniqueTokens = llmRequest.getUniqueTokens(beamIdx);
4437+
auto const tokensPerBlock = getTokensPerBlock();
4438+
// Count full blocks from uniqueTokens.size() (NOT getUsableUniqueTokenCountForReuse).
4439+
// This is intentional: the connector chain front-runs storeBlocks, committing a block's
4440+
// hash the moment the block fills -- including a trailing block that lands exactly on a
4441+
// block boundary. getUsableUniqueTokenCountForReuse subtracts the final unmaterialized
4442+
// token, which would drop that just-filled trailing block and silently disable
4443+
// front-running. See KVCacheManagerTest.CommitAndGetBlockHashesFrontRunsTrailingFullBlock.
4444+
auto const numFullTokenBlocks = static_cast<SizeType32>(uniqueTokens.size()) / tokensPerBlock;
4445+
auto const numAllocatedBlocks = static_cast<SizeType32>(blockIds.size());
4446+
// The allocator may have allocated a (partial) trailing block; clip to whichever count is
4447+
// smaller so we never index past either side.
4448+
auto const limit = std::min(numFullTokenBlocks, numAllocatedBlocks);
4449+
if (limit == 0)
4450+
{
4451+
return {};
4452+
}
4453+
4454+
bool const usesExtraIds = llmRequest.getInputTokensExtraIds().has_value();
4455+
auto const loraTaskId = llmRequest.getLoraTaskId();
4456+
auto const cacheSaltID = llmRequest.getCacheSaltID();
4457+
4458+
std::vector<executor::IdType> hashes;
4459+
hashes.reserve(static_cast<size_t>(limit));
4460+
for (SizeType32 b = 0; b < limit; ++b)
4461+
{
4462+
auto block = mBlockManager.getBlockById(blockIds[b], windowSize);
4463+
TLLM_CHECK_WITH_INFO(block != nullptr,
4464+
"commitAndGetBlockHashesForRequest: null block at index %d (blockId=%d, request %lu).", b, blockIds[b],
4465+
static_cast<unsigned long>(llmRequest.mRequestId));
4466+
if (!block->isFull())
4467+
{
4468+
SizeType32 const tokenStart = b * tokensPerBlock;
4469+
SizeType32 const tokenEnd = tokenStart + tokensPerBlock;
4470+
auto extraKeys = generateBlockHashExtraKeys(llmRequest, tokenStart, tokenEnd);
4471+
VecUniqueTokens blockTokens(uniqueTokens.begin() + tokenStart, uniqueTokens.begin() + tokenEnd);
4472+
BlockKey blockKey(usesExtraIds, loraTaskId, std::move(blockTokens), std::move(extraKeys), cacheSaltID);
4473+
block->setBlockKey(blockKey, /*isFull=*/true);
4474+
// setHash() chains through mPrevBlockInSeq, which was wired in addBlockToBeam. The
4475+
// loop walks blocks in allocation order, so by the time we reach block b its
4476+
// predecessor (if any) has already been committed and exposes a stable hash.
4477+
block->setHash();
4478+
}
4479+
hashes.push_back(static_cast<executor::IdType>(block->getHash()));
4480+
}
4481+
return hashes;
4482+
}
4483+
44114484
std::vector<std::vector<std::vector<SizeType32>>> KVCacheManager::getBatchCacheBlockIds(
44124485
std::vector<LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const
44134486
{

cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,9 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
629629
.def("unpin_blocks_by_id", &BaseKVCacheManager::unpinBlocksById, nb::call_guard<nb::gil_scoped_release>())
630630
.def("reset_reuse_state", &BaseKVCacheManager::resetReuseState, nb::call_guard<nb::gil_scoped_release>())
631631
.def("get_priority_by_block_id", &BaseKVCacheManager::getPriorityByBlockId, nb::arg("block_id"),
632-
nb::arg("window_size"), nb::call_guard<nb::gil_scoped_release>());
632+
nb::arg("window_size"), nb::call_guard<nb::gil_scoped_release>())
633+
.def("commit_and_get_block_hashes_for_request", &BaseKVCacheManager::commitAndGetBlockHashesForRequest,
634+
nb::arg("llm_request"), nb::arg("window_size"), nb::call_guard<nb::gil_scoped_release>());
633635

634636
nb::bind_vector<CacheBlockIds>(m, "CacheBlockIds")
635637
.def("__getstate__", [](CacheBlockIds const& v) { return nb::make_tuple(v); })

cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4483,6 +4483,216 @@ TEST_F(KVCacheManagerTest, GetPriorityByBlockId)
44834483
EXPECT_EQ(invalidOutOfRange, KvCacheRetentionConfig::kDefaultRetentionPriority);
44844484
}
44854485

4486+
TEST_F(KVCacheManagerTest, CommitAndGetBlockHashesForRequest)
4487+
{
4488+
// Validates KVCacheManager::commitAndGetBlockHashesForRequest (the hash chain exposed to
4489+
// the KV cache connector):
4490+
// * a request with fewer than one full block yields an empty chain,
4491+
// * one hash is returned per *full* block; a partial trailing block is clipped,
4492+
// * the chain matches BlockKeyHasher applied block-by-block to the request's tokens,
4493+
// * a block that fills during generation is committed in the same step (the front-running
4494+
// semantic exercising the "set" branch, not just the already-full lookup branch),
4495+
// * repeated calls are idempotent (already-full blocks become pure lookups), and
4496+
// * the committed hashes equal the hashes the KV cache Stored events later emit.
4497+
auto constexpr numLayers = 2;
4498+
auto constexpr numKvHeads = 2;
4499+
auto constexpr sizePerHead = 16;
4500+
auto constexpr tokensPerBlock = 4;
4501+
auto constexpr numBlocks = 8;
4502+
auto constexpr maxAttentionWindow = 32;
4503+
auto constexpr maxNumSequences = 4;
4504+
auto constexpr beamWidth = 1;
4505+
auto constexpr beamIdx = 0;
4506+
auto constexpr dtype = nvinfer1::DataType::kHALF;
4507+
auto const stream = std::make_shared<tr::CudaStream>();
4508+
SizeType32 constexpr maxNewTokens = 8;
4509+
tr::SamplingConfig const samplingConfig{beamWidth};
4510+
bool constexpr isStreaming{false};
4511+
4512+
auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
4513+
4514+
KVCacheManager kvCacheManager(numLayers, numKvHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
4515+
beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, dtype, 0, stream, maxAttentionWindow,
4516+
maxAttentionWindow, /*enableBlockReuse=*/true, CacheType::kSELF, std::nullopt,
4517+
std::make_unique<tlk::KVCacheEventManager>(1024));
4518+
kvCacheManager.allocatePools(false);
4519+
(void) getEvents(kvCacheManager); // Drain the Created event.
4520+
4521+
// Ground truth: chain BlockKeyHasher over the request's full token blocks, exactly as the
4522+
// production storeBlocks path (and KV cache events) would for a freshly-allocated sequence.
4523+
auto const expectedChain = [&](LlmRequest const& req)
4524+
{
4525+
auto const& uniqueTokens = req.getUniqueTokens(beamIdx);
4526+
auto const numFull = static_cast<SizeType32>(uniqueTokens.size()) / tokensPerBlock;
4527+
std::vector<tle::IdType> expected;
4528+
std::size_t parentHash = 0;
4529+
for (SizeType32 b = 0; b < numFull; ++b)
4530+
{
4531+
VecUniqueTokens slice(
4532+
uniqueTokens.begin() + b * tokensPerBlock, uniqueTokens.begin() + (b + 1) * tokensPerBlock);
4533+
BlockKey const blockKey(/*usesExtraIds=*/false, /*loraTaskId=*/std::nullopt, std::move(slice));
4534+
auto const hash = BlockKeyHasher::hash(blockKey, parentHash);
4535+
expected.push_back(static_cast<tle::IdType>(hash));
4536+
parentHash = hash;
4537+
}
4538+
return expected;
4539+
};
4540+
4541+
// Case 1: fewer than one full block -> empty chain.
4542+
{
4543+
auto inputTokens = std::make_shared<VecTokens>(VecTokens{0, 1});
4544+
auto llmRequest = std::make_shared<LlmRequest>(0, maxNewTokens, inputTokens, samplingConfig, isStreaming);
4545+
kvCacheManager.addSequenceBatch(
4546+
{{{0, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
4547+
EXPECT_TRUE(kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow).empty());
4548+
tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4549+
(void) kvCacheManager.removeSequence(0, llmRequest);
4550+
}
4551+
4552+
// Case 2: 6 context tokens -> 1 full block (committed at allocation, lookup branch) + a
4553+
// partial trailing block. The partial 2nd block must be clipped, so only one hash is returned.
4554+
auto inputTokens = std::make_shared<VecTokens>(VecTokens{10, 11, 12, 13, 14, 15});
4555+
auto llmRequest = std::make_shared<LlmRequest>(1, maxNewTokens, inputTokens, samplingConfig, isStreaming);
4556+
kvCacheManager.addSequenceBatch(
4557+
{{{1, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
4558+
tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4559+
4560+
auto contextHashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
4561+
auto contextExpected = expectedChain(*llmRequest);
4562+
ASSERT_EQ(contextExpected.size(), 1u); // Partial 2nd block must be clipped.
4563+
EXPECT_EQ(contextHashes, contextExpected);
4564+
4565+
// Generate tokens 16, 17, 18 so the 2nd block (tokens 14..17) fills *during generation*. It
4566+
// was allocated partial, so commitAndGetBlockHashesForRequest must take the "set" branch:
4567+
// build the full BlockKey, mark the block full, and hash it chained from the first block.
4568+
// Token 18 starts a 3rd (partial) block so that block 2 is no longer the sequence's trailing
4569+
// block: storeBlocks drops the final unusable token, and we want block 2 keyed with all four
4570+
// tokens at store time so its stored hash matches the committed one (see the event check).
4571+
for (auto const token : {16, 17, 18})
4572+
{
4573+
llmRequest->addNewToken(token, beamIdx);
4574+
kvCacheManager.addToken(1);
4575+
}
4576+
4577+
auto hashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
4578+
auto expected = expectedChain(*llmRequest);
4579+
ASSERT_EQ(expected.size(), 2u);
4580+
EXPECT_EQ(hashes, expected);
4581+
// The first block's hash is unchanged from the context-only call (front-running only appends).
4582+
EXPECT_EQ(hashes.front(), contextHashes.front());
4583+
4584+
// Idempotent: a repeated call (now pure lookups on full blocks) returns the same chain.
4585+
EXPECT_EQ(kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow), hashes);
4586+
4587+
// The committed hashes must match the hashes the KV cache Stored events emit for the same
4588+
// blocks once the sequence is released and its full blocks are stored for reuse. This holds
4589+
// for blocks that are not the sequence's trailing block (storeBlocks drops the final unusable
4590+
// token, which would otherwise shorten the trailing block's key relative to the committed one).
4591+
(void) getEvents(kvCacheManager); // Drain pending events before storing.
4592+
tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4593+
(void) kvCacheManager.removeSequence(1, llmRequest);
4594+
4595+
std::set<tle::IdType> storedHashes;
4596+
for (auto const& event : getEvents(kvCacheManager))
4597+
{
4598+
if (std::holds_alternative<tle::KVCacheStoredData>(event.data))
4599+
{
4600+
for (auto const& block : std::get<tle::KVCacheStoredData>(event.data).blocks)
4601+
{
4602+
storedHashes.insert(static_cast<tle::IdType>(block.blockHash));
4603+
}
4604+
}
4605+
}
4606+
for (auto const hash : hashes)
4607+
{
4608+
EXPECT_GT(storedHashes.count(hash), 0u) << "committed hash not emitted by a Stored event";
4609+
}
4610+
}
4611+
4612+
TEST_F(KVCacheManagerTest, CommitAndGetBlockHashesFrontRunsTrailingFullBlock)
4613+
{
4614+
// Regression guard for the front-running contract: when a block fills *exactly* on a block
4615+
// boundary so that it is the sequence's trailing block (no partial block follows it),
4616+
// commitAndGetBlockHashesForRequest must still commit and return that block's hash in the
4617+
// same step. The sibling test CommitAndGetBlockHashesForRequest only covers a just-filled
4618+
// block that is followed by a partial block, so it would still pass if the implementation
4619+
// switched to getUsableUniqueTokenCountForReuse (which subtracts the final unmaterialized
4620+
// token and would drop the trailing full block). This test pins the exact-boundary case so
4621+
// that regression fails loudly: with tokensPerBlock=4 and 8 tokens, the usable-count path
4622+
// would yield (8 - 1) / 4 = 1 block, whereas the correct front-running chain has 2.
4623+
auto constexpr numLayers = 2;
4624+
auto constexpr numKvHeads = 2;
4625+
auto constexpr sizePerHead = 16;
4626+
auto constexpr tokensPerBlock = 4;
4627+
auto constexpr numBlocks = 8;
4628+
auto constexpr maxAttentionWindow = 32;
4629+
auto constexpr maxNumSequences = 4;
4630+
auto constexpr beamWidth = 1;
4631+
auto constexpr beamIdx = 0;
4632+
auto constexpr dtype = nvinfer1::DataType::kHALF;
4633+
auto const stream = std::make_shared<tr::CudaStream>();
4634+
SizeType32 constexpr maxNewTokens = 8;
4635+
tr::SamplingConfig const samplingConfig{beamWidth};
4636+
bool constexpr isStreaming{false};
4637+
4638+
auto const blocksPerWindow = BlocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
4639+
4640+
KVCacheManager kvCacheManager(numLayers, numKvHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
4641+
beamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, dtype, 0, stream, maxAttentionWindow,
4642+
maxAttentionWindow, /*enableBlockReuse=*/true, CacheType::kSELF, std::nullopt);
4643+
kvCacheManager.allocatePools(false);
4644+
4645+
// Chain BlockKeyHasher over the request's full token blocks (keyed by uniqueTokens.size()),
4646+
// mirroring the front-running chain the connector expects.
4647+
auto const expectedChain = [&](LlmRequest const& req)
4648+
{
4649+
auto const& uniqueTokens = req.getUniqueTokens(beamIdx);
4650+
auto const numFull = static_cast<SizeType32>(uniqueTokens.size()) / tokensPerBlock;
4651+
std::vector<tle::IdType> expected;
4652+
std::size_t parentHash = 0;
4653+
for (SizeType32 b = 0; b < numFull; ++b)
4654+
{
4655+
VecUniqueTokens slice(
4656+
uniqueTokens.begin() + b * tokensPerBlock, uniqueTokens.begin() + (b + 1) * tokensPerBlock);
4657+
BlockKey const blockKey(/*usesExtraIds=*/false, /*loraTaskId=*/std::nullopt, std::move(slice));
4658+
auto const hash = BlockKeyHasher::hash(blockKey, parentHash);
4659+
expected.push_back(static_cast<tle::IdType>(hash));
4660+
parentHash = hash;
4661+
}
4662+
return expected;
4663+
};
4664+
4665+
// 6 context tokens -> block 0 full (10..13), block 1 partial (14, 15).
4666+
auto inputTokens = std::make_shared<VecTokens>(VecTokens{10, 11, 12, 13, 14, 15});
4667+
auto llmRequest = std::make_shared<LlmRequest>(0, maxNewTokens, inputTokens, samplingConfig, isStreaming);
4668+
kvCacheManager.addSequenceBatch(
4669+
{{{0, static_cast<SizeType32>(inputTokens->size()), beamWidth}}}, {std::ref(*llmRequest)});
4670+
tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4671+
4672+
auto contextHashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
4673+
ASSERT_EQ(contextHashes.size(), 1u); // block 1 is still partial here.
4674+
4675+
// Generate exactly tokens 16, 17 so block 1 (tokens 14..17) fills and becomes the *trailing*
4676+
// block -- no further (partial) block is started. The just-filled trailing block must be
4677+
// committed via the "set" branch in this same step.
4678+
for (auto const token : {16, 17})
4679+
{
4680+
llmRequest->addNewToken(token, beamIdx);
4681+
kvCacheManager.addToken(0);
4682+
}
4683+
4684+
auto hashes = kvCacheManager.commitAndGetBlockHashesForRequest(*llmRequest, maxAttentionWindow);
4685+
auto expected = expectedChain(*llmRequest);
4686+
ASSERT_EQ(expected.size(), 2u);
4687+
// The crux: 2 hashes, not 1. A usable-count implementation would drop the trailing block.
4688+
EXPECT_EQ(hashes, expected);
4689+
// Front-running only appends; block 0's hash is unchanged from the context-only call.
4690+
EXPECT_EQ(hashes.front(), contextHashes.front());
4691+
4692+
tensorrt_llm::testing::KvCacheManagerTestUtil::simulatePrefillCompletion(*llmRequest);
4693+
(void) kvCacheManager.removeSequence(0, llmRequest);
4694+
}
4695+
44864696
TEST(KVCacheManagerHelpersTest, ChopVectorIntoBlocksBasicNoPartial)
44874697
{
44884698
using namespace tensorrt_llm::batch_manager::kv_cache_manager;

0 commit comments

Comments
 (0)