@@ -2897,34 +2897,6 @@ std::optional<KVCacheBlock::IdType> BlockManager::releaseBlocks(
28972897 return lastStoredId;
28982898}
28992899
2900- void BlockManager::releasePrefixBlocks (GenerationRequest& sequence, SizeType32 numBlocks)
2901- {
2902- // NOTE: This assumes a single window size (no VSWA). With different window
2903- // sizes, each WindowBlockManager may have a different number of allocated
2904- // blocks, so releasing the same numBlocks from all managers would need
2905- // per-window-size handling. Disaggregated serving does not support VSWA
2906- // today (gated by should_store_blocks: not is_vswa in the executor and
2907- // beamWidth == 1 assertion in WindowBlockManager::releasePrefixBlocks).
2908- //
2909- auto const windowSize = mWindowBlockManagers .cbegin ()->first ;
2910- // Snapshot the counter before iterating so that every WindowBlockManager
2911- // releases the same range. Without this, the first manager would advance
2912- // the single-window front-block counter and subsequent managers would see
2913- // the counter already at the target, skipping their own blocks.
2914- SizeType32 const startIdx = sequence.getNumFrontBlocksRemoved (windowSize);
2915- for (auto & [_, manager] : mWindowBlockManagers )
2916- {
2917- manager.releasePrefixBlocks (sequence, startIdx, numBlocks);
2918- }
2919- // Advance the single-window counter once, after all managers have released.
2920- // Uses incrementNumFrontBlocksRemoved (counter-only) instead of
2921- // removeFrontBlock so the intent is explicit.
2922- while (sequence.getNumFrontBlocksRemoved (windowSize) < numBlocks)
2923- {
2924- sequence.incrementNumFrontBlocksRemoved (windowSize);
2925- }
2926- }
2927-
29282900void BlockManager::pinBlocks (GenerationRequest& sequence)
29292901{
29302902 for (auto & [_, manager] : mWindowBlockManagers )
@@ -3737,43 +3709,6 @@ void WindowBlockManager::detachFrontBlock(GenerationRequest& sequence)
37373709 sequence.getNumFrontBlocksRemoved (mWindowSize ));
37383710}
37393711
3740- void WindowBlockManager::releasePrefixBlocks (GenerationRequest& sequence, SizeType32 startIdx, SizeType32 numBlocks)
3741- {
3742- TLLM_CHECK_WITH_INFO (
3743- sequence.getBeamWidth () == 1 , " [kv cache manager] releasePrefixBlocks does not support beamWidth > 1" );
3744-
3745- auto const requestId = sequence.getRequestId ();
3746- auto & allocatedBlocks = mAllocatedBlocksPerSeq .at (requestId);
3747- SizeType32 const target = std::min (numBlocks, static_cast <SizeType32>(allocatedBlocks.size ()));
3748-
3749- // Release blocks in range [startIdx, target). The single-window
3750- // front-block counter is advanced by BlockManager after
3751- // all WindowBlockManagers have processed the same range.
3752- for (SizeType32 blockIdx = startIdx; blockIdx < target; ++blockIdx)
3753- {
3754- auto & block = allocatedBlocks.at (blockIdx);
3755- auto releasedBlock = block;
3756-
3757- TLLM_LOG_DEBUG (" %s::releasePrefixBlocks - Releasing block %d from sequence %lu" , mLogPrefix .c_str (),
3758- releasedBlock->getBlockId (), requestId);
3759-
3760- // Replace the sequence slot with a placeholder, matching detachFrontBlock().
3761- // removeSequence later walks allocatedBlocks in releaseBlocks(); leaving the
3762- // real block here would release it a second time and corrupt the eviction
3763- // policy's free-block count.
3764- block = KVCacheBlock::createPlaceholder ();
3765-
3766- if (releasedBlock->hasRefs ())
3767- {
3768- releasedBlock->decRefCount ();
3769- }
3770- if (!releasedBlock->hasRefs ())
3771- {
3772- mEvictionPolicy ->releaseBlock (releasedBlock);
3773- }
3774- }
3775- }
3776-
37773712PrefixReuseSummary KVCacheManager::analyzePrefixReuse (
37783713 VecUniqueTokens const & uniqueTokens, LlmRequest const & llmRequest) const
37793714{
@@ -3950,31 +3885,6 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(
39503885 return lastStoredId;
39513886}
39523887
3953- void KVCacheManager::releasePrefixBlocks (RequestIdType requestId, SizeType32 numBlocks)
3954- {
3955- // Hard precondition: BlockManager::releasePrefixBlocks advances the
3956- // single-window front-block counter to numBlocks for every WindowBlockManager,
3957- // even when a window has fewer than numBlocks allocated. Under variable
3958- // sliding window attention (VSWA), that would cause WindowBlockManager::
3959- // releaseBlocks (called during removeSequence) to underrun rbegin() and
3960- // skip tail blocks for the smaller window. Disagg serving already gates
3961- // VSWA out, but we enforce the assumption here so the C++ API contract is
3962- // self-defending instead of relying on caller discipline.
3963- TLLM_CHECK_WITH_INFO (
3964- !mBlockManager .isVariableWindow (), " releasePrefixBlocks does not support variable sliding window attention" );
3965- if (numBlocks <= 0 )
3966- {
3967- return ;
3968- }
3969- std::scoped_lock lock (mSequencesMtx );
3970- auto it = mSequences .find (requestId);
3971- if (it == mSequences .end ())
3972- {
3973- return ;
3974- }
3975- mBlockManager .releasePrefixBlocks (it->second , numBlocks);
3976- }
3977-
39783888std::vector<KVCacheBlock::IdType> KVCacheManager::storeBlocksForReuse (
39793889 RequestIdType requestId, OptionalRef<LlmRequest const > llmRequest, bool pinBlocks)
39803890{
0 commit comments