@@ -2906,22 +2906,22 @@ void BlockManager::releasePrefixBlocks(GenerationRequest& sequence, SizeType32 n
29062906 // today (gated by should_store_blocks: not is_vswa in the executor and
29072907 // beamWidth == 1 assertion in WindowBlockManager::releasePrefixBlocks).
29082908 //
2909+ auto const windowSize = mWindowBlockManagers .cbegin ()->first ;
29092910 // Snapshot the counter before iterating so that every WindowBlockManager
29102911 // releases the same range. Without this, the first manager would advance
2911- // the shared mNumFrontBlocksRemoved counter and subsequent managers would
2912- // see the counter already at the target, skipping their own blocks.
2913- SizeType32 const startIdx = sequence.getNumFrontBlocksRemoved ();
2912+ // the single-window front-block counter and subsequent managers would see
2913+ // the counter already at the target, skipping their own blocks.
2914+ SizeType32 const startIdx = sequence.getNumFrontBlocksRemoved (windowSize );
29142915 for (auto & [_, manager] : mWindowBlockManagers )
29152916 {
29162917 manager.releasePrefixBlocks (sequence, startIdx, numBlocks);
29172918 }
2918- // Advance the shared counter once, after all managers have released.
2919+ // Advance the single-window counter once, after all managers have released.
29192920 // Uses incrementNumFrontBlocksRemoved (counter-only) instead of
2920- // removeFrontBlock so the intent is explicit and we do not depend on
2921- // removeFrontBlock ignoring its windowSize argument.
2922- while (sequence.getNumFrontBlocksRemoved () < numBlocks)
2921+ // removeFrontBlock so the intent is explicit.
2922+ while (sequence.getNumFrontBlocksRemoved (windowSize) < numBlocks)
29232923 {
2924- sequence.incrementNumFrontBlocksRemoved ();
2924+ sequence.incrementNumFrontBlocksRemoved (windowSize );
29252925 }
29262926}
29272927
@@ -3746,23 +3746,30 @@ void WindowBlockManager::releasePrefixBlocks(GenerationRequest& sequence, SizeTy
37463746 auto & allocatedBlocks = mAllocatedBlocksPerSeq .at (requestId);
37473747 SizeType32 const target = std::min (numBlocks, static_cast <SizeType32>(allocatedBlocks.size ()));
37483748
3749- // Release blocks in range [startIdx, target). The shared
3750- // mNumFrontBlocksRemoved counter is advanced by BlockManager after
3749+ // Release blocks in range [startIdx, target). The single-window
3750+ // front-block counter is advanced by BlockManager after
37513751 // all WindowBlockManagers have processed the same range.
37523752 for (SizeType32 blockIdx = startIdx; blockIdx < target; ++blockIdx)
37533753 {
37543754 auto & block = allocatedBlocks.at (blockIdx);
3755+ auto releasedBlock = block;
37553756
37563757 TLLM_LOG_DEBUG (" %s::releasePrefixBlocks - Releasing block %d from sequence %lu" , mLogPrefix .c_str (),
3757- block ->getBlockId (), requestId);
3758+ releasedBlock ->getBlockId (), requestId);
37583759
3759- if (block->hasRefs ())
3760+ // Replace the sequence slot with a placeholder, matching detachFrontBlock().
3761+ // removeSequence later walks allocatedBlocks in releaseBlocks(); leaving the
3762+ // real block here would release it a second time and corrupt the eviction
3763+ // policy's free-block count.
3764+ block = KVCacheBlock::createPlaceholder ();
3765+
3766+ if (releasedBlock->hasRefs ())
37603767 {
3761- block ->decRefCount ();
3768+ releasedBlock ->decRefCount ();
37623769 }
3763- if (!block ->hasRefs ())
3770+ if (!releasedBlock ->hasRefs ())
37643771 {
3765- mEvictionPolicy ->releaseBlock (block );
3772+ mEvictionPolicy ->releaseBlock (releasedBlock );
37663773 }
37673774 }
37683775}
@@ -3945,8 +3952,8 @@ std::optional<KVCacheBlock::IdType> KVCacheManager::removeSequence(
39453952
39463953void KVCacheManager::releasePrefixBlocks (RequestIdType requestId, SizeType32 numBlocks)
39473954{
3948- // Hard precondition: BlockManager::releasePrefixBlocks advances the shared
3949- // mNumFrontBlocksRemoved counter to numBlocks for every WindowBlockManager,
3955+ // Hard precondition: BlockManager::releasePrefixBlocks advances the
3956+ // single-window front-block counter to numBlocks for every WindowBlockManager,
39503957 // even when a window has fewer than numBlocks allocated. Under variable
39513958 // sliding window attention (VSWA), that would cause WindowBlockManager::
39523959 // releaseBlocks (called during removeSequence) to underrun rbegin() and
0 commit comments