@@ -729,7 +729,7 @@ class WindowBlockManager
729729 std::vector<SizeType32> const & managedLayers, std::vector<SizeType32> const & numKvHeadsPerLayer,
730730 SizeType32 sizePerHead, SizeType32 tokensPerBlock, bool isSWA, SizeType32 blocksInPrimaryPool,
731731 SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
732- bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
732+ CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
733733 std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
734734 std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager,
735735 radix_block_tree::UnifiedBlockTree& lookupTree, std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent = nullptr ,
@@ -1132,8 +1132,6 @@ class WindowBlockManager
11321132 // getPoolLayerIdx
11331133 std::unordered_map<SizeType32, SizeType32> mLayerToIndexWithinPool ;
11341134
1135- // Whether offloaded blocks should be onboarded before reuse.
1136- bool mOnboardBlocks ;
11371135 // Buffer manager
11381136 runtime::BufferManager mBufferManager ;
11391137
@@ -1241,7 +1239,7 @@ class BlockManager
12411239 CudaStreamPtr stream, SizeType32 maxSequenceLength, SizeType32 maxBeamWidth,
12421240 std::vector<SizeType32> const & maxAttentionWindowVec,
12431241 std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
1244- SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF ,
1242+ SizeType32 sinkBubbleLength, CacheType cacheType = CacheType::kSELF ,
12451243 std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt ,
12461244 std::shared_ptr<KVCacheEventManager> eventManager = nullptr , bool enablePartialReuse = true ,
12471245 bool copyOnPartialReuse = true ,
@@ -1985,7 +1983,7 @@ class KVCacheManager : public BaseKVCacheManager
19851983 std::vector<SizeType32> const & maxAttentionWindowVec,
19861984 std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
19871985 SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false ,
1988- bool onboardBlocks = true , CacheType cacheType = CacheType::kSELF ,
1986+ CacheType cacheType = CacheType::kSELF ,
19891987 std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt ,
19901988 std::shared_ptr<KVCacheEventManager> eventManager = nullptr , bool enablePartialReuse = true ,
19911989 bool copyOnpartialReuse = true ,
@@ -1999,7 +1997,7 @@ class KVCacheManager : public BaseKVCacheManager
19991997 std::vector<SizeType32> const & maxAttentionWindowVec,
20001998 std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
20011999 SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false ,
2002- bool onboardBlocks = true , CacheType cacheType = CacheType::kSELF ,
2000+ CacheType cacheType = CacheType::kSELF ,
20032001 std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt ,
20042002 std::shared_ptr<KVCacheEventManager> eventManager = nullptr , bool enablePartialReuse = true ,
20052003 bool copyOnpartialReuse = true ,
@@ -2013,7 +2011,7 @@ class KVCacheManager : public BaseKVCacheManager
20132011 std::vector<SizeType32> const & maxAttentionWindowVec,
20142012 std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
20152013 SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = true ,
2016- bool onboardBlocks = true , CacheType cacheType = CacheType::kSELF ,
2014+ CacheType cacheType = CacheType::kSELF ,
20172015 std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt ,
20182016 std::shared_ptr<KVCacheEventManager> eventManager = nullptr , bool enablePartialReuse = true ,
20192017 bool copyOnpartialReuse = true ,
@@ -2027,8 +2025,8 @@ class KVCacheManager : public BaseKVCacheManager
20272025 std::vector<SizeType32> const & maxAttentionWindowVec,
20282026 std::optional<TempAttentionWindowInputs> const & tempAttentionWindowInputs, nvinfer1::DataType dtype,
20292027 SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false ,
2030- bool onboardBlocks = true , CacheType cacheType = CacheType::kSELF , bool enablePartialReuse = true ,
2031- bool copyOnpartialReuse = true , bool enableIndexerKCache = false , SizeType32 indexerKCacheQuantBlockSize = 128 ,
2028+ CacheType cacheType = CacheType::kSELF , bool enablePartialReuse = true , bool copyOnpartialReuse = true ,
2029+ bool enableIndexerKCache = false , SizeType32 indexerKCacheQuantBlockSize = 128 ,
20322030 SizeType32 indexerKCacheIndexHeadDim = 0 ,
20332031 std::optional<LinearAttentionMetadata> linearAttentionMetadata = std::nullopt );
20342032
0 commit comments