NVIDIA
diff --git a/‎CODING_GUIDELINES.md‎
Lines changed: 1 addition & 1 deletion b/‎CODING_GUIDELINES.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cpp/disaggServerBenchmark.cpp‎
Lines changed: 1 addition & 7 deletions b/‎benchmarks/cpp/disaggServerBenchmark.cpp‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 1 addition & 6 deletions b/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎benchmarks/cpp/utils/utils.h‎
Lines changed: 0 additions & 1 deletion b/‎benchmarks/cpp/utils/utils.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 7 additions & 9 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 1 addition & 6 deletions b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 1 addition & 6 deletions
@@ -480,7 +480,7 @@ char* const errStr = getErrorStr(status);        // const pointer to mutable cha
 Code should adhere to [PEP 8](https://peps.python.org/pep-0008/#fn-hi), unless otherwise noted.
 
 #### Python Standard
-1. The code developed for TensorRT-LLM should conform to Python 3.8+.
+1. The code developed for TensorRT-LLM should conform to Python 3.10+.
 
 #### Formatting
 
 
@@ -10,7 +10,7 @@ TensorRT LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-13.1.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![torch](https://img.shields.io/badge/torch-2.10.0-green)](https://pytorch.org)
-[![version](https://img.shields.io/badge/release-1.3.0rc11-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.3.0rc12-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
 
 [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
 
@@ -610,7 +610,7 @@ class DisaggExecutorServer
             texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse,
                 benchmarkParams.maxTokensInPagedKvCache, benchmarkParams.maxAttentionWindowVec,
                 benchmarkParams.sinkTokenLength, benchmarkParams.freeGpuMemoryFractions.at(in),
-                benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
+                benchmarkParams.kvHostCacheSize);
             texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode,
                 benchmarkParams.enableContextFMHAFP32Acc, benchmarkParams.cudaGraphMode,
                 benchmarkParams.cudaGraphCacheSize);
@@ -1213,8 +1213,6 @@ int main(int argc, char* argv[])
     options.add_options()("kv_host_cache_bytes",
         "Size of secondary memory pool used for offloading kv cache blocks (in bytes).",
         cxxopts::value<size_t>()->default_value("0"));
-    options.add_options()("kv_onboard_blocks", "If offloaded blocks should be onboarded to primary memory before reuse",
-        cxxopts::value<bool>()->default_value("true"));
     options.add_options()(
         "max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value<SizeType32>());
     options.add_options()("gpu_weights_percent",
@@ -1482,10 +1480,6 @@ int main(int argc, char* argv[])
     TLLM_CHECK_WITH_INFO(
         benchmarkParams.kvHostCacheSize == false, "Currently disaggServer don't support kv_host_cache!");
 
-    // Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
-    benchmarkParams.kvOnboardBlocks = result["kv_onboard_blocks"].as<bool>();
-    TLLM_CHECK_WITH_INFO(
-        benchmarkParams.kvOnboardBlocks == true, "Currently disaggServer don't support kv_onboard_blocks =false!");
     // Argument: Medusa choices for the Medusa speculative decoding.
     if (result.count("medusa_choices"))
     {
 
@@ -630,7 +630,7 @@ class ExecutorServer
 
         texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
             benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
-            benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks,
+            benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize,
             benchmarkParams.crossKvCacheFraction);
         texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
             std::nullopt, benchmarkParams.loraHostCacheSize);
@@ -1133,8 +1133,6 @@ int main(int argc, char* argv[])
     options.add_options()("kv_host_cache_bytes",
         "Size of secondary memory pool used for offloading kv cache blocks (in bytes).",
         cxxopts::value<size_t>()->default_value("0"));
-    options.add_options()("kv_onboard_blocks", "If offloaded blocks should be onboarded to primary memory before reuse",
-        cxxopts::value<bool>()->default_value("true"));
     options.add_options()(
         "max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value<SizeType32>());
 
@@ -1355,9 +1353,6 @@ int main(int argc, char* argv[])
     // Argument: How many KV cache blocks (as fraction of number of GPU kv cache blocks).
     benchmarkParams.kvHostCacheSize = result["kv_host_cache_bytes"].as<size_t>();
 
-    // Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
-    benchmarkParams.kvOnboardBlocks = result["kv_onboard_blocks"].as<bool>();
-
     // Argument: Medusa choices for the Medusa speculative decoding.
     if (result.count("medusa_choices"))
     {
 
@@ -80,7 +80,6 @@ struct BenchmarkParams
 
     // KV cache block offloading
     size_t kvHostCacheSize{0};
-    bool kvOnboardBlocks{true};
 
     // Weights offloading
     float gpuWeightsPercent{1.0};
 
@@ -729,7 +729,7 @@ class WindowBlockManager
         std::vector<SizeType32> const& managedLayers, std::vector<SizeType32> const& numKvHeadsPerLayer,
         SizeType32 sizePerHead, SizeType32 tokensPerBlock, bool isSWA, SizeType32 blocksInPrimaryPool,
         SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
-        bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
+        CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
         std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
         std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager,
         radix_block_tree::UnifiedBlockTree& lookupTree, std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent = nullptr,
@@ -1132,8 +1132,6 @@ class WindowBlockManager
     // getPoolLayerIdx
     std::unordered_map<SizeType32, SizeType32> mLayerToIndexWithinPool;
 
-    // Whether offloaded blocks should be onboarded before reuse.
-    bool mOnboardBlocks;
     // Buffer manager
     runtime::BufferManager mBufferManager;
 
@@ -1241,7 +1239,7 @@ class BlockManager
         CudaStreamPtr stream, SizeType32 maxSequenceLength, SizeType32 maxBeamWidth,
         std::vector<SizeType32> const& maxAttentionWindowVec,
         std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
-        SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
+        SizeType32 sinkBubbleLength, CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
         bool copyOnPartialReuse = true,
@@ -1985,7 +1983,7 @@ class KVCacheManager : public BaseKVCacheManager
         std::vector<SizeType32> const& maxAttentionWindowVec,
         std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
         SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
-        bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
+        CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
         bool copyOnpartialReuse = true,
@@ -1999,7 +1997,7 @@ class KVCacheManager : public BaseKVCacheManager
         std::vector<SizeType32> const& maxAttentionWindowVec,
         std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
         SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
-        bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
+        CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
         bool copyOnpartialReuse = true,
@@ -2013,7 +2011,7 @@ class KVCacheManager : public BaseKVCacheManager
         std::vector<SizeType32> const& maxAttentionWindowVec,
         std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
         SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = true,
-        bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
+        CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
         bool copyOnpartialReuse = true,
@@ -2027,8 +2025,8 @@ class KVCacheManager : public BaseKVCacheManager
         std::vector<SizeType32> const& maxAttentionWindowVec,
         std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
         SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
-        bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true,
-        bool copyOnpartialReuse = true, bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
+        CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true, bool copyOnpartialReuse = true,
+        bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
         SizeType32 indexerKCacheIndexHeadDim = 0,
         std::optional<LinearAttentionMetadata> linearAttentionMetadata = std::nullopt);
 
 
@@ -1039,7 +1039,7 @@ class KvCacheConfig
         std::optional<std::vector<SizeType32>> const& maxAttentionWindowVec = std::nullopt,
         std::optional<SizeType32> const& sinkTokenLength = std::nullopt,
         std::optional<FloatType> const& freeGpuMemoryFraction = std::nullopt,
-        std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true,
+        std::optional<size_t> const& hostCacheSize = std::nullopt,
         std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
         std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0,
         bool enablePartialReuse = true, bool copyOnPartialReuse = true, bool useUvm = false,
@@ -1056,7 +1056,6 @@ class KvCacheConfig
     [[nodiscard]] std::optional<FloatType> getFreeGpuMemoryFraction() const;
     [[nodiscard]] std::optional<FloatType> getCrossKvCacheFraction() const;
     [[nodiscard]] std::optional<size_t> getHostCacheSize() const;
-    [[nodiscard]] bool getOnboardBlocks() const;
     [[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
     [[nodiscard]] size_t getEventBufferMaxSize() const;
     [[nodiscard]] bool getUseUvm() const;
@@ -1072,7 +1071,6 @@ class KvCacheConfig
     void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
     void setCrossKvCacheFraction(FloatType crossKvCacheFraction);
     void setHostCacheSize(size_t hostCacheSize);
-    void setOnboardBlocks(bool onboardBlocks);
     void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
     void setEventBufferMaxSize(size_t eventBufferMaxSize);
     void setUseUvm(bool useUvm);
@@ -1116,9 +1114,6 @@ class KvCacheConfig
     /// Having a secondary memory pool increases KV cache block reuse potential.
     std::optional<size_t> mHostCacheSize;
 
-    /// @brief Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
-    bool mOnboardBlocks;
-
     /// @brief Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.
     std::optional<RetentionPriority> mSecondaryOffloadMinPriority;