Skip to content

Commit df73fb2

Browse files
Merge branch 'main' into initial-stats-sweep
2 parents 008f9ab + 61cef21 commit df73fb2

65 files changed

Lines changed: 1672 additions & 3435 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CODING_GUIDELINES.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ char* const errStr = getErrorStr(status); // const pointer to mutable cha
480480
Code should adhere to [PEP 8](https://peps.python.org/pep-0008/#fn-hi), unless otherwise noted.
481481

482482
#### Python Standard
483-
1. The code developed for TensorRT-LLM should conform to Python 3.8+.
483+
1. The code developed for TensorRT-LLM should conform to Python 3.10+.
484484

485485
#### Formatting
486486

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ TensorRT LLM
1010
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
1111
[![cuda](https://img.shields.io/badge/cuda-13.1.1-green)](https://developer.nvidia.com/cuda-downloads)
1212
[![torch](https://img.shields.io/badge/torch-2.10.0-green)](https://pytorch.org)
13-
[![version](https://img.shields.io/badge/release-1.3.0rc11-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
13+
[![version](https://img.shields.io/badge/release-1.3.0rc12-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
1414
[![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
1515

1616
[Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)   |   [Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](https://nvidia.github.io/TensorRT-LLM/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)

benchmarks/cpp/disaggServerBenchmark.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,7 @@ class DisaggExecutorServer
610610
texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse,
611611
benchmarkParams.maxTokensInPagedKvCache, benchmarkParams.maxAttentionWindowVec,
612612
benchmarkParams.sinkTokenLength, benchmarkParams.freeGpuMemoryFractions.at(in),
613-
benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
613+
benchmarkParams.kvHostCacheSize);
614614
texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode,
615615
benchmarkParams.enableContextFMHAFP32Acc, benchmarkParams.cudaGraphMode,
616616
benchmarkParams.cudaGraphCacheSize);
@@ -1213,8 +1213,6 @@ int main(int argc, char* argv[])
12131213
options.add_options()("kv_host_cache_bytes",
12141214
"Size of secondary memory pool used for offloading kv cache blocks (in bytes).",
12151215
cxxopts::value<size_t>()->default_value("0"));
1216-
options.add_options()("kv_onboard_blocks", "If offloaded blocks should be onboarded to primary memory before reuse",
1217-
cxxopts::value<bool>()->default_value("true"));
12181216
options.add_options()(
12191217
"max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value<SizeType32>());
12201218
options.add_options()("gpu_weights_percent",
@@ -1482,10 +1480,6 @@ int main(int argc, char* argv[])
14821480
TLLM_CHECK_WITH_INFO(
14831481
benchmarkParams.kvHostCacheSize == false, "Currently disaggServer don't support kv_host_cache!");
14841482

1485-
// Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
1486-
benchmarkParams.kvOnboardBlocks = result["kv_onboard_blocks"].as<bool>();
1487-
TLLM_CHECK_WITH_INFO(
1488-
benchmarkParams.kvOnboardBlocks == true, "Currently disaggServer don't support kv_onboard_blocks =false!");
14891483
// Argument: Medusa choices for the Medusa speculative decoding.
14901484
if (result.count("medusa_choices"))
14911485
{

benchmarks/cpp/gptManagerBenchmark.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ class ExecutorServer
630630

631631
texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
632632
benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
633-
benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks,
633+
benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize,
634634
benchmarkParams.crossKvCacheFraction);
635635
texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
636636
std::nullopt, benchmarkParams.loraHostCacheSize);
@@ -1133,8 +1133,6 @@ int main(int argc, char* argv[])
11331133
options.add_options()("kv_host_cache_bytes",
11341134
"Size of secondary memory pool used for offloading kv cache blocks (in bytes).",
11351135
cxxopts::value<size_t>()->default_value("0"));
1136-
options.add_options()("kv_onboard_blocks", "If offloaded blocks should be onboarded to primary memory before reuse",
1137-
cxxopts::value<bool>()->default_value("true"));
11381136
options.add_options()(
11391137
"max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value<SizeType32>());
11401138

@@ -1355,9 +1353,6 @@ int main(int argc, char* argv[])
13551353
// Argument: How many KV cache blocks (as fraction of number of GPU kv cache blocks).
13561354
benchmarkParams.kvHostCacheSize = result["kv_host_cache_bytes"].as<size_t>();
13571355

1358-
// Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
1359-
benchmarkParams.kvOnboardBlocks = result["kv_onboard_blocks"].as<bool>();
1360-
13611356
// Argument: Medusa choices for the Medusa speculative decoding.
13621357
if (result.count("medusa_choices"))
13631358
{

benchmarks/cpp/utils/utils.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ struct BenchmarkParams
8080

8181
// KV cache block offloading
8282
size_t kvHostCacheSize{0};
83-
bool kvOnboardBlocks{true};
8483

8584
// Weights offloading
8685
float gpuWeightsPercent{1.0};

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,7 @@ class WindowBlockManager
729729
std::vector<SizeType32> const& managedLayers, std::vector<SizeType32> const& numKvHeadsPerLayer,
730730
SizeType32 sizePerHead, SizeType32 tokensPerBlock, bool isSWA, SizeType32 blocksInPrimaryPool,
731731
SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
732-
bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
732+
CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
733733
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
734734
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager,
735735
radix_block_tree::UnifiedBlockTree& lookupTree, std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent = nullptr,
@@ -1132,8 +1132,6 @@ class WindowBlockManager
11321132
// getPoolLayerIdx
11331133
std::unordered_map<SizeType32, SizeType32> mLayerToIndexWithinPool;
11341134

1135-
// Whether offloaded blocks should be onboarded before reuse.
1136-
bool mOnboardBlocks;
11371135
// Buffer manager
11381136
runtime::BufferManager mBufferManager;
11391137

@@ -1241,7 +1239,7 @@ class BlockManager
12411239
CudaStreamPtr stream, SizeType32 maxSequenceLength, SizeType32 maxBeamWidth,
12421240
std::vector<SizeType32> const& maxAttentionWindowVec,
12431241
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
1244-
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType = CacheType::kSELF,
1242+
SizeType32 sinkBubbleLength, CacheType cacheType = CacheType::kSELF,
12451243
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
12461244
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
12471245
bool copyOnPartialReuse = true,
@@ -1985,7 +1983,7 @@ class KVCacheManager : public BaseKVCacheManager
19851983
std::vector<SizeType32> const& maxAttentionWindowVec,
19861984
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
19871985
SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
1988-
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
1986+
CacheType cacheType = CacheType::kSELF,
19891987
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
19901988
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
19911989
bool copyOnpartialReuse = true,
@@ -1999,7 +1997,7 @@ class KVCacheManager : public BaseKVCacheManager
19991997
std::vector<SizeType32> const& maxAttentionWindowVec,
20001998
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
20011999
SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
2002-
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
2000+
CacheType cacheType = CacheType::kSELF,
20032001
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
20042002
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
20052003
bool copyOnpartialReuse = true,
@@ -2013,7 +2011,7 @@ class KVCacheManager : public BaseKVCacheManager
20132011
std::vector<SizeType32> const& maxAttentionWindowVec,
20142012
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
20152013
SizeType32 sinkTokenLength, CudaStreamPtr stream, SizeType32 maxSequenceLength, bool enableBlockReuse = true,
2016-
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF,
2014+
CacheType cacheType = CacheType::kSELF,
20172015
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
20182016
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
20192017
bool copyOnpartialReuse = true,
@@ -2027,8 +2025,8 @@ class KVCacheManager : public BaseKVCacheManager
20272025
std::vector<SizeType32> const& maxAttentionWindowVec,
20282026
std::optional<TempAttentionWindowInputs> const& tempAttentionWindowInputs, nvinfer1::DataType dtype,
20292027
SizeType32 sinkTokenLength, int64_t stream, SizeType32 maxSequenceLength, bool enableBlockReuse = false,
2030-
bool onboardBlocks = true, CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true,
2031-
bool copyOnpartialReuse = true, bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
2028+
CacheType cacheType = CacheType::kSELF, bool enablePartialReuse = true, bool copyOnpartialReuse = true,
2029+
bool enableIndexerKCache = false, SizeType32 indexerKCacheQuantBlockSize = 128,
20322030
SizeType32 indexerKCacheIndexHeadDim = 0,
20332031
std::optional<LinearAttentionMetadata> linearAttentionMetadata = std::nullopt);
20342032

cpp/include/tensorrt_llm/executor/executor.h

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,7 +1039,7 @@ class KvCacheConfig
10391039
std::optional<std::vector<SizeType32>> const& maxAttentionWindowVec = std::nullopt,
10401040
std::optional<SizeType32> const& sinkTokenLength = std::nullopt,
10411041
std::optional<FloatType> const& freeGpuMemoryFraction = std::nullopt,
1042-
std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true,
1042+
std::optional<size_t> const& hostCacheSize = std::nullopt,
10431043
std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
10441044
std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0,
10451045
bool enablePartialReuse = true, bool copyOnPartialReuse = true, bool useUvm = false,
@@ -1056,7 +1056,6 @@ class KvCacheConfig
10561056
[[nodiscard]] std::optional<FloatType> getFreeGpuMemoryFraction() const;
10571057
[[nodiscard]] std::optional<FloatType> getCrossKvCacheFraction() const;
10581058
[[nodiscard]] std::optional<size_t> getHostCacheSize() const;
1059-
[[nodiscard]] bool getOnboardBlocks() const;
10601059
[[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
10611060
[[nodiscard]] size_t getEventBufferMaxSize() const;
10621061
[[nodiscard]] bool getUseUvm() const;
@@ -1072,7 +1071,6 @@ class KvCacheConfig
10721071
void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
10731072
void setCrossKvCacheFraction(FloatType crossKvCacheFraction);
10741073
void setHostCacheSize(size_t hostCacheSize);
1075-
void setOnboardBlocks(bool onboardBlocks);
10761074
void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
10771075
void setEventBufferMaxSize(size_t eventBufferMaxSize);
10781076
void setUseUvm(bool useUvm);
@@ -1116,9 +1114,6 @@ class KvCacheConfig
11161114
/// Having a secondary memory pool increases KV cache block reuse potential.
11171115
std::optional<size_t> mHostCacheSize;
11181116

1119-
/// @brief Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
1120-
bool mOnboardBlocks;
1121-
11221117
/// @brief Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.
11231118
std::optional<RetentionPriority> mSecondaryOffloadMinPriority;
11241119

0 commit comments

Comments
 (0)