Skip to content

Commit cb6dd3a

Browse files
authored
Merge branch 'main' into dev-bench-moe
2 parents f39a42a + 158d684 commit cb6dd3a

277 files changed

Lines changed: 27204 additions & 2707 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3rdparty/fetch_content.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
{
3333
"name": "deepgemm",
3434
"git_repository": "https://github.com/deepseek-ai/DeepGEMM",
35-
"git_tag": "c491439ed5966833d56883ca302b6f72e74f8105",
35+
"git_tag": "245dc5d6a5fe344c61505fe71011d203141d4479",
3636
"git_submodules_recurse": true,
3737
"source_subdir": "dont-add-this-project-with-add-subdirectory"
3838
},

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ TensorRT LLM
1010
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
1111
[![cuda](https://img.shields.io/badge/cuda-13.1.1-green)](https://developer.nvidia.com/cuda-downloads)
1212
[![torch](https://img.shields.io/badge/torch-2.10.0-green)](https://pytorch.org)
13-
[![version](https://img.shields.io/badge/release-1.3.0rc19-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
13+
[![version](https://img.shields.io/badge/release-1.3.0rc20-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
1414
[![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
1515

1616
[Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)   |   [Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](https://nvidia.github.io/TensorRT-LLM/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1908,6 +1908,15 @@ class GenericLlmRequest
19081908
return mPerfMetrics.kvCacheMetrics.numNewAllocatedBlocks;
19091909
}
19101910

1911+
void updateKvCachePerfMetrics(
1912+
SizeType32 allocTotalBlocks, SizeType32 allocNewBlocks, SizeType32 reusedBlocks, SizeType32 missedBlocks)
1913+
{
1914+
updateAllocTotalBlocksPerRequest(allocTotalBlocks);
1915+
updateAllocNewBlocksPerRequest(allocNewBlocks);
1916+
updateReusedBlocksPerRequest(reusedBlocks);
1917+
updateMissedBlocksPerRequest(missedBlocks);
1918+
}
1919+
19111920
void updateReusedBlocksPerRequest(SizeType32 reusedBlocksPerRequest)
19121921
{
19131922
mPerfMetrics.kvCacheMetrics.numReusedBlocks += reusedBlocksPerRequest;

cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2023-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -208,7 +208,9 @@ class RuntimeBuffers
208208
//! Temporarily store the transposed results of multiple fragment logits, [maxBeamWidth, kCACHE_LENGTH]
209209
TensorPtr transposedLogits;
210210

211-
//! Temporarily store logits buffer address during the transposing, [kCACHE_LENGTH]
211+
//! Temporarily store logits buffer address during the transposing, [maxBatchSize, kCACHE_LENGTH]
212+
//! One row per batch slot (same layout as fragmentPointerHost) so concurrent flushes for
213+
//! different requests in the same batch never clobber each other's pointer arrays.
212214
TensorPtr fragmentPointerDevice;
213215

214216
//! Temporarily store logits buffer address during the transposing, [maxBatchSize, kCACHE_LENGTH]
@@ -222,11 +224,14 @@ class RuntimeBuffers
222224
workIdx = (workIdx + 1) % (fragmentPointerHost->getShape().d[0]);
223225
}
224226

225-
[[nodiscard]] TensorPtr getFragmentPointerHost()
227+
//! Returns matching host and device pointer rows for the current workIdx, then advances
228+
//! workIdx. Always call this instead of the individual getters to avoid ordering bugs.
229+
[[nodiscard]] std::pair<TensorPtr, TensorPtr> getFragmentPointerSlot()
226230
{
227-
TensorPtr slice = runtime::ITensor::slice(fragmentPointerHost, workIdx, 1);
231+
TensorPtr host = runtime::ITensor::slice(fragmentPointerHost, workIdx, 1);
232+
TensorPtr device = runtime::ITensor::slice(fragmentPointerDevice, workIdx, 1);
228233
cycleWorkIdx();
229-
return slice;
234+
return {std::move(host), std::move(device)};
230235
};
231236
};
232237

cpp/tensorrt_llm/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@ set(TRTLLM_LINK_LIBS
200200
layers_src
201201
runtime_src
202202
testing_src
203+
compressorKernels_src
204+
mhcKernels_src
203205
userbuffers_src
204206
${DECODER_SHARED_TARGET_0}
205207
${DECODER_SHARED_TARGET_1})

cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -152,8 +152,8 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
152152
ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH, maxBatchSize * maxBeamWidth, vocabSizePadded}),
153153
logitsType);
154154

155-
generationLogitsCache.fragmentPointerDevice
156-
= manager.gpu(ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
155+
generationLogitsCache.fragmentPointerDevice = manager.gpu(
156+
ITensor::makeShape({maxBatchSize, GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
157157
generationLogitsCache.fragmentPointerHost = tensorrt_llm::runtime::BufferManager::pinnedPool(
158158
ITensor::makeShape({maxBatchSize, GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
159159
}

cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,8 +1216,21 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests
12161216
{
12171217
for (auto const& llmReq : activeRequests)
12181218
{
1219+
// Remove from mInflightReqIds so changeBeamWidth can proceed on the next iteration.
1220+
// terminateRequest frees seqSlot/KV cache but does not clean up mInflightReqIds.
1221+
mInflightReqIds.erase(llmReq->mRequestId);
12191222
terminateRequest(llmReq);
12201223
}
1224+
// Force buffer/decoder reset to clean up any partial state from the aborted batch
1225+
// (e.g. partially-filled cross-KV block offsets from mid-context-chunk processing).
1226+
// Guard on mInflightReqIds.empty(): in pipeline-parallel multi-micro-batch mode,
1227+
// other micro-batches may still have requests tracked here; changeBeamWidth asserts
1228+
// emptiness so we skip the reset and let the next successful forwardAsync iteration
1229+
// perform it when the set is clear.
1230+
if (mWorldConfig.isLastPipelineParallelRank() && mInflightReqIds.empty())
1231+
{
1232+
changeBeamWidth(mOperatingBeamWidth);
1233+
}
12211234
}
12221235
catch (std::exception const& e)
12231236
{

cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,11 @@ void copyGenerationLogits(RuntimeBuffers::GenerationLogitsCache& generationLogit
103103

104104
auto const fragmentSize = llmReq.getGenerationLogitsFragmentsSize();
105105

106-
// Merge logits fragments on device
106+
// Merge logits fragments on device. getFragmentPointerSlot() returns the matching host and
107+
// device rows for the current workIdx and advances the index atomically, so concurrent flushes
108+
// for different requests in the same batch never clobber each other's pointer arrays.
107109
auto const& transposeBufferPtr = generationLogitsCache.transposedLogits;
108-
auto const& cachePointerDevice = generationLogitsCache.fragmentPointerDevice;
109-
auto const& cachePointerHost = generationLogitsCache.getFragmentPointerHost();
110+
auto [cachePointerHost, cachePointerDevice] = generationLogitsCache.getFragmentPointerSlot();
110111
tensorrt_llm::runtime::kernels::mergeLogitsFragments(bufferManager, *transposeBufferPtr,
111112
llmReq.getGenerationLogitsFragments(), *cachePointerDevice, *cachePointerHost, 0, 1, reqBeamWidth,
112113
bufferManager.getStream(), 0);

cpp/tensorrt_llm/kernels/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ add_subdirectory(dsv3MinLatencyKernels)
3030
add_subdirectory(causalConv1d)
3131
add_subdirectory(fusedGatedRMSNormQuant)
3232
add_subdirectory(mamba2MTPSSMCache)
33+
add_subdirectory(mhcKernels)
34+
add_subdirectory(compressorKernels)
3335

3436
file(GLOB_RECURSE SRC_CPP *.cpp)
3537
file(GLOB_RECURSE SRC_CU *.cu)
@@ -55,6 +57,10 @@ list(FILTER SRC_CU EXCLUDE REGEX "userbuffers/.*")
5557
list(FILTER SRC_CU EXCLUDE REGEX "fusedLayernormKernels/.*")
5658
list(FILTER SRC_CU EXCLUDE REGEX "fusedGatedRMSNormQuant/.*")
5759
list(FILTER SRC_CU EXCLUDE REGEX "mamba2MTPSSMCache/.*")
60+
list(FILTER SRC_CPP EXCLUDE REGEX "mhcKernels/.*")
61+
list(FILTER SRC_CU EXCLUDE REGEX "mhcKernels/.*")
62+
list(FILTER SRC_CPP EXCLUDE REGEX "compressorKernels/.*")
63+
list(FILTER SRC_CU EXCLUDE REGEX "compressorKernels/.*")
5864

5965
if(NOT ENABLE_MULTI_DEVICE)
6066
list(FILTER SRC_CU EXCLUDE REGEX "customAllReduceKernels*.*cu$")

cpp/tensorrt_llm/kernels/IndexerTopK.h

Lines changed: 44 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved.
33
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,47 +27,63 @@ TRTLLM_NAMESPACE_BEGIN
2727

2828
namespace kernels
2929
{
30-
/// Indexer TopK decode. Three tiers:
31-
/// - GVR Heuristic (preIdx provided, K in {512,1024,2048}, numColumns in
32-
/// [kSeqSmall, splitWorkThreshold), numRows below the
33-
/// architecture-derived wave/L2 bound).
34-
/// - Single-block (numColumns < split-work threshold)
35-
/// - Multi-pass radix (numColumns >= split-work threshold; requires
36-
/// `scratch` sized via indexerTopKDecodeScratchBytes,
37-
/// zero-init on first call and may be reused).
38-
///
39-
/// `is_prefill = true` forces single-block (split-work suppressed).
40-
void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, int const splitWorkThreshold,
41-
int const numRows, int const numColumns, int const stride0, int const stride1, int const next_n,
42-
int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0, int const preIdxCount = 0,
43-
float* heuristicScratch = nullptr, cudaStream_t const stream = 0, void* scratch = nullptr, size_t scratchBytes = 0,
44-
bool is_prefill = false);
30+
// Number of blocks-per-row used by the multi-block split + merge dispatch path of
31+
// invokeIndexerTopKDecode. Returns 1 when the single-block path is preferred.
32+
// Callers that allocate aux buffers must use this same helper to size them, and
33+
// must pass the same splitWorkThreshold they will pass to invokeIndexerTopKDecode
34+
// (a value <= 0 selects the internal default).
35+
int computeIndexerTopKDecodeBlocksPerRow(int numRows, int numColumns, int splitWorkThreshold = 0);
4536

46-
/// Size of the multi-pass radix `scratch` buffer for these shapes.
47-
size_t indexerTopKDecodeScratchBytes(int numRows, int numColumns, int topK);
37+
/// fp32 indexer TopK decode — L2-aware BS-threshold dispatcher with four
38+
/// fallback tiers:
39+
/// - GVR Heuristic (preIdx provided, kSeqSmall ≤ N < splitWork, BS < kBsLarge, K ∈ {512,1024,2048})
40+
/// - Insertion sort (N < kSortingAlgorithmThreshold)
41+
/// - Radix sort (kSortingAlgorithmThreshold ≤ N < splitWork)
42+
/// - Radix split-work (N ≥ splitWork — uses outLogitsAux / outIndicesAux)
43+
void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
44+
int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
45+
int const stride1, int const next_n, int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0,
46+
int const preIdxCount = 0, float* heuristicScratch = nullptr, int const compressRatio = 1,
47+
cudaStream_t const stream = 0);
4848

49-
/// bf16 overload; same contract.
49+
/// bf16 indexer TopK decode — same dispatch axes as the fp32 entry, except
50+
/// kBsL2 uses sizeof(__nv_bfloat16) bytes/elem (L2 footprint is half) and
51+
/// the split-work tier is unsupported (the bf16/fp16 entry does not expose
52+
/// the float aux buffers required for split-work). Insertion + radix tiers
53+
/// share topKPerRowDecode with fp32 — histogram and sort run on float keys
54+
/// after a static_cast<float>(InputT) at HBM-read sites.
55+
///
56+
/// Aborts with TLLM_CHECK if numColumns ≥ splitWorkThreshold; callers in
57+
/// that regime must use the fp32 entry.
5058
void invokeIndexerTopKDecode(__nv_bfloat16 const* logits, int const* seqLens, int* indices,
5159
int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0, int const stride1,
5260
int const next_n, int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0,
53-
int const preIdxCount = 0, __nv_bfloat16* heuristicScratch = nullptr, cudaStream_t const stream = 0,
54-
void* scratch = nullptr, size_t scratchBytes = 0, bool is_prefill = false);
61+
int const preIdxCount = 0, __nv_bfloat16* heuristicScratch = nullptr, int const compressRatio = 1,
62+
cudaStream_t const stream = 0);
5563

56-
/// fp16 overload; same contract.
64+
/// fp16 indexer TopK decode — see bf16 overload for dispatcher contract.
5765
void invokeIndexerTopKDecode(__half const* logits, int const* seqLens, int* indices, int const splitWorkThreshold,
5866
int const numRows, int const numColumns, int const stride0, int const stride1, int const next_n,
5967
int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0, int const preIdxCount = 0,
60-
__half* heuristicScratch = nullptr, cudaStream_t const stream = 0, void* scratch = nullptr, size_t scratchBytes = 0,
61-
bool is_prefill = false);
68+
__half* heuristicScratch = nullptr, int const compressRatio = 1, cudaStream_t const stream = 0);
6269

6370
void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int const* rowEnds, int* indices,
6471
int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
6572
cudaStream_t const stream = 0);
6673

67-
/// True iff invokeIndexerTopKDecode would pick the GVR tier for this shape:
68-
/// K in {512,1024,2048}, numColumns in [kSeqSmall, splitWorkThreshold), and
69-
/// numRows below the architecture-derived wave/L2 bound. Lets callers
70-
/// provision preIdx / heuristicScratch only when needed.
74+
/// Returns true iff invokeIndexerTopKDecode would route to the GVR Heuristic
75+
/// kernel for this (numRows, numColumns, topK) triple, assuming valid preIdx
76+
/// is provided and stride1 == 1. Useful for callers that need to provision a
77+
/// preIdx tensor or heuristicScratch buffer only when GVR will be selected.
78+
///
79+
/// Mirrors the gating logic of the dispatcher: K ∈ {512, 1024, 2048},
80+
/// numColumns ∈ [kSeqSmall, splitWorkThreshold), numRows < kBsLarge, where
81+
/// kBsLarge = min(kBsWave, kBsL2) and kBsL2 scales with bytesPerElem.
82+
///
83+
/// @param numRows logits rows (batch · next_n)
84+
/// @param numColumns logits columns (max sequence length)
85+
/// @param topK requested output size
86+
/// @param bytesPerElem element size of logits (4 for fp32, 2 for bf16/fp16)
7187
bool canIndexerTopKDecodeUseGvr(int numRows, int numColumns, int topK, int bytesPerElem = 4);
7288

7389
} // namespace kernels

0 commit comments

Comments
 (0)