NVIDIA
diff --git a/‎3rdparty/fetch_content.json‎
Lines changed: 1 addition & 1 deletion b/‎3rdparty/fetch_content.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 9 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h‎
Lines changed: 10 additions & 5 deletions b/‎cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp‎
Lines changed: 3 additions & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 13 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp‎
Lines changed: 4 additions & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/IndexerTopK.h‎
Lines changed: 44 additions & 28 deletions b/‎cpp/tensorrt_llm/kernels/IndexerTopK.h‎
Lines changed: 44 additions & 28 deletions
@@ -32,7 +32,7 @@
     {
       "name": "deepgemm",
       "git_repository": "https://github.com/deepseek-ai/DeepGEMM",
-      "git_tag": "c491439ed5966833d56883ca302b6f72e74f8105",
+      "git_tag": "245dc5d6a5fe344c61505fe71011d203141d4479",
       "git_submodules_recurse": true,
       "source_subdir": "dont-add-this-project-with-add-subdirectory"
     },
 
@@ -10,7 +10,7 @@ TensorRT LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-13.1.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![torch](https://img.shields.io/badge/torch-2.10.0-green)](https://pytorch.org)
-[![version](https://img.shields.io/badge/release-1.3.0rc19-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.3.0rc20-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
 
 [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
 
@@ -1908,6 +1908,15 @@ class GenericLlmRequest
         return mPerfMetrics.kvCacheMetrics.numNewAllocatedBlocks;
     }
 
+    void updateKvCachePerfMetrics(
+        SizeType32 allocTotalBlocks, SizeType32 allocNewBlocks, SizeType32 reusedBlocks, SizeType32 missedBlocks)
+    {
+        updateAllocTotalBlocksPerRequest(allocTotalBlocks);
+        updateAllocNewBlocksPerRequest(allocNewBlocks);
+        updateReusedBlocksPerRequest(reusedBlocks);
+        updateMissedBlocksPerRequest(missedBlocks);
+    }
+
     void updateReusedBlocksPerRequest(SizeType32 reusedBlocksPerRequest)
     {
         mPerfMetrics.kvCacheMetrics.numReusedBlocks += reusedBlocksPerRequest;
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -208,7 +208,9 @@ class RuntimeBuffers
         //! Temporarily store the transposed results of multiple fragment logits, [maxBeamWidth, kCACHE_LENGTH]
         TensorPtr transposedLogits;
 
-        //! Temporarily store logits buffer address during the transposing, [kCACHE_LENGTH]
+        //! Temporarily store logits buffer address during the transposing, [maxBatchSize, kCACHE_LENGTH]
+        //! One row per batch slot (same layout as fragmentPointerHost) so concurrent flushes for
+        //! different requests in the same batch never clobber each other's pointer arrays.
         TensorPtr fragmentPointerDevice;
 
         //! Temporarily store logits buffer address during the transposing, [maxBatchSize, kCACHE_LENGTH]
@@ -222,11 +224,14 @@ class RuntimeBuffers
             workIdx = (workIdx + 1) % (fragmentPointerHost->getShape().d[0]);
         }
 
-        [[nodiscard]] TensorPtr getFragmentPointerHost()
+        //! Returns matching host and device pointer rows for the current workIdx, then advances
+        //! workIdx.  Always call this instead of the individual getters to avoid ordering bugs.
+        [[nodiscard]] std::pair<TensorPtr, TensorPtr> getFragmentPointerSlot()
         {
-            TensorPtr slice = runtime::ITensor::slice(fragmentPointerHost, workIdx, 1);
+            TensorPtr host = runtime::ITensor::slice(fragmentPointerHost, workIdx, 1);
+            TensorPtr device = runtime::ITensor::slice(fragmentPointerDevice, workIdx, 1);
             cycleWorkIdx();
-            return slice;
+            return {std::move(host), std::move(device)};
         };
     };
 
 
@@ -200,6 +200,8 @@ set(TRTLLM_LINK_LIBS
     layers_src
     runtime_src
     testing_src
+    compressorKernels_src
+    mhcKernels_src
     userbuffers_src
     ${DECODER_SHARED_TARGET_0}
     ${DECODER_SHARED_TARGET_1})
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -152,8 +152,8 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
             ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH, maxBatchSize * maxBeamWidth, vocabSizePadded}),
             logitsType);
 
-        generationLogitsCache.fragmentPointerDevice
-            = manager.gpu(ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
+        generationLogitsCache.fragmentPointerDevice = manager.gpu(
+            ITensor::makeShape({maxBatchSize, GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
         generationLogitsCache.fragmentPointerHost = tensorrt_llm::runtime::BufferManager::pinnedPool(
             ITensor::makeShape({maxBatchSize, GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
     }
 
@@ -1216,8 +1216,21 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests
         {
             for (auto const& llmReq : activeRequests)
             {
+                // Remove from mInflightReqIds so changeBeamWidth can proceed on the next iteration.
+                // terminateRequest frees seqSlot/KV cache but does not clean up mInflightReqIds.
+                mInflightReqIds.erase(llmReq->mRequestId);
                 terminateRequest(llmReq);
             }
+            // Force buffer/decoder reset to clean up any partial state from the aborted batch
+            // (e.g. partially-filled cross-KV block offsets from mid-context-chunk processing).
+            // Guard on mInflightReqIds.empty(): in pipeline-parallel multi-micro-batch mode,
+            // other micro-batches may still have requests tracked here; changeBeamWidth asserts
+            // emptiness so we skip the reset and let the next successful forwardAsync iteration
+            // perform it when the set is clear.
+            if (mWorldConfig.isLastPipelineParallelRank() && mInflightReqIds.empty())
+            {
+                changeBeamWidth(mOperatingBeamWidth);
+            }
         }
         catch (std::exception const& e)
         {
 
@@ -103,10 +103,11 @@ void copyGenerationLogits(RuntimeBuffers::GenerationLogitsCache& generationLogit
 
     auto const fragmentSize = llmReq.getGenerationLogitsFragmentsSize();
 
-    // Merge logits fragments on device
+    // Merge logits fragments on device.  getFragmentPointerSlot() returns the matching host and
+    // device rows for the current workIdx and advances the index atomically, so concurrent flushes
+    // for different requests in the same batch never clobber each other's pointer arrays.
     auto const& transposeBufferPtr = generationLogitsCache.transposedLogits;
-    auto const& cachePointerDevice = generationLogitsCache.fragmentPointerDevice;
-    auto const& cachePointerHost = generationLogitsCache.getFragmentPointerHost();
+    auto [cachePointerHost, cachePointerDevice] = generationLogitsCache.getFragmentPointerSlot();
     tensorrt_llm::runtime::kernels::mergeLogitsFragments(bufferManager, *transposeBufferPtr,
         llmReq.getGenerationLogitsFragments(), *cachePointerDevice, *cachePointerHost, 0, 1, reqBeamWidth,
         bufferManager.getStream(), 0);
 
@@ -30,6 +30,8 @@ add_subdirectory(dsv3MinLatencyKernels)
 add_subdirectory(causalConv1d)
 add_subdirectory(fusedGatedRMSNormQuant)
 add_subdirectory(mamba2MTPSSMCache)
+add_subdirectory(mhcKernels)
+add_subdirectory(compressorKernels)
 
 file(GLOB_RECURSE SRC_CPP *.cpp)
 file(GLOB_RECURSE SRC_CU *.cu)
@@ -55,6 +57,10 @@ list(FILTER SRC_CU EXCLUDE REGEX "userbuffers/.*")
 list(FILTER SRC_CU EXCLUDE REGEX "fusedLayernormKernels/.*")
 list(FILTER SRC_CU EXCLUDE REGEX "fusedGatedRMSNormQuant/.*")
 list(FILTER SRC_CU EXCLUDE REGEX "mamba2MTPSSMCache/.*")
+list(FILTER SRC_CPP EXCLUDE REGEX "mhcKernels/.*")
+list(FILTER SRC_CU EXCLUDE REGEX "mhcKernels/.*")
+list(FILTER SRC_CPP EXCLUDE REGEX "compressorKernels/.*")
+list(FILTER SRC_CU EXCLUDE REGEX "compressorKernels/.*")
 
 if(NOT ENABLE_MULTI_DEVICE)
   list(FILTER SRC_CU EXCLUDE REGEX "customAllReduceKernels*.*cu$")
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2025, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.  All rights reserved.
  * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,47 +27,63 @@ TRTLLM_NAMESPACE_BEGIN
 
 namespace kernels
 {
-/// Indexer TopK decode. Three tiers:
-///   - GVR Heuristic     (preIdx provided, K in {512,1024,2048}, numColumns in
-///                       [kSeqSmall, splitWorkThreshold), numRows below the
-///                       architecture-derived wave/L2 bound).
-///   - Single-block     (numColumns < split-work threshold)
-///   - Multi-pass radix (numColumns >= split-work threshold; requires
-///                       `scratch` sized via indexerTopKDecodeScratchBytes,
-///                       zero-init on first call and may be reused).
-///
-/// `is_prefill = true` forces single-block (split-work suppressed).
-void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, int const splitWorkThreshold,
-    int const numRows, int const numColumns, int const stride0, int const stride1, int const next_n,
-    int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0, int const preIdxCount = 0,
-    float* heuristicScratch = nullptr, cudaStream_t const stream = 0, void* scratch = nullptr, size_t scratchBytes = 0,
-    bool is_prefill = false);
+// Number of blocks-per-row used by the multi-block split + merge dispatch path of
+// invokeIndexerTopKDecode. Returns 1 when the single-block path is preferred.
+// Callers that allocate aux buffers must use this same helper to size them, and
+// must pass the same splitWorkThreshold they will pass to invokeIndexerTopKDecode
+// (a value <= 0 selects the internal default).
+int computeIndexerTopKDecodeBlocksPerRow(int numRows, int numColumns, int splitWorkThreshold = 0);
 
-/// Size of the multi-pass radix `scratch` buffer for these shapes.
-size_t indexerTopKDecodeScratchBytes(int numRows, int numColumns, int topK);
+/// fp32 indexer TopK decode — L2-aware BS-threshold dispatcher with four
+/// fallback tiers:
+///   - GVR Heuristic    (preIdx provided, kSeqSmall ≤ N < splitWork, BS < kBsLarge, K ∈ {512,1024,2048})
+///   - Insertion sort   (N < kSortingAlgorithmThreshold)
+///   - Radix sort       (kSortingAlgorithmThreshold ≤ N < splitWork)
+///   - Radix split-work (N ≥ splitWork — uses outLogitsAux / outIndicesAux)
+void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
+    int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
+    int const stride1, int const next_n, int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0,
+    int const preIdxCount = 0, float* heuristicScratch = nullptr, int const compressRatio = 1,
+    cudaStream_t const stream = 0);
 
-/// bf16 overload; same contract.
+/// bf16 indexer TopK decode — same dispatch axes as the fp32 entry, except
+/// kBsL2 uses sizeof(__nv_bfloat16) bytes/elem (L2 footprint is half) and
+/// the split-work tier is unsupported (the bf16/fp16 entry does not expose
+/// the float aux buffers required for split-work). Insertion + radix tiers
+/// share topKPerRowDecode with fp32 — histogram and sort run on float keys
+/// after a static_cast<float>(InputT) at HBM-read sites.
+///
+/// Aborts with TLLM_CHECK if numColumns ≥ splitWorkThreshold; callers in
+/// that regime must use the fp32 entry.
 void invokeIndexerTopKDecode(__nv_bfloat16 const* logits, int const* seqLens, int* indices,
     int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0, int const stride1,
     int const next_n, int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0,
-    int const preIdxCount = 0, __nv_bfloat16* heuristicScratch = nullptr, cudaStream_t const stream = 0,
-    void* scratch = nullptr, size_t scratchBytes = 0, bool is_prefill = false);
+    int const preIdxCount = 0, __nv_bfloat16* heuristicScratch = nullptr, int const compressRatio = 1,
+    cudaStream_t const stream = 0);
 
-/// fp16 overload; same contract.
+/// fp16 indexer TopK decode — see bf16 overload for dispatcher contract.
 void invokeIndexerTopKDecode(__half const* logits, int const* seqLens, int* indices, int const splitWorkThreshold,
     int const numRows, int const numColumns, int const stride0, int const stride1, int const next_n,
     int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0, int const preIdxCount = 0,
-    __half* heuristicScratch = nullptr, cudaStream_t const stream = 0, void* scratch = nullptr, size_t scratchBytes = 0,
-    bool is_prefill = false);
+    __half* heuristicScratch = nullptr, int const compressRatio = 1, cudaStream_t const stream = 0);
 
 void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int const* rowEnds, int* indices,
     int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
     cudaStream_t const stream = 0);
 
-/// True iff invokeIndexerTopKDecode would pick the GVR tier for this shape:
-/// K in {512,1024,2048}, numColumns in [kSeqSmall, splitWorkThreshold), and
-/// numRows below the architecture-derived wave/L2 bound. Lets callers
-/// provision preIdx / heuristicScratch only when needed.
+/// Returns true iff invokeIndexerTopKDecode would route to the GVR Heuristic
+/// kernel for this (numRows, numColumns, topK) triple, assuming valid preIdx
+/// is provided and stride1 == 1. Useful for callers that need to provision a
+/// preIdx tensor or heuristicScratch buffer only when GVR will be selected.
+///
+/// Mirrors the gating logic of the dispatcher: K ∈ {512, 1024, 2048},
+/// numColumns ∈ [kSeqSmall, splitWorkThreshold), numRows < kBsLarge, where
+/// kBsLarge = min(kBsWave, kBsL2) and kBsL2 scales with bytesPerElem.
+///
+/// @param numRows         logits rows (batch · next_n)
+/// @param numColumns      logits columns (max sequence length)
+/// @param topK            requested output size
+/// @param bytesPerElem    element size of logits (4 for fp32, 2 for bf16/fp16)
 bool canIndexerTopKDecodeUseGvr(int numRows, int numColumns, int topK, int bytesPerElem = 4);
 
 } // namespace kernels
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`{`
`33`	`33`	`"name": "deepgemm",`
`34`	`34`	`"git_repository": "https://github.com/deepseek-ai/DeepGEMM",`
`35`		`- "git_tag": "c491439ed5966833d56883ca302b6f72e74f8105",`
	`35`	`+ "git_tag": "245dc5d6a5fe344c61505fe71011d203141d4479",`
`36`	`36`	`"git_submodules_recurse": true,`
`37`	`37`	`"source_subdir": "dont-add-this-project-with-add-subdirectory"`
`38`	`38`	`},`
Original file line number	Diff line number	Diff line change
`@@ -1216,8 +1216,21 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests`
`1216`	`1216`	`{`
`1217`	`1217`	`for (auto const& llmReq : activeRequests)`
`1218`	`1218`	`{`
	`1219`	`+ // Remove from mInflightReqIds so changeBeamWidth can proceed on the next iteration.`
	`1220`	`+ // terminateRequest frees seqSlot/KV cache but does not clean up mInflightReqIds.`
	`1221`	`+ mInflightReqIds.erase(llmReq->mRequestId);`
`1219`	`1222`	`terminateRequest(llmReq);`
`1220`	`1223`	`}`
	`1224`	`+ // Force buffer/decoder reset to clean up any partial state from the aborted batch`
	`1225`	`+ // (e.g. partially-filled cross-KV block offsets from mid-context-chunk processing).`
	`1226`	`+ // Guard on mInflightReqIds.empty(): in pipeline-parallel multi-micro-batch mode,`
	`1227`	`+ // other micro-batches may still have requests tracked here; changeBeamWidth asserts`
	`1228`	`+ // emptiness so we skip the reset and let the next successful forwardAsync iteration`
	`1229`	`+ // perform it when the set is clear.`
	`1230`	`+ if (mWorldConfig.isLastPipelineParallelRank() && mInflightReqIds.empty())`
	`1231`	`+ {`
	`1232`	`+ changeBeamWidth(mOperatingBeamWidth);`
	`1233`	`+ }`
`1221`	`1234`	`}`
`1222`	`1235`	`catch (std::exception const& e)`
`1223`	`1236`	`{`