NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/IndexerTopK.h‎
Lines changed: 28 additions & 35 deletions b/‎cpp/tensorrt_llm/kernels/IndexerTopK.h‎
Lines changed: 28 additions & 35 deletions
@@ -27,54 +27,47 @@ TRTLLM_NAMESPACE_BEGIN
 
 namespace kernels
 {
-/// fp32 indexer TopK decode — L2-aware BS-threshold dispatcher with four
-/// fallback tiers:
-///   - GVR Heuristic    (preIdx provided, kSeqSmall ≤ N < splitWork, BS < kBsLarge, K ∈ {512,1024,2048})
-///   - Insertion sort   (N < kSortingAlgorithmThreshold)
-///   - Radix sort       (kSortingAlgorithmThreshold ≤ N < splitWork)
-///   - Radix split-work (N ≥ splitWork — uses outLogitsAux / outIndicesAux)
-void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux,
-    int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0,
-    int const stride1, int const next_n, int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0,
-    int const preIdxCount = 0, float* heuristicScratch = nullptr, cudaStream_t const stream = 0);
-
-/// bf16 indexer TopK decode — same dispatch axes as the fp32 entry, except
-/// kBsL2 uses sizeof(__nv_bfloat16) bytes/elem (L2 footprint is half) and
-/// the split-work tier is unsupported (the bf16/fp16 entry does not expose
-/// the float aux buffers required for split-work). Insertion + radix tiers
-/// share topKPerRowDecode with fp32 — histogram and sort run on float keys
-/// after a static_cast<float>(InputT) at HBM-read sites.
+/// Indexer TopK decode. Three tiers:
+///   - GVR Heuristic     (preIdx provided, K in {512,1024,2048}, numColumns in
+///                       [kSeqSmall, splitWorkThreshold), numRows below the
+///                       architecture-derived wave/L2 bound).
+///   - Single-block     (numColumns < split-work threshold)
+///   - Multi-pass radix (numColumns >= split-work threshold; requires
+///                       `scratch` sized via indexerTopKDecodeScratchBytes,
+///                       zero-init on first call and may be reused).
 ///
-/// Aborts with TLLM_CHECK if numColumns ≥ splitWorkThreshold; callers in
-/// that regime must use the fp32 entry.
+/// `is_prefill = true` forces single-block (split-work suppressed).
+void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, int const splitWorkThreshold,
+    int const numRows, int const numColumns, int const stride0, int const stride1, int const next_n,
+    int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0, int const preIdxCount = 0,
+    float* heuristicScratch = nullptr, cudaStream_t const stream = 0, void* scratch = nullptr, size_t scratchBytes = 0,
+    bool is_prefill = false);
+
+/// Size of the multi-pass radix `scratch` buffer for these shapes.
+size_t indexerTopKDecodeScratchBytes(int numRows, int numColumns, int topK);
+
+/// bf16 overload; same contract.
 void invokeIndexerTopKDecode(__nv_bfloat16 const* logits, int const* seqLens, int* indices,
     int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0, int const stride1,
     int const next_n, int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0,
-    int const preIdxCount = 0, __nv_bfloat16* heuristicScratch = nullptr, cudaStream_t const stream = 0);
+    int const preIdxCount = 0, __nv_bfloat16* heuristicScratch = nullptr, cudaStream_t const stream = 0,
+    void* scratch = nullptr, size_t scratchBytes = 0, bool is_prefill = false);
 
-/// fp16 indexer TopK decode — see bf16 overload for dispatcher contract.
+/// fp16 overload; same contract.
 void invokeIndexerTopKDecode(__half const* logits, int const* seqLens, int* indices, int const splitWorkThreshold,
     int const numRows, int const numColumns, int const stride0, int const stride1, int const next_n,
     int const topK = 2048, int const* preIdx = nullptr, int const preIdxStride = 0, int const preIdxCount = 0,
-    __half* heuristicScratch = nullptr, cudaStream_t const stream = 0);
+    __half* heuristicScratch = nullptr, cudaStream_t const stream = 0, void* scratch = nullptr, size_t scratchBytes = 0,
+    bool is_prefill = false);
 
 void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int const* rowEnds, int* indices,
     int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048,
     cudaStream_t const stream = 0);
 
-/// Returns true iff invokeIndexerTopKDecode would route to the GVR Heuristic
-/// kernel for this (numRows, numColumns, topK) triple, assuming valid preIdx
-/// is provided and stride1 == 1. Useful for callers that need to provision a
-/// preIdx tensor or heuristicScratch buffer only when GVR will be selected.
-///
-/// Mirrors the gating logic of the dispatcher: K ∈ {512, 1024, 2048},
-/// numColumns ∈ [kSeqSmall, splitWorkThreshold), numRows < kBsLarge, where
-/// kBsLarge = min(kBsWave, kBsL2) and kBsL2 scales with bytesPerElem.
-///
-/// @param numRows         logits rows (batch · next_n)
-/// @param numColumns      logits columns (max sequence length)
-/// @param topK            requested output size
-/// @param bytesPerElem    element size of logits (4 for fp32, 2 for bf16/fp16)
+/// True iff invokeIndexerTopKDecode would pick the GVR tier for this shape:
+/// K in {512,1024,2048}, numColumns in [kSeqSmall, splitWorkThreshold), and
+/// numRows below the architecture-derived wave/L2 bound. Lets callers
+/// provision preIdx / heuristicScratch only when needed.
 bool canIndexerTopKDecodeUseGvr(int numRows, int numColumns, int topK, int bytesPerElem = 4);
 
 } // namespace kernels