NVIDIA
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_lora_problem_builder.cu‎
Lines changed: 9 additions & 1 deletion b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_lora_problem_builder.cu‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/decodingKernels.cu‎
Lines changed: 21 additions & 4 deletions b/‎cpp/tensorrt_llm/kernels/decodingKernels.cu‎
Lines changed: 21 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/decodingKernels.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/decodingKernels.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/layers/beamSearchLayer.cu‎
Lines changed: 6 additions & 3 deletions b/‎cpp/tensorrt_llm/layers/beamSearchLayer.cu‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/thop/bindings.cpp‎
Lines changed: 2 additions & 0 deletions b/‎cpp/tensorrt_llm/nanobind/thop/bindings.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -10,7 +10,7 @@ TensorRT LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-13.1.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![torch](https://img.shields.io/badge/torch-2.10.0-green)](https://pytorch.org)
-[![version](https://img.shields.io/badge/release-1.3.0rc18-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.3.0rc19-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
 
 [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
 
@@ -68,8 +68,16 @@ __global__ void moeLoraProblemBuilderKernel(int32_t const* __restrict__ ranks, i
     // Problem sizes: each permuted token gets its own (M=1) GEMM. This matches
     // worst-case scheduling with no run-length aggregation; a future
     // optimization can aggregate consecutive identical-adapter tokens.
+    //
+    // Rank-0 rows carry no active adapter (base/no-LoRA request, padding, or
+    // warmup) and have null A/B pointers, so their delta is zero and the caller
+    // pre-zeroes the output. The in-GEMM already collapses to N=0 (rank is its
+    // N) and is skipped, but the out-GEMM's N is out_hidden_size; forcing it to
+    // zero here lets the grouped GEMM skip these rows too instead of launching
+    // tiles that dereference the null B pointer.
+    int const out_n = (rank > 0) ? static_cast<int>(out_hidden_size) : 0;
     problem_sizes_in[i] = cutlass::gemm::GemmCoord(1, rank, static_cast<int>(in_hidden_size));
-    problem_sizes_out[i] = cutlass::gemm::GemmCoord(1, static_cast<int>(out_hidden_size), rank);
+    problem_sizes_out[i] = cutlass::gemm::GemmCoord(1, out_n, rank);
 
     // Pointer rows. dtype_bytes scales the per-row stride so the same
     // builder serves bf16/fp16/fp32 adapters without templating.
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -728,7 +728,7 @@ namespace tensorrt_llm::runtime::kernels
 {
 // Must be similar to [cpp/tensorrt_llm/thop/gatherTreeOp.cpp] gatherTree
 void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
-    SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream)
+    SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream, runtime::SizeType32 batchSlot)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -781,15 +781,32 @@ void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decod
     lengthPenaltyPtr = manager.copyFrom(lengthPenaltyVec, ITensor::makeShape({batchSize}), runtime::MemoryType::kGPU);
 
     tensorrt_llm::kernels::BeamHypotheses bh;
-    bh.nMaxBatchSize = batchSize;
+    // logProbsTiled has shape [MSL, maxNumSequences, BM] and is passed unsliced.
+    // nMaxBatchSize must equal the allocation stride (dim-1), not the per-slot batchSize=1.
+    // The pointer is pre-offset by batchSlot*BM so that insertUnfinishedPathKernel,
+    // which uses bid=0 / nBatchSize=1, computes:
+    //   (base + batchSlot*BM)[step * maxBS * BM + 0*BM + beamIdx]
+    //   = base[step * maxBS * BM + batchSlot * BM + beamIdx]
+    //   = logProbsTiled[step][batchSlot][beamIdx]  ✓
+    auto const logProbsTiledMaxBatchSize = static_cast<SizeType32>(decodingOutput.logProbsTiled->getShape().d[1]);
+    auto const logProbsTiledBeamWidth = static_cast<SizeType32>(decodingOutput.logProbsTiled->getShape().d[2]);
+    TLLM_CHECK_WITH_INFO(batchSlot < logProbsTiledMaxBatchSize,
+        "batchSlot (%d) must be < logProbsTiled maxBatchSize (%d); "
+        "logProbsTiled would be accessed out of bounds.",
+        batchSlot, logProbsTiledMaxBatchSize);
+    TLLM_CHECK_WITH_INFO(beamWidth == logProbsTiledBeamWidth,
+        "beamWidth (%d) must equal logProbsTiled BM dimension (%d); "
+        "pointer offset batchSlot*beamWidth would be misaligned.",
+        beamWidth, logProbsTiledBeamWidth);
+    bh.nMaxBatchSize = logProbsTiledMaxBatchSize;
     bh.nBatchSize = batchSize;
     bh.nBeamWidth = beamWidth;
     bh.nMaxSeqLen = maxSeqLength;
     bh.lengthPenalties = bufferCast<float>(*lengthPenaltyPtr);
     bh.inputLengths = bufferCast<SizeType32>(*decodingInput.lengths);
     bh.outputIds = bufferCast<TokenIdType>(finalOutputIds);
     bh.logProbs = bufferCastOrNull<float>(decodingOutput.logProbs);
-    bh.logProbsTiled = bufferCast<float>(*decodingOutput.logProbsTiled);
+    bh.logProbsTiled = bufferCast<float>(*decodingOutput.logProbsTiled) + batchSlot * beamWidth;
     bh.sequenceLengths = bufferCast<SizeType32>(*decodingOutput.lengths);
     bh.cumLogProbs = bufferCast<float>(*decodingOutput.cumLogProbs);
     bh.outputIdsCBA = bufferCast<TokenIdType>(*decodingOutput.beamHypotheses.outputIdsCBA);
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -133,5 +133,5 @@ namespace tensorrt_llm::runtime::kernels
 //! \param cudaStream the CUDA stream on which to perform the operation.
 
 void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
-    SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream);
+    SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream, runtime::SizeType32 batchSlot = 0);
 } // namespace tensorrt_llm::runtime::kernels
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -335,7 +335,10 @@ void BeamSearchLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs> const
     BeamHypotheses bh;
     // bh's members not used in this function: outputIds, logProbs, outputIdsUnfinish, parentIdsUnfinish
     bh.bVBWS = this->mVBWS;
-    bh.nMaxBatchSize = static_cast<std::int32_t>(op->outputIdsPtr->getDimension<0>());
+    // outputIds retains its full maxBatchSize allocation; outputIdsPtr is sliced to the active
+    // batch size in DynamicDecodeLayer::prepareIdsPtrs (ITensor::slice(mOutputIdsPtrDevice, 0, batchSize))
+    // and must not be used as a stride for the [MSL, maxBatchSize, BM]-shaped logProbsTiled buffer.
+    bh.nMaxBatchSize = static_cast<std::int32_t>(op->outputIds->getDimension<0>());
     bh.nBatchSize = ip->localBatchSize;
     bh.nBeamWidth = op->outputIds->getDimension<1>();
     bh.nMaxSeqLen = op->outputIds->getDimension<2>();
@@ -397,7 +400,7 @@ void BeamSearchLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs> const
     T const* bias = static_cast<T const*>(nullptr);
     TLLM_CHECK_WITH_INFO(getWorkspaceSize() >= 2 * bh.nBatchSize * bh.nBeamWidth * bh.nBeamWidth * 2,
         "Workspace size (%lu) is not enough for topk softmax required (%lu).", (uint64_t) getWorkspaceSize(),
-        (uint64_t) (2 * bh.nMaxBatchSize * bh.nBeamWidth * bh.nBeamWidth * 2));
+        (uint64_t) (2 * bh.nBatchSize * bh.nBeamWidth * bh.nBeamWidth * 2));
 
     if (this->mV2 || this->mVBWS)
     {
 
@@ -174,6 +174,8 @@ void initBindings(nb::module_& m)
         nb::arg("sage_attn_num_elts_per_blk_k") = 0, nb::arg("sage_attn_num_elts_per_blk_v") = 0,
         nb::arg("sage_attn_qk_int8") = false, nb::arg("num_contexts") = 0, nb::arg("num_ctx_tokens") = 0,
         nb::arg("trtllm_gen_jit_warmup") = false, nb::arg("compressed_kv_cache_pool_ptr") = std::nullopt,
+        nb::arg("is_cross") = false, nb::arg("cross_kv") = std::nullopt,
+        nb::arg("relative_attention_bias") = std::nullopt, nb::arg("relative_attention_max_distance") = 0,
         nb::arg("spec_decoding_target_max_draft_tokens") = std::nullopt, "Multi-head attention operation",
         nb::call_guard<nb::gil_scoped_release>());
 
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -247,7 +247,7 @@ CudaEvent GptDecoderBatched::finalize(decoder::DecoderState const& decoderState,
 
     auto [dInput, dOutput] = prepareGatherTree(decoderState, batchSlot, streaming, *mRuntimeStream);
 
-    kernels::gatherTree(dOutput, dInput, samplingConfig, *mRuntimeStream);
+    kernels::gatherTree(dOutput, dInput, samplingConfig, *mRuntimeStream, batchSlot);
 
     CudaEvent event{};
     mRuntimeStream->record(event);