Skip to content

Commit e72c27b

Browse files
authored
Merge branch 'main' into user/fanrongl/dsv4-compressor-mhc
2 parents 8b66876 + 2ef2ea5 commit e72c27b

41 files changed

Lines changed: 1744 additions & 211 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ TensorRT LLM
1010
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
1111
[![cuda](https://img.shields.io/badge/cuda-13.1.1-green)](https://developer.nvidia.com/cuda-downloads)
1212
[![torch](https://img.shields.io/badge/torch-2.10.0-green)](https://pytorch.org)
13-
[![version](https://img.shields.io/badge/release-1.3.0rc18-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
13+
[![version](https://img.shields.io/badge/release-1.3.0rc19-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
1414
[![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
1515

1616
[Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)   |   [Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](https://nvidia.github.io/TensorRT-LLM/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)

cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_lora_problem_builder.cu

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,16 @@ __global__ void moeLoraProblemBuilderKernel(int32_t const* __restrict__ ranks, i
6868
// Problem sizes: each permuted token gets its own (M=1) GEMM. This matches
6969
// worst-case scheduling with no run-length aggregation; a future
7070
// optimization can aggregate consecutive identical-adapter tokens.
71+
//
72+
// Rank-0 rows carry no active adapter (base/no-LoRA request, padding, or
73+
// warmup) and have null A/B pointers, so their delta is zero and the caller
74+
// pre-zeroes the output. The in-GEMM already collapses to N=0 (rank is its
75+
// N) and is skipped, but the out-GEMM's N is out_hidden_size; forcing it to
76+
// zero here lets the grouped GEMM skip these rows too instead of launching
77+
// tiles that dereference the null B pointer.
78+
int const out_n = (rank > 0) ? static_cast<int>(out_hidden_size) : 0;
7179
problem_sizes_in[i] = cutlass::gemm::GemmCoord(1, rank, static_cast<int>(in_hidden_size));
72-
problem_sizes_out[i] = cutlass::gemm::GemmCoord(1, static_cast<int>(out_hidden_size), rank);
80+
problem_sizes_out[i] = cutlass::gemm::GemmCoord(1, out_n, rank);
7381

7482
// Pointer rows. dtype_bytes scales the per-row stride so the same
7583
// builder serves bf16/fp16/fp32 adapters without templating.

cpp/tensorrt_llm/kernels/decodingKernels.cu

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2020-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -728,7 +728,7 @@ namespace tensorrt_llm::runtime::kernels
728728
{
729729
// Must be similar to [cpp/tensorrt_llm/thop/gatherTreeOp.cpp] gatherTree
730730
void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
731-
SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream)
731+
SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream, runtime::SizeType32 batchSlot)
732732
{
733733
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
734734

@@ -781,15 +781,32 @@ void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decod
781781
lengthPenaltyPtr = manager.copyFrom(lengthPenaltyVec, ITensor::makeShape({batchSize}), runtime::MemoryType::kGPU);
782782

783783
tensorrt_llm::kernels::BeamHypotheses bh;
784-
bh.nMaxBatchSize = batchSize;
784+
// logProbsTiled has shape [MSL, maxNumSequences, BM] and is passed unsliced.
785+
// nMaxBatchSize must equal the allocation stride (dim-1), not the per-slot batchSize=1.
786+
// The pointer is pre-offset by batchSlot*BM so that insertUnfinishedPathKernel,
787+
// which uses bid=0 / nBatchSize=1, computes:
788+
// (base + batchSlot*BM)[step * maxBS * BM + 0*BM + beamIdx]
789+
// = base[step * maxBS * BM + batchSlot * BM + beamIdx]
790+
// = logProbsTiled[step][batchSlot][beamIdx] ✓
791+
auto const logProbsTiledMaxBatchSize = static_cast<SizeType32>(decodingOutput.logProbsTiled->getShape().d[1]);
792+
auto const logProbsTiledBeamWidth = static_cast<SizeType32>(decodingOutput.logProbsTiled->getShape().d[2]);
793+
TLLM_CHECK_WITH_INFO(batchSlot < logProbsTiledMaxBatchSize,
794+
"batchSlot (%d) must be < logProbsTiled maxBatchSize (%d); "
795+
"logProbsTiled would be accessed out of bounds.",
796+
batchSlot, logProbsTiledMaxBatchSize);
797+
TLLM_CHECK_WITH_INFO(beamWidth == logProbsTiledBeamWidth,
798+
"beamWidth (%d) must equal logProbsTiled BM dimension (%d); "
799+
"pointer offset batchSlot*beamWidth would be misaligned.",
800+
beamWidth, logProbsTiledBeamWidth);
801+
bh.nMaxBatchSize = logProbsTiledMaxBatchSize;
785802
bh.nBatchSize = batchSize;
786803
bh.nBeamWidth = beamWidth;
787804
bh.nMaxSeqLen = maxSeqLength;
788805
bh.lengthPenalties = bufferCast<float>(*lengthPenaltyPtr);
789806
bh.inputLengths = bufferCast<SizeType32>(*decodingInput.lengths);
790807
bh.outputIds = bufferCast<TokenIdType>(finalOutputIds);
791808
bh.logProbs = bufferCastOrNull<float>(decodingOutput.logProbs);
792-
bh.logProbsTiled = bufferCast<float>(*decodingOutput.logProbsTiled);
809+
bh.logProbsTiled = bufferCast<float>(*decodingOutput.logProbsTiled) + batchSlot * beamWidth;
793810
bh.sequenceLengths = bufferCast<SizeType32>(*decodingOutput.lengths);
794811
bh.cumLogProbs = bufferCast<float>(*decodingOutput.cumLogProbs);
795812
bh.outputIdsCBA = bufferCast<TokenIdType>(*decodingOutput.beamHypotheses.outputIdsCBA);

cpp/tensorrt_llm/kernels/decodingKernels.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -133,5 +133,5 @@ namespace tensorrt_llm::runtime::kernels
133133
//! \param cudaStream the CUDA stream on which to perform the operation.
134134

135135
void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput,
136-
SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream);
136+
SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream, runtime::SizeType32 batchSlot = 0);
137137
} // namespace tensorrt_llm::runtime::kernels

cpp/tensorrt_llm/layers/beamSearchLayer.cu

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -335,7 +335,10 @@ void BeamSearchLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs> const
335335
BeamHypotheses bh;
336336
// bh's members not used in this function: outputIds, logProbs, outputIdsUnfinish, parentIdsUnfinish
337337
bh.bVBWS = this->mVBWS;
338-
bh.nMaxBatchSize = static_cast<std::int32_t>(op->outputIdsPtr->getDimension<0>());
338+
// outputIds retains its full maxBatchSize allocation; outputIdsPtr is sliced to the active
339+
// batch size in DynamicDecodeLayer::prepareIdsPtrs (ITensor::slice(mOutputIdsPtrDevice, 0, batchSize))
340+
// and must not be used as a stride for the [MSL, maxBatchSize, BM]-shaped logProbsTiled buffer.
341+
bh.nMaxBatchSize = static_cast<std::int32_t>(op->outputIds->getDimension<0>());
339342
bh.nBatchSize = ip->localBatchSize;
340343
bh.nBeamWidth = op->outputIds->getDimension<1>();
341344
bh.nMaxSeqLen = op->outputIds->getDimension<2>();
@@ -397,7 +400,7 @@ void BeamSearchLayer<T>::forwardAsync(std::shared_ptr<BaseDecodingOutputs> const
397400
T const* bias = static_cast<T const*>(nullptr);
398401
TLLM_CHECK_WITH_INFO(getWorkspaceSize() >= 2 * bh.nBatchSize * bh.nBeamWidth * bh.nBeamWidth * 2,
399402
"Workspace size (%lu) is not enough for topk softmax required (%lu).", (uint64_t) getWorkspaceSize(),
400-
(uint64_t) (2 * bh.nMaxBatchSize * bh.nBeamWidth * bh.nBeamWidth * 2));
403+
(uint64_t) (2 * bh.nBatchSize * bh.nBeamWidth * bh.nBeamWidth * 2));
401404

402405
if (this->mV2 || this->mVBWS)
403406
{

cpp/tensorrt_llm/nanobind/thop/bindings.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ void initBindings(nb::module_& m)
174174
nb::arg("sage_attn_num_elts_per_blk_k") = 0, nb::arg("sage_attn_num_elts_per_blk_v") = 0,
175175
nb::arg("sage_attn_qk_int8") = false, nb::arg("num_contexts") = 0, nb::arg("num_ctx_tokens") = 0,
176176
nb::arg("trtllm_gen_jit_warmup") = false, nb::arg("compressed_kv_cache_pool_ptr") = std::nullopt,
177+
nb::arg("is_cross") = false, nb::arg("cross_kv") = std::nullopt,
178+
nb::arg("relative_attention_bias") = std::nullopt, nb::arg("relative_attention_max_distance") = 0,
177179
nb::arg("spec_decoding_target_max_draft_tokens") = std::nullopt, "Multi-head attention operation",
178180
nb::call_guard<nb::gil_scoped_release>());
179181

cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -247,7 +247,7 @@ CudaEvent GptDecoderBatched::finalize(decoder::DecoderState const& decoderState,
247247

248248
auto [dInput, dOutput] = prepareGatherTree(decoderState, batchSlot, streaming, *mRuntimeStream);
249249

250-
kernels::gatherTree(dOutput, dInput, samplingConfig, *mRuntimeStream);
250+
kernels::gatherTree(dOutput, dInput, samplingConfig, *mRuntimeStream, batchSlot);
251251

252252
CudaEvent event{};
253253
mRuntimeStream->record(event);

0 commit comments

Comments
 (0)