[None][fix] Fix encoder-decoder beam search corruption via per-slot fragmentPointerDevice (#15461)

achartier · web-flow · commit beb922f98acd · 2026-06-23T08:33:15.000-07:00
Signed-off-by: Aurelien Chartier &lt;2567591+achartier@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -208,7 +208,9 @@ class RuntimeBuffers
         //! Temporarily store the transposed results of multiple fragment logits, [maxBeamWidth, kCACHE_LENGTH]
         TensorPtr transposedLogits;
 
-        //! Temporarily store logits buffer address during the transposing, [kCACHE_LENGTH]
+        //! Temporarily store logits buffer address during the transposing, [maxBatchSize, kCACHE_LENGTH]
+        //! One row per batch slot (same layout as fragmentPointerHost) so concurrent flushes for
+        //! different requests in the same batch never clobber each other's pointer arrays.
         TensorPtr fragmentPointerDevice;
 
         //! Temporarily store logits buffer address during the transposing, [maxBatchSize, kCACHE_LENGTH]
@@ -222,11 +224,14 @@ class RuntimeBuffers
             workIdx = (workIdx + 1) % (fragmentPointerHost->getShape().d[0]);
         }
 
-        [[nodiscard]] TensorPtr getFragmentPointerHost()
+        //! Returns matching host and device pointer rows for the current workIdx, then advances
+        //! workIdx.  Always call this instead of the individual getters to avoid ordering bugs.
+        [[nodiscard]] std::pair<TensorPtr, TensorPtr> getFragmentPointerSlot()
         {
-            TensorPtr slice = runtime::ITensor::slice(fragmentPointerHost, workIdx, 1);
+            TensorPtr host = runtime::ITensor::slice(fragmentPointerHost, workIdx, 1);
+            TensorPtr device = runtime::ITensor::slice(fragmentPointerDevice, workIdx, 1);
             cycleWorkIdx();
-            return slice;
+            return {std::move(host), std::move(device)};
         };
     };
 
diff --git a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -152,8 +152,8 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
             ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH, maxBatchSize * maxBeamWidth, vocabSizePadded}),
             logitsType);
 
-        generationLogitsCache.fragmentPointerDevice
-            = manager.gpu(ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
+        generationLogitsCache.fragmentPointerDevice = manager.gpu(
+            ITensor::makeShape({maxBatchSize, GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
         generationLogitsCache.fragmentPointerHost = tensorrt_llm::runtime::BufferManager::pinnedPool(
             ITensor::makeShape({maxBatchSize, GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
     }
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -1216,8 +1216,21 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests
         {
             for (auto const& llmReq : activeRequests)
             {
+                // Remove from mInflightReqIds so changeBeamWidth can proceed on the next iteration.
+                // terminateRequest frees seqSlot/KV cache but does not clean up mInflightReqIds.
+                mInflightReqIds.erase(llmReq->mRequestId);
                 terminateRequest(llmReq);
             }
+            // Force buffer/decoder reset to clean up any partial state from the aborted batch
+            // (e.g. partially-filled cross-KV block offsets from mid-context-chunk processing).
+            // Guard on mInflightReqIds.empty(): in pipeline-parallel multi-micro-batch mode,
+            // other micro-batches may still have requests tracked here; changeBeamWidth asserts
+            // emptiness so we skip the reset and let the next successful forwardAsync iteration
+            // perform it when the set is clear.
+            if (mWorldConfig.isLastPipelineParallelRank() && mInflightReqIds.empty())
+            {
+                changeBeamWidth(mOperatingBeamWidth);
+            }
         }
         catch (std::exception const& e)
         {
diff --git a/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp b/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp
@@ -103,10 +103,11 @@ void copyGenerationLogits(RuntimeBuffers::GenerationLogitsCache& generationLogit
 
     auto const fragmentSize = llmReq.getGenerationLogitsFragmentsSize();
 
-    // Merge logits fragments on device
+    // Merge logits fragments on device.  getFragmentPointerSlot() returns the matching host and
+    // device rows for the current workIdx and advances the index atomically, so concurrent flushes
+    // for different requests in the same batch never clobber each other's pointer arrays.
     auto const& transposeBufferPtr = generationLogitsCache.transposedLogits;
-    auto const& cachePointerDevice = generationLogitsCache.fragmentPointerDevice;
-    auto const& cachePointerHost = generationLogitsCache.getFragmentPointerHost();
+    auto [cachePointerHost, cachePointerDevice] = generationLogitsCache.getFragmentPointerSlot();
     tensorrt_llm::runtime::kernels::mergeLogitsFragments(bufferManager, *transposeBufferPtr,
         llmReq.getGenerationLogitsFragments(), *cachePointerDevice, *cachePointerHost, 0, 1, reqBeamWidth,
         bufferManager.getStream(), 0);
diff --git a/cpp/tests/unit_tests/batch_manager/CMakeLists.txt b/cpp/tests/unit_tests/batch_manager/CMakeLists.txt
@@ -31,3 +31,4 @@ add_gtest(rnnCacheFormatterTest rnnCacheFormatterTest.cpp)
 add_gtest(cudaGraphExecutorCacheTest cudaGraphExecutorCacheTest.cpp)
 add_gtest(agentTreeTest agentTreeTest.cpp)
 add_gtest(truncateBlocksTest truncateBlocksTest.cpp)
+add_gtest(encDecBeamSearchTest encDecBeamSearchTest.cpp)
diff --git a/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp b/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp
@@ -0,0 +1,154 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/utils/inflightBatchingUtils.h"
+#include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/kernels/kvCacheIndex.h"
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+using namespace tensorrt_llm::batch_manager;
+using namespace tensorrt_llm::batch_manager::kv_cache_manager;
+namespace tr = tensorrt_llm::runtime;
+namespace tc = tensorrt_llm::common;
+namespace tk = tensorrt_llm::kernels;
+using SizeType32 = tr::SizeType32;
+
+// Verify that copyGenerationLogits correctly assembles the host logits buffer
+// using the real kernel merge path, and that two back-to-back calls (simulating
+// two requests flushing in the same batch) use distinct fragmentPointerDevice
+// slots so their pointer arrays do not clobber each other.
+TEST(CopyGenerationLogitsTest, KernelMergePathProducesCorrectHostLayoutAndSlotsAreIsolated)
+{
+    SizeType32 constexpr beamWidth = 2;
+    SizeType32 constexpr numSteps = RuntimeBuffers::GenerationLogitsCache::kCACHE_LENGTH; // full flush
+    SizeType32 constexpr vocabSize = 8;
+    SizeType32 constexpr promptLen = 1;
+    SizeType32 constexpr maxBatchSize = 4; // must be >= 2 to test slot isolation
+
+    auto stream = std::make_shared<tr::CudaStream>();
+    tr::BufferManager bufferMgr{stream};
+
+    // Build a real GenerationLogitsCache so that transposedLogits,
+    // fragmentPointerDevice and fragmentPointerHost are all properly allocated.
+    // cache.logits uses pinned memory so the test can fill it from the CPU while
+    // the GPU kernel can still read from it via DMA.
+    RuntimeBuffers::GenerationLogitsCache cache;
+    cache.logits = tr::BufferManager::pinnedPool(
+        tr::ITensor::makeShape({numSteps, maxBatchSize * beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT);
+    cache.transposedLogits
+        = bufferMgr.gpu(tr::ITensor::makeShape({beamWidth, numSteps, vocabSize}), nvinfer1::DataType::kFLOAT);
+    cache.fragmentPointerDevice
+        = bufferMgr.gpu(tr::ITensor::makeShape({maxBatchSize, numSteps}), nvinfer1::DataType::kINT64);
+    cache.fragmentPointerHost
+        = tr::BufferManager::pinnedPool(tr::ITensor::makeShape({maxBatchSize, numSteps}), nvinfer1::DataType::kINT64);
+
+    // Helper: build one LlmRequest that has numSteps fragments pointing into
+    // cache.logits[0..numSteps-1][logitsIndex:logitsIndex+beamWidth].
+    // Each fragment is filled with sentinel value (step*100 + beam + reqOffset).
+    auto makeRequest = [&](RequestIdType reqId, SizeType32 logitsIndex, float reqOffset) -> std::shared_ptr<LlmRequest>
+    {
+        auto tokens = std::make_shared<VecTokens>(promptLen, 0);
+        tr::SamplingConfig sc{beamWidth};
+        auto req = std::make_shared<LlmRequest>(reqId, numSteps, tokens, sc, false);
+
+        LlmRequest::BeamTokens gen(beamWidth, VecTokens(numSteps, 1));
+        req->setGeneratedTokens(gen);
+        req->allocGenerationLogitsHost(vocabSize, nvinfer1::DataType::kFLOAT);
+
+        // Write known values into the logits cache slots for this request and
+        // create matching fragment slice views.
+        for (SizeType32 step = 0; step < numSteps; ++step)
+        {
+            // cache.logits shape: [numSteps, maxBatchSize*beamWidth, vocabSize]
+            // Slice to [1, maxBS*bw, vocab], squeeze to [maxBS*bw, vocab].
+            tr::ITensor::SharedPtr slot = tr::ITensor::slice(cache.logits, step, 1);
+            slot->squeeze(0); // [maxBS*bw, vocab]
+            auto* slotPtr = tr::bufferCast<float>(*slot);
+            for (SizeType32 beam = 0; beam < beamWidth; ++beam)
+            {
+                float const val = reqOffset + static_cast<float>(step * 100 + beam);
+                for (SizeType32 v = 0; v < vocabSize; ++v)
+                {
+                    slotPtr[(logitsIndex + beam) * vocabSize + v] = val;
+                }
+            }
+
+            // Fragment matches HandleGenerationLogits: slice [logitsIndex:logitsIndex+beamWidth]
+            // from the step slot, then unsqueeze(0) → [1, beamWidth, vocab].
+            tr::ITensor::SharedPtr fragView = tr::ITensor::slice(slot, logitsIndex, beamWidth);
+            fragView->unsqueeze(0); // [1, beamWidth, vocab]
+            req->addGenerationLogitsFragment(fragView);
+        }
+        return req;
+    };
+
+    // Request 0 occupies logitsIndex=0 in the batch slot.
+    auto req0 = makeRequest(1, /*logitsIndex=*/0, /*reqOffset=*/0.0f);
+    // Request 1 occupies logitsIndex=beamWidth in the batch slot.
+    auto req1 = makeRequest(2, /*logitsIndex=*/beamWidth, /*reqOffset=*/1000.0f);
+
+    // Flush request 0 — uses workIdx=0.
+    utils::copyGenerationLogits(cache, bufferMgr, *req0, /*beforeDecoder=*/false, {});
+    // Flush request 1 — uses workIdx=1 (different slot → no pointer clobbering).
+    utils::copyGenerationLogits(cache, bufferMgr, *req1, /*beforeDecoder=*/false, {});
+
+    ASSERT_EQ(cudaStreamSynchronize(stream->get()), cudaSuccess);
+
+    // Verify req0 host buffer: host[beam, step, v] == step*100 + beam
+    auto const* host0 = tr::bufferCast<float>(*req0->getGenerationLogitsHost());
+    for (SizeType32 beam = 0; beam < beamWidth; ++beam)
+    {
+        for (SizeType32 step = 0; step < numSteps; ++step)
+        {
+            float const expected = static_cast<float>(step * 100 + beam);
+            for (SizeType32 v = 0; v < vocabSize; ++v)
+            {
+                SizeType32 const idx = (beam * numSteps + step) * vocabSize + v;
+                EXPECT_FLOAT_EQ(host0[idx], expected) << "req0 host[beam=" << beam << ",step=" << step << ",v=" << v
+                                                      << "]=" << host0[idx] << " expected " << expected;
+            }
+        }
+    }
+
+    // Verify req1 host buffer: host[beam, step, v] == 1000 + step*100 + beam
+    auto const* host1 = tr::bufferCast<float>(*req1->getGenerationLogitsHost());
+    for (SizeType32 beam = 0; beam < beamWidth; ++beam)
+    {
+        for (SizeType32 step = 0; step < numSteps; ++step)
+        {
+            float const expected = 1000.0f + static_cast<float>(step * 100 + beam);
+            for (SizeType32 v = 0; v < vocabSize; ++v)
+            {
+                SizeType32 const idx = (beam * numSteps + step) * vocabSize + v;
+                EXPECT_FLOAT_EQ(host1[idx], expected) << "req1 host[beam=" << beam << ",step=" << step << ",v=" << v
+                                                      << "]=" << host1[idx] << " expected " << expected;
+            }
+        }
+    }
+
+    // Both requests must have had their fragments cleared.
+    EXPECT_EQ(req0->getGenerationLogitsFragmentsSize(), 0);
+    EXPECT_EQ(req1->getGenerationLogitsFragmentsSize(), 0);
+}

Original file line number	Diff line number	Diff line change
`@@ -1216,8 +1216,21 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests`
`1216`	`1216`	`{`
`1217`	`1217`	`for (auto const& llmReq : activeRequests)`
`1218`	`1218`	`{`
	`1219`	`+ // Remove from mInflightReqIds so changeBeamWidth can proceed on the next iteration.`
	`1220`	`+ // terminateRequest frees seqSlot/KV cache but does not clean up mInflightReqIds.`
	`1221`	`+ mInflightReqIds.erase(llmReq->mRequestId);`
`1219`	`1222`	`terminateRequest(llmReq);`
`1220`	`1223`	`}`
	`1224`	`+ // Force buffer/decoder reset to clean up any partial state from the aborted batch`
	`1225`	`+ // (e.g. partially-filled cross-KV block offsets from mid-context-chunk processing).`
	`1226`	`+ // Guard on mInflightReqIds.empty(): in pipeline-parallel multi-micro-batch mode,`
	`1227`	`+ // other micro-batches may still have requests tracked here; changeBeamWidth asserts`
	`1228`	`+ // emptiness so we skip the reset and let the next successful forwardAsync iteration`
	`1229`	`+ // perform it when the set is clear.`
	`1230`	`+ if (mWorldConfig.isLastPipelineParallelRank() && mInflightReqIds.empty())`
	`1231`	`+ {`
	`1232`	`+ changeBeamWidth(mOperatingBeamWidth);`
	`1233`	`+ }`
`1221`	`1234`	`}`
`1222`	`1235`	`catch (std::exception const& e)`
`1223`	`1236`	`{`