[None][test] Strengthen enc-dec beam-search regression tests

achartier · achartier · commit 92ff8bd5dbee · 2026-06-17T11:36:42.000-07:00
Fix-1 test (CrossKvBeamSharingTest): previously the allocator happened to
share blocks across beams, making equality across beam slots a tautology that
passed both with and without the fix.  Now the test explicitly writes distinct
per-beam sentinel values into the source cacheBlockIndices tensor before calling
copyBlockOffsets.  The old code (srcBeamIdx = beamIdx) leaves beams 1..N-1 with
their own different values; the fixed code (srcBeamIdx = 0 when isCrossKv())
normalises all slots to beam-0.  Confirmed: test fails when the fix is reverted.

Fix-3 test (CopyGenerationLogitsTest): the previous test used a null dummy cache
and did not exercise the mergeLogitsFragmentsKernel path.  Replaced with a test
that allocates a real GenerationLogitsCache (transposedLogits, fragmentPointerDevice,
fragmentPointerHost), creates fragments as pinned-memory slices of cache.logits with
known per-(step, beam) values, and verifies both requests produce the correct host
layout via the actual kernel merge path.  Running two back-to-back flushes also
validates that each uses a separate fragmentPointerDevice slot (slot isolation).

Signed-off-by: Aurelien Chartier &lt;2567591+achartier@users.noreply.github.com&gt;
diff --git a/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp b/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp
@@ -57,21 +57,14 @@ using SizeType32 = tr::SizeType32;
 // Fix 1: KVCacheManager::copyBlockOffsets cross-KV beam sharing
 // ============================================================================
 
-// Verify that for a cross-KV cache with beam width > 1, copyBlockOffsets
-// places the same physical block IDs in every beam slot.
-//
-// This tests the isCrossKv() branch added by fix 1: when all beams share the
-// same encoder output, the output offset table must reflect that.  In the
-// production Whisper case the allocator gives each beam its own physical
-// blocks, so beams 1..N-1 would reference uninitialised GPU memory without
-// the fix.  This unit test uses a simple context-only sequence where the
-// allocator happens to share blocks across beams; the observable property
-// (all beam slots equal beam-0) still holds and constitutes a sanity check.
-TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
+// Verify that copyBlockOffsets normalises all beam slots to beam-0's value on the
+// cross-KV path even when per-beam source rows differ.  The test writes distinct
+// sentinel values into each beam row of the source tensor before calling
+// copyBlockOffsets so that the old bug (srcBeamIdx = beamIdx) would leave beams
+// 1..N-1 with different values, while the fix (srcBeamIdx = 0 for isCrossKv())
+// produces equal values across all beams.
+TEST(CrossKvBeamSharingTest, CopyBlockOffsetsNormalisesAllBeamsToBeam0)
 {
-    // Encoder-decoder setup: 1 layer, 1 KV head, sizePerHead=4,
-    // tokensPerBlock=8, encoder output length=16 → 2 blocks per sequence,
-    // beam width=3.
     auto stream = std::make_shared<tr::CudaStream>();
 
     SizeType32 constexpr numLayers = 1;
@@ -83,7 +76,6 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
     SizeType32 constexpr maxAttentionWindow = 16;
     SizeType32 constexpr encoderLen = 16; // 2 blocks
 
-    // Reserve enough blocks for 1 sequence × beamWidth × (encoderLen / tokensPerBlock).
     SizeType32 constexpr numBlocks = maxNumSequences * beamWidth * (encoderLen / tokensPerBlock);
     BlocksPerWindow const blocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
 
@@ -93,30 +85,45 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
         /*enableBlockReuse=*/false, CacheType::kCROSS);
     crossKvMgr.allocatePools(false);
 
-    // Build a minimal LlmRequest and allocate a cross-KV sequence.
     RequestIdType constexpr requestId = 1;
     auto inputTokens = std::make_shared<VecTokens>(encoderLen, 0);
     tr::SamplingConfig const samplingConfig{beamWidth};
     auto llmReq = std::make_shared<LlmRequest>(requestId, /*maxNewTokens=*/0, inputTokens, samplingConfig, false);
     crossKvMgr.addSequenceBatch({{{requestId, encoderLen, beamWidth}}}, {std::ref(*llmReq)});
 
-    // Allocate CPU output tensor: [numPools, maxNumSeq*beamWidth, 2, maxBlocksPerSeq].
+    // Write distinct per-beam values into the source cacheBlockIndices tensor so
+    // that the old code (copying each beam's own row) would produce different
+    // outputs, while the fixed code (always copying beam-0's row) produces equal.
+    auto& seq = crossKvMgr.getSequence(requestId);
+    auto& srcTensor = seq.getCacheBlockIndices(maxAttentionWindow);
+    auto const& srcShape = srcTensor.getShape();
+    auto* const srcPtr = tr::bufferCast<tk::KVCacheIndex>(srcTensor);
+    for (SizeType32 beam = 0; beam < beamWidth; ++beam)
+    {
+        for (SizeType32 kv = 0; kv < 2; ++kv)
+        {
+            for (SizeType32 block = 0; block < srcShape.d[3]; ++block)
+            {
+                auto const idx = tc::flat_index(srcShape.d, /*pool=*/0, beam, kv, block);
+                // Beam b gets value (b*100 + kv*10 + block), all non-zero and distinct.
+                srcPtr[idx]
+                    = tk::KVCacheIndex{static_cast<tk::KVCacheIndex::UnderlyingType>(beam * 100 + kv * 10 + block + 1)};
+            }
+        }
+    }
+
     auto const dims = crossKvMgr.getOffsetTableDimensions();
     SizeType32 const numPools = dims.numPools;
     SizeType32 const maxBlocksPerSeq = dims.maxBlocksPerSeq;
     auto blockOffsets
         = tr::BufferManager::cpu(tr::ITensor::makeShape({numPools, maxNumSequences * beamWidth, 2, maxBlocksPerSeq}),
             tr::TRTDataType<tk::KVCacheIndex>::value);
 
-    // Fill with sentinel so we can detect un-written slots.
     auto* const raw = tr::bufferCast<tk::KVCacheIndex>(*blockOffsets);
     std::fill(raw, raw + blockOffsets->getSize(), tk::KVCacheIndex{tk::KVCacheIndex::kInvalidPoolIndex});
 
     crossKvMgr.copyBlockOffsets(*blockOffsets, /*outputSlotOffset=*/0, requestId);
 
-    // Post-condition: for every (pool, K/V, block), beams 1..beamWidth-1
-    // must hold the same physical block index as beam 0, and beam 0 itself
-    // must be a valid (non-sentinel) index.
     auto const& shape = blockOffsets->getShape();
     for (SizeType32 pool = 0; pool < numPools; ++pool)
     {
@@ -126,9 +133,11 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
             {
                 auto idx = [&](SizeType32 beam) { return tc::flat_index(shape.d, pool, beam, kv, block); };
 
+                // Beam 0 must have been written (non-sentinel) and reflect its source value.
                 EXPECT_NE(raw[idx(0)].get(), tk::KVCacheIndex::kInvalidPoolIndex)
-                    << "pool=" << pool << " beam=0 kv=" << kv << " block=" << block << ": not initialised";
+                    << "pool=" << pool << " beam=0 kv=" << kv << " block=" << block << ": not written";
 
+                // All other beams must equal beam 0 — the fix normalises them.
                 for (SizeType32 beam = 1; beam < beamWidth; ++beam)
                 {
                     EXPECT_EQ(raw[idx(beam)].get(), raw[idx(0)].get())
@@ -141,80 +150,123 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
 }
 
 // ============================================================================
-// Fix 3: copyGenerationLogits direct-copy correctness
+// Fix 3: copyGenerationLogits per-slot fragmentPointerDevice
 // ============================================================================
 
-// Verify that the direct-copy implementation of copyGenerationLogits writes
-// each step's logits for each beam to the correct slot in the host buffer.
-TEST(CopyGenerationLogitsTest, DirectCopyPlacesEachBeamStepAtCorrectHostOffset)
+// Verify that copyGenerationLogits correctly assembles the host logits buffer
+// using the real kernel merge path, and that two back-to-back calls (simulating
+// two requests flushing in the same batch) use distinct fragmentPointerDevice
+// slots so their pointer arrays do not clobber each other.
+TEST(CopyGenerationLogitsTest, KernelMergePathProducesCorrectHostLayoutAndSlotsAreIsolated)
 {
-    // Parameters.
     SizeType32 constexpr beamWidth = 2;
-    SizeType32 constexpr numSteps = 4; // one full cache-length flush
+    SizeType32 constexpr numSteps = RuntimeBuffers::GenerationLogitsCache::kCACHE_LENGTH; // full flush
     SizeType32 constexpr vocabSize = 8;
     SizeType32 constexpr promptLen = 1;
+    SizeType32 constexpr maxBatchSize = 4; // must be >= 2 to test slot isolation
 
     auto stream = std::make_shared<tr::CudaStream>();
     tr::BufferManager bufferMgr{stream};
 
-    // Create a request: promptLen=1, maxNewTokens=numSteps.
-    RequestIdType constexpr requestId = 1;
-    auto inputTokens = std::make_shared<VecTokens>(promptLen, 0);
-    tr::SamplingConfig const samplingConfig{beamWidth};
-    auto llmReq = std::make_shared<LlmRequest>(requestId, numSteps, inputTokens, samplingConfig, /*isStreaming=*/false);
-
-    // Advance internal token count to simulate numSteps tokens generated so
-    // that (with beforeDecoder=false):
-    //   numGenerationToken = getNumTokens(beam) - mPromptLen = numSteps
-    //   hostOffset         = numGenerationToken - fragmentSize = 0
-    LlmRequest::BeamTokens const generatedTokens(beamWidth, VecTokens(numSteps, /*token=*/1));
-    llmReq->setGeneratedTokens(generatedTokens);
-    llmReq->allocGenerationLogitsHost(vocabSize, nvinfer1::DataType::kFLOAT);
-
-    // Build numSteps logit fragments, each of shape [1, beamWidth, vocabSize],
-    // filled with a unique per-(step, beam) sentinel value: step*100 + beam.
-    for (SizeType32 step = 0; step < numSteps; ++step)
+    // Build a real GenerationLogitsCache so that transposedLogits,
+    // fragmentPointerDevice and fragmentPointerHost are all properly allocated.
+    // cache.logits uses pinned memory so the test can fill it from the CPU while
+    // the GPU kernel can still read from it via DMA.
+    RuntimeBuffers::GenerationLogitsCache cache;
+    cache.logits = tr::BufferManager::pinnedPool(
+        tr::ITensor::makeShape({numSteps, maxBatchSize * beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT);
+    cache.transposedLogits
+        = bufferMgr.gpu(tr::ITensor::makeShape({beamWidth, numSteps, vocabSize}), nvinfer1::DataType::kFLOAT);
+    cache.fragmentPointerDevice
+        = bufferMgr.gpu(tr::ITensor::makeShape({maxBatchSize, numSteps}), nvinfer1::DataType::kINT64);
+    cache.fragmentPointerHost
+        = tr::BufferManager::pinnedPool(tr::ITensor::makeShape({maxBatchSize, numSteps}), nvinfer1::DataType::kINT64);
+
+    // Helper: build one LlmRequest that has numSteps fragments pointing into
+    // cache.logits[0..numSteps-1][logitsIndex:logitsIndex+beamWidth].
+    // Each fragment is filled with sentinel value (step*100 + beam + reqOffset).
+    auto makeRequest = [&](RequestIdType reqId, SizeType32 logitsIndex, float reqOffset) -> std::shared_ptr<LlmRequest>
     {
-        tr::ITensor::SharedPtr frag = tr::BufferManager::pinnedPool(
-            tr::ITensor::makeShape({1, beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT);
-        auto* const fragData = tr::bufferCast<float>(*frag);
-        for (SizeType32 beam = 0; beam < beamWidth; ++beam)
+        auto tokens = std::make_shared<VecTokens>(promptLen, 0);
+        tr::SamplingConfig sc{beamWidth};
+        auto req = std::make_shared<LlmRequest>(reqId, numSteps, tokens, sc, false);
+
+        LlmRequest::BeamTokens gen(beamWidth, VecTokens(numSteps, 1));
+        req->setGeneratedTokens(gen);
+        req->allocGenerationLogitsHost(vocabSize, nvinfer1::DataType::kFLOAT);
+
+        // Write known values into the logits cache slots for this request and
+        // create matching fragment slice views.
+        for (SizeType32 step = 0; step < numSteps; ++step)
         {
-            float const val = static_cast<float>(step * 100 + beam);
-            for (SizeType32 v = 0; v < vocabSize; ++v)
+            // cache.logits shape: [numSteps, maxBatchSize*beamWidth, vocabSize]
+            // Slice to [1, maxBS*bw, vocab], squeeze to [maxBS*bw, vocab].
+            tr::ITensor::SharedPtr slot = tr::ITensor::slice(cache.logits, step, 1);
+            slot->squeeze(0); // [maxBS*bw, vocab]
+            auto* slotPtr = tr::bufferCast<float>(*slot);
+            for (SizeType32 beam = 0; beam < beamWidth; ++beam)
             {
-                // Flat layout: [1][beam][v] → beam * vocabSize + v
-                fragData[beam * vocabSize + v] = val;
+                float const val = reqOffset + static_cast<float>(step * 100 + beam);
+                for (SizeType32 v = 0; v < vocabSize; ++v)
+                {
+                    slotPtr[(logitsIndex + beam) * vocabSize + v] = val;
+                }
             }
+
+            // Fragment matches HandleGenerationLogits: slice [logitsIndex:logitsIndex+beamWidth]
+            // from the step slot, then unsqueeze(0) → [1, beamWidth, vocab].
+            tr::ITensor::SharedPtr fragView = tr::ITensor::slice(slot, logitsIndex, beamWidth);
+            fragView->unsqueeze(0); // [1, beamWidth, vocab]
+            req->addGenerationLogitsFragment(fragView);
         }
-        llmReq->addGenerationLogitsFragment(frag);
-    }
-    ASSERT_EQ(llmReq->getGenerationLogitsFragmentsSize(), numSteps);
+        return req;
+    };
 
-    // Dummy cache — not accessed by the direct-copy implementation.
-    RuntimeBuffers::GenerationLogitsCache dummyCache;
+    // Request 0 occupies logitsIndex=0 in the batch slot.
+    auto req0 = makeRequest(1, /*logitsIndex=*/0, /*reqOffset=*/0.0f);
+    // Request 1 occupies logitsIndex=beamWidth in the batch slot.
+    auto req1 = makeRequest(2, /*logitsIndex=*/beamWidth, /*reqOffset=*/1000.0f);
 
-    utils::copyGenerationLogits(dummyCache, bufferMgr, *llmReq, /*beforeDecoder=*/false, /*numDroppedTokens=*/{});
+    // Flush request 0 — uses workIdx=0.
+    utils::copyGenerationLogits(cache, bufferMgr, *req0, /*beforeDecoder=*/false, {});
+    // Flush request 1 — uses workIdx=1 (different slot → no pointer clobbering).
+    utils::copyGenerationLogits(cache, bufferMgr, *req1, /*beforeDecoder=*/false, {});
 
     ASSERT_EQ(cudaStreamSynchronize(stream->get()), cudaSuccess);
 
-    // Post-condition: generationLogitsHost[beam, step, v] == step*100 + beam
-    // for all (beam, step, v).  Host shape: [beamWidth, maxNewTokens, vocab].
-    auto const* const hostData = tr::bufferCast<float>(*llmReq->getGenerationLogitsHost());
+    // Verify req0 host buffer: host[beam, step, v] == step*100 + beam
+    auto const* host0 = tr::bufferCast<float>(*req0->getGenerationLogitsHost());
     for (SizeType32 beam = 0; beam < beamWidth; ++beam)
     {
         for (SizeType32 step = 0; step < numSteps; ++step)
         {
             float const expected = static_cast<float>(step * 100 + beam);
             for (SizeType32 v = 0; v < vocabSize; ++v)
             {
-                SizeType32 const flatIdx = (beam * numSteps + step) * vocabSize + v;
-                EXPECT_FLOAT_EQ(hostData[flatIdx], expected) << "host[beam=" << beam << ", step=" << step << ", v=" << v
-                                                             << "]=" << hostData[flatIdx] << " expected " << expected;
+                SizeType32 const idx = (beam * numSteps + step) * vocabSize + v;
+                EXPECT_FLOAT_EQ(host0[idx], expected) << "req0 host[beam=" << beam << ",step=" << step << ",v=" << v
+                                                      << "]=" << host0[idx] << " expected " << expected;
+            }
+        }
+    }
+
+    // Verify req1 host buffer: host[beam, step, v] == 1000 + step*100 + beam
+    auto const* host1 = tr::bufferCast<float>(*req1->getGenerationLogitsHost());
+    for (SizeType32 beam = 0; beam < beamWidth; ++beam)
+    {
+        for (SizeType32 step = 0; step < numSteps; ++step)
+        {
+            float const expected = 1000.0f + static_cast<float>(step * 100 + beam);
+            for (SizeType32 v = 0; v < vocabSize; ++v)
+            {
+                SizeType32 const idx = (beam * numSteps + step) * vocabSize + v;
+                EXPECT_FLOAT_EQ(host1[idx], expected) << "req1 host[beam=" << beam << ",step=" << step << ",v=" << v
+                                                      << "]=" << host1[idx] << " expected " << expected;
             }
         }
     }
 
-    // copyGenerationLogits must clear fragments after flushing.
-    EXPECT_EQ(llmReq->getGenerationLogitsFragmentsSize(), 0);
+    // Both requests must have had their fragments cleared.
+    EXPECT_EQ(req0->getGenerationLogitsFragmentsSize(), 0);
+    EXPECT_EQ(req1->getGenerationLogitsFragmentsSize(), 0);
 }