Skip to content

Commit 92ff8bd

Browse files
committed
[None][test] Strengthen enc-dec beam-search regression tests
Fix-1 test (CrossKvBeamSharingTest): previously the allocator happened to share blocks across beams, making equality across beam slots a tautology that passed both with and without the fix. Now the test explicitly writes distinct per-beam sentinel values into the source cacheBlockIndices tensor before calling copyBlockOffsets. The old code (srcBeamIdx = beamIdx) leaves beams 1..N-1 with their own different values; the fixed code (srcBeamIdx = 0 when isCrossKv()) normalises all slots to beam-0. Confirmed: test fails when the fix is reverted. Fix-3 test (CopyGenerationLogitsTest): the previous test used a null dummy cache and did not exercise the mergeLogitsFragmentsKernel path. Replaced with a test that allocates a real GenerationLogitsCache (transposedLogits, fragmentPointerDevice, fragmentPointerHost), creates fragments as pinned-memory slices of cache.logits with known per-(step, beam) values, and verifies both requests produce the correct host layout via the actual kernel merge path. Running two back-to-back flushes also validates that each uses a separate fragmentPointerDevice slot (slot isolation). Signed-off-by: Aurelien Chartier <2567591+achartier@users.noreply.github.com>
1 parent 687af83 commit 92ff8bd

1 file changed

Lines changed: 119 additions & 67 deletions

File tree

cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp

Lines changed: 119 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -57,21 +57,14 @@ using SizeType32 = tr::SizeType32;
5757
// Fix 1: KVCacheManager::copyBlockOffsets cross-KV beam sharing
5858
// ============================================================================
5959

60-
// Verify that for a cross-KV cache with beam width > 1, copyBlockOffsets
61-
// places the same physical block IDs in every beam slot.
62-
//
63-
// This tests the isCrossKv() branch added by fix 1: when all beams share the
64-
// same encoder output, the output offset table must reflect that. In the
65-
// production Whisper case the allocator gives each beam its own physical
66-
// blocks, so beams 1..N-1 would reference uninitialised GPU memory without
67-
// the fix. This unit test uses a simple context-only sequence where the
68-
// allocator happens to share blocks across beams; the observable property
69-
// (all beam slots equal beam-0) still holds and constitutes a sanity check.
70-
TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
60+
// Verify that copyBlockOffsets normalises all beam slots to beam-0's value on the
61+
// cross-KV path even when per-beam source rows differ. The test writes distinct
62+
// sentinel values into each beam row of the source tensor before calling
63+
// copyBlockOffsets so that the old bug (srcBeamIdx = beamIdx) would leave beams
64+
// 1..N-1 with different values, while the fix (srcBeamIdx = 0 for isCrossKv())
65+
// produces equal values across all beams.
66+
TEST(CrossKvBeamSharingTest, CopyBlockOffsetsNormalisesAllBeamsToBeam0)
7167
{
72-
// Encoder-decoder setup: 1 layer, 1 KV head, sizePerHead=4,
73-
// tokensPerBlock=8, encoder output length=16 → 2 blocks per sequence,
74-
// beam width=3.
7568
auto stream = std::make_shared<tr::CudaStream>();
7669

7770
SizeType32 constexpr numLayers = 1;
@@ -83,7 +76,6 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
8376
SizeType32 constexpr maxAttentionWindow = 16;
8477
SizeType32 constexpr encoderLen = 16; // 2 blocks
8578

86-
// Reserve enough blocks for 1 sequence × beamWidth × (encoderLen / tokensPerBlock).
8779
SizeType32 constexpr numBlocks = maxNumSequences * beamWidth * (encoderLen / tokensPerBlock);
8880
BlocksPerWindow const blocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
8981

@@ -93,30 +85,45 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
9385
/*enableBlockReuse=*/false, CacheType::kCROSS);
9486
crossKvMgr.allocatePools(false);
9587

96-
// Build a minimal LlmRequest and allocate a cross-KV sequence.
9788
RequestIdType constexpr requestId = 1;
9889
auto inputTokens = std::make_shared<VecTokens>(encoderLen, 0);
9990
tr::SamplingConfig const samplingConfig{beamWidth};
10091
auto llmReq = std::make_shared<LlmRequest>(requestId, /*maxNewTokens=*/0, inputTokens, samplingConfig, false);
10192
crossKvMgr.addSequenceBatch({{{requestId, encoderLen, beamWidth}}}, {std::ref(*llmReq)});
10293

103-
// Allocate CPU output tensor: [numPools, maxNumSeq*beamWidth, 2, maxBlocksPerSeq].
94+
// Write distinct per-beam values into the source cacheBlockIndices tensor so
95+
// that the old code (copying each beam's own row) would produce different
96+
// outputs, while the fixed code (always copying beam-0's row) produces equal.
97+
auto& seq = crossKvMgr.getSequence(requestId);
98+
auto& srcTensor = seq.getCacheBlockIndices(maxAttentionWindow);
99+
auto const& srcShape = srcTensor.getShape();
100+
auto* const srcPtr = tr::bufferCast<tk::KVCacheIndex>(srcTensor);
101+
for (SizeType32 beam = 0; beam < beamWidth; ++beam)
102+
{
103+
for (SizeType32 kv = 0; kv < 2; ++kv)
104+
{
105+
for (SizeType32 block = 0; block < srcShape.d[3]; ++block)
106+
{
107+
auto const idx = tc::flat_index(srcShape.d, /*pool=*/0, beam, kv, block);
108+
// Beam b gets value (b*100 + kv*10 + block), all non-zero and distinct.
109+
srcPtr[idx]
110+
= tk::KVCacheIndex{static_cast<tk::KVCacheIndex::UnderlyingType>(beam * 100 + kv * 10 + block + 1)};
111+
}
112+
}
113+
}
114+
104115
auto const dims = crossKvMgr.getOffsetTableDimensions();
105116
SizeType32 const numPools = dims.numPools;
106117
SizeType32 const maxBlocksPerSeq = dims.maxBlocksPerSeq;
107118
auto blockOffsets
108119
= tr::BufferManager::cpu(tr::ITensor::makeShape({numPools, maxNumSequences * beamWidth, 2, maxBlocksPerSeq}),
109120
tr::TRTDataType<tk::KVCacheIndex>::value);
110121

111-
// Fill with sentinel so we can detect un-written slots.
112122
auto* const raw = tr::bufferCast<tk::KVCacheIndex>(*blockOffsets);
113123
std::fill(raw, raw + blockOffsets->getSize(), tk::KVCacheIndex{tk::KVCacheIndex::kInvalidPoolIndex});
114124

115125
crossKvMgr.copyBlockOffsets(*blockOffsets, /*outputSlotOffset=*/0, requestId);
116126

117-
// Post-condition: for every (pool, K/V, block), beams 1..beamWidth-1
118-
// must hold the same physical block index as beam 0, and beam 0 itself
119-
// must be a valid (non-sentinel) index.
120127
auto const& shape = blockOffsets->getShape();
121128
for (SizeType32 pool = 0; pool < numPools; ++pool)
122129
{
@@ -126,9 +133,11 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
126133
{
127134
auto idx = [&](SizeType32 beam) { return tc::flat_index(shape.d, pool, beam, kv, block); };
128135

136+
// Beam 0 must have been written (non-sentinel) and reflect its source value.
129137
EXPECT_NE(raw[idx(0)].get(), tk::KVCacheIndex::kInvalidPoolIndex)
130-
<< "pool=" << pool << " beam=0 kv=" << kv << " block=" << block << ": not initialised";
138+
<< "pool=" << pool << " beam=0 kv=" << kv << " block=" << block << ": not written";
131139

140+
// All other beams must equal beam 0 — the fix normalises them.
132141
for (SizeType32 beam = 1; beam < beamWidth; ++beam)
133142
{
134143
EXPECT_EQ(raw[idx(beam)].get(), raw[idx(0)].get())
@@ -141,80 +150,123 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
141150
}
142151

143152
// ============================================================================
144-
// Fix 3: copyGenerationLogits direct-copy correctness
153+
// Fix 3: copyGenerationLogits per-slot fragmentPointerDevice
145154
// ============================================================================
146155

147-
// Verify that the direct-copy implementation of copyGenerationLogits writes
148-
// each step's logits for each beam to the correct slot in the host buffer.
149-
TEST(CopyGenerationLogitsTest, DirectCopyPlacesEachBeamStepAtCorrectHostOffset)
156+
// Verify that copyGenerationLogits correctly assembles the host logits buffer
157+
// using the real kernel merge path, and that two back-to-back calls (simulating
158+
// two requests flushing in the same batch) use distinct fragmentPointerDevice
159+
// slots so their pointer arrays do not clobber each other.
160+
TEST(CopyGenerationLogitsTest, KernelMergePathProducesCorrectHostLayoutAndSlotsAreIsolated)
150161
{
151-
// Parameters.
152162
SizeType32 constexpr beamWidth = 2;
153-
SizeType32 constexpr numSteps = 4; // one full cache-length flush
163+
SizeType32 constexpr numSteps = RuntimeBuffers::GenerationLogitsCache::kCACHE_LENGTH; // full flush
154164
SizeType32 constexpr vocabSize = 8;
155165
SizeType32 constexpr promptLen = 1;
166+
SizeType32 constexpr maxBatchSize = 4; // must be >= 2 to test slot isolation
156167

157168
auto stream = std::make_shared<tr::CudaStream>();
158169
tr::BufferManager bufferMgr{stream};
159170

160-
// Create a request: promptLen=1, maxNewTokens=numSteps.
161-
RequestIdType constexpr requestId = 1;
162-
auto inputTokens = std::make_shared<VecTokens>(promptLen, 0);
163-
tr::SamplingConfig const samplingConfig{beamWidth};
164-
auto llmReq = std::make_shared<LlmRequest>(requestId, numSteps, inputTokens, samplingConfig, /*isStreaming=*/false);
165-
166-
// Advance internal token count to simulate numSteps tokens generated so
167-
// that (with beforeDecoder=false):
168-
// numGenerationToken = getNumTokens(beam) - mPromptLen = numSteps
169-
// hostOffset = numGenerationToken - fragmentSize = 0
170-
LlmRequest::BeamTokens const generatedTokens(beamWidth, VecTokens(numSteps, /*token=*/1));
171-
llmReq->setGeneratedTokens(generatedTokens);
172-
llmReq->allocGenerationLogitsHost(vocabSize, nvinfer1::DataType::kFLOAT);
173-
174-
// Build numSteps logit fragments, each of shape [1, beamWidth, vocabSize],
175-
// filled with a unique per-(step, beam) sentinel value: step*100 + beam.
176-
for (SizeType32 step = 0; step < numSteps; ++step)
171+
// Build a real GenerationLogitsCache so that transposedLogits,
172+
// fragmentPointerDevice and fragmentPointerHost are all properly allocated.
173+
// cache.logits uses pinned memory so the test can fill it from the CPU while
174+
// the GPU kernel can still read from it via DMA.
175+
RuntimeBuffers::GenerationLogitsCache cache;
176+
cache.logits = tr::BufferManager::pinnedPool(
177+
tr::ITensor::makeShape({numSteps, maxBatchSize * beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT);
178+
cache.transposedLogits
179+
= bufferMgr.gpu(tr::ITensor::makeShape({beamWidth, numSteps, vocabSize}), nvinfer1::DataType::kFLOAT);
180+
cache.fragmentPointerDevice
181+
= bufferMgr.gpu(tr::ITensor::makeShape({maxBatchSize, numSteps}), nvinfer1::DataType::kINT64);
182+
cache.fragmentPointerHost
183+
= tr::BufferManager::pinnedPool(tr::ITensor::makeShape({maxBatchSize, numSteps}), nvinfer1::DataType::kINT64);
184+
185+
// Helper: build one LlmRequest that has numSteps fragments pointing into
186+
// cache.logits[0..numSteps-1][logitsIndex:logitsIndex+beamWidth].
187+
// Each fragment is filled with sentinel value (step*100 + beam + reqOffset).
188+
auto makeRequest = [&](RequestIdType reqId, SizeType32 logitsIndex, float reqOffset) -> std::shared_ptr<LlmRequest>
177189
{
178-
tr::ITensor::SharedPtr frag = tr::BufferManager::pinnedPool(
179-
tr::ITensor::makeShape({1, beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT);
180-
auto* const fragData = tr::bufferCast<float>(*frag);
181-
for (SizeType32 beam = 0; beam < beamWidth; ++beam)
190+
auto tokens = std::make_shared<VecTokens>(promptLen, 0);
191+
tr::SamplingConfig sc{beamWidth};
192+
auto req = std::make_shared<LlmRequest>(reqId, numSteps, tokens, sc, false);
193+
194+
LlmRequest::BeamTokens gen(beamWidth, VecTokens(numSteps, 1));
195+
req->setGeneratedTokens(gen);
196+
req->allocGenerationLogitsHost(vocabSize, nvinfer1::DataType::kFLOAT);
197+
198+
// Write known values into the logits cache slots for this request and
199+
// create matching fragment slice views.
200+
for (SizeType32 step = 0; step < numSteps; ++step)
182201
{
183-
float const val = static_cast<float>(step * 100 + beam);
184-
for (SizeType32 v = 0; v < vocabSize; ++v)
202+
// cache.logits shape: [numSteps, maxBatchSize*beamWidth, vocabSize]
203+
// Slice to [1, maxBS*bw, vocab], squeeze to [maxBS*bw, vocab].
204+
tr::ITensor::SharedPtr slot = tr::ITensor::slice(cache.logits, step, 1);
205+
slot->squeeze(0); // [maxBS*bw, vocab]
206+
auto* slotPtr = tr::bufferCast<float>(*slot);
207+
for (SizeType32 beam = 0; beam < beamWidth; ++beam)
185208
{
186-
// Flat layout: [1][beam][v] → beam * vocabSize + v
187-
fragData[beam * vocabSize + v] = val;
209+
float const val = reqOffset + static_cast<float>(step * 100 + beam);
210+
for (SizeType32 v = 0; v < vocabSize; ++v)
211+
{
212+
slotPtr[(logitsIndex + beam) * vocabSize + v] = val;
213+
}
188214
}
215+
216+
// Fragment matches HandleGenerationLogits: slice [logitsIndex:logitsIndex+beamWidth]
217+
// from the step slot, then unsqueeze(0) → [1, beamWidth, vocab].
218+
tr::ITensor::SharedPtr fragView = tr::ITensor::slice(slot, logitsIndex, beamWidth);
219+
fragView->unsqueeze(0); // [1, beamWidth, vocab]
220+
req->addGenerationLogitsFragment(fragView);
189221
}
190-
llmReq->addGenerationLogitsFragment(frag);
191-
}
192-
ASSERT_EQ(llmReq->getGenerationLogitsFragmentsSize(), numSteps);
222+
return req;
223+
};
193224

194-
// Dummy cache — not accessed by the direct-copy implementation.
195-
RuntimeBuffers::GenerationLogitsCache dummyCache;
225+
// Request 0 occupies logitsIndex=0 in the batch slot.
226+
auto req0 = makeRequest(1, /*logitsIndex=*/0, /*reqOffset=*/0.0f);
227+
// Request 1 occupies logitsIndex=beamWidth in the batch slot.
228+
auto req1 = makeRequest(2, /*logitsIndex=*/beamWidth, /*reqOffset=*/1000.0f);
196229

197-
utils::copyGenerationLogits(dummyCache, bufferMgr, *llmReq, /*beforeDecoder=*/false, /*numDroppedTokens=*/{});
230+
// Flush request 0 — uses workIdx=0.
231+
utils::copyGenerationLogits(cache, bufferMgr, *req0, /*beforeDecoder=*/false, {});
232+
// Flush request 1 — uses workIdx=1 (different slot → no pointer clobbering).
233+
utils::copyGenerationLogits(cache, bufferMgr, *req1, /*beforeDecoder=*/false, {});
198234

199235
ASSERT_EQ(cudaStreamSynchronize(stream->get()), cudaSuccess);
200236

201-
// Post-condition: generationLogitsHost[beam, step, v] == step*100 + beam
202-
// for all (beam, step, v). Host shape: [beamWidth, maxNewTokens, vocab].
203-
auto const* const hostData = tr::bufferCast<float>(*llmReq->getGenerationLogitsHost());
237+
// Verify req0 host buffer: host[beam, step, v] == step*100 + beam
238+
auto const* host0 = tr::bufferCast<float>(*req0->getGenerationLogitsHost());
204239
for (SizeType32 beam = 0; beam < beamWidth; ++beam)
205240
{
206241
for (SizeType32 step = 0; step < numSteps; ++step)
207242
{
208243
float const expected = static_cast<float>(step * 100 + beam);
209244
for (SizeType32 v = 0; v < vocabSize; ++v)
210245
{
211-
SizeType32 const flatIdx = (beam * numSteps + step) * vocabSize + v;
212-
EXPECT_FLOAT_EQ(hostData[flatIdx], expected) << "host[beam=" << beam << ", step=" << step << ", v=" << v
213-
<< "]=" << hostData[flatIdx] << " expected " << expected;
246+
SizeType32 const idx = (beam * numSteps + step) * vocabSize + v;
247+
EXPECT_FLOAT_EQ(host0[idx], expected) << "req0 host[beam=" << beam << ",step=" << step << ",v=" << v
248+
<< "]=" << host0[idx] << " expected " << expected;
249+
}
250+
}
251+
}
252+
253+
// Verify req1 host buffer: host[beam, step, v] == 1000 + step*100 + beam
254+
auto const* host1 = tr::bufferCast<float>(*req1->getGenerationLogitsHost());
255+
for (SizeType32 beam = 0; beam < beamWidth; ++beam)
256+
{
257+
for (SizeType32 step = 0; step < numSteps; ++step)
258+
{
259+
float const expected = 1000.0f + static_cast<float>(step * 100 + beam);
260+
for (SizeType32 v = 0; v < vocabSize; ++v)
261+
{
262+
SizeType32 const idx = (beam * numSteps + step) * vocabSize + v;
263+
EXPECT_FLOAT_EQ(host1[idx], expected) << "req1 host[beam=" << beam << ",step=" << step << ",v=" << v
264+
<< "]=" << host1[idx] << " expected " << expected;
214265
}
215266
}
216267
}
217268

218-
// copyGenerationLogits must clear fragments after flushing.
219-
EXPECT_EQ(llmReq->getGenerationLogitsFragmentsSize(), 0);
269+
// Both requests must have had their fragments cleared.
270+
EXPECT_EQ(req0->getGenerationLogitsFragmentsSize(), 0);
271+
EXPECT_EQ(req1->getGenerationLogitsFragmentsSize(), 0);
220272
}

0 commit comments

Comments
 (0)