@@ -57,21 +57,14 @@ using SizeType32 = tr::SizeType32;
5757// Fix 1: KVCacheManager::copyBlockOffsets cross-KV beam sharing
5858// ============================================================================
5959
60- // Verify that for a cross-KV cache with beam width > 1, copyBlockOffsets
61- // places the same physical block IDs in every beam slot.
62- //
63- // This tests the isCrossKv() branch added by fix 1: when all beams share the
64- // same encoder output, the output offset table must reflect that. In the
65- // production Whisper case the allocator gives each beam its own physical
66- // blocks, so beams 1..N-1 would reference uninitialised GPU memory without
67- // the fix. This unit test uses a simple context-only sequence where the
68- // allocator happens to share blocks across beams; the observable property
69- // (all beam slots equal beam-0) still holds and constitutes a sanity check.
70- TEST (CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
60+ // Verify that copyBlockOffsets normalises all beam slots to beam-0's value on the
61+ // cross-KV path even when per-beam source rows differ. The test writes distinct
62+ // sentinel values into each beam row of the source tensor before calling
63+ // copyBlockOffsets so that the old bug (srcBeamIdx = beamIdx) would leave beams
64+ // 1..N-1 with different values, while the fix (srcBeamIdx = 0 for isCrossKv())
65+ // produces equal values across all beams.
66+ TEST (CrossKvBeamSharingTest, CopyBlockOffsetsNormalisesAllBeamsToBeam0)
7167{
72- // Encoder-decoder setup: 1 layer, 1 KV head, sizePerHead=4,
73- // tokensPerBlock=8, encoder output length=16 → 2 blocks per sequence,
74- // beam width=3.
7568 auto stream = std::make_shared<tr::CudaStream>();
7669
7770 SizeType32 constexpr numLayers = 1 ;
@@ -83,7 +76,6 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
8376 SizeType32 constexpr maxAttentionWindow = 16 ;
8477 SizeType32 constexpr encoderLen = 16 ; // 2 blocks
8578
86- // Reserve enough blocks for 1 sequence × beamWidth × (encoderLen / tokensPerBlock).
8779 SizeType32 constexpr numBlocks = maxNumSequences * beamWidth * (encoderLen / tokensPerBlock);
8880 BlocksPerWindow const blocksPerWindow{{maxAttentionWindow, {numBlocks, 0 }}};
8981
@@ -93,30 +85,45 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
9385 /* enableBlockReuse=*/ false , CacheType::kCROSS );
9486 crossKvMgr.allocatePools (false );
9587
96- // Build a minimal LlmRequest and allocate a cross-KV sequence.
9788 RequestIdType constexpr requestId = 1 ;
9889 auto inputTokens = std::make_shared<VecTokens>(encoderLen, 0 );
9990 tr::SamplingConfig const samplingConfig{beamWidth};
10091 auto llmReq = std::make_shared<LlmRequest>(requestId, /* maxNewTokens=*/ 0 , inputTokens, samplingConfig, false );
10192 crossKvMgr.addSequenceBatch ({{{requestId, encoderLen, beamWidth}}}, {std::ref (*llmReq)});
10293
103- // Allocate CPU output tensor: [numPools, maxNumSeq*beamWidth, 2, maxBlocksPerSeq].
94+ // Write distinct per-beam values into the source cacheBlockIndices tensor so
95+ // that the old code (copying each beam's own row) would produce different
96+ // outputs, while the fixed code (always copying beam-0's row) produces equal.
97+ auto & seq = crossKvMgr.getSequence (requestId);
98+ auto & srcTensor = seq.getCacheBlockIndices (maxAttentionWindow);
99+ auto const & srcShape = srcTensor.getShape ();
100+ auto * const srcPtr = tr::bufferCast<tk::KVCacheIndex>(srcTensor);
101+ for (SizeType32 beam = 0 ; beam < beamWidth; ++beam)
102+ {
103+ for (SizeType32 kv = 0 ; kv < 2 ; ++kv)
104+ {
105+ for (SizeType32 block = 0 ; block < srcShape.d [3 ]; ++block)
106+ {
107+ auto const idx = tc::flat_index (srcShape.d , /* pool=*/ 0 , beam, kv, block);
108+ // Beam b gets value (b*100 + kv*10 + block), all non-zero and distinct.
109+ srcPtr[idx]
110+ = tk::KVCacheIndex{static_cast <tk::KVCacheIndex::UnderlyingType>(beam * 100 + kv * 10 + block + 1 )};
111+ }
112+ }
113+ }
114+
104115 auto const dims = crossKvMgr.getOffsetTableDimensions ();
105116 SizeType32 const numPools = dims.numPools ;
106117 SizeType32 const maxBlocksPerSeq = dims.maxBlocksPerSeq ;
107118 auto blockOffsets
108119 = tr::BufferManager::cpu (tr::ITensor::makeShape ({numPools, maxNumSequences * beamWidth, 2 , maxBlocksPerSeq}),
109120 tr::TRTDataType<tk::KVCacheIndex>::value);
110121
111- // Fill with sentinel so we can detect un-written slots.
112122 auto * const raw = tr::bufferCast<tk::KVCacheIndex>(*blockOffsets);
113123 std::fill (raw, raw + blockOffsets->getSize (), tk::KVCacheIndex{tk::KVCacheIndex::kInvalidPoolIndex });
114124
115125 crossKvMgr.copyBlockOffsets (*blockOffsets, /* outputSlotOffset=*/ 0 , requestId);
116126
117- // Post-condition: for every (pool, K/V, block), beams 1..beamWidth-1
118- // must hold the same physical block index as beam 0, and beam 0 itself
119- // must be a valid (non-sentinel) index.
120127 auto const & shape = blockOffsets->getShape ();
121128 for (SizeType32 pool = 0 ; pool < numPools; ++pool)
122129 {
@@ -126,9 +133,11 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
126133 {
127134 auto idx = [&](SizeType32 beam) { return tc::flat_index (shape.d , pool, beam, kv, block); };
128135
136+ // Beam 0 must have been written (non-sentinel) and reflect its source value.
129137 EXPECT_NE (raw[idx (0 )].get (), tk::KVCacheIndex::kInvalidPoolIndex )
130- << " pool=" << pool << " beam=0 kv=" << kv << " block=" << block << " : not initialised " ;
138+ << " pool=" << pool << " beam=0 kv=" << kv << " block=" << block << " : not written " ;
131139
140+ // All other beams must equal beam 0 — the fix normalises them.
132141 for (SizeType32 beam = 1 ; beam < beamWidth; ++beam)
133142 {
134143 EXPECT_EQ (raw[idx (beam)].get (), raw[idx (0 )].get ())
@@ -141,80 +150,123 @@ TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
141150}
142151
143152// ============================================================================
144- // Fix 3: copyGenerationLogits direct-copy correctness
153+ // Fix 3: copyGenerationLogits per-slot fragmentPointerDevice
145154// ============================================================================
146155
147- // Verify that the direct-copy implementation of copyGenerationLogits writes
148- // each step's logits for each beam to the correct slot in the host buffer.
149- TEST (CopyGenerationLogitsTest, DirectCopyPlacesEachBeamStepAtCorrectHostOffset)
156+ // Verify that copyGenerationLogits correctly assembles the host logits buffer
157+ // using the real kernel merge path, and that two back-to-back calls (simulating
158+ // two requests flushing in the same batch) use distinct fragmentPointerDevice
159+ // slots so their pointer arrays do not clobber each other.
160+ TEST (CopyGenerationLogitsTest, KernelMergePathProducesCorrectHostLayoutAndSlotsAreIsolated)
150161{
151- // Parameters.
152162 SizeType32 constexpr beamWidth = 2 ;
153- SizeType32 constexpr numSteps = 4 ; // one full cache-length flush
163+ SizeType32 constexpr numSteps = RuntimeBuffers::GenerationLogitsCache:: kCACHE_LENGTH ; // full flush
154164 SizeType32 constexpr vocabSize = 8 ;
155165 SizeType32 constexpr promptLen = 1 ;
166+ SizeType32 constexpr maxBatchSize = 4 ; // must be >= 2 to test slot isolation
156167
157168 auto stream = std::make_shared<tr::CudaStream>();
158169 tr::BufferManager bufferMgr{stream};
159170
160- // Create a request: promptLen=1, maxNewTokens=numSteps.
161- RequestIdType constexpr requestId = 1 ;
162- auto inputTokens = std::make_shared<VecTokens>(promptLen, 0 );
163- tr::SamplingConfig const samplingConfig{beamWidth};
164- auto llmReq = std::make_shared<LlmRequest>(requestId, numSteps, inputTokens, samplingConfig, /* isStreaming=*/ false );
165-
166- // Advance internal token count to simulate numSteps tokens generated so
167- // that (with beforeDecoder=false):
168- // numGenerationToken = getNumTokens(beam) - mPromptLen = numSteps
169- // hostOffset = numGenerationToken - fragmentSize = 0
170- LlmRequest::BeamTokens const generatedTokens (beamWidth, VecTokens (numSteps, /* token=*/ 1 ));
171- llmReq->setGeneratedTokens (generatedTokens);
172- llmReq->allocGenerationLogitsHost (vocabSize, nvinfer1::DataType::kFLOAT );
173-
174- // Build numSteps logit fragments, each of shape [1, beamWidth, vocabSize],
175- // filled with a unique per-(step, beam) sentinel value: step*100 + beam.
176- for (SizeType32 step = 0 ; step < numSteps; ++step)
171+ // Build a real GenerationLogitsCache so that transposedLogits,
172+ // fragmentPointerDevice and fragmentPointerHost are all properly allocated.
173+ // cache.logits uses pinned memory so the test can fill it from the CPU while
174+ // the GPU kernel can still read from it via DMA.
175+ RuntimeBuffers::GenerationLogitsCache cache;
176+ cache.logits = tr::BufferManager::pinnedPool (
177+ tr::ITensor::makeShape ({numSteps, maxBatchSize * beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT );
178+ cache.transposedLogits
179+ = bufferMgr.gpu (tr::ITensor::makeShape ({beamWidth, numSteps, vocabSize}), nvinfer1::DataType::kFLOAT );
180+ cache.fragmentPointerDevice
181+ = bufferMgr.gpu (tr::ITensor::makeShape ({maxBatchSize, numSteps}), nvinfer1::DataType::kINT64 );
182+ cache.fragmentPointerHost
183+ = tr::BufferManager::pinnedPool (tr::ITensor::makeShape ({maxBatchSize, numSteps}), nvinfer1::DataType::kINT64 );
184+
185+ // Helper: build one LlmRequest that has numSteps fragments pointing into
186+ // cache.logits[0..numSteps-1][logitsIndex:logitsIndex+beamWidth].
187+ // Each fragment is filled with sentinel value (step*100 + beam + reqOffset).
188+ auto makeRequest = [&](RequestIdType reqId, SizeType32 logitsIndex, float reqOffset) -> std::shared_ptr<LlmRequest>
177189 {
178- tr::ITensor::SharedPtr frag = tr::BufferManager::pinnedPool (
179- tr::ITensor::makeShape ({1 , beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT );
180- auto * const fragData = tr::bufferCast<float >(*frag);
181- for (SizeType32 beam = 0 ; beam < beamWidth; ++beam)
190+ auto tokens = std::make_shared<VecTokens>(promptLen, 0 );
191+ tr::SamplingConfig sc{beamWidth};
192+ auto req = std::make_shared<LlmRequest>(reqId, numSteps, tokens, sc, false );
193+
194+ LlmRequest::BeamTokens gen (beamWidth, VecTokens (numSteps, 1 ));
195+ req->setGeneratedTokens (gen);
196+ req->allocGenerationLogitsHost (vocabSize, nvinfer1::DataType::kFLOAT );
197+
198+ // Write known values into the logits cache slots for this request and
199+ // create matching fragment slice views.
200+ for (SizeType32 step = 0 ; step < numSteps; ++step)
182201 {
183- float const val = static_cast <float >(step * 100 + beam);
184- for (SizeType32 v = 0 ; v < vocabSize; ++v)
202+ // cache.logits shape: [numSteps, maxBatchSize*beamWidth, vocabSize]
203+ // Slice to [1, maxBS*bw, vocab], squeeze to [maxBS*bw, vocab].
204+ tr::ITensor::SharedPtr slot = tr::ITensor::slice (cache.logits , step, 1 );
205+ slot->squeeze (0 ); // [maxBS*bw, vocab]
206+ auto * slotPtr = tr::bufferCast<float >(*slot);
207+ for (SizeType32 beam = 0 ; beam < beamWidth; ++beam)
185208 {
186- // Flat layout: [1][beam][v] → beam * vocabSize + v
187- fragData[beam * vocabSize + v] = val;
209+ float const val = reqOffset + static_cast <float >(step * 100 + beam);
210+ for (SizeType32 v = 0 ; v < vocabSize; ++v)
211+ {
212+ slotPtr[(logitsIndex + beam) * vocabSize + v] = val;
213+ }
188214 }
215+
216+ // Fragment matches HandleGenerationLogits: slice [logitsIndex:logitsIndex+beamWidth]
217+ // from the step slot, then unsqueeze(0) → [1, beamWidth, vocab].
218+ tr::ITensor::SharedPtr fragView = tr::ITensor::slice (slot, logitsIndex, beamWidth);
219+ fragView->unsqueeze (0 ); // [1, beamWidth, vocab]
220+ req->addGenerationLogitsFragment (fragView);
189221 }
190- llmReq->addGenerationLogitsFragment (frag);
191- }
192- ASSERT_EQ (llmReq->getGenerationLogitsFragmentsSize (), numSteps);
222+ return req;
223+ };
193224
194- // Dummy cache — not accessed by the direct-copy implementation.
195- RuntimeBuffers::GenerationLogitsCache dummyCache;
225+ // Request 0 occupies logitsIndex=0 in the batch slot.
226+ auto req0 = makeRequest (1 , /* logitsIndex=*/ 0 , /* reqOffset=*/ 0 .0f );
227+ // Request 1 occupies logitsIndex=beamWidth in the batch slot.
228+ auto req1 = makeRequest (2 , /* logitsIndex=*/ beamWidth, /* reqOffset=*/ 1000 .0f );
196229
197- utils::copyGenerationLogits (dummyCache, bufferMgr, *llmReq, /* beforeDecoder=*/ false , /* numDroppedTokens=*/ {});
230+ // Flush request 0 — uses workIdx=0.
231+ utils::copyGenerationLogits (cache, bufferMgr, *req0, /* beforeDecoder=*/ false , {});
232+ // Flush request 1 — uses workIdx=1 (different slot → no pointer clobbering).
233+ utils::copyGenerationLogits (cache, bufferMgr, *req1, /* beforeDecoder=*/ false , {});
198234
199235 ASSERT_EQ (cudaStreamSynchronize (stream->get ()), cudaSuccess);
200236
201- // Post-condition: generationLogitsHost[beam, step, v] == step*100 + beam
202- // for all (beam, step, v). Host shape: [beamWidth, maxNewTokens, vocab].
203- auto const * const hostData = tr::bufferCast<float >(*llmReq->getGenerationLogitsHost ());
237+ // Verify req0 host buffer: host[beam, step, v] == step*100 + beam
238+ auto const * host0 = tr::bufferCast<float >(*req0->getGenerationLogitsHost ());
204239 for (SizeType32 beam = 0 ; beam < beamWidth; ++beam)
205240 {
206241 for (SizeType32 step = 0 ; step < numSteps; ++step)
207242 {
208243 float const expected = static_cast <float >(step * 100 + beam);
209244 for (SizeType32 v = 0 ; v < vocabSize; ++v)
210245 {
211- SizeType32 const flatIdx = (beam * numSteps + step) * vocabSize + v;
212- EXPECT_FLOAT_EQ (hostData[flatIdx], expected) << " host[beam=" << beam << " , step=" << step << " , v=" << v
213- << " ]=" << hostData[flatIdx] << " expected " << expected;
246+ SizeType32 const idx = (beam * numSteps + step) * vocabSize + v;
247+ EXPECT_FLOAT_EQ (host0[idx], expected) << " req0 host[beam=" << beam << " ,step=" << step << " ,v=" << v
248+ << " ]=" << host0[idx] << " expected " << expected;
249+ }
250+ }
251+ }
252+
253+ // Verify req1 host buffer: host[beam, step, v] == 1000 + step*100 + beam
254+ auto const * host1 = tr::bufferCast<float >(*req1->getGenerationLogitsHost ());
255+ for (SizeType32 beam = 0 ; beam < beamWidth; ++beam)
256+ {
257+ for (SizeType32 step = 0 ; step < numSteps; ++step)
258+ {
259+ float const expected = 1000 .0f + static_cast <float >(step * 100 + beam);
260+ for (SizeType32 v = 0 ; v < vocabSize; ++v)
261+ {
262+ SizeType32 const idx = (beam * numSteps + step) * vocabSize + v;
263+ EXPECT_FLOAT_EQ (host1[idx], expected) << " req1 host[beam=" << beam << " ,step=" << step << " ,v=" << v
264+ << " ]=" << host1[idx] << " expected " << expected;
214265 }
215266 }
216267 }
217268
218- // copyGenerationLogits must clear fragments after flushing.
219- EXPECT_EQ (llmReq->getGenerationLogitsFragmentsSize (), 0 );
269+ // Both requests must have had their fragments cleared.
270+ EXPECT_EQ (req0->getGenerationLogitsFragmentsSize (), 0 );
271+ EXPECT_EQ (req1->getGenerationLogitsFragmentsSize (), 0 );
220272}
0 commit comments