@@ -151,23 +151,26 @@ MaxRequestsScheduler::MaxRequestsScheduler(
151151}
152152
153153MaxUtilizationScheduler::MaxUtilizationScheduler (SizeType32 maxNumRequests, bool twoStepsLookAhead,
154- LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
154+ LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling )
155155 : BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
156156 , mMaxNumRequests (maxNumRequests)
157157 , mTwoStepsLookAhead {twoStepsLookAhead}
158+ , mEnablePrefixAwareScheduling {enablePrefixAwareScheduling}
158159{
159160}
160161
161- GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler (
162- SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState )
162+ GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler (SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
163+ LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling )
163164 : BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
164165 , mMaxNumRequests (maxNumRequests)
166+ , mEnablePrefixAwareScheduling {enablePrefixAwareScheduling}
165167{
166168}
167169
168- StaticBatchScheduler::StaticBatchScheduler (
169- SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
170- : GuaranteedNoEvictScheduler(maxNumRequests, noScheduleUntilState, noScheduleAfterState)
170+ StaticBatchScheduler::StaticBatchScheduler (SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
171+ LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
172+ : GuaranteedNoEvictScheduler(
173+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling)
171174{
172175}
173176
@@ -226,7 +229,7 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
226229 = peftCacheManager ? peftCacheManager->getMaxDevicePages () : std::numeric_limits<SizeType32>::max ();
227230
228231 // The optimization of delaying requests won't work for variable window attention
229- bool skippingIsRelevant = (!kvCacheManager.getBlockManager ().isVariableWindow ())
232+ bool skippingIsRelevant = mEnablePrefixAwareScheduling && (!kvCacheManager.getBlockManager ().isVariableWindow ())
230233 && (!crossKvCacheManager || !crossKvCacheManager->getBlockManager ().isVariableWindow ());
231234
232235 // Keep track of blocks contributed by requests in context phase
@@ -323,28 +326,39 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
323326 bool const isEncoderInit = req->isEncoderInitState ();
324327 std::optional<kv_cache_manager::PrefixReuseSummary> summary;
325328 std::optional<kv_cache_manager::PrefixReuseSummary> crossSummary;
326- if (isFirstChunkContext )
329+ if (mEnablePrefixAwareScheduling )
327330 {
328- // analyzePrefixReuse asserts on variable-window managers; skip the walk there
329- // and let downstream callers fall back to their fresh tree-walk path.
330- if (kvCacheManager.isEnableBlockReuse () && !kvCacheManager.getBlockManager ().isVariableWindow ())
331+ if (isFirstChunkContext)
331332 {
332- auto uniqueTokens = req->getUniqueTokens (0 );
333- summary = kvCacheManager.analyzePrefixReuse (uniqueTokens, *req);
333+ // analyzePrefixReuse asserts on variable-window managers; skip the walk there
334+ // and let downstream callers fall back to their fresh tree-walk path.
335+ if (kvCacheManager.isEnableBlockReuse () && !kvCacheManager.getBlockManager ().isVariableWindow ())
336+ {
337+ auto uniqueTokens = req->getUniqueTokens (0 );
338+ summary = kvCacheManager.analyzePrefixReuse (uniqueTokens, *req);
339+ }
340+ if (crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse ()
341+ && !crossKvCacheManager->getBlockManager ().isVariableWindow ())
342+ {
343+ auto uniqueTokens = *(req->getEncoderUniqueTokens ().value ());
344+ crossSummary = crossKvCacheManager->analyzePrefixReuse (uniqueTokens, *req);
345+ }
334346 }
335- if (crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse ()
347+ else if (isEncoderInit && crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse ()
336348 && !crossKvCacheManager->getBlockManager ().isVariableWindow ())
337349 {
350+ // Encoder admission only needs the cross summary for reuse ordering.
338351 auto uniqueTokens = *(req->getEncoderUniqueTokens ().value ());
339352 crossSummary = crossKvCacheManager->analyzePrefixReuse (uniqueTokens, *req);
340353 }
341354 }
342- else if (isEncoderInit && crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse ()
343- && !crossKvCacheManager->getBlockManager ().isVariableWindow ())
355+ else if (isFirstChunkContext)
344356 {
345- // Encoder admission only needs the cross summary for reuse ordering.
346- auto uniqueTokens = *(req->getEncoderUniqueTokens ().value ());
347- crossSummary = crossKvCacheManager->analyzePrefixReuse (uniqueTokens, *req);
357+ summary = kv_cache_manager::PrefixReuseSummary{};
358+ if (crossKvCacheManager)
359+ {
360+ crossSummary = kv_cache_manager::PrefixReuseSummary{};
361+ }
348362 }
349363 // Beneficial-to-skip check using the cached summary
350364 if (!StaticBatchScheduling && skippingIsRelevant && (isFirstChunkContext || isEncoderInit)
@@ -442,7 +456,7 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
442456 }
443457
444458 // The optimization of delaying requests won't work for variable window attention
445- bool skippingIsRelevant = !kvCacheManager.getBlockManager ().isVariableWindow ();
459+ bool skippingIsRelevant = mEnablePrefixAwareScheduling && !kvCacheManager.getBlockManager ().isVariableWindow ();
446460
447461 // Keep track of number of requests and block needed for the scheduled requests
448462 auto scheduledBlocksManager
@@ -459,8 +473,13 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
459473 std::unordered_set<uint64_t > seenTaskIds;
460474
461475 // Keep track of blocks contributed by requests in context phase
462- auto [newlyContributedContextBlocks, newlyContributedCrossContextBlocks]
463- = prefillWithChunkedContextsAlreadyExecuting (activeRequests, kvCacheManager);
476+ std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedContextBlocks;
477+ std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedCrossContextBlocks;
478+ if (skippingIsRelevant)
479+ {
480+ std::tie (newlyContributedContextBlocks, newlyContributedCrossContextBlocks)
481+ = prefillWithChunkedContextsAlreadyExecuting (activeRequests, kvCacheManager);
482+ }
464483
465484 // Find last active in case we need to evict. Encoder-init requests are
466485 // intentionally excluded here: they hold no started self- or cross-pool
@@ -511,7 +530,11 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
511530 std::optional<kv_cache_manager::PrefixReuseSummary> summary;
512531 // analyzePrefixReuse asserts on variable-window managers; skip the walk there
513532 // and let downstream callers fall back to their fresh tree-walk path.
514- if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse ()
533+ if (isFirstChunkContext && !mEnablePrefixAwareScheduling )
534+ {
535+ summary = kv_cache_manager::PrefixReuseSummary{};
536+ }
537+ else if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse ()
515538 && !kvCacheManager.getBlockManager ().isVariableWindow ())
516539 {
517540 auto uniqueTokens = req->getUniqueTokens (0 );
@@ -644,24 +667,26 @@ bool trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,
644667
645668CapacityScheduler::CapacityScheduler (SizeType32 maxNumRequests,
646669 executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool hasKvCacheManager, bool twoStepsLookAhead,
647- LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
670+ LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling )
648671{
649672 if (!hasKvCacheManager)
650673 {
651674 mScheduler = MaxRequestsScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
652675 }
653676 else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kMAX_UTILIZATION )
654677 {
655- mScheduler
656- = MaxUtilizationScheduler{ maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState};
678+ mScheduler = MaxUtilizationScheduler{
679+ maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling };
657680 }
658681 else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT )
659682 {
660- mScheduler = GuaranteedNoEvictScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
683+ mScheduler = GuaranteedNoEvictScheduler{
684+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
661685 }
662686 else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kSTATIC_BATCH )
663687 {
664- mScheduler = StaticBatchScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
688+ mScheduler = StaticBatchScheduler{
689+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
665690 }
666691 else
667692 {
0 commit comments