@@ -138,23 +138,26 @@ MaxRequestsScheduler::MaxRequestsScheduler(
138138}
139139
140140MaxUtilizationScheduler::MaxUtilizationScheduler (SizeType32 maxNumRequests, bool twoStepsLookAhead,
141- LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
141+ LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling )
142142 : BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
143143 , mMaxNumRequests (maxNumRequests)
144144 , mTwoStepsLookAhead {twoStepsLookAhead}
145+ , mEnablePrefixAwareScheduling {enablePrefixAwareScheduling}
145146{
146147}
147148
148- GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler (
149- SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState )
149+ GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler (SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
150+ LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling )
150151 : BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
151152 , mMaxNumRequests (maxNumRequests)
153+ , mEnablePrefixAwareScheduling {enablePrefixAwareScheduling}
152154{
153155}
154156
155- StaticBatchScheduler::StaticBatchScheduler (
156- SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
157- : GuaranteedNoEvictScheduler(maxNumRequests, noScheduleUntilState, noScheduleAfterState)
157+ StaticBatchScheduler::StaticBatchScheduler (SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
158+ LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
159+ : GuaranteedNoEvictScheduler(
160+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling)
158161{
159162}
160163
@@ -213,7 +216,7 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
213216 = peftCacheManager ? peftCacheManager->getMaxDevicePages () : std::numeric_limits<SizeType32>::max ();
214217
215218 // The optimization of delaying requests won't work for variable window attention
216- bool skippingIsRelevant = (!kvCacheManager.getBlockManager ().isVariableWindow ())
219+ bool skippingIsRelevant = mEnablePrefixAwareScheduling && (!kvCacheManager.getBlockManager ().isVariableWindow ())
217220 && (!crossKvCacheManager || !crossKvCacheManager->getBlockManager ().isVariableWindow ());
218221
219222 // Keep track of blocks contributed by requests in context phase
@@ -315,12 +318,21 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
315318 {
316319 // analyzePrefixReuse asserts on variable-window managers; skip the walk there
317320 // and let downstream callers fall back to their fresh tree-walk path.
318- if (kvCacheManager.isEnableBlockReuse () && !kvCacheManager.getBlockManager ().isVariableWindow ())
321+ if (!mEnablePrefixAwareScheduling )
322+ {
323+ summary = kv_cache_manager::PrefixReuseSummary{};
324+ if (crossKvCacheManager)
325+ {
326+ crossSummary = kv_cache_manager::PrefixReuseSummary{};
327+ }
328+ }
329+ else if (kvCacheManager.isEnableBlockReuse ()
330+ && !kvCacheManager.getBlockManager ().isVariableWindow ())
319331 {
320332 auto uniqueTokens = req->getUniqueTokens (0 );
321333 summary = kvCacheManager.analyzePrefixReuse (uniqueTokens, *req);
322334 }
323- if (crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse ()
335+ if (mEnablePrefixAwareScheduling && crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse ()
324336 && !crossKvCacheManager->getBlockManager ().isVariableWindow ())
325337 {
326338 auto uniqueTokens = *(req->getEncoderUniqueTokens ().value ());
@@ -427,7 +439,7 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
427439 }
428440
429441 // The optimization of delaying requests won't work for variable window attention
430- bool skippingIsRelevant = !kvCacheManager.getBlockManager ().isVariableWindow ();
442+ bool skippingIsRelevant = mEnablePrefixAwareScheduling && !kvCacheManager.getBlockManager ().isVariableWindow ();
431443
432444 // Keep track of number of requests and block needed for the scheduled requests
433445 auto scheduledBlocksManager
@@ -444,8 +456,13 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
444456 std::unordered_set<uint64_t > seenTaskIds;
445457
446458 // Keep track of blocks contributed by requests in context phase
447- auto [newlyContributedContextBlocks, newlyContributedCrossContextBlocks]
448- = prefillWithChunkedContextsAlreadyExecuting (activeRequests, kvCacheManager);
459+ std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedContextBlocks;
460+ std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedCrossContextBlocks;
461+ if (skippingIsRelevant)
462+ {
463+ std::tie (newlyContributedContextBlocks, newlyContributedCrossContextBlocks)
464+ = prefillWithChunkedContextsAlreadyExecuting (activeRequests, kvCacheManager);
465+ }
449466
450467 // Find last active in case we need to evict. Encoder-init requests are
451468 // intentionally excluded here: they hold no started self- or cross-pool
@@ -483,7 +500,11 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
483500 std::optional<kv_cache_manager::PrefixReuseSummary> summary;
484501 // analyzePrefixReuse asserts on variable-window managers; skip the walk there
485502 // and let downstream callers fall back to their fresh tree-walk path.
486- if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse ()
503+ if (isFirstChunkContext && !mEnablePrefixAwareScheduling )
504+ {
505+ summary = kv_cache_manager::PrefixReuseSummary{};
506+ }
507+ else if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse ()
487508 && !kvCacheManager.getBlockManager ().isVariableWindow ())
488509 {
489510 auto uniqueTokens = req->getUniqueTokens (0 );
@@ -613,24 +634,26 @@ bool trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,
613634
614635CapacityScheduler::CapacityScheduler (SizeType32 maxNumRequests,
615636 executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool hasKvCacheManager, bool twoStepsLookAhead,
616- LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
637+ LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling )
617638{
618639 if (!hasKvCacheManager)
619640 {
620641 mScheduler = MaxRequestsScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
621642 }
622643 else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kMAX_UTILIZATION )
623644 {
624- mScheduler
625- = MaxUtilizationScheduler{ maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState};
645+ mScheduler = MaxUtilizationScheduler{
646+ maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling };
626647 }
627648 else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT )
628649 {
629- mScheduler = GuaranteedNoEvictScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
650+ mScheduler = GuaranteedNoEvictScheduler{
651+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
630652 }
631653 else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kSTATIC_BATCH )
632654 {
633- mScheduler = StaticBatchScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
655+ mScheduler = StaticBatchScheduler{
656+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
634657 }
635658 else
636659 {
0 commit comments