Skip to content

Commit 61321d5

Browse files
committed
Add prefix-aware scheduling config flag
Signed-off-by: Simeng Liu <simengl@nvidia.com>
1 parent f7dd7ec commit 61321d5

25 files changed

Lines changed: 863 additions & 120 deletions

File tree

cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2023-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -97,7 +97,8 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
9797
public:
9898
MaxUtilizationScheduler(SizeType32 maxNumRequests, bool twoStepsLookAhead,
9999
LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
100-
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
100+
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
101+
bool enablePrefixAwareScheduling = true);
101102

102103
[[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
103104
kv_cache_manager::BaseKVCacheManager& kvCacheManager,
@@ -108,6 +109,8 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
108109
SizeType32 mMaxNumRequests;
109110
/// @brief Boolean that indicates if two step lookahead is enabled
110111
bool mTwoStepsLookAhead;
112+
/// @brief Whether to use KV prefix-reuse estimates in scheduling decisions.
113+
bool mEnablePrefixAwareScheduling;
111114
};
112115

113116
/// @brief Schedule requests using the GUARANTEED_NO_EVICT policy
@@ -120,7 +123,8 @@ class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
120123
public:
121124
GuaranteedNoEvictScheduler(SizeType32 maxNumRequests,
122125
LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
123-
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
126+
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
127+
bool enablePrefixAwareScheduling = true);
124128

125129
[[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
126130
kv_cache_manager::BaseKVCacheManager const& kvCacheManager,
@@ -136,6 +140,8 @@ class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
136140

137141
private:
138142
SizeType32 mMaxNumRequests;
143+
/// @brief Whether to use KV prefix-reuse estimates in scheduling decisions.
144+
bool mEnablePrefixAwareScheduling;
139145
};
140146

141147
/// @brief Schedule requests using the STATIC_BATCH policy
@@ -144,7 +150,8 @@ class StaticBatchScheduler : public GuaranteedNoEvictScheduler
144150
public:
145151
StaticBatchScheduler(SizeType32 maxNumRequests,
146152
LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
147-
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
153+
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
154+
bool enablePrefixAwareScheduling = true);
148155

149156
[[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
150157
kv_cache_manager::BaseKVCacheManager const& kvCacheManager,
@@ -160,7 +167,8 @@ class CapacityScheduler : public Algorithm
160167
explicit CapacityScheduler(SizeType32 maxNumRequests, executor::CapacitySchedulerPolicy capacitySchedulerPolicy,
161168
bool hasKvCacheManager, bool twoStepsLookAhead = false,
162169
LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
163-
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
170+
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
171+
bool enablePrefixAwareScheduling = true);
164172

165173
/**
166174
* @brief Schedules requests following the selected policy.

cpp/include/tensorrt_llm/executor/executor.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1019,7 +1019,7 @@ class SchedulerConfig
10191019
explicit SchedulerConfig(
10201020
CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT,
10211021
std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt,
1022-
std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt);
1022+
std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt, bool enablePrefixAwareScheduling = true);
10231023

10241024
bool operator==(SchedulerConfig const& other) const;
10251025

@@ -1029,6 +1029,8 @@ class SchedulerConfig
10291029

10301030
[[nodiscard]] std::optional<DynamicBatchConfig> getDynamicBatchConfig() const;
10311031

1032+
[[nodiscard]] bool getEnablePrefixAwareScheduling() const;
1033+
10321034
private:
10331035
friend class Serialization;
10341036

@@ -1040,6 +1042,9 @@ class SchedulerConfig
10401042

10411043
/// @brief The config for tuning batch size dynamically. See DynamicBatchSizeConfig.
10421044
std::optional<DynamicBatchConfig> mDynamicBatchConfig;
1045+
1046+
/// @brief Whether schedulers use KV prefix-reuse estimates for admission and token-budget decisions.
1047+
bool mEnablePrefixAwareScheduling;
10431048
};
10441049

10451050
/// @brief Configuration class for the KV cache

cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -138,23 +138,26 @@ MaxRequestsScheduler::MaxRequestsScheduler(
138138
}
139139

140140
MaxUtilizationScheduler::MaxUtilizationScheduler(SizeType32 maxNumRequests, bool twoStepsLookAhead,
141-
LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
141+
LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
142142
: BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
143143
, mMaxNumRequests(maxNumRequests)
144144
, mTwoStepsLookAhead{twoStepsLookAhead}
145+
, mEnablePrefixAwareScheduling{enablePrefixAwareScheduling}
145146
{
146147
}
147148

148-
GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler(
149-
SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
149+
GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler(SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
150+
LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
150151
: BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
151152
, mMaxNumRequests(maxNumRequests)
153+
, mEnablePrefixAwareScheduling{enablePrefixAwareScheduling}
152154
{
153155
}
154156

155-
StaticBatchScheduler::StaticBatchScheduler(
156-
SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
157-
: GuaranteedNoEvictScheduler(maxNumRequests, noScheduleUntilState, noScheduleAfterState)
157+
StaticBatchScheduler::StaticBatchScheduler(SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
158+
LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
159+
: GuaranteedNoEvictScheduler(
160+
maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling)
158161
{
159162
}
160163

@@ -213,7 +216,7 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
213216
= peftCacheManager ? peftCacheManager->getMaxDevicePages() : std::numeric_limits<SizeType32>::max();
214217

215218
// The optimization of delaying requests won't work for variable window attention
216-
bool skippingIsRelevant = (!kvCacheManager.getBlockManager().isVariableWindow())
219+
bool skippingIsRelevant = mEnablePrefixAwareScheduling && (!kvCacheManager.getBlockManager().isVariableWindow())
217220
&& (!crossKvCacheManager || !crossKvCacheManager->getBlockManager().isVariableWindow());
218221

219222
// Keep track of blocks contributed by requests in context phase
@@ -315,12 +318,21 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
315318
{
316319
// analyzePrefixReuse asserts on variable-window managers; skip the walk there
317320
// and let downstream callers fall back to their fresh tree-walk path.
318-
if (kvCacheManager.isEnableBlockReuse() && !kvCacheManager.getBlockManager().isVariableWindow())
321+
if (!mEnablePrefixAwareScheduling)
322+
{
323+
summary = kv_cache_manager::PrefixReuseSummary{};
324+
if (crossKvCacheManager)
325+
{
326+
crossSummary = kv_cache_manager::PrefixReuseSummary{};
327+
}
328+
}
329+
else if (kvCacheManager.isEnableBlockReuse()
330+
&& !kvCacheManager.getBlockManager().isVariableWindow())
319331
{
320332
auto uniqueTokens = req->getUniqueTokens(0);
321333
summary = kvCacheManager.analyzePrefixReuse(uniqueTokens, *req);
322334
}
323-
if (crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()
335+
if (mEnablePrefixAwareScheduling && crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()
324336
&& !crossKvCacheManager->getBlockManager().isVariableWindow())
325337
{
326338
auto uniqueTokens = *(req->getEncoderUniqueTokens().value());
@@ -427,7 +439,7 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
427439
}
428440

429441
// The optimization of delaying requests won't work for variable window attention
430-
bool skippingIsRelevant = !kvCacheManager.getBlockManager().isVariableWindow();
442+
bool skippingIsRelevant = mEnablePrefixAwareScheduling && !kvCacheManager.getBlockManager().isVariableWindow();
431443

432444
// Keep track of number of requests and block needed for the scheduled requests
433445
auto scheduledBlocksManager
@@ -444,8 +456,13 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
444456
std::unordered_set<uint64_t> seenTaskIds;
445457

446458
// Keep track of blocks contributed by requests in context phase
447-
auto [newlyContributedContextBlocks, newlyContributedCrossContextBlocks]
448-
= prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager);
459+
std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedContextBlocks;
460+
std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedCrossContextBlocks;
461+
if (skippingIsRelevant)
462+
{
463+
std::tie(newlyContributedContextBlocks, newlyContributedCrossContextBlocks)
464+
= prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager);
465+
}
449466

450467
// Find last active in case we need to evict. Encoder-init requests are
451468
// intentionally excluded here: they hold no started self- or cross-pool
@@ -483,7 +500,11 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
483500
std::optional<kv_cache_manager::PrefixReuseSummary> summary;
484501
// analyzePrefixReuse asserts on variable-window managers; skip the walk there
485502
// and let downstream callers fall back to their fresh tree-walk path.
486-
if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse()
503+
if (isFirstChunkContext && !mEnablePrefixAwareScheduling)
504+
{
505+
summary = kv_cache_manager::PrefixReuseSummary{};
506+
}
507+
else if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse()
487508
&& !kvCacheManager.getBlockManager().isVariableWindow())
488509
{
489510
auto uniqueTokens = req->getUniqueTokens(0);
@@ -613,24 +634,26 @@ bool trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,
613634

614635
CapacityScheduler::CapacityScheduler(SizeType32 maxNumRequests,
615636
executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool hasKvCacheManager, bool twoStepsLookAhead,
616-
LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
637+
LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
617638
{
618639
if (!hasKvCacheManager)
619640
{
620641
mScheduler = MaxRequestsScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
621642
}
622643
else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kMAX_UTILIZATION)
623644
{
624-
mScheduler
625-
= MaxUtilizationScheduler{maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState};
645+
mScheduler = MaxUtilizationScheduler{
646+
maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
626647
}
627648
else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)
628649
{
629-
mScheduler = GuaranteedNoEvictScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
650+
mScheduler = GuaranteedNoEvictScheduler{
651+
maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
630652
}
631653
else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kSTATIC_BATCH)
632654
{
633-
mScheduler = StaticBatchScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
655+
mScheduler = StaticBatchScheduler{
656+
maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
634657
}
635658
else
636659
{

cpp/tensorrt_llm/batch_manager/trtEncoderModel.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -76,8 +76,11 @@ TrtEncoderModel::TrtEncoderModel(runtime::ModelConfig const& modelConfig, WorldC
7676
// handling of maximizing utilization or pause/evict
7777
// TODO: finer control on encoder requests scheduling
7878
mCapacityScheduler = std::make_unique<tensorrt_llm::batch_manager::CapacityScheduler>(
79-
getMaxBatchSize() * mNumMicroBatches, executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(), false,
80-
false, LlmRequestState::kENCODER_INIT, LlmRequestState::kCONTEXT_INIT);
79+
getMaxBatchSize() * mNumMicroBatches, executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(),
80+
/*hasKvCacheManager=*/false, /*twoStepsLookAhead=*/false,
81+
/*noScheduleUntilState=*/LlmRequestState::kENCODER_INIT,
82+
/*noScheduleAfterState=*/LlmRequestState::kCONTEXT_INIT,
83+
/*enablePrefixAwareScheduling=*/executorConfig.getSchedulerConfig().getEnablePrefixAwareScheduling());
8184

8285
mMicroBatchScheduler = std::make_unique<tensorrt_llm::batch_manager::MicroBatchScheduler>(
8386
std::nullopt, mModelConfig.getMaxInputLen(), LlmRequestState::kENCODER_INIT, LlmRequestState::kCONTEXT_INIT);

cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,10 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
442442

443443
mCapacityScheduler = std::make_unique<CapacityScheduler>(getMaxNumSequences(),
444444
executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(), mKvCacheManager != nullptr,
445-
mWorldConfig.isPipelineParallel());
445+
/*twoStepsLookAhead=*/mWorldConfig.isPipelineParallel(),
446+
/*noScheduleUntilState=*/LlmRequestState::kCONTEXT_INIT,
447+
/*noScheduleAfterState=*/LlmRequestState::kGENERATION_COMPLETE,
448+
/*enablePrefixAwareScheduling=*/executorConfig.getSchedulerConfig().getEnablePrefixAwareScheduling());
446449

447450
mMicroBatchScheduler = std::make_unique<MicroBatchScheduler>(ctxChunkConfig, maxContextLength);
448451

cpp/tensorrt_llm/executor/schedulerConfig.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,17 +21,20 @@ namespace tensorrt_llm::executor
2121
{
2222

2323
SchedulerConfig::SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy,
24-
std::optional<ContextChunkingPolicy> contextChunkingPolicy, std::optional<DynamicBatchConfig> dynamicBatchConfig)
24+
std::optional<ContextChunkingPolicy> contextChunkingPolicy, std::optional<DynamicBatchConfig> dynamicBatchConfig,
25+
bool enablePrefixAwareScheduling)
2526
: mCapacitySchedulerPolicy(capacitySchedulerPolicy)
2627
, mContextChunkingPolicy(std::move(contextChunkingPolicy))
2728
, mDynamicBatchConfig(std::move(dynamicBatchConfig))
29+
, mEnablePrefixAwareScheduling(enablePrefixAwareScheduling)
2830
{
2931
}
3032

3133
bool SchedulerConfig::operator==(SchedulerConfig const& other) const
3234
{
3335
return mCapacitySchedulerPolicy == other.mCapacitySchedulerPolicy
34-
&& mContextChunkingPolicy == other.mContextChunkingPolicy;
36+
&& mContextChunkingPolicy == other.mContextChunkingPolicy
37+
&& mEnablePrefixAwareScheduling == other.mEnablePrefixAwareScheduling;
3538
}
3639

3740
[[nodiscard]] CapacitySchedulerPolicy SchedulerConfig::getCapacitySchedulerPolicy() const
@@ -49,4 +52,9 @@ bool SchedulerConfig::operator==(SchedulerConfig const& other) const
4952
return mDynamicBatchConfig;
5053
}
5154

55+
[[nodiscard]] bool SchedulerConfig::getEnablePrefixAwareScheduling() const
56+
{
57+
return mEnablePrefixAwareScheduling;
58+
}
59+
5260
} // namespace tensorrt_llm::executor

cpp/tensorrt_llm/executor/serialization.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1427,14 +1427,17 @@ SchedulerConfig Serialization::deserializeSchedulerConfig(std::istream& is)
14271427
auto capacitySchedulerPolicy = su::deserialize<CapacitySchedulerPolicy>(is);
14281428
auto contextChunkingPolicy = su::deserialize<std::optional<ContextChunkingPolicy>>(is);
14291429
auto dynamicBatchConfig = su::deserialize<std::optional<DynamicBatchConfig>>(is);
1430-
return SchedulerConfig{capacitySchedulerPolicy, contextChunkingPolicy, dynamicBatchConfig};
1430+
auto enablePrefixAwareScheduling = su::deserialize<bool>(is);
1431+
return SchedulerConfig{
1432+
capacitySchedulerPolicy, contextChunkingPolicy, dynamicBatchConfig, enablePrefixAwareScheduling};
14311433
}
14321434

14331435
void Serialization::serialize(SchedulerConfig const& schedulerConfig, std::ostream& os)
14341436
{
14351437
su::serialize(schedulerConfig.getCapacitySchedulerPolicy(), os);
14361438
su::serialize(schedulerConfig.getContextChunkingPolicy(), os);
14371439
su::serialize(schedulerConfig.getDynamicBatchConfig(), os);
1440+
su::serialize(schedulerConfig.getEnablePrefixAwareScheduling(), os);
14381441
}
14391442

14401443
size_t Serialization::serializedSize(SchedulerConfig const& schedulerConfig)
@@ -1443,6 +1446,7 @@ size_t Serialization::serializedSize(SchedulerConfig const& schedulerConfig)
14431446
totalSize += su::serializedSize(schedulerConfig.getCapacitySchedulerPolicy());
14441447
totalSize += su::serializedSize(schedulerConfig.getContextChunkingPolicy());
14451448
totalSize += su::serializedSize(schedulerConfig.getDynamicBatchConfig());
1449+
totalSize += su::serializedSize(schedulerConfig.getEnablePrefixAwareScheduling());
14461450
return totalSize;
14471451
}
14481452

cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
* SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -85,10 +85,12 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
8585
.def("__setstate__", agentTreeConfigSetstate);
8686

8787
nb::class_<CapacityScheduler>(m, CapacityScheduler::name)
88-
.def(nb::init<SizeType32, executor::CapacitySchedulerPolicy, bool, bool, LlmRequestState, LlmRequestState>(),
88+
.def(nb::init<SizeType32, executor::CapacitySchedulerPolicy, bool, bool, LlmRequestState, LlmRequestState,
89+
bool>(),
8990
nb::arg("max_num_requests"), nb::arg("capacity_scheduler_policy"), nb::arg("has_kv_cache_manager"),
9091
nb::arg("two_step_lookahead") = false, nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
91-
nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
92+
nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE,
93+
nb::arg("enable_prefix_aware_scheduling") = true)
9294
.def("__call__", &CapacityScheduler::operator(), nb::arg("active_requests"),
9395
nb::arg("kv_cache_manager") = nullptr, nb::arg("peft_cache_manager") = nullptr,
9496
nb::arg("cross_kv_cache_manager") = nullptr)

0 commit comments

Comments
 (0)