Skip to content

Commit 2967da3

Browse files
SimengLiu-nvMrGeva
authored andcommitted
[None][feat] Add prefix-aware scheduling config flag to support opt-out (NVIDIA#15526)
Signed-off-by: Simeng Liu <simengl@nvidia.com>
1 parent 5178e13 commit 2967da3

25 files changed

Lines changed: 503 additions & 110 deletions

File tree

cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
2+
* Copyright (c) 2023-2026, NVIDIA CORPORATION. All rights reserved.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -97,7 +97,8 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
9797
public:
9898
MaxUtilizationScheduler(SizeType32 maxNumRequests, bool twoStepsLookAhead,
9999
LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
100-
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
100+
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
101+
bool enablePrefixAwareScheduling = true);
101102

102103
[[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
103104
kv_cache_manager::BaseKVCacheManager& kvCacheManager,
@@ -108,6 +109,8 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
108109
SizeType32 mMaxNumRequests;
109110
/// @brief Boolean that indicates if two step lookahead is enabled
110111
bool mTwoStepsLookAhead;
112+
/// @brief Whether to use KV prefix-reuse estimates in scheduling decisions.
113+
bool mEnablePrefixAwareScheduling;
111114
};
112115

113116
/// @brief Schedule requests using the GUARANTEED_NO_EVICT policy
@@ -120,7 +123,8 @@ class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
120123
public:
121124
GuaranteedNoEvictScheduler(SizeType32 maxNumRequests,
122125
LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
123-
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
126+
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
127+
bool enablePrefixAwareScheduling = true);
124128

125129
[[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
126130
kv_cache_manager::BaseKVCacheManager const& kvCacheManager,
@@ -136,6 +140,8 @@ class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
136140

137141
private:
138142
SizeType32 mMaxNumRequests;
143+
/// @brief Whether to use KV prefix-reuse estimates in scheduling decisions.
144+
bool mEnablePrefixAwareScheduling;
139145
};
140146

141147
/// @brief Schedule requests using the STATIC_BATCH policy
@@ -144,7 +150,8 @@ class StaticBatchScheduler : public GuaranteedNoEvictScheduler
144150
public:
145151
StaticBatchScheduler(SizeType32 maxNumRequests,
146152
LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
147-
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
153+
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
154+
bool enablePrefixAwareScheduling = true);
148155

149156
[[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
150157
kv_cache_manager::BaseKVCacheManager const& kvCacheManager,
@@ -160,7 +167,8 @@ class CapacityScheduler : public Algorithm
160167
explicit CapacityScheduler(SizeType32 maxNumRequests, executor::CapacitySchedulerPolicy capacitySchedulerPolicy,
161168
bool hasKvCacheManager, bool twoStepsLookAhead = false,
162169
LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
163-
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
170+
LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
171+
bool enablePrefixAwareScheduling = true);
164172

165173
/**
166174
* @brief Schedules requests following the selected policy.

cpp/include/tensorrt_llm/executor/executor.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -989,6 +989,8 @@ class DynamicBatchConfig
989989

990990
[[nodiscard]] std::vector<std::pair<SizeType32, SizeType32>> getBatchSizeTable() const;
991991

992+
bool operator==(DynamicBatchConfig const& other) const;
993+
992994
/// @brief The default value of batch size table
993995
static std::vector<std::pair<SizeType32, SizeType32>> const kDefaultBatchSizeTable;
994996

@@ -1019,7 +1021,7 @@ class SchedulerConfig
10191021
explicit SchedulerConfig(
10201022
CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT,
10211023
std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt,
1022-
std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt);
1024+
std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt, bool enablePrefixAwareScheduling = true);
10231025

10241026
bool operator==(SchedulerConfig const& other) const;
10251027

@@ -1029,6 +1031,8 @@ class SchedulerConfig
10291031

10301032
[[nodiscard]] std::optional<DynamicBatchConfig> getDynamicBatchConfig() const;
10311033

1034+
[[nodiscard]] bool getEnablePrefixAwareScheduling() const;
1035+
10321036
private:
10331037
friend class Serialization;
10341038

@@ -1040,6 +1044,9 @@ class SchedulerConfig
10401044

10411045
/// @brief The config for tuning batch size dynamically. See DynamicBatchSizeConfig.
10421046
std::optional<DynamicBatchConfig> mDynamicBatchConfig;
1047+
1048+
/// @brief Whether schedulers use KV prefix-reuse estimates for admission and token-budget decisions.
1049+
bool mEnablePrefixAwareScheduling;
10431050
};
10441051

10451052
/// @brief Configuration class for the KV cache

cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp

Lines changed: 53 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -151,23 +151,26 @@ MaxRequestsScheduler::MaxRequestsScheduler(
151151
}
152152

153153
MaxUtilizationScheduler::MaxUtilizationScheduler(SizeType32 maxNumRequests, bool twoStepsLookAhead,
154-
LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
154+
LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
155155
: BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
156156
, mMaxNumRequests(maxNumRequests)
157157
, mTwoStepsLookAhead{twoStepsLookAhead}
158+
, mEnablePrefixAwareScheduling{enablePrefixAwareScheduling}
158159
{
159160
}
160161

161-
GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler(
162-
SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
162+
GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler(SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
163+
LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
163164
: BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
164165
, mMaxNumRequests(maxNumRequests)
166+
, mEnablePrefixAwareScheduling{enablePrefixAwareScheduling}
165167
{
166168
}
167169

168-
StaticBatchScheduler::StaticBatchScheduler(
169-
SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
170-
: GuaranteedNoEvictScheduler(maxNumRequests, noScheduleUntilState, noScheduleAfterState)
170+
StaticBatchScheduler::StaticBatchScheduler(SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
171+
LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
172+
: GuaranteedNoEvictScheduler(
173+
maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling)
171174
{
172175
}
173176

@@ -226,7 +229,7 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
226229
= peftCacheManager ? peftCacheManager->getMaxDevicePages() : std::numeric_limits<SizeType32>::max();
227230

228231
// The optimization of delaying requests won't work for variable window attention
229-
bool skippingIsRelevant = (!kvCacheManager.getBlockManager().isVariableWindow())
232+
bool skippingIsRelevant = mEnablePrefixAwareScheduling && (!kvCacheManager.getBlockManager().isVariableWindow())
230233
&& (!crossKvCacheManager || !crossKvCacheManager->getBlockManager().isVariableWindow());
231234

232235
// Keep track of blocks contributed by requests in context phase
@@ -323,28 +326,39 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
323326
bool const isEncoderInit = req->isEncoderInitState();
324327
std::optional<kv_cache_manager::PrefixReuseSummary> summary;
325328
std::optional<kv_cache_manager::PrefixReuseSummary> crossSummary;
326-
if (isFirstChunkContext)
329+
if (mEnablePrefixAwareScheduling)
327330
{
328-
// analyzePrefixReuse asserts on variable-window managers; skip the walk there
329-
// and let downstream callers fall back to their fresh tree-walk path.
330-
if (kvCacheManager.isEnableBlockReuse() && !kvCacheManager.getBlockManager().isVariableWindow())
331+
if (isFirstChunkContext)
331332
{
332-
auto uniqueTokens = req->getUniqueTokens(0);
333-
summary = kvCacheManager.analyzePrefixReuse(uniqueTokens, *req);
333+
// analyzePrefixReuse asserts on variable-window managers; skip the walk there
334+
// and let downstream callers fall back to their fresh tree-walk path.
335+
if (kvCacheManager.isEnableBlockReuse() && !kvCacheManager.getBlockManager().isVariableWindow())
336+
{
337+
auto uniqueTokens = req->getUniqueTokens(0);
338+
summary = kvCacheManager.analyzePrefixReuse(uniqueTokens, *req);
339+
}
340+
if (crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()
341+
&& !crossKvCacheManager->getBlockManager().isVariableWindow())
342+
{
343+
auto uniqueTokens = *(req->getEncoderUniqueTokens().value());
344+
crossSummary = crossKvCacheManager->analyzePrefixReuse(uniqueTokens, *req);
345+
}
334346
}
335-
if (crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()
347+
else if (isEncoderInit && crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()
336348
&& !crossKvCacheManager->getBlockManager().isVariableWindow())
337349
{
350+
// Encoder admission only needs the cross summary for reuse ordering.
338351
auto uniqueTokens = *(req->getEncoderUniqueTokens().value());
339352
crossSummary = crossKvCacheManager->analyzePrefixReuse(uniqueTokens, *req);
340353
}
341354
}
342-
else if (isEncoderInit && crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()
343-
&& !crossKvCacheManager->getBlockManager().isVariableWindow())
355+
else if (isFirstChunkContext)
344356
{
345-
// Encoder admission only needs the cross summary for reuse ordering.
346-
auto uniqueTokens = *(req->getEncoderUniqueTokens().value());
347-
crossSummary = crossKvCacheManager->analyzePrefixReuse(uniqueTokens, *req);
357+
summary = kv_cache_manager::PrefixReuseSummary{};
358+
if (crossKvCacheManager)
359+
{
360+
crossSummary = kv_cache_manager::PrefixReuseSummary{};
361+
}
348362
}
349363
// Beneficial-to-skip check using the cached summary
350364
if (!StaticBatchScheduling && skippingIsRelevant && (isFirstChunkContext || isEncoderInit)
@@ -442,7 +456,7 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
442456
}
443457

444458
// The optimization of delaying requests won't work for variable window attention
445-
bool skippingIsRelevant = !kvCacheManager.getBlockManager().isVariableWindow();
459+
bool skippingIsRelevant = mEnablePrefixAwareScheduling && !kvCacheManager.getBlockManager().isVariableWindow();
446460

447461
// Keep track of number of requests and block needed for the scheduled requests
448462
auto scheduledBlocksManager
@@ -459,8 +473,13 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
459473
std::unordered_set<uint64_t> seenTaskIds;
460474

461475
// Keep track of blocks contributed by requests in context phase
462-
auto [newlyContributedContextBlocks, newlyContributedCrossContextBlocks]
463-
= prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager);
476+
std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedContextBlocks;
477+
std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedCrossContextBlocks;
478+
if (skippingIsRelevant)
479+
{
480+
std::tie(newlyContributedContextBlocks, newlyContributedCrossContextBlocks)
481+
= prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager);
482+
}
464483

465484
// Find last active in case we need to evict. Encoder-init requests are
466485
// intentionally excluded here: they hold no started self- or cross-pool
@@ -511,7 +530,11 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
511530
std::optional<kv_cache_manager::PrefixReuseSummary> summary;
512531
// analyzePrefixReuse asserts on variable-window managers; skip the walk there
513532
// and let downstream callers fall back to their fresh tree-walk path.
514-
if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse()
533+
if (isFirstChunkContext && !mEnablePrefixAwareScheduling)
534+
{
535+
summary = kv_cache_manager::PrefixReuseSummary{};
536+
}
537+
else if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse()
515538
&& !kvCacheManager.getBlockManager().isVariableWindow())
516539
{
517540
auto uniqueTokens = req->getUniqueTokens(0);
@@ -644,24 +667,26 @@ bool trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,
644667

645668
CapacityScheduler::CapacityScheduler(SizeType32 maxNumRequests,
646669
executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool hasKvCacheManager, bool twoStepsLookAhead,
647-
LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
670+
LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
648671
{
649672
if (!hasKvCacheManager)
650673
{
651674
mScheduler = MaxRequestsScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
652675
}
653676
else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kMAX_UTILIZATION)
654677
{
655-
mScheduler
656-
= MaxUtilizationScheduler{maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState};
678+
mScheduler = MaxUtilizationScheduler{
679+
maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
657680
}
658681
else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)
659682
{
660-
mScheduler = GuaranteedNoEvictScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
683+
mScheduler = GuaranteedNoEvictScheduler{
684+
maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
661685
}
662686
else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kSTATIC_BATCH)
663687
{
664-
mScheduler = StaticBatchScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
688+
mScheduler = StaticBatchScheduler{
689+
maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
665690
}
666691
else
667692
{

cpp/tensorrt_llm/batch_manager/trtEncoderModel.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -76,8 +76,11 @@ TrtEncoderModel::TrtEncoderModel(runtime::ModelConfig const& modelConfig, WorldC
7676
// handling of maximizing utilization or pause/evict
7777
// TODO: finer control on encoder requests scheduling
7878
mCapacityScheduler = std::make_unique<tensorrt_llm::batch_manager::CapacityScheduler>(
79-
getMaxBatchSize() * mNumMicroBatches, executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(), false,
80-
false, LlmRequestState::kENCODER_INIT, LlmRequestState::kCONTEXT_INIT);
79+
getMaxBatchSize() * mNumMicroBatches, executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(),
80+
/*hasKvCacheManager=*/false, /*twoStepsLookAhead=*/false,
81+
/*noScheduleUntilState=*/LlmRequestState::kENCODER_INIT,
82+
/*noScheduleAfterState=*/LlmRequestState::kCONTEXT_INIT,
83+
/*enablePrefixAwareScheduling=*/executorConfig.getSchedulerConfig().getEnablePrefixAwareScheduling());
8184

8285
mMicroBatchScheduler = std::make_unique<tensorrt_llm::batch_manager::MicroBatchScheduler>(
8386
std::nullopt, mModelConfig.getMaxInputLen(), LlmRequestState::kENCODER_INIT, LlmRequestState::kCONTEXT_INIT);

cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,10 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
445445

446446
mCapacityScheduler = std::make_unique<CapacityScheduler>(getMaxNumSequences(),
447447
executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(), mKvCacheManager != nullptr,
448-
mWorldConfig.isPipelineParallel());
448+
/*twoStepsLookAhead=*/mWorldConfig.isPipelineParallel(),
449+
/*noScheduleUntilState=*/LlmRequestState::kCONTEXT_INIT,
450+
/*noScheduleAfterState=*/LlmRequestState::kGENERATION_COMPLETE,
451+
/*enablePrefixAwareScheduling=*/executorConfig.getSchedulerConfig().getEnablePrefixAwareScheduling());
449452

450453
mMicroBatchScheduler = std::make_unique<MicroBatchScheduler>(ctxChunkConfig, maxContextLength);
451454

cpp/tensorrt_llm/executor/dynamicBatchConfig.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -49,6 +49,14 @@ std::vector<std::pair<SizeType32, SizeType32>> DynamicBatchConfig::getBatchSizeT
4949
return mBatchSizeTable;
5050
}
5151

52+
bool DynamicBatchConfig::operator==(DynamicBatchConfig const& other) const
53+
{
54+
return mEnableBatchSizeTuning == other.mEnableBatchSizeTuning
55+
&& mEnableMaxNumTokensTuning == other.mEnableMaxNumTokensTuning
56+
&& mDynamicBatchMovingAverageWindow == other.mDynamicBatchMovingAverageWindow
57+
&& mBatchSizeTable == other.mBatchSizeTable;
58+
}
59+
5260
std::vector<std::pair<SizeType32, SizeType32>> const DynamicBatchConfig::kDefaultBatchSizeTable{
5361
{144, 128},
5462
{336, 256},

cpp/tensorrt_llm/executor/schedulerConfig.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
* SPDX-License-Identifier: Apache-2.0
44
*
55
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,17 +21,20 @@ namespace tensorrt_llm::executor
2121
{
2222

2323
SchedulerConfig::SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy,
24-
std::optional<ContextChunkingPolicy> contextChunkingPolicy, std::optional<DynamicBatchConfig> dynamicBatchConfig)
24+
std::optional<ContextChunkingPolicy> contextChunkingPolicy, std::optional<DynamicBatchConfig> dynamicBatchConfig,
25+
bool enablePrefixAwareScheduling)
2526
: mCapacitySchedulerPolicy(capacitySchedulerPolicy)
2627
, mContextChunkingPolicy(std::move(contextChunkingPolicy))
2728
, mDynamicBatchConfig(std::move(dynamicBatchConfig))
29+
, mEnablePrefixAwareScheduling(enablePrefixAwareScheduling)
2830
{
2931
}
3032

3133
bool SchedulerConfig::operator==(SchedulerConfig const& other) const
3234
{
3335
return mCapacitySchedulerPolicy == other.mCapacitySchedulerPolicy
34-
&& mContextChunkingPolicy == other.mContextChunkingPolicy;
36+
&& mContextChunkingPolicy == other.mContextChunkingPolicy && mDynamicBatchConfig == other.mDynamicBatchConfig
37+
&& mEnablePrefixAwareScheduling == other.mEnablePrefixAwareScheduling;
3538
}
3639

3740
[[nodiscard]] CapacitySchedulerPolicy SchedulerConfig::getCapacitySchedulerPolicy() const
@@ -49,4 +52,9 @@ bool SchedulerConfig::operator==(SchedulerConfig const& other) const
4952
return mDynamicBatchConfig;
5053
}
5154

55+
[[nodiscard]] bool SchedulerConfig::getEnablePrefixAwareScheduling() const
56+
{
57+
return mEnablePrefixAwareScheduling;
58+
}
59+
5260
} // namespace tensorrt_llm::executor

0 commit comments

Comments
 (0)