NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h‎
Lines changed: 13 additions & 5 deletions b/‎cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 6 additions & 1 deletion b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp‎
Lines changed: 41 additions & 18 deletions b/‎cpp/tensorrt_llm/batch_manager/capacityScheduler.cpp‎
Lines changed: 41 additions & 18 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtEncoderModel.cpp‎
Lines changed: 6 additions & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/trtEncoderModel.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 4 additions & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/executor/schedulerConfig.cpp‎
Lines changed: 11 additions & 3 deletions b/‎cpp/tensorrt_llm/executor/schedulerConfig.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/executor/serialization.cpp‎
Lines changed: 5 additions & 1 deletion b/‎cpp/tensorrt_llm/executor/serialization.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp‎
Lines changed: 5 additions & 3 deletions b/‎cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp‎
Lines changed: 5 additions & 3 deletions
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,7 +97,8 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
 public:
     MaxUtilizationScheduler(SizeType32 maxNumRequests, bool twoStepsLookAhead,
         LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
-        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
+        bool enablePrefixAwareScheduling = true);
 
     [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
         kv_cache_manager::BaseKVCacheManager& kvCacheManager,
@@ -108,6 +109,8 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
     SizeType32 mMaxNumRequests;
     /// @brief Boolean that indicates if two step lookahead is enabled
     bool mTwoStepsLookAhead;
+    /// @brief Whether to use KV prefix-reuse estimates in scheduling decisions.
+    bool mEnablePrefixAwareScheduling;
 };
 
 /// @brief Schedule requests using the GUARANTEED_NO_EVICT policy
@@ -120,7 +123,8 @@ class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
 public:
     GuaranteedNoEvictScheduler(SizeType32 maxNumRequests,
         LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
-        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
+        bool enablePrefixAwareScheduling = true);
 
     [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
         kv_cache_manager::BaseKVCacheManager const& kvCacheManager,
@@ -136,6 +140,8 @@ class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
 
 private:
     SizeType32 mMaxNumRequests;
+    /// @brief Whether to use KV prefix-reuse estimates in scheduling decisions.
+    bool mEnablePrefixAwareScheduling;
 };
 
 /// @brief Schedule requests using the STATIC_BATCH policy
@@ -144,7 +150,8 @@ class StaticBatchScheduler : public GuaranteedNoEvictScheduler
 public:
     StaticBatchScheduler(SizeType32 maxNumRequests,
         LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
-        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
+        bool enablePrefixAwareScheduling = true);
 
     [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
         kv_cache_manager::BaseKVCacheManager const& kvCacheManager,
@@ -160,7 +167,8 @@ class CapacityScheduler : public Algorithm
     explicit CapacityScheduler(SizeType32 maxNumRequests, executor::CapacitySchedulerPolicy capacitySchedulerPolicy,
         bool hasKvCacheManager, bool twoStepsLookAhead = false,
         LlmRequestState noScheduleUntilState = LlmRequestState::kCONTEXT_INIT,
-        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
+        LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE,
+        bool enablePrefixAwareScheduling = true);
 
     /**
      * @brief Schedules requests following the selected policy.
 
@@ -1019,7 +1019,7 @@ class SchedulerConfig
     explicit SchedulerConfig(
         CapacitySchedulerPolicy capacitySchedulerPolicy = CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT,
         std::optional<ContextChunkingPolicy> contextChunkingPolicy = std::nullopt,
-        std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt);
+        std::optional<DynamicBatchConfig> dynamicBatchConfig = std::nullopt, bool enablePrefixAwareScheduling = true);
 
     bool operator==(SchedulerConfig const& other) const;
 
@@ -1029,6 +1029,8 @@ class SchedulerConfig
 
     [[nodiscard]] std::optional<DynamicBatchConfig> getDynamicBatchConfig() const;
 
+    [[nodiscard]] bool getEnablePrefixAwareScheduling() const;
+
 private:
     friend class Serialization;
 
@@ -1040,6 +1042,9 @@ class SchedulerConfig
 
     /// @brief The config for tuning batch size dynamically. See DynamicBatchSizeConfig.
     std::optional<DynamicBatchConfig> mDynamicBatchConfig;
+
+    /// @brief Whether schedulers use KV prefix-reuse estimates for admission and token-budget decisions.
+    bool mEnablePrefixAwareScheduling;
 };
 
 /// @brief Configuration class for the KV cache
 
@@ -138,23 +138,26 @@ MaxRequestsScheduler::MaxRequestsScheduler(
 }
 
 MaxUtilizationScheduler::MaxUtilizationScheduler(SizeType32 maxNumRequests, bool twoStepsLookAhead,
-    LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
+    LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
     : BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
     , mMaxNumRequests(maxNumRequests)
     , mTwoStepsLookAhead{twoStepsLookAhead}
+    , mEnablePrefixAwareScheduling{enablePrefixAwareScheduling}
 {
 }
 
-GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler(
-    SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
+GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler(SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
+    LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
     : BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)
     , mMaxNumRequests(maxNumRequests)
+    , mEnablePrefixAwareScheduling{enablePrefixAwareScheduling}
 {
 }
 
-StaticBatchScheduler::StaticBatchScheduler(
-    SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
-    : GuaranteedNoEvictScheduler(maxNumRequests, noScheduleUntilState, noScheduleAfterState)
+StaticBatchScheduler::StaticBatchScheduler(SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,
+    LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
+    : GuaranteedNoEvictScheduler(
+        maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling)
 {
 }
 
@@ -213,7 +216,7 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
         = peftCacheManager ? peftCacheManager->getMaxDevicePages() : std::numeric_limits<SizeType32>::max();
 
     // The optimization of delaying requests won't work for variable window attention
-    bool skippingIsRelevant = (!kvCacheManager.getBlockManager().isVariableWindow())
+    bool skippingIsRelevant = mEnablePrefixAwareScheduling && (!kvCacheManager.getBlockManager().isVariableWindow())
         && (!crossKvCacheManager || !crossKvCacheManager->getBlockManager().isVariableWindow());
 
     // Keep track of blocks contributed by requests in context phase
@@ -315,12 +318,21 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(
                 {
                     // analyzePrefixReuse asserts on variable-window managers; skip the walk there
                     // and let downstream callers fall back to their fresh tree-walk path.
-                    if (kvCacheManager.isEnableBlockReuse() && !kvCacheManager.getBlockManager().isVariableWindow())
+                    if (!mEnablePrefixAwareScheduling)
+                    {
+                        summary = kv_cache_manager::PrefixReuseSummary{};
+                        if (crossKvCacheManager)
+                        {
+                            crossSummary = kv_cache_manager::PrefixReuseSummary{};
+                        }
+                    }
+                    else if (kvCacheManager.isEnableBlockReuse()
+                        && !kvCacheManager.getBlockManager().isVariableWindow())
                     {
                         auto uniqueTokens = req->getUniqueTokens(0);
                         summary = kvCacheManager.analyzePrefixReuse(uniqueTokens, *req);
                     }
-                    if (crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()
+                    if (mEnablePrefixAwareScheduling && crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()
                         && !crossKvCacheManager->getBlockManager().isVariableWindow())
                     {
                         auto uniqueTokens = *(req->getEncoderUniqueTokens().value());
@@ -427,7 +439,7 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
     }
 
     // The optimization of delaying requests won't work for variable window attention
-    bool skippingIsRelevant = !kvCacheManager.getBlockManager().isVariableWindow();
+    bool skippingIsRelevant = mEnablePrefixAwareScheduling && !kvCacheManager.getBlockManager().isVariableWindow();
 
     // Keep track of number of requests and block needed for the scheduled requests
     auto scheduledBlocksManager
@@ -444,8 +456,13 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
     std::unordered_set<uint64_t> seenTaskIds;
 
     // Keep track of blocks contributed by requests in context phase
-    auto [newlyContributedContextBlocks, newlyContributedCrossContextBlocks]
-        = prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager);
+    std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedContextBlocks;
+    std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedCrossContextBlocks;
+    if (skippingIsRelevant)
+    {
+        std::tie(newlyContributedContextBlocks, newlyContributedCrossContextBlocks)
+            = prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager);
+    }
 
     // Find last active in case we need to evict.  Encoder-init requests are
     // intentionally excluded here: they hold no started self- or cross-pool
@@ -483,7 +500,11 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(
         std::optional<kv_cache_manager::PrefixReuseSummary> summary;
         // analyzePrefixReuse asserts on variable-window managers; skip the walk there
         // and let downstream callers fall back to their fresh tree-walk path.
-        if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse()
+        if (isFirstChunkContext && !mEnablePrefixAwareScheduling)
+        {
+            summary = kv_cache_manager::PrefixReuseSummary{};
+        }
+        else if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse()
             && !kvCacheManager.getBlockManager().isVariableWindow())
         {
             auto uniqueTokens = req->getUniqueTokens(0);
@@ -613,24 +634,26 @@ bool trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,
 
 CapacityScheduler::CapacityScheduler(SizeType32 maxNumRequests,
     executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool hasKvCacheManager, bool twoStepsLookAhead,
-    LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)
+    LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)
 {
     if (!hasKvCacheManager)
     {
         mScheduler = MaxRequestsScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
     }
     else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kMAX_UTILIZATION)
     {
-        mScheduler
-            = MaxUtilizationScheduler{maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState};
+        mScheduler = MaxUtilizationScheduler{
+            maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
     }
     else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)
     {
-        mScheduler = GuaranteedNoEvictScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
+        mScheduler = GuaranteedNoEvictScheduler{
+            maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
     }
     else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kSTATIC_BATCH)
     {
-        mScheduler = StaticBatchScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};
+        mScheduler = StaticBatchScheduler{
+            maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};
     }
     else
     {
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -76,8 +76,11 @@ TrtEncoderModel::TrtEncoderModel(runtime::ModelConfig const& modelConfig, WorldC
     // handling of maximizing utilization or pause/evict
     // TODO: finer control on encoder requests scheduling
     mCapacityScheduler = std::make_unique<tensorrt_llm::batch_manager::CapacityScheduler>(
-        getMaxBatchSize() * mNumMicroBatches, executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(), false,
-        false, LlmRequestState::kENCODER_INIT, LlmRequestState::kCONTEXT_INIT);
+        getMaxBatchSize() * mNumMicroBatches, executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(),
+        /*hasKvCacheManager=*/false, /*twoStepsLookAhead=*/false,
+        /*noScheduleUntilState=*/LlmRequestState::kENCODER_INIT,
+        /*noScheduleAfterState=*/LlmRequestState::kCONTEXT_INIT,
+        /*enablePrefixAwareScheduling=*/executorConfig.getSchedulerConfig().getEnablePrefixAwareScheduling());
 
     mMicroBatchScheduler = std::make_unique<tensorrt_llm::batch_manager::MicroBatchScheduler>(
         std::nullopt, mModelConfig.getMaxInputLen(), LlmRequestState::kENCODER_INIT, LlmRequestState::kCONTEXT_INIT);
 
@@ -442,7 +442,10 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
 
     mCapacityScheduler = std::make_unique<CapacityScheduler>(getMaxNumSequences(),
         executorConfig.getSchedulerConfig().getCapacitySchedulerPolicy(), mKvCacheManager != nullptr,
-        mWorldConfig.isPipelineParallel());
+        /*twoStepsLookAhead=*/mWorldConfig.isPipelineParallel(),
+        /*noScheduleUntilState=*/LlmRequestState::kCONTEXT_INIT,
+        /*noScheduleAfterState=*/LlmRequestState::kGENERATION_COMPLETE,
+        /*enablePrefixAwareScheduling=*/executorConfig.getSchedulerConfig().getEnablePrefixAwareScheduling());
 
     mMicroBatchScheduler = std::make_unique<MicroBatchScheduler>(ctxChunkConfig, maxContextLength);
 
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,17 +21,20 @@ namespace tensorrt_llm::executor
 {
 
 SchedulerConfig::SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy,
-    std::optional<ContextChunkingPolicy> contextChunkingPolicy, std::optional<DynamicBatchConfig> dynamicBatchConfig)
+    std::optional<ContextChunkingPolicy> contextChunkingPolicy, std::optional<DynamicBatchConfig> dynamicBatchConfig,
+    bool enablePrefixAwareScheduling)
     : mCapacitySchedulerPolicy(capacitySchedulerPolicy)
     , mContextChunkingPolicy(std::move(contextChunkingPolicy))
     , mDynamicBatchConfig(std::move(dynamicBatchConfig))
+    , mEnablePrefixAwareScheduling(enablePrefixAwareScheduling)
 {
 }
 
 bool SchedulerConfig::operator==(SchedulerConfig const& other) const
 {
     return mCapacitySchedulerPolicy == other.mCapacitySchedulerPolicy
-        && mContextChunkingPolicy == other.mContextChunkingPolicy;
+        && mContextChunkingPolicy == other.mContextChunkingPolicy
+        && mEnablePrefixAwareScheduling == other.mEnablePrefixAwareScheduling;
 }
 
 [[nodiscard]] CapacitySchedulerPolicy SchedulerConfig::getCapacitySchedulerPolicy() const
@@ -49,4 +52,9 @@ bool SchedulerConfig::operator==(SchedulerConfig const& other) const
     return mDynamicBatchConfig;
 }
 
+[[nodiscard]] bool SchedulerConfig::getEnablePrefixAwareScheduling() const
+{
+    return mEnablePrefixAwareScheduling;
+}
+
 } // namespace tensorrt_llm::executor
@@ -1427,14 +1427,17 @@ SchedulerConfig Serialization::deserializeSchedulerConfig(std::istream& is)
     auto capacitySchedulerPolicy = su::deserialize<CapacitySchedulerPolicy>(is);
     auto contextChunkingPolicy = su::deserialize<std::optional<ContextChunkingPolicy>>(is);
     auto dynamicBatchConfig = su::deserialize<std::optional<DynamicBatchConfig>>(is);
-    return SchedulerConfig{capacitySchedulerPolicy, contextChunkingPolicy, dynamicBatchConfig};
+    auto enablePrefixAwareScheduling = su::deserialize<bool>(is);
+    return SchedulerConfig{
+        capacitySchedulerPolicy, contextChunkingPolicy, dynamicBatchConfig, enablePrefixAwareScheduling};
 }
 
 void Serialization::serialize(SchedulerConfig const& schedulerConfig, std::ostream& os)
 {
     su::serialize(schedulerConfig.getCapacitySchedulerPolicy(), os);
     su::serialize(schedulerConfig.getContextChunkingPolicy(), os);
     su::serialize(schedulerConfig.getDynamicBatchConfig(), os);
+    su::serialize(schedulerConfig.getEnablePrefixAwareScheduling(), os);
 }
 
 size_t Serialization::serializedSize(SchedulerConfig const& schedulerConfig)
@@ -1443,6 +1446,7 @@ size_t Serialization::serializedSize(SchedulerConfig const& schedulerConfig)
     totalSize += su::serializedSize(schedulerConfig.getCapacitySchedulerPolicy());
     totalSize += su::serializedSize(schedulerConfig.getContextChunkingPolicy());
     totalSize += su::serializedSize(schedulerConfig.getDynamicBatchConfig());
+    totalSize += su::serializedSize(schedulerConfig.getEnablePrefixAwareScheduling());
     return totalSize;
 }
 
 
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -85,10 +85,12 @@ void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_
         .def("__setstate__", agentTreeConfigSetstate);
 
     nb::class_<CapacityScheduler>(m, CapacityScheduler::name)
-        .def(nb::init<SizeType32, executor::CapacitySchedulerPolicy, bool, bool, LlmRequestState, LlmRequestState>(),
+        .def(nb::init<SizeType32, executor::CapacitySchedulerPolicy, bool, bool, LlmRequestState, LlmRequestState,
+                 bool>(),
             nb::arg("max_num_requests"), nb::arg("capacity_scheduler_policy"), nb::arg("has_kv_cache_manager"),
             nb::arg("two_step_lookahead") = false, nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
-            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
+            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE,
+            nb::arg("enable_prefix_aware_scheduling") = true)
         .def("__call__", &CapacityScheduler::operator(), nb::arg("active_requests"),
             nb::arg("kv_cache_manager") = nullptr, nb::arg("peft_cache_manager") = nullptr,
             nb::arg("cross_kv_cache_manager") = nullptr)
Original file line number	Diff line number	Diff line change
`@@ -138,23 +138,26 @@ MaxRequestsScheduler::MaxRequestsScheduler(`
`138`	`138`	`}`
`139`	`139`
`140`	`140`	`MaxUtilizationScheduler::MaxUtilizationScheduler(SizeType32 maxNumRequests, bool twoStepsLookAhead,`
`141`		`- LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)`
	`141`	`+ LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)`
`142`	`142`	`: BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)`
`143`	`143`	`, mMaxNumRequests(maxNumRequests)`
`144`	`144`	`, mTwoStepsLookAhead{twoStepsLookAhead}`
	`145`	`+ , mEnablePrefixAwareScheduling{enablePrefixAwareScheduling}`
`145`	`146`	`{`
`146`	`147`	`}`
`147`	`148`
`148`		`-GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler(`
`149`		`- SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)`
	`149`	`+GuaranteedNoEvictScheduler::GuaranteedNoEvictScheduler(SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,`
	`150`	`+ LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)`
`150`	`151`	`: BaseCapacityScheduler(noScheduleUntilState, noScheduleAfterState)`
`151`	`152`	`, mMaxNumRequests(maxNumRequests)`
	`153`	`+ , mEnablePrefixAwareScheduling{enablePrefixAwareScheduling}`
`152`	`154`	`{`
`153`	`155`	`}`
`154`	`156`
`155`		`-StaticBatchScheduler::StaticBatchScheduler(`
`156`		`- SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)`
`157`		`- : GuaranteedNoEvictScheduler(maxNumRequests, noScheduleUntilState, noScheduleAfterState)`
	`157`	`+StaticBatchScheduler::StaticBatchScheduler(SizeType32 maxNumRequests, LlmRequestState noScheduleUntilState,`
	`158`	`+ LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)`
	`159`	`+ : GuaranteedNoEvictScheduler(`
	`160`	`+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling)`
`158`	`161`	`{`
`159`	`162`	`}`
`160`	`163`
`@@ -213,7 +216,7 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(`
`213`	`216`	`= peftCacheManager ? peftCacheManager->getMaxDevicePages() : std::numeric_limits<SizeType32>::max();`
`214`	`217`
`215`	`218`	`// The optimization of delaying requests won't work for variable window attention`
`216`		`- bool skippingIsRelevant = (!kvCacheManager.getBlockManager().isVariableWindow())`
	`219`	`+ bool skippingIsRelevant = mEnablePrefixAwareScheduling && (!kvCacheManager.getBlockManager().isVariableWindow())`
`217`	`220`	`&& (!crossKvCacheManager \|\| !crossKvCacheManager->getBlockManager().isVariableWindow());`
`218`	`221`
`219`	`222`	`// Keep track of blocks contributed by requests in context phase`
`@@ -315,12 +318,21 @@ std::tuple<RequestVector, RequestVector> GuaranteedNoEvictScheduler::impl(`
`315`	`318`	`{`
`316`	`319`	`// analyzePrefixReuse asserts on variable-window managers; skip the walk there`
`317`	`320`	`// and let downstream callers fall back to their fresh tree-walk path.`
`318`		`- if (kvCacheManager.isEnableBlockReuse() && !kvCacheManager.getBlockManager().isVariableWindow())`
	`321`	`+ if (!mEnablePrefixAwareScheduling)`
	`322`	`+ {`
	`323`	`+ summary = kv_cache_manager::PrefixReuseSummary{};`
	`324`	`+ if (crossKvCacheManager)`
	`325`	`+ {`
	`326`	`+ crossSummary = kv_cache_manager::PrefixReuseSummary{};`
	`327`	`+ }`
	`328`	`+ }`
	`329`	`+ else if (kvCacheManager.isEnableBlockReuse()`
	`330`	`+ && !kvCacheManager.getBlockManager().isVariableWindow())`
`319`	`331`	`{`
`320`	`332`	`auto uniqueTokens = req->getUniqueTokens(0);`
`321`	`333`	`summary = kvCacheManager.analyzePrefixReuse(uniqueTokens, *req);`
`322`	`334`	`}`
`323`		`- if (crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()`
	`335`	`+ if (mEnablePrefixAwareScheduling && crossKvCacheManager && crossKvCacheManager->isEnableBlockReuse()`
`324`	`336`	`&& !crossKvCacheManager->getBlockManager().isVariableWindow())`
`325`	`337`	`{`
`326`	`338`	`auto uniqueTokens = *(req->getEncoderUniqueTokens().value());`
`@@ -427,7 +439,7 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(`
`427`	`439`	`}`
`428`	`440`
`429`	`441`	`// The optimization of delaying requests won't work for variable window attention`
`430`		`- bool skippingIsRelevant = !kvCacheManager.getBlockManager().isVariableWindow();`
	`442`	`+ bool skippingIsRelevant = mEnablePrefixAwareScheduling && !kvCacheManager.getBlockManager().isVariableWindow();`
`431`	`443`
`432`	`444`	`// Keep track of number of requests and block needed for the scheduled requests`
`433`	`445`	`auto scheduledBlocksManager`
`@@ -444,8 +456,13 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(`
`444`	`456`	`std::unordered_set<uint64_t> seenTaskIds;`
`445`	`457`
`446`	`458`	`// Keep track of blocks contributed by requests in context phase`
`447`		`- auto [newlyContributedContextBlocks, newlyContributedCrossContextBlocks]`
`448`		`- = prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager);`
	`459`	`+ std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedContextBlocks;`
	`460`	`+ std::unordered_set<BlockKey, BlockKeyHasher> newlyContributedCrossContextBlocks;`
	`461`	`+ if (skippingIsRelevant)`
	`462`	`+ {`
	`463`	`+ std::tie(newlyContributedContextBlocks, newlyContributedCrossContextBlocks)`
	`464`	`+ = prefillWithChunkedContextsAlreadyExecuting(activeRequests, kvCacheManager);`
	`465`	`+ }`
`449`	`466`
`450`	`467`	`// Find last active in case we need to evict. Encoder-init requests are`
`451`	`468`	`// intentionally excluded here: they hold no started self- or cross-pool`
`@@ -483,7 +500,11 @@ std::tuple<RequestVector, RequestVector> MaxUtilizationScheduler::operator()(`
`483`	`500`	`std::optional<kv_cache_manager::PrefixReuseSummary> summary;`
`484`	`501`	`// analyzePrefixReuse asserts on variable-window managers; skip the walk there`
`485`	`502`	`// and let downstream callers fall back to their fresh tree-walk path.`
`486`		`- if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse()`
	`503`	`+ if (isFirstChunkContext && !mEnablePrefixAwareScheduling)`
	`504`	`+ {`
	`505`	`+ summary = kv_cache_manager::PrefixReuseSummary{};`
	`506`	`+ }`
	`507`	`+ else if (isFirstChunkContext && kvCacheManager.isEnableBlockReuse()`
`487`	`508`	`&& !kvCacheManager.getBlockManager().isVariableWindow())`
`488`	`509`	`{`
`489`	`510`	`auto uniqueTokens = req->getUniqueTokens(0);`
`@@ -613,24 +634,26 @@ bool trySchedulingRequestMaxUtilization(std::shared_ptr<LlmRequest> const& req,`
`613`	`634`
`614`	`635`	`CapacityScheduler::CapacityScheduler(SizeType32 maxNumRequests,`
`615`	`636`	`executor::CapacitySchedulerPolicy capacitySchedulerPolicy, bool hasKvCacheManager, bool twoStepsLookAhead,`
`616`		`- LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState)`
	`637`	`+ LlmRequestState noScheduleUntilState, LlmRequestState noScheduleAfterState, bool enablePrefixAwareScheduling)`
`617`	`638`	`{`
`618`	`639`	`if (!hasKvCacheManager)`
`619`	`640`	`{`
`620`	`641`	`mScheduler = MaxRequestsScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};`
`621`	`642`	`}`
`622`	`643`	`else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kMAX_UTILIZATION)`
`623`	`644`	`{`
`624`		`- mScheduler`
`625`		`- = MaxUtilizationScheduler{maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState};`
	`645`	`+ mScheduler = MaxUtilizationScheduler{`
	`646`	`+ maxNumRequests, twoStepsLookAhead, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};`
`626`	`647`	`}`
`627`	`648`	`else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)`
`628`	`649`	`{`
`629`		`- mScheduler = GuaranteedNoEvictScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};`
	`650`	`+ mScheduler = GuaranteedNoEvictScheduler{`
	`651`	`+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};`
`630`	`652`	`}`
`631`	`653`	`else if (capacitySchedulerPolicy == executor::CapacitySchedulerPolicy::kSTATIC_BATCH)`
`632`	`654`	`{`
`633`		`- mScheduler = StaticBatchScheduler{maxNumRequests, noScheduleUntilState, noScheduleAfterState};`
	`655`	`+ mScheduler = StaticBatchScheduler{`
	`656`	`+ maxNumRequests, noScheduleUntilState, noScheduleAfterState, enablePrefixAwareScheduling};`
`634`	`657`	`}`
`635`	`658`	`else`
`636`	`659`	`{`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/*`
`2`		`- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`2`	`+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
`3`	`3`	`* SPDX-License-Identifier: Apache-2.0`
`4`	`4`	`*`
`5`	`5`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`@@ -21,17 +21,20 @@ namespace tensorrt_llm::executor`
`21`	`21`	`{`
`22`	`22`
`23`	`23`	`SchedulerConfig::SchedulerConfig(CapacitySchedulerPolicy capacitySchedulerPolicy,`
`24`		`- std::optional<ContextChunkingPolicy> contextChunkingPolicy, std::optional<DynamicBatchConfig> dynamicBatchConfig)`
	`24`	`+ std::optional<ContextChunkingPolicy> contextChunkingPolicy, std::optional<DynamicBatchConfig> dynamicBatchConfig,`
	`25`	`+ bool enablePrefixAwareScheduling)`
`25`	`26`	`: mCapacitySchedulerPolicy(capacitySchedulerPolicy)`
`26`	`27`	`, mContextChunkingPolicy(std::move(contextChunkingPolicy))`
`27`	`28`	`, mDynamicBatchConfig(std::move(dynamicBatchConfig))`
	`29`	`+ , mEnablePrefixAwareScheduling(enablePrefixAwareScheduling)`
`28`	`30`	`{`
`29`	`31`	`}`
`30`	`32`
`31`	`33`	`bool SchedulerConfig::operator==(SchedulerConfig const& other) const`
`32`	`34`	`{`
`33`	`35`	`return mCapacitySchedulerPolicy == other.mCapacitySchedulerPolicy`
`34`		`- && mContextChunkingPolicy == other.mContextChunkingPolicy;`
	`36`	`+ && mContextChunkingPolicy == other.mContextChunkingPolicy`
	`37`	`+ && mEnablePrefixAwareScheduling == other.mEnablePrefixAwareScheduling;`
`35`	`38`	`}`
`36`	`39`
`37`	`40`	`[[nodiscard]] CapacitySchedulerPolicy SchedulerConfig::getCapacitySchedulerPolicy() const`
`@@ -49,4 +52,9 @@ bool SchedulerConfig::operator==(SchedulerConfig const& other) const`
`49`	`52`	`return mDynamicBatchConfig;`
`50`	`53`	`}`
`51`	`54`
	`55`	`+[[nodiscard]] bool SchedulerConfig::getEnablePrefixAwareScheduling() const`
	`56`	`+{`
	`57`	`+ return mEnablePrefixAwareScheduling;`
	`58`	`+}`
	`59`	`+`
`52`	`60`	`} // namespace tensorrt_llm::executor`
Original file line number	Diff line number	Diff line change
`@@ -1427,14 +1427,17 @@ SchedulerConfig Serialization::deserializeSchedulerConfig(std::istream& is)`
`1427`	`1427`	`auto capacitySchedulerPolicy = su::deserialize<CapacitySchedulerPolicy>(is);`
`1428`	`1428`	`auto contextChunkingPolicy = su::deserialize<std::optional<ContextChunkingPolicy>>(is);`
`1429`	`1429`	`auto dynamicBatchConfig = su::deserialize<std::optional<DynamicBatchConfig>>(is);`
`1430`		`- return SchedulerConfig{capacitySchedulerPolicy, contextChunkingPolicy, dynamicBatchConfig};`
	`1430`	`+ auto enablePrefixAwareScheduling = su::deserialize<bool>(is);`
	`1431`	`+ return SchedulerConfig{`
	`1432`	`+ capacitySchedulerPolicy, contextChunkingPolicy, dynamicBatchConfig, enablePrefixAwareScheduling};`
`1431`	`1433`	`}`
`1432`	`1434`
`1433`	`1435`	`void Serialization::serialize(SchedulerConfig const& schedulerConfig, std::ostream& os)`
`1434`	`1436`	`{`
`1435`	`1437`	`su::serialize(schedulerConfig.getCapacitySchedulerPolicy(), os);`
`1436`	`1438`	`su::serialize(schedulerConfig.getContextChunkingPolicy(), os);`
`1437`	`1439`	`su::serialize(schedulerConfig.getDynamicBatchConfig(), os);`
	`1440`	`+ su::serialize(schedulerConfig.getEnablePrefixAwareScheduling(), os);`
`1438`	`1441`	`}`
`1439`	`1442`
`1440`	`1443`	`size_t Serialization::serializedSize(SchedulerConfig const& schedulerConfig)`
`@@ -1443,6 +1446,7 @@ size_t Serialization::serializedSize(SchedulerConfig const& schedulerConfig)`
`1443`	`1446`	`totalSize += su::serializedSize(schedulerConfig.getCapacitySchedulerPolicy());`
`1444`	`1447`	`totalSize += su::serializedSize(schedulerConfig.getContextChunkingPolicy());`
`1445`	`1448`	`totalSize += su::serializedSize(schedulerConfig.getDynamicBatchConfig());`
	`1449`	`+ totalSize += su::serializedSize(schedulerConfig.getEnablePrefixAwareScheduling());`
`1446`	`1450`	`return totalSize;`
`1447`	`1451`	`}`
`1448`	`1452`