NVIDIA
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 133 additions & 59 deletions b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 133 additions & 59 deletions
@@ -925,21 +925,9 @@ def _create_one_model_draft_kv_cache_manager(
             is_disagg=self._is_disagg,
         )
 
-    def _split_kv_cache_budget_for_draft(self) -> Optional[KvCacheConfig]:
-        """Split KV cache budgets between target and draft KV caches.
-
-        When using KVCacheManagerV2 with a separate draft KV cache,
-        max_gpu_total_bytes and host_cache_size each represent the total
-        budget for both target and draft combined.  This method splits both
-        budgets proportionally based on their per-token KV cache sizes.
-
-        Returns a cloned KvCacheConfig for the draft, or None if no split is
-        needed.  Also modifies self._kv_cache_config in-place for the target.
-        """
-        total_budget = self._kv_cache_config.max_gpu_total_bytes
-        if total_budget is None or total_budget <= 0:
-            return None
-
+    def _get_target_and_draft_cache_costs(
+        self, ) -> Optional[tuple[CacheCost, CacheCost]]:
+        """Per-manager KV cache costs for target and draft layers."""
         total_kv = self._get_kv_size_per_token()
         target_kv = self._per_manager_cache_cost(
             self._kv_cache_manager_cls, self._model_engine.model.model_config)
@@ -949,10 +937,15 @@ def _split_kv_cache_budget_for_draft(self) -> Optional[KvCacheConfig]:
                              intercept=total_kv.intercept - target_kv.intercept)
         if target_kv.slope <= 0 or draft_kv.slope <= 0:
             return None
+        return target_kv, draft_kv
 
-        # Cover both managers' fixed costs first, then split the remaining
-        # budget by per-token slope. With zero intercepts this reduces to the
-        # original proportional split.
+    def _compute_draft_budget_shares(
+        self,
+        total_budget: int,
+        target_kv: CacheCost,
+        draft_kv: CacheCost,
+    ) -> Optional[tuple[int, int]]:
+        """Split *total_budget* into (target_budget, draft_budget) byte shares."""
         intercept_total = target_kv.intercept + draft_kv.intercept
         slope_budget = total_budget - intercept_total
         if slope_budget <= 0:
@@ -965,62 +958,138 @@ def _split_kv_cache_budget_for_draft(self) -> Optional[KvCacheConfig]:
         draft_slope_share = int(slope_budget * draft_kv.slope / slope_total)
         draft_budget = draft_kv.intercept + draft_slope_share
         target_budget = total_budget - draft_budget
+        return target_budget, draft_budget
+
+    def _split_kv_cache_budget_for_draft(
+        self,
+        budget_attr: str,
+        draft_kv_cache_config: Optional[KvCacheConfig] = None,
+    ) -> Optional[KvCacheConfig]:
+        """Split a byte budget (attribute on ``KvCacheConfig``) between target
+        and draft KV caches.
+
+        Splits the value of ``self._kv_cache_config.<budget_attr>`` using the
+        affine target/draft cache costs, updates the target config in-place,
+        and merges the draft share into ``draft_kv_cache_config`` (cloning the
+        target config if needed).
+
+        Returns the (possibly newly created) draft config. The input
+        ``draft_kv_cache_config`` is returned unchanged when the split is not
+        applicable (the budget is not set, or the per-manager cache costs are
+        unavailable) — in those cases sharing ``self._kv_cache_config`` is
+        correct.
+
+        The affine fixed (intercept) cost models GPU-resident state (e.g. mamba
+        SSM state). It is only charged against ``max_gpu_total_bytes``; for any
+        other budget (e.g. ``host_cache_size``, which is host offload memory the
+        GPU-resident state never occupies) the intercept is dropped so the split
+        stays proportional to the per-token cost.
+
+        When the split is *infeasible* (the combined fixed cost meets or exceeds
+        the budget — only possible for ``max_gpu_total_bytes`` after the above)
+        the shortfall is fatal: both managers need their fixed state resident in
+        GPU memory, so the run would OOM. It raises ``ValueError`` rather than
+        silently producing an unusable config. A defensive degrade-to-zero path
+        for non-GPU budgets remains so the draft never silently inherits the full
+        budget and double-allocates it.
+        """
+        total_budget = getattr(self._kv_cache_config, budget_attr) or 0
+        if total_budget <= 0:
+            return draft_kv_cache_config
+
+        cache_costs = self._get_target_and_draft_cache_costs()
+        if cache_costs is None:
+            return draft_kv_cache_config
+        target_kv, draft_kv = cache_costs
+
+        # The fixed (intercept) cost models GPU-resident state such as mamba SSM
+        # state; it does not consume host offload memory. When splitting a
+        # non-GPU budget (e.g. host_cache_size), drop the intercept so the split
+        # stays proportional to the per-token (slope) cost instead of being
+        # spuriously starved by a GPU-only fixed cost.
+        if budget_attr != "max_gpu_total_bytes":
+            target_kv = CacheCost(slope=target_kv.slope)
+            draft_kv = CacheCost(slope=draft_kv.slope)
+
+        shares = self._compute_draft_budget_shares(total_budget, target_kv,
+                                                   draft_kv)
+        if shares is None:
+            # The split is infeasible (combined fixed cost >= total budget).
+            intercept_total = target_kv.intercept + draft_kv.intercept
+            if budget_attr == "max_gpu_total_bytes":
+                # A GPU budget that cannot even fit the combined fixed cost is
+                # fatal: both managers need their fixed state resident in GPU
+                # memory, so the run would OOM. Fail fast with actionable
+                # guidance rather than producing an unusable zero-budget draft.
+                raise ValueError(
+                    f"KV cache GPU budget ({total_budget / GB:.2f} GiB) is "
+                    f"smaller than the combined fixed cost "
+                    f"({intercept_total / GB:.2f} GiB, e.g. mamba SSM state) "
+                    f"for target+draft. Increase free_gpu_memory_fraction or "
+                    f"max_gpu_total_bytes, or reduce max_batch_size (the fixed "
+                    f"cost scales with batch size).")
+            # Defensive: non-GPU budgets zero out the intercept above, so with a
+            # positive budget this branch is currently unreachable for them. It
+            # remains as a safety net guaranteeing that, should a non-GPU budget
+            # ever carry a fixed cost it cannot fit, we degrade gracefully rather
+            # than letting both managers inherit the full budget and
+            # double-allocate it: keep the full budget on the target and zero the
+            # draft's share for this attribute.
+            logger.warning(
+                f"Cannot split KV cache {budget_attr} between target and draft; "
+                f"assigning the draft a zero {budget_attr} budget to avoid "
+                f"double-allocating the full budget.")
+            if draft_kv_cache_config is None:
+                draft_kv_cache_config = self._kv_cache_config.model_copy()
+            setattr(draft_kv_cache_config, budget_attr, 0)
+            return draft_kv_cache_config
+        target_budget, draft_budget = shares
 
         logger.info(
-            f"Splitting KV cache budget: total={total_budget / GB:.2f} GiB, "
+            f"Splitting KV cache {budget_attr}: total={total_budget / GB:.2f} GiB, "
             f"target={target_budget / GB:.2f} GiB ({target_kv}), "
             f"draft={draft_budget / GB:.2f} GiB ({draft_kv})")
 
-        self._kv_cache_config.max_gpu_total_bytes = target_budget
-
-        draft_kv_cache_config = self._kv_cache_config.model_copy()
-        draft_kv_cache_config.max_gpu_total_bytes = draft_budget
-
-        host_budget = self._kv_cache_config.host_cache_size
-        if host_budget is not None and host_budget > 0:
-            draft_ratio = draft_budget / total_budget
-            draft_host_budget = int(host_budget * draft_ratio)
-            target_host_budget = host_budget - draft_host_budget
-            self._kv_cache_config.host_cache_size = target_host_budget
-            draft_kv_cache_config.host_cache_size = draft_host_budget
-            logger.info(
-                f"Splitting KV cache host budget: total={host_budget / GB:.2f} GiB, "
-                f"target={target_host_budget / GB:.2f} GiB, "
-                f"draft={draft_host_budget / GB:.2f} GiB")
-
+        setattr(self._kv_cache_config, budget_attr, target_budget)
+        if draft_kv_cache_config is None:
+            draft_kv_cache_config = self._kv_cache_config.model_copy()
+        setattr(draft_kv_cache_config, budget_attr, draft_budget)
         return draft_kv_cache_config
 
+    def _needs_gpu_kv_cache_budget_split(self) -> bool:
+        """Whether max_gpu_total_bytes must be split per manager."""
+        if issubclass(self._kv_cache_manager_cls, KVCacheManagerV2):
+            return self._should_create_separate_draft_kv_cache()
+        return is_vswa_enabled(self._kv_cache_config)
+
     def build_managers(self,
                        resources: Dict,
                        estimating_kv_cache: bool = False) -> None:
         """Construct KV caches for model and draft model (if applicable)."""
         if self._skip_est:
             self.configure_kv_cache_capacity()
 
-        # For V2 with separate one-model draft KV cache, split the total budget
-        # between target and draft before creating either manager.
-        # Only split for the final managers, not during estimation — estimation
-        # uses max_tokens-based logic and must not have its config mutated.
-        # Two-model draft is excluded: V2 does not support two-model mode.
-        draft_kv_cache_config = None
-        if (not estimating_kv_cache
-                and self._should_create_separate_draft_kv_cache()
-                and issubclass(self._kv_cache_manager_cls, KVCacheManagerV2)):
-            draft_kv_cache_config = self._split_kv_cache_budget_for_draft()
-
-        # Also split for V1 VSWA. The VSWA pool is sized directly from
-        # max_gpu_total_bytes and ignores max_tokens, so without splitting
-        # both target and draft each allocate the full combined budget.
-        # V1 non-VSWA does not need this: max_tokens caps the block count
-        # per model, giving each a proportional share of the budget.
+        # Split combined KV cache budgets before creating managers. Skip during
+        # estimation — estimation uses max_tokens-based logic and must not
+        # mutate the config.
         has_draft = (
             self._draft_model_engine is not None  # two-model
             or self._should_create_separate_draft_kv_cache())  # one-model
-        if (not estimating_kv_cache and has_draft
-                and draft_kv_cache_config is None
-                and not issubclass(self._kv_cache_manager_cls, KVCacheManagerV2)
-                and is_vswa_enabled(self._kv_cache_config)):
-            draft_kv_cache_config = self._split_kv_cache_budget_for_draft()
+        draft_kv_cache_config = None
+        if not estimating_kv_cache and has_draft:
+            # Used when each manager sizes pools from max_gpu_total_bytes (V2
+            # and V1 VSWA). V1 non-VSWA GPU uses shared max_tokens instead.
+            if self._needs_gpu_kv_cache_budget_split():
+                draft_kv_cache_config = self._split_kv_cache_budget_for_draft(
+                    "max_gpu_total_bytes", draft_kv_cache_config)
+            # KVCacheManagerV2 does not support two-model draft budget splitting.
+            v2_two_model = (issubclass(self._kv_cache_manager_cls,
+                                       KVCacheManagerV2)
+                            and self._draft_model_engine is not None)
+            if not v2_two_model:
+                # Each manager sizes its host pool from host_cache_size directly.
+                draft_kv_cache_config = self._split_kv_cache_budget_for_draft(
+                    "host_cache_size", draft_kv_cache_config)
 
         kv_cache_manager = self._create_kv_cache_manager(
             self._model_engine, estimating_kv_cache)
@@ -1037,14 +1106,19 @@ def build_managers(self,
                 assert draft_kv_cache_config is None, (
                     "KVCacheManagerV2 does not support two-model speculative "
                     "decoding with separate draft KV cache budget splitting.")
-            # For V1 VSWA, apply the draft's split budget temporarily
+            # For V1, apply the draft's split budgets temporarily.
             if draft_kv_cache_config is not None:
                 saved_budget = self._kv_cache_config.max_gpu_total_bytes
-                self._kv_cache_config.max_gpu_total_bytes = draft_kv_cache_config.max_gpu_total_bytes
+                saved_host = self._kv_cache_config.host_cache_size
+                self._kv_cache_config.max_gpu_total_bytes = (
+                    draft_kv_cache_config.max_gpu_total_bytes)
+                self._kv_cache_config.host_cache_size = (
+                    draft_kv_cache_config.host_cache_size)
             draft_kv_cache_manager = self._create_kv_cache_manager(
                 self._draft_model_engine, estimating_kv_cache)
             if draft_kv_cache_config is not None:
                 self._kv_cache_config.max_gpu_total_bytes = saved_budget
+                self._kv_cache_config.host_cache_size = saved_host
         # One-model speculative decoding with different KV layouts
         elif self._should_create_separate_draft_kv_cache():
             draft_kv_cache_manager = self._create_one_model_draft_kv_cache_manager(