[None][fix] Draft KV cache should not allocate host memory

shang-pin-tmate · shang-pin-tmate · commit 79148bab6ba6 · 2026-04-03T16:42:15.000-07:00
When using one-model speculative decoding with separate draft KV cache
(e.g. EAGLE3), the draft cache inherits the target's KvCacheConfig which
may have a non-zero host_cache_size. This causes unnecessary host memory
allocation for the draft cache. Only the target model should use host
offloading since draft tokens are transient and may be rejected during
verification.

Fix: set host_cache_size=0 on the draft KV cache config before creating
the draft KV cache manager.

Signed-off-by: Shang-Pin Sheng &lt;shang-pin@tmatehq.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -695,6 +695,11 @@ def _create_one_model_draft_kv_cache_manager(
         # falls back to the target model's config for MTP mode.
         sparse_attn_config = effective_draft_config.sparse_attention_config
         draft_kv_config = kv_cache_config_override if kv_cache_config_override is not None else self._kv_cache_config
+        # Draft KV cache should not allocate host memory — only the target
+        # model uses host offloading.  Zero out host_cache_size to prevent
+        # unnecessary host memory allocation for the draft cache.
+        draft_kv_config = draft_kv_config.model_copy(
+            update={'host_cache_size': 0})
         return _create_kv_cache_manager(
             model_engine=None,
             kv_cache_manager_cls=draft_kv_cache_manager_cls,