[None][fix] Fix Mamba cache correctness under MTP + CUDA-graph padding (#13151)

Wanli-Jiang · web-flow · commit 450122e4622d · 2026-04-27T21:32:37.000+08:00
Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/rnnStateManager.cpp b/cpp/tensorrt_llm/batch_manager/rnnStateManager.cpp
@@ -20,8 +20,6 @@
 #include "tensorrt_llm/runtime/cudaStream.h"
 #include "tensorrt_llm/runtime/utils/runtimeUtils.h"
 
-#include <unordered_set>
-
 using namespace tensorrt_llm::runtime;
 
 namespace tensorrt_llm::batch_manager::rnn_state_manager
@@ -258,40 +256,16 @@ std::vector<RnnStateManager::SizeType32> RnnStateManager::getStateIndices(
     std::vector<RequestIdType> const& requestIds, std::vector<bool> const& isPadding)
 {
     TLLM_CHECK_WITH_INFO(requestIds.size() == isPadding.size(), "requestIds and isPadding must have the same size");
-
-    std::unordered_set<SizeType32> availableSlots;
-    availableSlots.reserve(mMaxNumSequences);
-    for (SizeType32 i = 0; i < mMaxNumSequences; ++i)
-    {
-        availableSlots.insert(i);
-    }
-
-    for (size_t i = 0; i < requestIds.size(); ++i)
-    {
-        if (!isPadding[i])
-        {
-            availableSlots.erase(getCacheIndex(requestIds[i]));
-        }
-    }
-
+    // Every id (real or CUDA-graph padding sentinel) has a permanent slot
+    // allocated by allocateCacheBlocks; padding entries all share their
+    // sentinel's slot, so they never alias a live request and never
+    // consume free-pool slots.
     std::vector<SizeType32> result;
     result.reserve(requestIds.size());
-    auto availableIt = availableSlots.begin();
-
-    for (size_t i = 0; i < requestIds.size(); ++i)
+    for (auto const& rid : requestIds)
     {
-        if (isPadding[i])
-        {
-            TLLM_CHECK_WITH_INFO(availableIt != availableSlots.end(), "Run out of available slots for padding");
-            result.push_back(*availableIt);
-            ++availableIt;
-        }
-        else
-        {
-            result.push_back(getCacheIndex(requestIds[i]));
-        }
+        result.push_back(getCacheIndex(rid));
     }
-
     return result;
 }
 
diff --git a/cpp/tensorrt_llm/thop/mamba2MTPSSMCacheOp.cpp b/cpp/tensorrt_llm/thop/mamba2MTPSSMCacheOp.cpp
@@ -55,7 +55,12 @@ void mamba2_mtp_ssm_cache_update(th::Tensor ssm, th::Tensor x, th::Tensor dt, th
     int const head_dim = ssm.size(2);
     int const ssm_dim = ssm.size(3);
 
-    TORCH_CHECK(intermediate_states.dim() == 5 && intermediate_states.size(0) == ssm.size(0)
+    // ssm.size(0) is the Mamba cache capacity — independent of the
+    // current batch (may include parked requests or reserved dummy
+    // slots). intermediate_states is per-step scratch indexed by
+    // intermediate_states_indices in [0, bs), so it only needs to
+    // fit the forward batch.
+    TORCH_CHECK(intermediate_states.dim() == 5 && intermediate_states.size(0) >= bs
             && intermediate_states.size(1) == cache_steps && intermediate_states.size(2) == nheads
             && intermediate_states.size(3) == head_dim && intermediate_states.size(4) == ssm_dim,
         "intermediate_states shape check failed");
diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_eagle.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_eagle.py
@@ -1025,10 +1025,12 @@ def _forward_with_kv_cache(self, csi: CachedSequenceInterface):
         kv_cache_manager = csi.kv_cache_manager
         if num_extend > 0 and isinstance(kv_cache_manager, MambaHybridCacheManager):
             if kv_cache_manager.is_speculative():
+                state_indices = csi.get_arg("slot_idx", truncate=True)
                 _ctx = SimpleNamespace(num_seqs=num_sequences, num_contexts=num_prefill)
                 kv_cache_manager.update_mamba_states(
                     attn_metadata=_ctx,
                     num_accepted_tokens=new_tokens_lens,
+                    state_indices=state_indices,
                 )
 
         # compute the cache and position offset based on the number of new tokens compared to the
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -305,15 +305,6 @@ def _generate_dummy_request(
     dummy_request = kv_cache_manager.add_dummy_requests([request_id], **request_kwargs)[0]
     dummy_request.is_cuda_graph_dummy = True
 
-    # generate a dummy scheduled requests object
-    dummy_scheduled_requests = ScheduledRequests()
-    dummy_scheduled_requests.generation_requests.append(dummy_request)
-
-    # if it's a hybrid kv-cache manager, we need to manually call prepare_resources again (not done
-    # in add_dummy_requests)
-    if is_hybrid_cache:
-        kv_cache_manager.prepare_resources(dummy_scheduled_requests)
-
     # add to spec resource manager
     if spec_res_mgr:
         spec_res_mgr.add_dummy_requests([request_id])
diff --git a/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py b/tensorrt_llm/_torch/pyexecutor/mamba_cache_manager.py
@@ -32,8 +32,7 @@
 from tensorrt_llm._torch.pyexecutor.resource_manager import (
     BaseResourceManager, CacheTypeCpp, DataType, KVCacheManager, get_pp_layers)
 from tensorrt_llm._torch.pyexecutor.scheduler import ScheduledRequests
-from tensorrt_llm._utils import (nvtx_range, prefer_pinned,
-                                 torch_dtype_to_binding)
+from tensorrt_llm._utils import nvtx_range, torch_dtype_to_binding
 from tensorrt_llm.bindings.internal.batch_manager import (
     KvCacheConnectorManager, LinearAttentionMetadata, LinearCacheType)
 from tensorrt_llm.llmapi.llm_args import KvCacheConfig
@@ -191,12 +190,10 @@ def free_resources(self, request: LlmRequest):
         self.mamba_impl.free_cache_block(request.py_request_id)
 
     def add_dummy_requests(self, request_ids: List[int], **kwargs):
-        # For CUDA graph dummy requests, the blocks will be allocated
-        # when get_state_indices is called.
-        from .cuda_graph_runner import CUDA_GRAPH_DUMMY_REQUEST_ID
-        request_ids = [
-            rid for rid in request_ids if rid != CUDA_GRAPH_DUMMY_REQUEST_ID
-        ]
+        # Allocate a permanent slot for every id, including CUDA-graph
+        # padding sentinels (matches PythonMambaCacheManager). Padding
+        # entries in get_state_indices then resolve via mCacheIndex to
+        # the sentinel's reserved slot and never alias a live request.
         if request_ids:
             self.mamba_impl.allocate_cache_blocks(request_ids)
 
@@ -375,12 +372,6 @@ def __init__(
         # mamba cache index, maps request_id -> state indices
         self.mamba_cache_index: Dict[int, int] = {}
 
-        # mamba cache state indices
-        self.state_indices: torch.Tensor = torch.arange(max_batch_size,
-                                                        device=device,
-                                                        dtype=torch.int32)
-        # save mamba state indices for requests
-        self.state_indices_list: List[int] = []
         # save intermediate state indices for requests
         self.intermediate_state_indices = torch.arange(max_batch_size,
                                                        dtype=torch.int32,
@@ -399,23 +390,13 @@ def get_needed_resource_to_completion(self, request: LlmRequest) -> int:
 
     @torch.inference_mode()
     def _prepare_mamba_cache_blocks(self, request_ids: List[int]):
-        self.state_indices_list.clear()
         for r in request_ids:
-            # cache hit
             if r in self.mamba_cache_index:
-                self.state_indices_list.append(self.mamba_cache_index[r])
-            # cache miss
-            else:
-                if len(self.mamba_cache_free_blocks) == 0:
-                    raise RuntimeError("run out of mamba cache blocks")
-                block = self.mamba_cache_free_blocks.pop()
-                self.mamba_cache_index[r] = block
-                self.state_indices_list.append(block)
-        self.state_indices[:len(self.state_indices_list)].copy_(
-            torch.tensor(self.state_indices_list,
-                         dtype=torch.int32,
-                         pin_memory=prefer_pinned()),
-            non_blocking=True)
+                continue
+            if len(self.mamba_cache_free_blocks) == 0:
+                raise RuntimeError("run out of mamba cache blocks")
+            block = self.mamba_cache_free_blocks.pop()
+            self.mamba_cache_index[r] = block
 
     def prepare_resources(self, scheduled_batch: ScheduledRequests):
         context_ids = [
@@ -428,10 +409,16 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests):
         self._prepare_mamba_cache_blocks(request_ids)
 
     def add_dummy_requests(self, request_ids: List[int], **kwargs):
-        from .cuda_graph_runner import CUDA_GRAPH_DUMMY_REQUEST_ID
-        request_ids = [
-            rid for rid in request_ids if rid != CUDA_GRAPH_DUMMY_REQUEST_ID
-        ]
+        # Allocate a permanent slot for every dummy request ID, including
+        # the CUDA-graph padding sentinel. Padding entries in a batch all
+        # reference the same dummy request ID, so they share one slot via
+        # mamba_cache_index lookup in get_state_indices. This mirrors how
+        # MTP's per-draft-len padding dummies already behave (they use
+        # CUDA_GRAPH_DUMMY_REQUEST_ID - draft_len, which was never
+        # filtered here) and keeps padding writes off every live
+        # request's slot, even under the overlap scheduler where a prior
+        # batch's completed requests linger in mamba_cache_index until
+        # _process_previous_batch runs.
         if request_ids:
             for r in request_ids:
                 if r not in self.mamba_cache_index:
@@ -448,29 +435,10 @@ def free_resources(self, request: LlmRequest):
 
     def get_state_indices(self, request_ids: List[int],
                           is_padding: List[bool]) -> List[int]:
-        assert len(request_ids) == len(is_padding), (
-            "request_ids and is_padding must have the same size")
-
-        used_slots = {
-            self.mamba_cache_index[req_id]
-            for req_id, pad in zip(request_ids, is_padding) if not pad
-        }
-        available_slots = iter(
-            sorted(set(range(self.state_indices.numel())) - used_slots))
-
-        def slot_for(req_id: int, pad: bool):
-            if pad:
-                try:
-                    return next(available_slots)
-                except StopIteration:
-                    raise RuntimeError(
-                        "Run out of available slots for padding") from None
-            return self.mamba_cache_index[req_id]
-
-        result = [
-            slot_for(rid, pad) for rid, pad in zip(request_ids, is_padding)
-        ]
-        return result
+        # Padding entries reuse the slot pre-allocated by their dummy
+        # request in add_dummy_requests; see that method for the
+        # overlap-scheduler rationale.
+        return [self.mamba_cache_index[rid] for rid in request_ids]
 
     def get_conv_states(self, layer_idx: int) -> torch.Tensor:
         layer_offset = self.mamba_layer_offsets[layer_idx]
@@ -509,9 +477,6 @@ def get_mamba_ssm_cache_dtype(self) -> torch.dtype:
 
     def shutdown(self):
         """Release tensor memory."""
-        # Clear state indices
-        self.state_indices = torch.tensor([])
-
         # Clear mamba cache states
         if isinstance(self.mamba_cache, self.SpeculativeState):
             self.mamba_cache = self.SpeculativeState(
@@ -530,14 +495,14 @@ def shutdown(self):
 
     @torch.compile(options={"max-autotune": True})
     def update_mamba_states(self, attn_metadata: "AttentionMetadata",
-                            num_accepted_tokens: torch.Tensor):
+                            num_accepted_tokens: torch.Tensor,
+                            state_indices: torch.Tensor):
         batch_size = attn_metadata.num_seqs
         num_contexts = attn_metadata.num_contexts
         num_gens = batch_size - num_contexts
         num_accepted_draft_tokens = num_accepted_tokens[
             num_contexts:num_contexts + num_gens] - 1
-        state_indices_d = self.state_indices[num_contexts:num_contexts +
-                                             num_gens]
+        state_indices_d = state_indices[num_contexts:num_contexts + num_gens]
 
         conv_states = self.mamba_cache.conv
         ssm_states = self.mamba_cache.temporal
@@ -684,9 +649,18 @@ def shutdown(self):
         self._impl.shutdown()
 
     def update_mamba_states(self, attn_metadata: "AttentionMetadata",
-                            num_accepted_tokens: torch.Tensor):
+                            num_accepted_tokens: torch.Tensor,
+                            state_indices: torch.Tensor):
+        # Non-speculative configs don't allocate intermediate state; the
+        # promotion is a clean no-op.
+        if not self._impl.is_speculative():
+            return
+        # Belt-and-suspenders: C++ is non-speculative today so this is
+        # unreachable. Fires if C++ ever grows speculative support
+        # without also implementing the scatter there.
         assert not self._use_cpp, "update_mamba_states is not supported in CppMambaCacheManager"
-        self._impl.update_mamba_states(attn_metadata, num_accepted_tokens)
+        self._impl.update_mamba_states(attn_metadata, num_accepted_tokens,
+                                       state_indices)
 
 
 class MixedMambaHybridCacheManager(KVCacheManager, MambaCacheManager):
@@ -733,7 +707,13 @@ def __init__(
         # mamba hybrid cache requires block reuse to be disabled in KV cache config
         assert not kv_cache_config.enable_block_reuse, "mamba hybrid cache requires block reuse to be disabled in KV cache config"
 
-        # initialize mamba cache manager
+        # Reserve one Mamba slot per possible CUDA-graph padding dummy
+        # (one per runtime_draft_len in 0..max_draft_len) so a full
+        # max_batch_size of real requests still leaves room for padding.
+        max_draft_len = (spec_config.max_draft_len
+                         if spec_config is not None else 0)
+        pool_size = max_batch_size + max_draft_len + 1
+
         MambaCacheManager.__init__(
             self,
             mamba_d_state,
@@ -742,7 +722,7 @@ def __init__(
             mamba_n_groups,
             mamba_head_dim,
             mamba_num_layers,
-            max_batch_size,
+            pool_size,
             max_batch_size,
             mapping,
             mamba_cache_dtype,
@@ -796,11 +776,6 @@ def update_resources(self,
         KVCacheManager.update_resources(self, scheduled_batch, attn_metadata,
                                         kv_cache_dtype_byte_size)
 
-    def update_mamba_states(self, attn_metadata: "AttentionMetadata",
-                            num_accepted_tokens: torch.Tensor):
-        MambaCacheManager.update_mamba_states(self, attn_metadata,
-                                              num_accepted_tokens)
-
 
 def calc_context_stop_positions(prompt_len: int,
                                 tokens_per_block: int,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -904,6 +904,16 @@ def drafting_loop_wrapper(model):
                 py_executor.kv_cache_transceiver.shutdown()
         finally:
             kv_cache_creator.teardown_managers(resources)
+
+        # Release Phase-1 CUDA graph pools before final KV allocation to avoid overshoot.
+        for eng in [model_engine, draft_model_engine]:
+            if eng is None:
+                continue
+            if eng.attn_metadata is not None:
+                if llm_args.cuda_graph_config is not None:
+                    eng._release_cuda_graphs()
+                eng.attn_metadata = None
+
         del py_executor  # free before constructing new
         gc.collect()
 
@@ -918,13 +928,6 @@ def drafting_loop_wrapper(model):
             max_seq_len = kv_cache_creator._max_seq_len
             update_sampler_max_seq_len(max_seq_len, sampler)
 
-            for eng in [model_engine, draft_model_engine]:
-                if eng is None:
-                    continue
-                if eng.attn_metadata is not None:
-                    if llm_args.cuda_graph_config is not None:
-                        eng._release_cuda_graphs()
-                    eng.attn_metadata = None
         with allocation_scope(ExecutorMemoryType.EXTRA_RESOURCES):
 
             # run gc.collect() to free memory of the previous py_executor, avoid cudaFree overlap with cuda graph capture
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py
@@ -1169,7 +1169,8 @@ def forward(
         if num_gens > 0 and self._is_mamba_hybrid_cache:
             attn_metadata.kv_cache_manager.update_mamba_states(
                 attn_metadata=attn_metadata,
-                num_accepted_tokens=num_accepted_tokens)
+                num_accepted_tokens=num_accepted_tokens,
+                state_indices=attn_metadata.mamba_metadata.state_indices)
 
         # Save the old attn_metadata and spec_metadata
         self._prepare_attn_metadata_for_spec_dec(attn_metadata)
diff --git a/tests/unittest/_torch/executor/test_mamba_cache_manager.py b/tests/unittest/_torch/executor/test_mamba_cache_manager.py
diff --git a/tests/unittest/others/test_kv_cache_transceiver.py b/tests/unittest/others/test_kv_cache_transceiver.py