[#15565][fix] AutoDeploy: Fix Super MTP IMA introduced by checkpointing replay (#15622)

galagam · web-flow · commit 2e33221d6dcb · 2026-06-26T07:07:30.000+03:00
Signed-off-by: Gal Hubara Agam &lt;96368689+galagam@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -993,7 +993,6 @@ def _prepare_inputs(
             _ungathered_new_lens=new_tokens_lens,
             **extra_args,
         )
-        self.cache_seq_interface.prepare_replay_metadata()
 
         self.iter_states["num_ctx_requests"] = num_prefill
         self.iter_states["num_ctx_tokens"] = num_prefill_tokens
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/interface.py b/tensorrt_llm/_torch/auto_deploy/shim/interface.py
@@ -904,7 +904,12 @@ def _create_and_assign_state_views(
                     self._caches[buf_name] = global_tensor
 
             if replay_work_items:
-                self._replay_work_items = torch.empty(
+                # Zero-init as an extra precaution (not torch.empty). The
+                # prepare_replay_metadata host-prepare hook (registered in
+                # initialize_resources) populates this buffer on every
+                # nest_sequences -- runtime and cudagraph capture alike -- so the
+                # replay SSM kernel never reads it unprepared.
+                self._replay_work_items = torch.zeros(
                     self.info.max_num_state_slots,
                     REPLAY_WORK_ITEM_WIDTH,
                     device=self.info.device,
@@ -1417,6 +1422,15 @@ def initialize_resources(self) -> int:
             f"max_tokens={s['max_tokens']}"
         )
 
+        if self.info.batch_info.is_use_replay():
+            # Wrapper takes **kwargs to satisfy the host-prepare callable protocol;
+            # prepare_replay_metadata reads everything it needs from self, so no
+            # graph-input args are requested (empty arg list).
+            def _replay_metadata_hook(**_sequence_info_args) -> None:
+                self.prepare_replay_metadata()
+
+            self.info.register_host_prepare_for_attention_forward(_replay_metadata_hook, [])
+
         return len(self._caches)
 
     def _requires_token_estimate(self) -> bool:
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -825,10 +825,7 @@ def test_mtp(self, world_size, attn_backend, model_id):
 
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
-            # bf16 acceptance is stable; fp8/nvfp4 have higher variance due to
-            # arithmetic rounding, so use a lower threshold for quantized models.
-            min_rate = 0.50 if model_id == "bf16" else 0.40
-            self.check_acceptance_rate(llm, min_acceptance_rate=min_rate)
+            self.check_acceptance_rate(llm, min_acceptance_rate=0.50)
 
         print_memory_usage("after evaluation")
 
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -408,5 +408,6 @@ l0_dgx_b200:
   - accuracy/test_llm_api_autodeploy.py::TestModelRegistryAccuracy::test_autodeploy_from_registry[deepseek-ai_DeepSeek-R1-0528-True]
   - accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_nvfp4[8]
   - accuracy/test_llm_api_autodeploy.py::TestNemotronUltraV3::test_accuracy[nvfp4-8]
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[nvfp4_ws8_80gb-trtllm]
   # ------------- AutoDeploy Perf Sanity ---------------
   - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_ad_blackwell-r1_fp8_ad_ws8_1k1k] TIMEOUT (120)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -6,7 +6,6 @@ accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xg
 accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[bf16-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6367792)
 accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[fp8-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6367792)
 accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[nvfp4-4-attn_dp_off-trtllm] SKIP (https://nvbugs/6367792)
-accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[fp8_ws4_80gb-trtllm] SKIP (https://nvbugs/6336682)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/6281818)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] SKIP (https://nvbugs/6281818)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency] SKIP (https://nvbugs/6276981)
@@ -157,20 +156,15 @@ full:B300/accuracy/test_llm_api_pytorch.py::TestKimiK25::test_nvfp4[ep8] SKIP (h
 full:B300/disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/6322073)
 full:B300/unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" SKIP (https://nvbugs/6165866)
 full:DGX_B200/unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" SKIP (https://nvbugs/6165866)
-full:DGX_H100/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[fp8_ws4_80gb-trtllm] SKIP (https://nvbugs/6336682)
 full:GB200/accuracy/test_dwdp_disaggregated_serving.py::TestDwdpDeepSeekV3Lite::test_dwdp_accuracy SKIP (https://nvbugs/6276923)
 full:GB200/accuracy/test_dwdp_disaggregated_serving.py::TestDwdpDeepSeekV3Lite::test_dwdp_accuracy_contention_opt SKIP (https://nvbugs/6276923)
 full:GB200/accuracy/test_dwdp_disaggregated_serving.py::TestDwdpDeepSeekV3Lite::test_dwdp_accuracy_mode_b_overlap SKIP (https://nvbugs/6276923)
-full:GB200/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[fp8_ws4_80gb-trtllm] SKIP (https://nvbugs/6316981)
 full:GB200/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8_moe_dflash SKIP (https://nvbugs/6316985)
 full:GB200/accuracy/test_llm_api_pytorch.py::TestQwen3_5_4B::test_dflash SKIP (https://nvbugs/6344883)
 full:GB200/accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype SKIP (https://nvbugs/6316983)
 full:GB200/disaggregated/test_ad_disagg.py::test_async_eagle3_full_model_handoff SKIP (https://nvbugs/6369254)
 full:GB300/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-False-False-False] SKIP (https://nvbugs/6316984)
 full:GB300/accuracy/test_llm_api_autodeploy.py::TestNemotronNanoV3::test_accuracy[nvfp4-1-attn_dp_off-trtllm] SKIP (https://nvbugs/6329165)
-full:GB300/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[bf16_ws4_180gb-trtllm] SKIP (https://nvbugs/6316981)
-full:GB300/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[fp8_ws4_80gb-trtllm] SKIP (https://nvbugs/6316981)
-full:GB300/accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_mtp[nvfp4_ws4_80gb-trtllm] SKIP (https://nvbugs/6316981)
 full:GB300/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-ep4-trtllm-fp8] SKIP (https://nvbugs/6316980)
 full:GB300/accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache_no_reuse-tp4-trtllm-fp8] SKIP (https://nvbugs/6316980)
 full:GB300/accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8_moe_dflash SKIP (https://nvbugs/6316985)
diff --git a/tests/unittest/auto_deploy/singlegpu/custom_ops/mamba/test_flashinfer_mamba_cached_op.py b/tests/unittest/auto_deploy/singlegpu/custom_ops/mamba/test_flashinfer_mamba_cached_op.py
@@ -19,7 +19,22 @@
 from test_triton_mamba_cached_op import _random_params
 
 import tensorrt_llm._torch.auto_deploy  # noqa: F401
-from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import BatchInfo
+from tensorrt_llm._torch.auto_deploy._compat import KvCacheConfig
+from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import (
+    BatchInfo,
+    CausalConvResourceHandler,
+    IntermediateConvStateHandler,
+    ReplayCacheBufIdxHandler,
+    ReplayNWritesHandler,
+    ReplayOldBHandler,
+    ReplayOldDAcumsumHandler,
+    ReplayOldDtHandler,
+    ReplayOldXHandler,
+    ReplayPrevNumAcceptedHandler,
+    ReplayWorkItemsHandler,
+    SSMResourceHandler,
+)
+from tensorrt_llm._torch.auto_deploy.shim.interface import CachedSequenceInterface
 from tensorrt_llm._torch.modules.mamba.mamba2_metadata import (
     REPLAY_WORK_CACHE_BUF_IDX,
     REPLAY_WORK_CACHE_SLOT,
@@ -265,3 +280,191 @@ def test_flashinfer_extend_replay_calls_replay_kernel(mamba_env, head_dim):
     )
     assert out.shape == hidden_states.shape
     assert torch.isfinite(out).all()
+
+
+class _SpecDecModeForReplayTest:
+    def use_one_engine(self):
+        return True
+
+
+class _SpecConfigForReplayTest:
+    def __init__(self, max_draft_len: int):
+        self.max_draft_len = max_draft_len
+        self.tokens_per_gen_step = max_draft_len + 1
+        self.spec_dec_mode = _SpecDecModeForReplayTest()
+
+
+def _build_interface_with_replay_buffers(num_heads, head_dim, d_state, n_groups, max_batch_size):
+    """Allocate replay buffers through the real production path (CachedSequenceInterface).
+
+    Registers the Mamba + replay-buffer resource bundle for one layer and runs
+    initialize_resources(), which is where the cache-manager-bound replay
+    work-items buffer (interface._replay_work_items -- the tensor the replay SSM
+    kernel actually reads) is allocated.
+    """
+    conv_dim = head_dim * num_heads + 2 * n_groups * d_state
+    interface = CachedSequenceInterface(
+        max_seq_len=128,
+        max_batch_size=max_batch_size,
+        max_num_tokens=(128 + 1) * max_batch_size,
+        device="cuda",
+        kv_cache_config=KvCacheConfig(
+            tokens_per_block=32, max_tokens=1024, free_gpu_memory_fraction=0.0
+        ),
+        spec_config=_SpecConfigForReplayTest(max_draft_len=2),
+    )
+    interface.add_resource(
+        "ssm_state_0",
+        SSMResourceHandler(
+            num_heads=num_heads, head_dim=head_dim, d_state=d_state, dtype=torch.bfloat16
+        ),
+    )
+    interface.add_resource(
+        "conv_state_0", CausalConvResourceHandler(conv_dim=conv_dim, d_conv=4, dtype=torch.float32)
+    )
+    interface.add_resource(
+        "intermediate_conv_state_0",
+        IntermediateConvStateHandler(conv_dim=conv_dim, d_conv=4, dtype=torch.float32),
+    )
+    interface.add_resource(
+        "replay_old_x_0",
+        ReplayOldXHandler(num_heads=num_heads, head_dim=head_dim, dtype=torch.bfloat16),
+    )
+    interface.add_resource(
+        "replay_old_B_0",
+        ReplayOldBHandler(n_groups=n_groups, d_state=d_state, dtype=torch.bfloat16),
+    )
+    interface.add_resource("replay_old_dt_0", ReplayOldDtHandler(num_heads=num_heads))
+    interface.add_resource("replay_old_dA_cumsum_0", ReplayOldDAcumsumHandler(num_heads=num_heads))
+    interface.add_resource("replay_cache_buf_idx_0", ReplayCacheBufIdxHandler())
+    interface.add_resource("replay_prev_num_accepted_0", ReplayPrevNumAcceptedHandler())
+    interface.add_resource("replay_work_items_0", ReplayWorkItemsHandler())
+    interface.add_resource("replay_n_writes_0", ReplayNWritesHandler())
+    return interface
+
+
+def test_extend_replay_init_buffers(mamba_env):
+    """The replay path must not cause an out-of-bounds access on the replay buffers.
+
+    Behavioral guard for the replay path: every buffer the prepare hook populates (the
+    work-items buffer and the n-writes count) is filled with garbage (out-of-bounds
+    values, simulating uninitialized memory), then the production metadata-prep path runs
+    and the real replay op executes; the test asserts no CUDA fault. With the fix, prep
+    populates the buffers before the kernel reads them, so the garbage never reaches the
+    kernel; without it the out-of-bounds values survive and fault.
+
+    Filling the buffers directly keeps the poison confined to them and makes the failure
+    deterministic: fresh CUDA memory is often benign, so a poison-free run cannot
+    reliably reproduce the bug.
+    """
+    device = mamba_env["device"]
+    dtype = mamba_env["dtype"]
+
+    # Production SuperV3 Mamba2 shape (AutoDeploy replicates mamba -> full heads/groups),
+    # large enough that the replay kernel runs its persistent_main path, which reads the
+    # cache slot from the replay work-items buffer.
+    num_extend = 8
+    tokens_per_extend = 7  # num_nextn_predict_layers (6) + 1
+    num_heads = 128
+    head_dim = 64
+    n_groups, ssm_state_size = 8, 128
+
+    interface = _build_interface_with_replay_buffers(
+        num_heads, head_dim, ssm_state_size, n_groups, max_batch_size=num_extend
+    )
+    interface.initialize_resources()
+
+    # Poison every buffer the prepare hook populates -- the work-items buffer and the
+    # n-writes count -- with out-of-bounds values, simulating garbage / uninitialized
+    # memory. The production metadata-prep below must overwrite them before the kernel
+    # reads them; if prep is missing (the bug) the poison survives and faults.
+    interface._replay_work_items.fill_(0x7FFFFFFF)  # int32-max: out-of-bounds cache slot
+    interface._replay_n_writes.fill_(0x7FFFFFFF)  # int32-max: out-of-bounds write count
+
+    # Drive the production metadata-prep path -- the same one cudagraph capture uses --
+    # so the replay work-items / n-writes buffers are populated exactly as in real runs
+    # (set_capture_batch -> nest_sequences -> prepare_replay_metadata host-prepare hook).
+    interface.info.set_capture_batch(max_draft_len=tokens_per_extend - 1, batch_size=num_extend)
+    replay_work_items = interface._replay_work_items
+    replay_n_writes = interface._replay_n_writes
+
+    # Per-token inputs and the remaining replay caches for the same extend batch.
+    (hidden_states, A, B, C, D, dt, dt_bias, time_step_limit, chunk_size) = _random_params(
+        device, dtype, num_extend, tokens_per_extend, num_heads, head_dim, n_groups, ssm_state_size
+    )
+    ssm_state_cache = torch.zeros(
+        num_extend, num_heads, head_dim, ssm_state_size, device=device, dtype=dtype
+    )
+    slot_idx = torch.arange(num_extend, device=device, dtype=torch.int32)
+
+    replay_history_size = 16
+    replay_old_x = torch.zeros(
+        num_extend, 2, replay_history_size, num_heads, head_dim, device=device, dtype=torch.bfloat16
+    )
+    replay_old_b = torch.zeros(
+        num_extend,
+        2,
+        replay_history_size,
+        n_groups,
+        ssm_state_size,
+        device=device,
+        dtype=torch.bfloat16,
+    )
+    replay_old_dt = torch.zeros(
+        num_extend, 2, num_heads, replay_history_size, device=device, dtype=torch.float32
+    )
+    replay_old_da_cumsum = torch.zeros(
+        num_extend, 2, num_heads, replay_history_size, device=device, dtype=torch.float32
+    )
+    replay_cache_buf_idx = torch.zeros(num_extend, device=device, dtype=torch.int32)
+    replay_prev_num_accepted = torch.zeros(num_extend, device=device, dtype=torch.int32)
+
+    _bi = BatchInfo()
+    _bi.update([0, 0, num_extend, num_extend * tokens_per_extend, 0, 0])
+    _bi.update_use_replay(True)
+    batch_info_host = _bi.serialize()
+    cu_seqlen = torch.arange(
+        0, (num_extend + 1) * tokens_per_extend, tokens_per_extend, device=device, dtype=torch.int32
+    )
+    use_initial_states = torch.zeros(num_extend, device=device, dtype=torch.bool)
+    any_prefill_use_initial_states_host = torch.tensor([False], device=device, dtype=torch.bool)
+
+    out = torch.ops.auto_deploy.flashinfer_cached_ssm(
+        hidden_states,
+        A,
+        B,
+        C,
+        D,
+        dt,
+        dt_bias,
+        # STANDARD METADATA
+        batch_info_host,
+        cu_seqlen,
+        slot_idx,
+        use_initial_states,
+        any_prefill_use_initial_states_host,
+        # EXTRA METADATA
+        None,
+        None,
+        None,  # chunk_indices, chunk_offsets, seq_idx_prefill
+        # CACHES
+        ssm_state_cache,
+        None,  # intermediate_ssm_state_cache (None in replay mode)
+        replay_old_x,
+        replay_old_b,
+        replay_old_dt,
+        replay_old_da_cumsum,
+        replay_cache_buf_idx,
+        replay_prev_num_accepted,
+        replay_work_items,
+        replay_n_writes,
+        # CONSTANTS
+        time_step_limit,
+        chunk_size,
+    )
+
+    # Synchronize so any out-of-bounds access on the replay buffers surfaces here as a
+    # CUDA error rather than asynchronously later.
+    torch.cuda.synchronize()
+    assert out.shape == hidden_states.shape
+    assert torch.isfinite(out).all()

Original file line number	Diff line number	Diff line change
`@@ -993,7 +993,6 @@ def _prepare_inputs(`
`993`	`993`	`_ungathered_new_lens=new_tokens_lens,`
`994`	`994`	`**extra_args,`
`995`	`995`	`)`
`996`		`- self.cache_seq_interface.prepare_replay_metadata()`
`997`	`996`
`998`	`997`	`self.iter_states["num_ctx_requests"] = num_prefill`
`999`	`998`	`self.iter_states["num_ctx_tokens"] = num_prefill_tokens`