[None][fix] Stabilize Mamba replay state update (#14841)

sunnyqgg · web-flow · commit 2dd5c67358ae · 2026-06-12T09:04:37.000+08:00
Signed-off-by: qgai &lt;qgai@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/modules/mamba/causal_conv1d_triton.py b/tensorrt_llm/_torch/modules/mamba/causal_conv1d_triton.py
@@ -158,7 +158,7 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
                 conv_states_ptrs = prior_tokens - 3 * stride_conv_state_tok  # [BLOCK_N]
                 col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
         else:
-            # prior-tokens are zeros
+            # No cached prefix: start the convolution window from zeros.
             if KERNEL_WIDTH >= 2:  # STRATEGY1
                 # first chunk and does not have prior-token, so just set to 0
                 col0 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
@@ -348,8 +348,6 @@ def forward(
             has_initial_states = mamba_metadata.has_initial_states[:
                                                                    num_prefills]
 
-            has_initial_states_p = has_initial_states[:num_prefills]
-            conv_states[state_indices_p[~has_initial_states_p]].zero_()
             # Fused kernel to avoid expensive .contiguous() call in causal_conv1d_fn.
             xbc_p_t = extract_transpose_xbc_prefill(zxbcdt, num_prefill_tokens,
                                                     self.tp_d_inner,
@@ -376,6 +374,7 @@ def forward(
 
             initial_states = None
             if mamba_metadata.use_initial_states:
+                # Rows without cached prefix state start SSM from zero.
                 initial_states = torch.where(
                     has_initial_states[:, None, None, None],
                     ssm_states[state_indices_p], 0)
diff --git a/tensorrt_llm/_torch/modules/mamba/replay_selective_state_update.py b/tensorrt_llm/_torch/modules/mamba/replay_selective_state_update.py
@@ -226,7 +226,7 @@ def _replay_precompute_kernel(
         other=0.0,
     )
 
-    # Compute raw CB once — shared across all heads in this block
+    # Compute raw CB once, shared across all heads in this block.
     raw_CB = tl.dot(C_all.to(tl.bfloat16), tl.trans(B_all).to(tl.bfloat16))
 
     # Store B to cache (once per group, only if this block covers the first heads)
@@ -458,17 +458,18 @@ def _replay_state_update_kernel(
     # two back).  coeff is all-zero (offs_t < 0), total_decay is 1.0, so the
     # replay leaves `state` unchanged — cache contents don't matter on step 0.
     coeff = tl.exp(total_dA_cumsum - old_dA_cumsum_all) * old_dt_all
-    coeff = tl.where(offs_t < prev_num_accepted_tokens, coeff, 0.0)
+    accepted_mask = t_mask & (offs_t < prev_num_accepted_tokens)
+    coeff = tl.where(accepted_mask, coeff, 0.0)
 
-    # Load old_x: (BLOCK_SIZE_T, BLOCK_SIZE_M) — single-buffered
+    # Zero stale rows beyond PNAT to prevent Inf/NaN from reaching tl.dot.
     old_x_base = old_x_ptr + cache_batch_idx * stride_old_x_cache + pid_h * stride_old_x_head
     old_x_all = tl.load(
         old_x_base + offs_t[:, None] * stride_old_x_T + offs_m[None, :] * stride_old_x_dim,
-        mask=t_mask[:, None] & m_mask[None, :],
+        mask=accepted_mask[:, None] & m_mask[None, :],
         other=0.0,
     )
 
-    # Load old_B from READ buffer: (BLOCK_SIZE_T, BLOCK_SIZE_DSTATE)
+    # Apply the same accepted-row mask to old_B.
     old_B_base = (
         old_B_ptr
         + cache_batch_idx * stride_old_B_cache
@@ -477,7 +478,7 @@ def _replay_state_update_kernel(
     )
     old_B_all = tl.load(
         old_B_base + offs_t[:, None] * stride_old_B_T + offs_n[None, :] * stride_old_B_dstate,
-        mask=t_mask[:, None] & n_mask[None, :],
+        mask=accepted_mask[:, None] & n_mask[None, :],
         other=0.0,
     ).to(tl.float32)
 
@@ -488,7 +489,7 @@ def _replay_state_update_kernel(
     total_decay = tl.where(prev_num_accepted_tokens > 0, tl.exp(total_dA_cumsum), 1.0)
     state *= total_decay
 
-    # tl.dot fast-forward: old_x^T @ dB_scaled → (M, dstate)
+    # tl.dot fast-forward: old_x^T @ dB_scaled -> (M, dstate)
     state += tl.dot(tl.trans(old_x_all).to(tl.bfloat16), dB_scaled.to(tl.bfloat16))
 
     # Write post-replay state
@@ -771,7 +772,7 @@ def replay_selective_state_update(
     device = x.device
     BLOCK_SIZE_T = max(triton.next_power_of_2(T), 16)
 
-    # Allocate precomputed intermediates (per-call, not cached)
+    # Allocate precomputed intermediates (per-call, not cached).
     cb_scaled = torch.empty(
         batch, nheads, BLOCK_SIZE_T, BLOCK_SIZE_T, device=device, dtype=torch.float32
     )