Optimize qwen3.5 prefill kernels

rosenrodt · rosenrodt · commit dc3a46893acd · 2026-03-25T22:19:53.000+08:00
- update chunked Gated Delta Rule prefill to use indexed in-kernel state updates
- remove explicit Qwen3Next prefill state gather/scatter in forward_extend
- retune causalConv1d forward launch selection for varlen and short sequences

Signed-off-by: Anthony Chang &lt;27950904+rosenrodt@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu
@@ -4,7 +4,7 @@
  * and https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_update.cu
  * Copyright (c) 2024, Tri Dao.
  *
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -349,20 +349,45 @@ void causal_conv1d_fwd_launch(ConvParamsBase& params, cudaStream_t stream)
         });
 }
 
+template <int kWidth, typename input_t, typename weight_t>
+void causal_conv1d_fwd_dispatch(ConvParamsBase& params, cudaStream_t stream)
+{
+    bool const isVarlen = params.query_start_loc_ptr != nullptr;
+    constexpr int kNarrowThreads = 64;
+    constexpr int kWideThreads = 128;
+    constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;
+    constexpr int kShortSeqThreshold = kNarrowThreads * kNElts;
+    // Varlen prefill launches one block per sequence/channel pair, so the per-sequence
+    // work is usually much smaller than params.seqlen suggests. That path also disables
+    // the wide vector-load specialization, so the 128-thread kernel tends to overprovision
+    // threads for many short chunks. Prefer the narrower launch for varlen and for short
+    // fixed-length inputs; keep the wider launch for long dense sequences.
+    bool const preferNarrowKernel = isVarlen || params.seqlen <= kShortSeqThreshold;
+
+    if (preferNarrowKernel)
+    {
+        causal_conv1d_fwd_launch<kNarrowThreads, kWidth, input_t, weight_t>(params, stream);
+    }
+    else
+    {
+        causal_conv1d_fwd_launch<kWideThreads, kWidth, input_t, weight_t>(params, stream);
+    }
+}
+
 template <typename input_t, typename weight_t>
 void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream)
 {
     if (params.width == 2)
     {
-        causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);
+        causal_conv1d_fwd_dispatch<2, input_t, weight_t>(params, stream);
     }
     else if (params.width == 3)
     {
-        causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);
+        causal_conv1d_fwd_dispatch<3, input_t, weight_t>(params, stream);
     }
     else if (params.width == 4)
     {
-        causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);
+        causal_conv1d_fwd_dispatch<4, input_t, weight_t>(params, stream);
     }
 }
 
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_next.py b/tensorrt_llm/_torch/models/modeling_qwen3_next.py
@@ -747,23 +747,22 @@ def forward_extend(
         g = g.unsqueeze(0)
         beta = beta.unsqueeze(0)
 
-        recurrent_state = ssm_states[cache_indices]
-
-        core_attn_out, last_recurrent_state = chunk_gated_delta_rule(
+        core_attn_out, _ = chunk_gated_delta_rule(
             q=query,
             k=key,
             v=value,
             g=g,
             beta=beta,
-            initial_state=recurrent_state,
-            output_final_state=True,
+            initial_state=ssm_states,
+            initial_state_indices=cache_indices,
+            # This path writes recurrent state directly back into the shared
+            # pool; callers **must** ensure cache_indices do not alias live slots.
+            inplace_indexed_state_update=True,
+            output_final_state=False,
             cu_seqlens=query_start_loc_long,
             head_first=False,
             use_qk_l2norm_in_kernel=True,
         )
-        last_recurrent_state = last_recurrent_state.to(ssm_states.dtype,
-                                                       copy=False)
-        ssm_states[cache_indices] = last_recurrent_state
 
         return core_attn_out
 
diff --git a/tensorrt_llm/_torch/modules/fla/chunk.py b/tensorrt_llm/_torch/modules/fla/chunk.py
@@ -29,6 +29,8 @@ def chunk_gated_delta_rule_fwd(
     beta: torch.Tensor,
     scale: float,
     initial_state: torch.Tensor,
+    initial_state_indices: Optional[torch.Tensor],
+    inplace_indexed_state_update: bool,
     output_final_state: bool,
     cu_seqlens: Optional[torch.LongTensor] = None,
 ):
@@ -54,7 +56,9 @@ def chunk_gated_delta_rule_fwd(
         u=u,
         g=g,
         initial_state=initial_state,
+        initial_state_indices=initial_state_indices,
         output_final_state=output_final_state,
+        inplace_indexed_state_update=inplace_indexed_state_update,
         cu_seqlens=cu_seqlens,
     )
     o = chunk_fwd_o(
@@ -86,6 +90,8 @@ def forward(
         beta: torch.Tensor,
         scale: float,
         initial_state: torch.Tensor,
+        initial_state_indices: Optional[torch.Tensor],
+        inplace_indexed_state_update: bool,
         output_final_state: bool,
         cu_seqlens: Optional[torch.LongTensor] = None,
         use_qk_l2norm_in_kernel: bool = False,
@@ -102,6 +108,8 @@ def forward(
             beta=beta,
             scale=scale,
             initial_state=initial_state,
+            initial_state_indices=initial_state_indices,
+            inplace_indexed_state_update=inplace_indexed_state_update,
             output_final_state=output_final_state,
             cu_seqlens=cu_seqlens,
         )
@@ -117,6 +125,8 @@ def chunk_gated_delta_rule(
     beta: torch.Tensor,
     scale: float = None,
     initial_state: torch.Tensor = None,
+    initial_state_indices: Optional[torch.Tensor] = None,
+    inplace_indexed_state_update: bool = False,
     output_final_state: bool = False,
     cu_seqlens: Optional[torch.LongTensor] = None,
     head_first: bool = False,
@@ -141,6 +151,13 @@ def chunk_gated_delta_rule(
             Initial state of shape `[N, H, K, V]` for `N` input sequences.
             For equal-length input sequences, `N` equals the batch size `B`.
             Default: `None`.
+        initial_state_indices (Optional[torch.Tensor]):
+            Optional state-pool indices of shape `[N]` selecting the slots to
+            read from `initial_state`.
+        inplace_indexed_state_update (Optional[bool]):
+            Explicit opt-in for writing indexed final states back into
+            `initial_state` in-place. Callers are responsible for ensuring the
+            selected slots are safe to update without aliasing races.
         output_final_state (Optional[bool]):
             Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
         cu_seqlens (torch.LongTensor):
@@ -211,12 +228,18 @@ def chunk_gated_delta_rule(
             raise ValueError(
                 f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
                 f"Please flatten variable-length inputs before processing.")
-        if initial_state is not None and initial_state.shape[0] != len(
-                cu_seqlens) - 1:
+        num_sequences = len(cu_seqlens) - 1
+        if initial_state_indices is not None:
+            if initial_state_indices.shape[0] != num_sequences:
+                raise ValueError(
+                    f"The number of initial-state indices is expected to be equal to the number of input "
+                    f"sequences, i.e., {num_sequences} rather than {initial_state_indices.shape[0]}."
+                )
+        elif initial_state is not None and initial_state.shape[
+                0] != num_sequences:
             raise ValueError(
                 f"The number of initial states is expected to be equal to the number of input sequences, "
-                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
-            )
+                f"i.e., {num_sequences} rather than {initial_state.shape[0]}.")
     if scale is None:
         scale = k.shape[-1]**-0.5
     o, final_state = ChunkGatedDeltaRuleFunction.apply(
@@ -227,6 +250,8 @@ def chunk_gated_delta_rule(
         beta,
         scale,
         initial_state,
+        initial_state_indices,
+        inplace_indexed_state_update,
         output_final_state,
         cu_seqlens,
         use_qk_l2norm_in_kernel,
diff --git a/tensorrt_llm/_torch/modules/fla/chunk_delta_h.py b/tensorrt_llm/_torch/modules/fla/chunk_delta_h.py
@@ -19,6 +19,7 @@
 @triton.heuristics({
     "USE_G": lambda args: args["g"] is not None,
     "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+    "USE_INDEXED_STATE": lambda args: args["h0_i"] is not None,
     "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
     "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
     "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
@@ -42,6 +43,7 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
     g,
     h,
     h0,
+    h0_i,
     ht,
     cu_seqlens,
     chunk_offsets,
@@ -54,6 +56,7 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
     BV: tl.constexpr,
     USE_G: tl.constexpr,
     USE_INITIAL_STATE: tl.constexpr,
+    USE_INDEXED_STATE: tl.constexpr,
     STORE_FINAL_STATE: tl.constexpr,
     SAVE_NEW_VALUE: tl.constexpr,
     IS_VARLEN: tl.constexpr,
@@ -91,10 +94,16 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
     stride_h = H * K * V
     stride_k = Hg * K
     stride_w = H * K
+    if USE_INDEXED_STATE:
+        state_index = tl.load(h0_i + i_n).to(tl.int64)
+        h0 = h0 + state_index * stride_h
+        ht = h0
     if USE_INITIAL_STATE:
-        h0 = h0 + i_nh * K * V
+        h0 = h0 + ((i_h if USE_INDEXED_STATE else i_nh) * K * V)
     if STORE_FINAL_STATE:
         ht = ht + i_nh * K * V
+    elif USE_INDEXED_STATE:
+        ht = ht + i_h * K * V
 
     # load initial state
     if USE_INITIAL_STATE:
@@ -209,7 +218,7 @@ def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
             b_h4 += tl.dot(b_k, b_v_new)
 
     # epilogue
-    if STORE_FINAL_STATE:
+    if STORE_FINAL_STATE or USE_INDEXED_STATE:
         p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV),
                                  (1, 0))
         tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
@@ -239,7 +248,9 @@ def chunk_gated_delta_rule_fwd_h(
     u: torch.Tensor,
     g: Optional[torch.Tensor] = None,
     initial_state: Optional[torch.Tensor] = None,
+    initial_state_indices: Optional[torch.Tensor] = None,
     output_final_state: bool = False,
+    inplace_indexed_state_update: bool = False,
     chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
     save_new_value: bool = True,
     cu_seqlens: Optional[torch.LongTensor] = None,
@@ -262,8 +273,14 @@ def chunk_gated_delta_rule_fwd_h(
     assert K <= 256, "current kernel does not support head dimension larger than 256."
 
     h = k.new_empty(B, NT, H, K, V)
+    use_indexed_state = initial_state is not None and initial_state_indices is not None
+    if use_indexed_state and not inplace_indexed_state_update:
+        raise ValueError(
+            "Indexed chunk state updates require inplace_indexed_state_update=True."
+        )
+    store_final_state_in_kernel = output_final_state and not use_indexed_state
     final_state = (k.new_empty(N, H, K, V, dtype=torch.float32)
-                   if output_final_state else None)
+                   if store_final_state_in_kernel else None)
 
     v_new = torch.empty_like(u) if save_new_value else None
 
@@ -278,6 +295,7 @@ def grid(meta):
         g=g,
         h=h,
         h0=initial_state,
+        h0_i=initial_state_indices,
         ht=final_state,
         cu_seqlens=cu_seqlens,
         chunk_offsets=chunk_offsets,
@@ -291,4 +309,9 @@ def grid(meta):
         num_warps=4,
         num_stages=2,
     )
+    if output_final_state and use_indexed_state:
+        # The indexed kernel path updates h0 in-place, so returning
+        # the final state means gathering those updated slots back out.
+        final_state = initial_state.index_select(
+            0, initial_state_indices.to(torch.long))
     return h, v_new, final_state

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`* and https://github.com/Dao-AILab/causal-conv1d/blob/main/csrc/causal_conv1d_update.cu`
`5`	`5`	`* Copyright (c) 2024, Tri Dao.`
`6`	`6`	`*`
`7`		`- * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.`
	`7`	`+ * Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.`
`8`	`8`	`*`
`9`	`9`	`* Licensed under the Apache License, Version 2.0 (the "License");`
`10`	`10`	`* you may not use this file except in compliance with the License.`
`@@ -349,20 +349,45 @@ void causal_conv1d_fwd_launch(ConvParamsBase& params, cudaStream_t stream)`
`349`	`349`	`});`
`350`	`350`	`}`
`351`	`351`
	`352`	`+template <int kWidth, typename input_t, typename weight_t>`
	`353`	`+void causal_conv1d_fwd_dispatch(ConvParamsBase& params, cudaStream_t stream)`
	`354`	`+{`
	`355`	`+ bool const isVarlen = params.query_start_loc_ptr != nullptr;`
	`356`	`+ constexpr int kNarrowThreads = 64;`
	`357`	`+ constexpr int kWideThreads = 128;`
	`358`	`+ constexpr int kNElts = sizeof(input_t) == 4 ? 4 : 8;`
	`359`	`+ constexpr int kShortSeqThreshold = kNarrowThreads * kNElts;`
	`360`	`+ // Varlen prefill launches one block per sequence/channel pair, so the per-sequence`
	`361`	`+ // work is usually much smaller than params.seqlen suggests. That path also disables`
	`362`	`+ // the wide vector-load specialization, so the 128-thread kernel tends to overprovision`
	`363`	`+ // threads for many short chunks. Prefer the narrower launch for varlen and for short`
	`364`	`+ // fixed-length inputs; keep the wider launch for long dense sequences.`
	`365`	`+ bool const preferNarrowKernel = isVarlen \|\| params.seqlen <= kShortSeqThreshold;`
	`366`	`+`
	`367`	`+ if (preferNarrowKernel)`
	`368`	`+ {`
	`369`	`+ causal_conv1d_fwd_launch<kNarrowThreads, kWidth, input_t, weight_t>(params, stream);`
	`370`	`+ }`
	`371`	`+ else`
	`372`	`+ {`
	`373`	`+ causal_conv1d_fwd_launch<kWideThreads, kWidth, input_t, weight_t>(params, stream);`
	`374`	`+ }`
	`375`	`+}`
	`376`	`+`
`352`	`377`	`template <typename input_t, typename weight_t>`
`353`	`378`	`void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream)`
`354`	`379`	`{`
`355`	`380`	`if (params.width == 2)`
`356`	`381`	`{`
`357`		`- causal_conv1d_fwd_launch<128, 2, input_t, weight_t>(params, stream);`
	`382`	`+ causal_conv1d_fwd_dispatch<2, input_t, weight_t>(params, stream);`
`358`	`383`	`}`
`359`	`384`	`else if (params.width == 3)`
`360`	`385`	`{`
`361`		`- causal_conv1d_fwd_launch<128, 3, input_t, weight_t>(params, stream);`
	`386`	`+ causal_conv1d_fwd_dispatch<3, input_t, weight_t>(params, stream);`
`362`	`387`	`}`
`363`	`388`	`else if (params.width == 4)`
`364`	`389`	`{`
`365`		`- causal_conv1d_fwd_launch<128, 4, input_t, weight_t>(params, stream);`
	`390`	`+ causal_conv1d_fwd_dispatch<4, input_t, weight_t>(params, stream);`
`366`	`391`	`}`
`367`	`392`	`}`
`368`	`393`