Add split-K decode SDPA dispatch to Qwen3.5 MoE attention

digantdesai · digantdesai · commit ff207ea769c2 · 2026-04-14T18:36:58.000-07:00
Dual-method export (decode T=1, prefill T&gt;=2) lets the model use a
simple if/else on T instead of torch.cond, eliminating the GPU-to-CPU
sync overhead that torch.cond's predicate evaluation requires.

Decode calls sdpa_decode_splitk (split-K flash-decoding for high KV
occupancy), prefill calls tiled sdpa. Guard sdpa_decode_splitk
validation behind isinstance(L_q, int) so AOTI tracing with symbolic
shapes doesn't trip the L_q==1 check.

Align sdpa_decode_splitk signature with sdpa (dropout_p, is_causal,
enable_gqa) for consistent API; unsupported args fail with clear
messages.

This PR was authored with the assistance of Claude
diff --git a/backends/cuda/triton/kernels/sdpa.py b/backends/cuda/triton/kernels/sdpa.py
@@ -1390,39 +1390,67 @@ def sdpa_decode_splitk(
     key: torch.Tensor,
     value: torch.Tensor,
     attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
     scale: float = 0.0,
+    enable_gqa: bool = False,
 ) -> torch.Tensor:
+    """Split-K flash-decoding SDPA for L_q=1 (decode step).
+
+    Signature mirrors sdpa() for drop-in use with torch.cond dispatch.
+    enable_gqa is accepted but ignored — GQA is handled natively via
+    H_q // H_kv grouping; no packed-GQA tradeoff exists at L_q=1.
+    """
+    _validate_sdpa_inputs(query, key, value, dropout_p, enable_gqa)
+
     B, H_q, L_q, D = query.shape
     _, H_kv, L_kv, _ = key.shape
 
+    out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
+
     # is_causal is a no-op at L_q=1 (single query can't attend to future
     # positions), so we accept it silently for API compatibility with callers
     # that always pass is_causal=True for decode.
 
-    if L_q != 1:
-        raise RuntimeError(
-            f"sdpa_decode_splitk requires L_q == 1 (decode); got L_q={L_q}"
-        )
-    if H_q % H_kv != 0:
-        raise RuntimeError(
-            f"H_q must be divisible by H_kv; got H_q={H_q}, H_kv={H_kv}"
-        )
-    if not _is_power_of_2(D):
-        raise RuntimeError(
-            f"sdpa_decode_splitk requires power-of-2 head dim; got D={D}"
-        )
+    # Validation — only check at runtime (concrete shapes), not during AOTI
+    # tracing where shapes are symbolic. torch.cond traces both branches with
+    # the same symbolic L_q, so L_q is not necessarily 1 during tracing.
+    if isinstance(L_q, int):
+        if L_q != 1:
+            raise RuntimeError(
+                f"sdpa_decode_splitk requires L_q == 1 (decode); got L_q={L_q}"
+            )
+        if H_q % H_kv != 0:
+            raise RuntimeError(
+                f"H_q must be divisible by H_kv; got H_q={H_q}, H_kv={H_kv}"
+            )
+        if not _is_power_of_2(D):
+            raise RuntimeError(
+                f"sdpa_decode_splitk requires power-of-2 head dim; got D={D}"
+            )
 
     num_groups = H_q // H_kv
-    out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
     sm_scale = 1.0 / math.sqrt(D) if scale == 0.0 else scale
     HAS_MASK, Mask_ptr, stride_mb, stride_mq, stride_mk = _prepare_mask_params(
         attn_mask, B, L_q, L_kv
     )
 
     _launch_decode_splitk(
-        query, key, value, out,
-        B, H_q, H_kv, L_kv, D, sm_scale,
-        HAS_MASK, Mask_ptr, stride_mb, stride_mq, stride_mk,
+        query,
+        key,
+        value,
+        out,
+        B,
+        H_q,
+        H_kv,
+        L_kv,
+        D,
+        sm_scale,
+        HAS_MASK,
+        Mask_ptr,
+        stride_mb,
+        stride_mq,
+        stride_mk,
         num_groups,
     )
     return out
@@ -1434,7 +1462,10 @@ def _sdpa_decode_splitk_abstract(
     key: torch.Tensor,
     value: torch.Tensor,
     attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
     scale: float = 0.0,
+    enable_gqa: bool = False,
 ) -> torch.Tensor:
     assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype"
     B, H_q, L_q, D = query.shape
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -20,6 +20,7 @@
 
 import torch
 import torch.nn as nn
+
 from torch.nn import functional as F
 
 
@@ -285,9 +286,19 @@ def forward(self, x, input_pos):
             )
         else:
             k, v = self.kv_cache.update(input_pos, k, v)
-            y = F.scaled_dot_product_attention(
-                q, k, v, attn_mask=attn_mask, enable_gqa=True
-            )
+            # The export produces two methods — decode (T=1, static) and
+            # prefill (T>=2, dynamic). Each traces only one branch, so no
+            # torch.cond is needed and we avoid GPU→CPU sync overhead.
+            if T == 1:
+                from executorch.backends.cuda.triton.kernels.sdpa import (
+                    sdpa_decode_splitk,
+                )
+
+                y = sdpa_decode_splitk(q, k, v, attn_mask=attn_mask)
+            else:
+                from executorch.backends.cuda.triton.kernels.sdpa import sdpa
+
+                y = sdpa(q, k, v, attn_mask=attn_mask, enable_gqa=True)
 
         y = y.transpose(1, 2).contiguous().view(B, T, -1)