Add torch.cond split-K decode dispatch to Qwen3.5 MoE attention

digantdesai · digantdesai · commit ebe61e832e8d · 2026-04-08T21:44:26.000-07:00
Runtime dispatch via torch.cond in FullAttention: split-K flash-decoding
for decode (L_q==1) and standard tiled SDPA for prefill (L_q&gt;1). Guard
sdpa_decode_splitk validation behind isinstance(L_q, int) so AOTI tracing
with symbolic shapes doesn't trip the L_q==1 check.

Align sdpa_decode_splitk signature with sdpa (dropout_p, is_causal,
enable_gqa) for drop-in use with torch.cond; unsupported args fail
with clear messages.
diff --git a/backends/cuda/triton/kernels/sdpa.py b/backends/cuda/triton/kernels/sdpa.py
@@ -1372,26 +1372,50 @@ def sdpa_decode_splitk(
     key: torch.Tensor,
     value: torch.Tensor,
     attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
     scale: float = 0.0,
+    enable_gqa: bool = False,
 ) -> torch.Tensor:
+    """Split-K flash-decoding SDPA for L_q=1 (decode step).
+
+    Signature mirrors sdpa() for drop-in use with torch.cond dispatch.
+    enable_gqa is accepted but ignored — GQA is handled natively via
+    H_q // H_kv grouping; no packed-GQA tradeoff exists at L_q=1.
+    """
     B, H_q, L_q, D = query.shape
     _, H_kv, L_kv, _ = key.shape
 
-    if L_q != 1:
-        raise RuntimeError(
-            f"sdpa_decode_splitk requires L_q == 1 (decode); got L_q={L_q}"
-        )
-    if H_q % H_kv != 0:
+    out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
+
+    if dropout_p != 0.0:
         raise RuntimeError(
-            f"H_q must be divisible by H_kv; got H_q={H_q}, H_kv={H_kv}"
+            f"sdpa_decode_splitk does not support dropout; got dropout_p={dropout_p}"
         )
-    if not _is_power_of_2(D):
+    if is_causal:
         raise RuntimeError(
-            f"sdpa_decode_splitk requires power-of-2 head dim; got D={D}"
+            "sdpa_decode_splitk does not support is_causal=True "
+            "(causal masking is a no-op at L_q=1; pass attn_mask instead)"
         )
 
+    # Validation — only check at runtime (concrete shapes), not during AOTI
+    # tracing where shapes are symbolic. torch.cond traces both branches with
+    # the same symbolic L_q, so L_q is not necessarily 1 during tracing.
+    if isinstance(L_q, int):
+        if L_q != 1:
+            raise RuntimeError(
+                f"sdpa_decode_splitk requires L_q == 1 (decode); got L_q={L_q}"
+            )
+        if H_q % H_kv != 0:
+            raise RuntimeError(
+                f"H_q must be divisible by H_kv; got H_q={H_q}, H_kv={H_kv}"
+            )
+        if not _is_power_of_2(D):
+            raise RuntimeError(
+                f"sdpa_decode_splitk requires power-of-2 head dim; got D={D}"
+            )
+
     num_groups = H_q // H_kv
-    out = torch.empty((B, H_q, L_q, D), device=query.device, dtype=query.dtype)
     sm_scale = 1.0 / math.sqrt(D) if scale == 0.0 else scale
     HAS_MASK, Mask_ptr, stride_mb, stride_mq, stride_mk = _prepare_mask_params(
         attn_mask, B, L_q, L_kv
@@ -1412,7 +1436,10 @@ def _sdpa_decode_splitk_abstract(
     key: torch.Tensor,
     value: torch.Tensor,
     attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
     scale: float = 0.0,
+    enable_gqa: bool = False,
 ) -> torch.Tensor:
     assert query.dtype == key.dtype == value.dtype, "Q, K, V must have the same dtype"
     B, H_q, L_q, D = query.shape
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -22,6 +22,8 @@
 import torch.nn as nn
 from torch.nn import functional as F
 
+from executorch.backends.cuda.triton.kernels.sdpa import sdpa, sdpa_decode_splitk
+
 
 # ---------------------------------------------------------------------------
 # Config
@@ -267,10 +269,15 @@ def forward(self, x, input_pos):
         # KV cache
         k, v = self.kv_cache.update(input_pos, k, v)
 
-        # SDPA with GQA — kernel maps Q heads to KV heads internally
+        # SDPA with GQA — runtime dispatch via torch.cond:
+        #   decode (L_q==1): split-K flash-decoding for high KV occupancy
+        #   prefill (L_q>1): standard tiled SDPA (m32/m64)
         attn_mask = self.mask[input_pos].unsqueeze(0).unsqueeze(0)
-        y = F.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, enable_gqa=True
+        y = torch.cond(
+            q.shape[2] == 1,
+            lambda q, k, v, mask: sdpa_decode_splitk(q, k, v, attn_mask=mask),
+            lambda q, k, v, mask: sdpa(q, k, v, attn_mask=mask, enable_gqa=True),
+            [q, k, v, attn_mask],
         )
 
         y = y.transpose(1, 2).contiguous().view(B, T, -1)