Add causal mask to SDPA kernels and use backend SDPA directly

voltjia · voltjia · commit 4838f4010106 · 2026-04-23T16:18:04.000+08:00
diff --git a/modules/attention.py b/modules/attention.py
@@ -99,16 +99,8 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        # TODO: NineToothed SDPA kernel lacks causal masking support, which is
-        # required by autoregressive inference. Fall back to torch so end-to-end
-        # generation produces coherent output.
-        attn_output = F.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            is_causal=attention_mask is None and query_states.shape[-2] > 1,
-            scale=self.scaling,
+        attn_output = type(self).scaled_dot_product_attention(
+            query_states, key_states, value_states, scale=self.scaling
         )
         attn_output = attn_output.transpose(1, 2)
 
diff --git a/ops/ninetoothed/kernels/scaled_dot_product_attention.py b/ops/ninetoothed/kernels/scaled_dot_product_attention.py
@@ -7,7 +7,7 @@
 
 
 def arrangement(
-    q, k, v, scale, o, BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N
+    q, k, v, scale, q_start, o, BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N
 ):
     def arrange_q_or_o(input):
         arranged = input.tile((1, 1, BLOCK_SIZE_M, -1))
@@ -26,10 +26,17 @@ def arrange_k_or_v(input):
 
     q_arranged = arrange_q_or_o(q)
 
-    return q_arranged, arrange_k_or_v(k), arrange_k_or_v(v), scale, arrange_q_or_o(o)
+    return (
+        q_arranged,
+        arrange_k_or_v(k),
+        arrange_k_or_v(v),
+        scale,
+        q_start,
+        arrange_q_or_o(o),
+    )
 
 
-def application(q, k, v, scale, o):
+def application(q, k, v, scale, q_start, o):
     q_loaded = (q * scale * 1.44269504089).to(q.dtype)
 
     acc = ntl.zeros((q.shape[-2], q.shape[-1]), dtype=ntl.float32)
@@ -38,7 +45,11 @@ def application(q, k, v, scale, o):
 
     for i in range(k.shape[0]):
         qk = ntl.dot(q_loaded, ntl.trans(k[i]))
-        qk = ntl.where(k[i].offsets(-2) < k.source.shape[-2], qk, float("-inf"))
+        qk = ntl.where(
+            (q.offsets(-2) + q_start)[:, None] >= k[i].offsets(-2),
+            qk,
+            float("-inf"),
+        )
 
         m_ij = ntl.maximum(m_i, ntl.max(qk, 1))
         p = ntl.exp2(qk - m_ij[:, None])
@@ -53,8 +64,8 @@ def application(q, k, v, scale, o):
     o = acc.to(o.dtype)  # noqa: F841
 
 
-shape_options = (None, None, None, {"constexpr": True, "upper_bound": 128})
-q, k, v, o = (Tensor(4, shape_options=shape_options) for _ in range(4))
-tensors = (q, k, v, Tensor(0), o)
+_shape_options = (None, None, None, {"constexpr": True, "upper_bound": 128})
+_q, _k, _v, _o = (Tensor(4, shape_options=_shape_options) for _ in range(4))
+tensors = (_q, _k, _v, Tensor(0), Tensor(0), _o)
 
 kernel = ninetoothed.make(arrangement, application, tensors)
diff --git a/ops/ninetoothed/torch.py b/ops/ninetoothed/torch.py
@@ -149,9 +149,13 @@ def scaled_dot_product_attention(q, k, v, scale=None):
     if scale is None:
         scale = 1 / math.sqrt(q.shape[-1])
 
+    q_start = k.shape[-2] - q.shape[-2]
+
     o = torch.empty_like(q)
 
-    ops.ninetoothed.kernels.scaled_dot_product_attention.kernel(q, k, v, scale, o)
+    ops.ninetoothed.kernels.scaled_dot_product_attention.kernel(
+        q, k, v, scale, q_start, o
+    )
 
     return o
 
diff --git a/ops/triton/kernels/scaled_dot_product_attention.py b/ops/triton/kernels/scaled_dot_product_attention.py
@@ -96,10 +96,13 @@ def kernel(
     l_i = tl.full((BLOCK_SIZE_M,), 1, dtype=tl.float32)
     m_i = tl.full((BLOCK_SIZE_M,), float("-inf"), dtype=tl.float32)
 
+    q_offsets = seq_len_k_v - seq_len_q + offs_m_start + tl.arange(0, BLOCK_SIZE_M)
+
     for i in range(0, tl.cdiv(seq_len_k_v, BLOCK_SIZE_N)):
         k = tl.load(k_block_ptr, boundary_check=(0, 1))
 
-        mask = i * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) < seq_len_k_v
+        k_offsets = i * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        mask = q_offsets[:, None] >= k_offsets[None, :]
         qk = tl.where(mask, tl.dot(q, k), float("-inf"))
 
         m_ij = tl.maximum(m_i, tl.max(qk, 1))