Add scaled_dot_product_attention operator

voltjia · voltjia · commit c14e60d8f5a1 · 2025-06-20T09:57:49.000+08:00
diff --git a/src/ntops/kernels/scaled_dot_product_attention.py b/src/ntops/kernels/scaled_dot_product_attention.py
@@ -0,0 +1,182 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+BLOCK_SIZE_M = ninetoothed.block_size()
+BLOCK_SIZE_N = ninetoothed.block_size()
+
+
+def arrangement(
+    query,
+    key,
+    value,
+    present_key,
+    present_value,
+    present_key_slot,
+    present_value_slot,
+    attn_mask,
+    scale,
+    output,
+    with_kv_cache,
+    BLOCK_SIZE_M=BLOCK_SIZE_M,
+    BLOCK_SIZE_N=BLOCK_SIZE_N,
+):
+    def arrange_query_or_output(input):
+        arranged = input.tile((1, 1, BLOCK_SIZE_M, -1)).tile(
+            (1, query.shape[-3] // key.shape[-3], 1, 1)
+        )
+        arranged.dtype = arranged.dtype.squeeze((0, 2, 3))
+        arranged.dtype.dtype = arranged.dtype.dtype.squeeze((0, 1))
+
+        return arranged
+
+    def arrange_key_or_value(input):
+        arranged = (
+            input.tile((1, 1, BLOCK_SIZE_N, -1))
+            .tile((1, 1, -1, -1))
+            .expand((-1, -1, query_arranged.shape[-2], -1))
+        )
+        arranged.dtype = arranged.dtype.squeeze((0, 1, 3))
+        arranged.dtype.dtype = arranged.dtype.dtype.squeeze((0, 1))
+
+        return arranged
+
+    def arrange_present_key_or_present_value(input):
+        arranged = input.tile((1, 1, -1, -1))
+        arranged.dtype = arranged.dtype.squeeze((0, 1))
+
+        return arranged
+
+    def arrange_attn_mask(input):
+        arranged = input.tile((1, 1, BLOCK_SIZE_M, BLOCK_SIZE_N)).tile((1, 1, 1, -1))
+        arranged.dtype = arranged.dtype.squeeze((0, 1, 2))
+        arranged.dtype.dtype = arranged.dtype.dtype.squeeze((0, 1))
+
+        return arranged
+
+    query_arranged = arrange_query_or_output(query)
+    key_arranged = arrange_key_or_value(key)
+    value_arranged = arrange_key_or_value(value)
+    present_key_arranged = arrange_present_key_or_present_value(present_key)
+    present_value_arranged = arrange_present_key_or_present_value(present_value)
+    present_key_slot_arranged = arrange_present_key_or_present_value(present_key_slot)
+    present_value_slot_arranged = arrange_present_key_or_present_value(
+        present_value_slot
+    )
+    attn_mask_arranged = arrange_attn_mask(attn_mask)
+    scale_arranged = scale
+    output_arranged = arrange_query_or_output(output)
+
+    if with_kv_cache:
+        return (
+            query_arranged,
+            key_arranged,
+            value_arranged,
+            present_key_arranged,
+            present_value_arranged,
+            present_key_slot_arranged,
+            present_value_slot_arranged,
+            attn_mask_arranged,
+            scale_arranged,
+            output_arranged,
+        )
+
+    return (
+        query_arranged,
+        key_arranged,
+        value_arranged,
+        attn_mask_arranged,
+        scale_arranged,
+        output_arranged,
+    )
+
+
+def application_with_kv_cache(
+    query,
+    key,
+    value,
+    present_key,
+    present_value,
+    present_key_slot,
+    present_value_slot,
+    attn_mask,
+    scale,
+    output,
+):
+    present_key_slot = present_key  # noqa: F841
+    present_value_slot = present_value  # noqa: F841
+
+    application_without_kv_cache(query, key, value, attn_mask, scale, output)
+
+
+def application_without_kv_cache(query, key, value, attn_mask, scale, output):
+    for i in range(query.shape[0]):
+        query_i = (1.4426950408889634 * scale * query[i]).to(query[i].dtype)
+
+        acc = ntl.zeros((query_i.shape[-2], query_i.shape[-1]), dtype=ntl.float32)
+        lse = ntl.full((query_i.shape[-2],), 1, dtype=ntl.float32)
+        max = ntl.full((query_i.shape[-2],), float("-inf"), dtype=ntl.float32)
+
+        for j in range(key.shape[0]):
+            qk = ntl.dot(query_i, ntl.trans(key[j])) + attn_mask[j]
+            qk = ntl.where(key[j].offsets(-2) < key.source.shape[-2], qk, float("-inf"))
+
+            next_max = ntl.maximum(max, ntl.max(qk, 1))
+            stable_qk = ntl.exp2(qk - next_max[:, None])
+
+            alpha = ntl.exp2(max - next_max)
+            acc = acc * alpha[:, None] + ntl.dot(stable_qk.to(value[i].dtype), value[j])
+            max = next_max
+            lse = lse * alpha + ntl.sum(stable_qk, 1)
+
+        acc /= lse[:, None]
+        output[i] = acc  # noqa: F841
+
+
+@functools.cache
+def make(with_kv_cache):
+    query, key, value, attn_mask, output = (
+        Tensor(
+            4, shape_options=(None, None, None, {"constexpr": True, "upper_bound": 128})
+        )
+        for _ in range(5)
+    )
+    present_key, present_value, present_key_slot, present_value_slot = (
+        Tensor(
+            4,
+            shape_options=(
+                None,
+                None,
+                {"constexpr": True, "upper_bound": 1},
+                {"constexpr": True, "upper_bound": 128},
+            ),
+        )
+        for _ in range(4)
+    )
+    scale = Tensor(0)
+
+    if with_kv_cache:
+        application = application_with_kv_cache
+    else:
+        application = application_without_kv_cache
+
+    tensors = (
+        query,
+        key,
+        value,
+        present_key,
+        present_value,
+        present_key_slot,
+        present_value_slot,
+        attn_mask,
+        scale,
+        output,
+    )
+
+    return ninetoothed.make(
+        functools.partial(arrangement, with_kv_cache=with_kv_cache),
+        application,
+        tensors,
+    )
diff --git a/src/ntops/torch.py b/src/ntops/torch.py
@@ -1,3 +1,5 @@
+import math
+
 import torch
 
 import ntops.kernels.abs
@@ -25,6 +27,7 @@
 import ntops.kernels.neg
 import ntops.kernels.relu
 import ntops.kernels.rsqrt
+import ntops.kernels.scaled_dot_product_attention
 import ntops.kernels.sigmoid
 import ntops.kernels.sin
 import ntops.kernels.softmax
@@ -314,6 +317,69 @@ def rsqrt(input, *, out=None):
     return out
 
 
+def scaled_dot_product_attention(
+    query,
+    key,
+    value,
+    attn_mask=None,
+    dropout_p=0,
+    is_causal=False,
+    scale=None,
+    # The default value here differs from that of
+    # `torch.nn.functional.scaled_dot_product_attention`
+    # because GQA cannot be disabled at the moment.
+    enable_gqa=True,
+    present_key=None,
+    present_value=None,
+    present_key_slot=None,
+    present_value_slot=None,
+):
+    # TODO: Support `dropout_p`.
+    assert dropout_p == 0, "`dropout_p` is not supported yet."
+    # TODO: Support `is_causal`.
+    assert not is_causal, "`is_causal` is not supported yet."
+    assert enable_gqa, "GQA must be enabled for now."
+
+    mask_shape = query.shape[:-1] + (key.shape[-2],)
+
+    if attn_mask is None:
+        attn_mask = torch.zeros(mask_shape, dtype=query.dtype, device=query.device)
+    elif attn_mask.dtype == torch.bool:
+        attn_mask = torch.where(attn_mask, 0, float("-inf"))
+
+    attn_mask = attn_mask.expand(mask_shape)
+
+    if scale is None:
+        scale = 1 / math.sqrt(query.shape[-1])
+
+    if present_key is not None:
+        with_kv_cache = True
+    else:
+        with_kv_cache = False
+
+    output = torch.empty_like(query, dtype=value.dtype)
+
+    kernel = ntops.kernels.scaled_dot_product_attention.make(with_kv_cache)
+
+    if with_kv_cache:
+        kernel(
+            query,
+            key,
+            value,
+            present_key,
+            present_value,
+            present_key_slot,
+            present_value_slot,
+            attn_mask,
+            scale,
+            output,
+        )
+    else:
+        kernel(query, key, value, attn_mask, scale, output)
+
+    return output
+
+
 def sigmoid(input, *, out=None):
     if out is None:
         out = torch.empty_like(input)
diff --git a/tests/test_scaled_dot_product_attention.py b/tests/test_scaled_dot_product_attention.py
@@ -0,0 +1,130 @@
+import itertools
+import math
+import random
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+import ntops.torch
+from tests.skippers import skip_if_cuda_not_available
+
+
+def generate_arguments():
+    def _generate_random_qkv(dtype, device):
+        def _generate_random_size():
+            return random.randint(1, 512)
+
+        batch_size = random.randint(1, 4)
+        num_heads_q = 2 ** random.randint(1, 5)
+        seq_len_q = _generate_random_size()
+        head_dim = random.choice([32, 64])
+        num_heads_kv = 2 ** random.randint(1, math.floor(math.log2(num_heads_q)))
+        seq_len_kv = _generate_random_size()
+
+        shape_q = (batch_size, num_heads_q, seq_len_q, head_dim)
+        shape_kv = (batch_size, num_heads_kv, seq_len_kv, head_dim)
+
+        query = torch.randn(shape_q, dtype=dtype, device=device)
+        key = torch.randn(shape_kv, dtype=dtype, device=device)
+        value = torch.randn(shape_kv, dtype=dtype, device=device)
+
+        return query, key, value
+
+    device = "cuda"
+
+    arguments = []
+
+    attn_mask_types = (None, torch.bool, torch.float32)
+    scales = (None, random.uniform(0.05, 0.5))
+    dtypes = (torch.float32, torch.float16)
+    with_kv_cache_values = (False, True)
+
+    for attn_mask_type, scale, dtype, with_kv_cache in itertools.product(
+        attn_mask_types, scales, dtypes, with_kv_cache_values
+    ):
+        query, key, value = _generate_random_qkv(dtype, device)
+
+        if attn_mask_type is not None:
+            attn_mask = torch.rand(
+                (query.shape[-2], key.shape[-2]), dtype=query.dtype, device=query.device
+            )
+
+            if attn_mask_type is torch.bool:
+                attn_mask = attn_mask > 0.5
+            # TODO: Non-infinite floating-point masks may cause
+            # precision issues. Revisit here later.
+            else:
+                attn_mask = torch.where(attn_mask > 0.5, 0, float("-inf"))
+                attn_mask = attn_mask.to(query.dtype)
+        else:
+            attn_mask = None
+
+        enable_gqa = True
+
+        if dtype is torch.float32:
+            atol = 0.01
+            rtol = 0.01
+        else:
+            atol = 0.025
+            rtol = 0.025
+
+        arguments.append(
+            (query, key, value, attn_mask, scale, enable_gqa, with_kv_cache, atol, rtol)
+        )
+
+    return (
+        "query, key, value, attn_mask, scale, enable_gqa, with_kv_cache, atol, rtol",
+        arguments,
+    )
+
+
+@skip_if_cuda_not_available
+class TestScaledDotProductAttention:
+    @pytest.mark.parametrize(*generate_arguments())
+    def test_cuda(
+        self, query, key, value, attn_mask, scale, enable_gqa, with_kv_cache, atol, rtol
+    ):
+        key_cloned = key.clone()
+        value_cloned = value.clone()
+
+        def _generate_present_and_slot(tensor):
+            present = tensor[:, :, -1:, :].clone()
+            present_slot = tensor[:, :, -1:, :]
+            present_slot[...] = 0
+
+            return present, present_slot
+
+        if with_kv_cache:
+            present_key, present_key_slot = _generate_present_and_slot(key)
+            present_value, present_value_slot = _generate_present_and_slot(value)
+        else:
+            present_key = None
+            present_value = None
+            present_key_slot = None
+            present_value_slot = None
+
+        ninetoothed_output = ntops.torch.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=attn_mask,
+            scale=scale,
+            enable_gqa=enable_gqa,
+            present_key=present_key,
+            present_value=present_value,
+            present_key_slot=present_key_slot,
+            present_value_slot=present_value_slot,
+        )
+        reference_output = F.scaled_dot_product_attention(
+            query,
+            key_cloned,
+            value_cloned,
+            attn_mask=attn_mask,
+            scale=scale,
+            enable_gqa=enable_gqa,
+        )
+
+        assert torch.allclose(
+            ninetoothed_output, reference_output, atol=atol, rtol=rtol
+        )