issue/889 - optimize flash attention performance from default setup

wooway777 · wooway777 · commit 4201ea720bd7 · 2026-01-22T11:01:17.000Z
diff --git a/src/infiniop/ops/flash_attention/ninetoothed/build.py b/src/infiniop/ops/flash_attention/ninetoothed/build.py
@@ -12,7 +12,7 @@ def build():
     with_attn_mask_values = (0,)
     causal_variant_values = (CausalVariant.UPPER_LEFT, CausalVariant.LOWER_RIGHT)
     dtype_values = (ninetoothed.float16, ninetoothed.bfloat16, ninetoothed.float32)
-    block_size_m_values = (64,)
+    block_size_m_values = (256,)
     block_size_n_values = (64,)
 
     constexpr_param_grid = {
diff --git a/src/infiniop/ops/flash_attention/ninetoothed/descriptor.h b/src/infiniop/ops/flash_attention/ninetoothed/descriptor.h
@@ -67,10 +67,10 @@ class Descriptor final : public InfiniopDescriptor {
         const auto emb_dim_{_query_shape[3]};
         const auto is_causal_{_is_causal};
         const auto with_attn_mask_{0};
-        const auto causal_variant_{1};
+        const auto causal_variant_{2};
         const auto dtype_{_dtype};
 
-        constexpr auto block_size_m_{64};
+        constexpr auto block_size_m_{256};
         constexpr auto block_size_n_{64};
 
         if (launch_flash_attention(stream,
diff --git a/src/infiniop/ops/flash_attention/ninetoothed/flash_attention.py b/src/infiniop/ops/flash_attention/ninetoothed/flash_attention.py
@@ -183,7 +183,7 @@ def application_without_kv_cache(
         lse = ntl.full((query_i.shape[-2],), 1, dtype=ntl.float32)
         max = ntl.full((query_i.shape[-2],), float("-inf"), dtype=ntl.float32)
 
-        for j in range(min(key.shape[0], actual_kv_len)):
+        for j in range(key.shape[0]):
 
             qk = ntl.dot(query_i, ntl.trans(key[j]))
 
@@ -196,7 +196,7 @@ def application_without_kv_cache(
             if is_causal:
                 query_pos = query[i].offsets(-2)
 
-                if causal_variant == 2:
+                if causal_variant == 2:  # CausalVariant.LOWER_RIGHT:
                     mask = (
                         query_pos[:, None] + actual_kv_len - query.source.shape[-2]
                         >= key_pos[None, :]