issue/889 - optimize flash attention performance from default setup

wooway777 · wooway777 · commit 7f2123a6eaaa · 2026-01-22T09:49:41.000Z
diff --git a/src/infiniop/ops/flash_attention/ninetoothed/build.py b/src/infiniop/ops/flash_attention/ninetoothed/build.py
@@ -12,7 +12,7 @@ def build():
     with_attn_mask_values = (0,)
     causal_variant_values = (CausalVariant.UPPER_LEFT, CausalVariant.LOWER_RIGHT)
     dtype_values = (ninetoothed.float16, ninetoothed.bfloat16, ninetoothed.float32)
-    block_size_m_values = (64,)
+    block_size_m_values = (256,)
     block_size_n_values = (64,)
 
     constexpr_param_grid = {
diff --git a/src/infiniop/ops/flash_attention/ninetoothed/descriptor.h b/src/infiniop/ops/flash_attention/ninetoothed/descriptor.h
@@ -67,10 +67,10 @@ class Descriptor final : public InfiniopDescriptor {
         const auto emb_dim_{_query_shape[3]};
         const auto is_causal_{_is_causal};
         const auto with_attn_mask_{0};
-        const auto causal_variant_{1};
+        const auto causal_variant_{2};
         const auto dtype_{_dtype};
 
-        constexpr auto block_size_m_{64};
+        constexpr auto block_size_m_{256};
         constexpr auto block_size_n_{64};
 
         if (launch_flash_attention(stream,