refine

YangKai0616 · YangKai0616 · commit 4bd5dbc56d5e · 2026-05-06T01:26:15.000Z
diff --git a/flash-attn2/build.toml b/flash-attn2/build.toml
@@ -195,6 +195,14 @@ src = [
     "flash_attn_xpu/src/flash_fwd_hdim192_fix.cpp",
     "flash_attn_xpu/src/flash_fwd_hdim256_fix.cpp",
     "flash_attn_xpu/src/flash_fwd_hdim512_fix.cpp",
+    "flash_attn_xpu/src/flash_fwd_hdim32_kvcache_paged.cpp",
+    "flash_attn_xpu/src/flash_fwd_hdim64_kvcache_paged.cpp",
+    "flash_attn_xpu/src/flash_fwd_hdim96_kvcache_paged.cpp",
+    "flash_attn_xpu/src/flash_fwd_hdim128_kvcache_paged.cpp",
+    "flash_attn_xpu/src/flash_fwd_hdim160_kvcache_paged.cpp",
+    "flash_attn_xpu/src/flash_fwd_hdim192_kvcache_paged.cpp",
+    "flash_attn_xpu/src/flash_fwd_hdim256_kvcache_paged.cpp",
+    "flash_attn_xpu/src/flash_fwd_hdim512_kvcache_paged.cpp",
     "flash_attn_xpu/src/fmha_bwd_types.hpp",
     "flash_attn_xpu/src/fmha_bwd.hpp",
     "flash_attn_xpu/src/fmha_bwd_impl.hpp",
diff --git a/flash-attn2/flash_attn_xpu/flash_api.cpp b/flash-attn2/flash_attn_xpu/flash_api.cpp
@@ -632,10 +632,22 @@ mha_fwd_kvcache(
         TORCH_CHECK(leftpad_k.dtype() == torch::kInt32, "leftpad_k must have dtype int32");
     }
 
-    // Write new K/V to cache in-place
-    // Non-paged without padding: fused in kernel (knew/vnew passed to dispatch)
-    // Paged or needs-padding: API-layer scatter (kernel fusion not applicable)
-    bool fuse_knew = k_.has_value() && seqlen_new > 0 && !paged_KV && !needs_padding;
+    // Write new K/V to cache.
+    //
+    // Strategy:
+    //   - Always prefer kernel-fused scatter (passes knew/vnew to the kernel,
+    //     which writes them in-place during the prologue).  This avoids any
+    //     host sync and works for both contiguous and paged caches.
+    //   - Fall back to API-layer scatter only when fusion is impossible:
+    //       * needs_padding: the cache pad is a separate buffer, so the
+    //         in-kernel writer would write to the padded copy, not the user
+    //         tensor; do the scatter on the user tensor and re-pad.
+    //       * rotary_cos: the rotary application happened on the padded
+    //         buffer; we need to slice off the padding before scattering to
+    //         the user cache. (Kernel-fused scatter copies the padded buffer
+    //         instead, which is wrong.)
+    bool fuse_knew = k_.has_value() && seqlen_new > 0
+                     && !needs_padding && !rotary_cos_.has_value();
     if (k_.has_value() && seqlen_new > 0 && !fuse_knew) {
         auto seqlens_cpu = seqlens_k.to(torch::kCPU);
         auto seqlens_accessor = seqlens_cpu.accessor<int32_t, 1>();
@@ -683,28 +695,8 @@ mha_fwd_kvcache(
         seqlens_k = seqlens_k + seqlen_new;
     }
 
-    // For paged KV, gather to contiguous format
-    if (paged_KV) {
-        int num_pages_needed = (seqlen_k + page_block_size - 1) / page_block_size;
-        auto block_indices = block_table.index({
-            torch::indexing::Slice(),
-            torch::indexing::Slice(0, num_pages_needed)
-        }).flatten();
-        auto k_gathered = kcache_padded.index_select(0, block_indices.to(torch::kLong));
-        auto v_gathered = vcache_padded.index_select(0, block_indices.to(torch::kLong));
-        k_gathered = k_gathered.view({batch_size, num_pages_needed, page_block_size, num_heads_k, head_size_padded});
-        v_gathered = v_gathered.view({batch_size, num_pages_needed, page_block_size, num_heads_k, head_size_padded});
-        k_gathered = k_gathered.view({batch_size, num_pages_needed * page_block_size, num_heads_k, head_size_padded});
-        v_gathered = v_gathered.view({batch_size, num_pages_needed * page_block_size, num_heads_k, head_size_padded});
-        kcache_padded = k_gathered.index({
-            torch::indexing::Slice(), torch::indexing::Slice(0, seqlen_k)
-        }).contiguous();
-        vcache_padded = v_gathered.index({
-            torch::indexing::Slice(), torch::indexing::Slice(0, seqlen_k)
-        }).contiguous();
-    }
-
-    // Dispatch to kernel
+    // Dispatch to kernel.  Paged caches are now passed natively (block_table
+    // routed straight through to the kernel, no host gather).
     auto queue = c10::xpu::getCurrentXPUStream(device_idx).queue();
     const bool is_local = (window_size_left >= 0);
 
@@ -718,19 +710,25 @@ mha_fwd_kvcache(
         leftpad_k_opt = leftpad_k;
     }
 
-    // For non-paged path with new KV, pass knew/vnew for fused scatter in kernel
+    // For paths where new KV is appended in-kernel, pass knew/vnew through.
     std::optional<at::Tensor> knew_opt, vnew_opt;
     if (fuse_knew) {
         knew_opt = k_padded;
         vnew_opt = v_padded;
     }
 
+    std::optional<at::Tensor> block_table_opt;
+    if (paged_KV) {
+        block_table_opt = block_table;
+    }
+
     cutlass_fmha_fwd_kvcache_impl(
         queue,
         q_padded, kcache_padded, vcache_padded,
         out, softmax_lse,
         seqlens_k, cache_batch_idx_opt, leftpad_k_opt,
         knew_opt, vnew_opt,
+        block_table_opt, seqlen_k,
         softmax_scale, window_size_left, window_size_right,
         is_causal, is_local);
 
diff --git a/flash-attn2/flash_attn_xpu/src/create_instantiation_files.sh b/flash-attn2/flash_attn_xpu/src/create_instantiation_files.sh
@@ -87,8 +87,39 @@ ENDFILE
   echo "  Created flash_fwd_hdim${hdim}_fix.cpp"
 done
 
+echo ""
+echo "Creating kvcache-paged instantiation files (non-varlen + paged)..."
+for hdim in "${HDIMS[@]}"; do
+  cat > flash_fwd_hdim${hdim}_kvcache_paged.cpp << ENDFILE
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head${hdim},
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head${hdim},
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+ENDFILE
+  echo "  Created flash_fwd_hdim${hdim}_kvcache_paged.cpp"
+done
+
 echo ""
 echo "✓ All instantiation files created successfully!"
 echo "  - ${#HDIMS[@]} varlen files (IsVarLen=1, paged + non-paged)"
 echo "  - ${#HDIMS[@]} fixed files (IsVarLen=0, decode + prefill)"
-echo "  Total: $((${#HDIMS[@]} * 2)) files"
+echo "  - ${#HDIMS[@]} kvcache_paged files (IsVarLen=0, IsPaged=1)"
+echo "  Total: $((${#HDIMS[@]} * 3)) files"
diff --git a/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim128_kvcache_paged.cpp b/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim128_kvcache_paged.cpp
@@ -0,0 +1,22 @@
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head128,
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head128,
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim160_kvcache_paged.cpp b/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim160_kvcache_paged.cpp
@@ -0,0 +1,22 @@
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head160,
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head160,
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim192_kvcache_paged.cpp b/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim192_kvcache_paged.cpp
@@ -0,0 +1,22 @@
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head192,
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head192,
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim256_kvcache_paged.cpp b/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim256_kvcache_paged.cpp
@@ -0,0 +1,22 @@
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head256,
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head256,
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim32_kvcache_paged.cpp b/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim32_kvcache_paged.cpp
@@ -0,0 +1,22 @@
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head32,
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head32,
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim512_kvcache_paged.cpp b/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim512_kvcache_paged.cpp
@@ -0,0 +1,22 @@
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head512,
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head512,
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim64_kvcache_paged.cpp b/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim64_kvcache_paged.cpp
@@ -0,0 +1,22 @@
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head64,
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head64,
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim96_kvcache_paged.cpp b/flash-attn2/flash_attn_xpu/src/flash_fwd_hdim96_kvcache_paged.cpp
@@ -0,0 +1,22 @@
+#include "fmha_fwd_impl.hpp"
+
+// Non-varlen + paged: IsVarLen=0, IsPaged=1
+// Used by mha_fwd_kvcache when block_table is provided.
+
+// Prefill paged
+template void policy_dispatch<
+    prefill_policy_head96,
+    PipelineStages_Prefill,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
+
+// Decode paged (smaller K-tile to fit page boundaries)
+template void policy_dispatch<
+    decode_paged_policy_head96,
+    PipelineStages_Decode,
+    0, 1>(
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/fmha_fwd.cpp b/flash-attn2/flash_attn_xpu/src/fmha_fwd.cpp
diff --git a/flash-attn2/flash_attn_xpu/src/fmha_fwd.hpp b/flash-attn2/flash_attn_xpu/src/fmha_fwd.hpp
diff --git a/flash-attn2/flash_attn_xpu/src/kernel/fmha_fwd_kernel_xe2.hpp b/flash-attn2/flash_attn_xpu/src/kernel/fmha_fwd_kernel_xe2.hpp