Fix

YangKai0616 · YangKai0616 · commit ab787775c8b1 · 2026-04-20T10:00:39.000Z
diff --git a/flash-attn2/build.toml b/flash-attn2/build.toml
@@ -206,6 +206,15 @@ src = [
     "flash_attn_xpu/src/flash_bwd_hdim192_fix.cpp",
     "flash_attn_xpu/src/flash_bwd_hdim256_fix.cpp",
     "flash_attn_xpu/src/flash_bwd_hdim512_fix.cpp",
+    "flash_attn_xpu/src/fmha_fwd_kvcache_impl.hpp",
+    "flash_attn_xpu/src/flash_kvcache_hdim32_fix.cpp",
+    "flash_attn_xpu/src/flash_kvcache_hdim64_fix.cpp",
+    "flash_attn_xpu/src/flash_kvcache_hdim96_fix.cpp",
+    "flash_attn_xpu/src/flash_kvcache_hdim128_fix.cpp",
+    "flash_attn_xpu/src/flash_kvcache_hdim160_fix.cpp",
+    "flash_attn_xpu/src/flash_kvcache_hdim192_fix.cpp",
+    "flash_attn_xpu/src/flash_kvcache_hdim256_fix.cpp",
+    "flash_attn_xpu/src/kernel/fmha_fwd_kvcache_kernel.hpp",
     "flash_attn_xpu/src/fmha_utils.hpp",
     "flash_attn_xpu/src/collective/fmha_fusion.hpp",
     "flash_attn_xpu/src/collective/copy_block_slm.hpp",
diff --git a/flash-attn2/flash_attn_xpu/flash_api.cpp b/flash-attn2/flash_attn_xpu/flash_api.cpp
diff --git a/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim128_fix.cpp b/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim128_fix.cpp
@@ -11,3 +11,13 @@ template void kvcache_policy_dispatch<
     sycl::queue& queue, 
     CutlassType cuType, 
     const fmha_fwd_kvcache_args_t& args);
+
+// KVCache paged mode: IsVarLen=0, IsPaged=1
+template void kvcache_policy_dispatch<
+    prefill_policy_head128,
+    1,  // PipelineStages
+    0,  // IsVarLen=0 (fixed length)
+    1>( // IsPaged=1 (paged)
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_kvcache_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim160_fix.cpp b/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim160_fix.cpp
@@ -11,3 +11,13 @@ template void kvcache_policy_dispatch<
     sycl::queue& queue, 
     CutlassType cuType, 
     const fmha_fwd_kvcache_args_t& args);
+
+// KVCache paged mode: IsVarLen=0, IsPaged=1
+template void kvcache_policy_dispatch<
+    prefill_policy_head160,
+    1,  // PipelineStages
+    0,  // IsVarLen=0 (fixed length)
+    1>( // IsPaged=1 (paged)
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_kvcache_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim192_fix.cpp b/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim192_fix.cpp
@@ -11,3 +11,13 @@ template void kvcache_policy_dispatch<
     sycl::queue& queue, 
     CutlassType cuType, 
     const fmha_fwd_kvcache_args_t& args);
+
+// KVCache paged mode: IsVarLen=0, IsPaged=1
+template void kvcache_policy_dispatch<
+    prefill_policy_head192,
+    1,  // PipelineStages
+    0,  // IsVarLen=0 (fixed length)
+    1>( // IsPaged=1 (paged)
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_kvcache_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim256_fix.cpp b/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim256_fix.cpp
@@ -11,3 +11,13 @@ template void kvcache_policy_dispatch<
     sycl::queue& queue, 
     CutlassType cuType, 
     const fmha_fwd_kvcache_args_t& args);
+
+// KVCache paged mode: IsVarLen=0, IsPaged=1
+template void kvcache_policy_dispatch<
+    prefill_policy_head256,
+    1,  // PipelineStages
+    0,  // IsVarLen=0 (fixed length)
+    1>( // IsPaged=1 (paged)
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_kvcache_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim32_fix.cpp b/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim32_fix.cpp
@@ -11,3 +11,13 @@ template void kvcache_policy_dispatch<
     sycl::queue& queue, 
     CutlassType cuType, 
     const fmha_fwd_kvcache_args_t& args);
+
+// KVCache paged mode: IsVarLen=0, IsPaged=1
+template void kvcache_policy_dispatch<
+    prefill_policy_head32,
+    1,  // PipelineStages
+    0,  // IsVarLen=0 (fixed length)
+    1>( // IsPaged=1 (paged)
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_kvcache_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim64_fix.cpp b/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim64_fix.cpp
@@ -11,3 +11,13 @@ template void kvcache_policy_dispatch<
     sycl::queue& queue, 
     CutlassType cuType, 
     const fmha_fwd_kvcache_args_t& args);
+
+// KVCache paged mode: IsVarLen=0, IsPaged=1
+template void kvcache_policy_dispatch<
+    prefill_policy_head64,
+    1,  // PipelineStages
+    0,  // IsVarLen=0 (fixed length)
+    1>( // IsPaged=1 (paged)
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_kvcache_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim96_fix.cpp b/flash-attn2/flash_attn_xpu/src/flash_kvcache_hdim96_fix.cpp
@@ -11,3 +11,13 @@ template void kvcache_policy_dispatch<
     sycl::queue& queue, 
     CutlassType cuType, 
     const fmha_fwd_kvcache_args_t& args);
+
+// KVCache paged mode: IsVarLen=0, IsPaged=1
+template void kvcache_policy_dispatch<
+    prefill_policy_head96,
+    1,  // PipelineStages
+    0,  // IsVarLen=0 (fixed length)
+    1>( // IsPaged=1 (paged)
+    sycl::queue& queue,
+    CutlassType cuType,
+    const fmha_fwd_kvcache_args_t& args);
diff --git a/flash-attn2/flash_attn_xpu/src/fmha_fwd_kvcache_impl.hpp b/flash-attn2/flash_attn_xpu/src/fmha_fwd_kvcache_impl.hpp
@@ -140,7 +140,7 @@ struct KVCacheKernelLauncher {
         {args.sm_scale,
          static_cast<int*>(args.block_table),
          args.page_block_size,
-         0,  // max_blocks_per_seq - set below if paged
+         args.max_blocks_per_seq,  // max_pages_per_seq for paged KV
          args.seqlen_k,  // total_seqlen_kv
          args.window_size_left,
          args.window_size_right,
@@ -372,25 +372,35 @@ inline void cutlass_fmha_fwd_kvcache_impl(
     const fmha_fwd_kvcache_args_t& args,
     CutlassType cuType) {
   
-  // Dispatch based on head size
-  // Uses explicitly instantiated kvcache_policy_dispatch<policy, stages, IsVarLen=0, IsPaged=0>
+  // Dispatch based on head size and paged KV mode
   int head_size = args.head_size;
+  bool is_paged = args.is_paged;
+  
+  // Helper macro to dispatch both paged and non-paged variants
+  #define DISPATCH_HEAD_SIZE(policy) \
+    if (is_paged) { \
+      kvcache_policy_dispatch<policy, 1, 0, 1>(queue, cuType, args); \
+    } else { \
+      kvcache_policy_dispatch<policy, 1, 0, 0>(queue, cuType, args); \
+    }
   
   if (head_size <= 32) {
-    kvcache_policy_dispatch<prefill_policy_head32, 1, 0, 0>(queue, cuType, args);
+    DISPATCH_HEAD_SIZE(prefill_policy_head32);
   } else if (head_size <= 64) {
-    kvcache_policy_dispatch<prefill_policy_head64, 1, 0, 0>(queue, cuType, args);
+    DISPATCH_HEAD_SIZE(prefill_policy_head64);
   } else if (head_size <= 96) {
-    kvcache_policy_dispatch<prefill_policy_head96, 1, 0, 0>(queue, cuType, args);
+    DISPATCH_HEAD_SIZE(prefill_policy_head96);
   } else if (head_size <= 128) {
-    kvcache_policy_dispatch<prefill_policy_head128, 1, 0, 0>(queue, cuType, args);
+    DISPATCH_HEAD_SIZE(prefill_policy_head128);
   } else if (head_size <= 160) {
-    kvcache_policy_dispatch<prefill_policy_head160, 1, 0, 0>(queue, cuType, args);
+    DISPATCH_HEAD_SIZE(prefill_policy_head160);
   } else if (head_size <= 192) {
-    kvcache_policy_dispatch<prefill_policy_head192, 1, 0, 0>(queue, cuType, args);
+    DISPATCH_HEAD_SIZE(prefill_policy_head192);
   } else if (head_size <= 256) {
-    kvcache_policy_dispatch<prefill_policy_head256, 1, 0, 0>(queue, cuType, args);
+    DISPATCH_HEAD_SIZE(prefill_policy_head256);
   } else {
     CUTLASS_ASSERT(false && "Unsupported head size for kvcache");
   }
+  
+  #undef DISPATCH_HEAD_SIZE
 }
diff --git a/flash-attn2/flash_attn_xpu/src/fmha_fwd_types.hpp b/flash-attn2/flash_attn_xpu/src/fmha_fwd_types.hpp
@@ -99,6 +99,8 @@ struct fmha_fwd_kvcache_args_t {
   int* block_table = nullptr;
   int64_t block_table_batch_stride = 0;
   int page_block_size = 1;
+  int max_blocks_per_seq = 0;  // Maximum number of blocks per sequence in paged mode
+  bool is_paged = false;  // Whether to use paged KV cache
 
   // Dimensions
   int batch_size;
diff --git a/flash-attn2/flash_attn_xpu/src/kernel/fmha_fwd_kvcache_kernel.hpp b/flash-attn2/flash_attn_xpu/src/kernel/fmha_fwd_kvcache_kernel.hpp
@@ -215,6 +215,7 @@ class XeFMHAFwdKVCacheKernel {
       int seq_len_kv_cache = 0;
       int seqlen_knew = params.mainloop.seqlen_new;
       int seq_len_kv = seq_len_kv_shape;  // Default: use shape value
+      int leftpad_k = 0;  // Leftpad offset for cache access
       
       if constexpr (CachedKV) {
         int batch_idx = params.mainloop.cache_batch_idx ? 
@@ -225,9 +226,13 @@ class XeFMHAFwdKVCacheKernel {
           // No per-batch cache seqlens provided, use full cache as the effective length
           seq_len_kv_cache = seq_len_kv_shape;
         }
-        // Handle leftpad if provided
+        // Get leftpad offset (leftpad_k positions before the valid data)
+        // If leftpad is set, the valid data starts at position leftpad in the cache
+        // We'll offset the cache pointer by leftpad, so the effective length becomes
+        // cache_seqlens - leftpad (number of valid tokens before the current position)
         if (params.mainloop.leftpad_k) {
-          seq_len_kv_cache -= params.mainloop.leftpad_k[batch_idx];
+          leftpad_k = params.mainloop.leftpad_k[batch_idx];
+          seq_len_kv_cache -= leftpad_k;  // Subtract leftpad from effective length
         }
         // Actual KV length = cached length + new tokens
         seq_len_kv = seq_len_kv_cache + seqlen_knew;
@@ -263,9 +268,6 @@ class XeFMHAFwdKVCacheKernel {
       const int k_blocks_causal =
           CausalMask ? (seq_coord + full_tile_offset) / get<1>(TileShapeQK{})
                      : 0;
-      
-      // Skip if there are no K blocks to process
-      if (k_blocks <= 0 || k_blocks <= k_block0) continue;
 
       auto batch_dim = is_var_len ? 1 : s.batch;
 
@@ -281,18 +283,22 @@ class XeFMHAFwdKVCacheKernel {
       auto shape_V = make_shape(
           s.head_size_vo, seqlen_knew > 0 ? seqlen_knew : 1, s.num_heads_kv, batch_dim);
 
-      // Shape for cached K/V - use cache capacity (seq_len_kv_shape) for indexing
+      // Shape for cached K/V - use remaining capacity after leftpad
+      int cache_seqlen_capacity = seq_len_kv_shape - leftpad_k;
       auto shape_K_cache = make_shape(
-          seq_len_kv_shape, s.head_size_qk, s.num_heads_kv, batch_dim);
+          cache_seqlen_capacity, s.head_size_qk, s.num_heads_kv, batch_dim);
       auto shape_V_cache = make_shape(
-          s.head_size_vo, seq_len_kv_shape, s.num_heads_kv, batch_dim);
+          s.head_size_vo, cache_seqlen_capacity, s.num_heads_kv, batch_dim);
 
       auto dcQ = const_cast<ElementQ*>(p.Q);
       auto dcK = const_cast<ElementK*>(p.K);
       auto dcV = const_cast<ElementV*>(p.V);
       auto dcO = const_cast<ElementO*>(p.O);
-      auto dcK_cache = const_cast<ElementK*>(p.K_cache);
-      auto dcV_cache = const_cast<ElementV*>(p.V_cache);
+      // Offset cache pointers by leftpad_k to skip padding
+      // K_cache layout: (seqlen, head_dim, heads, batch), stride[0] = row_stride
+      auto dcK_cache = const_cast<ElementK*>(p.K_cache) + leftpad_k * get<0>(p.dK_cache);
+      // V_cache layout: (head_dim, seqlen, heads, batch), stride[1] = row_stride  
+      auto dcV_cache = const_cast<ElementV*>(p.V_cache) + leftpad_k * get<1>(p.dV_cache);
 
       auto layout_q = make_layout(shape_Q, p.dQ);
       auto layout_k = make_layout(shape_K, p.dK);