huggingface
diff --git a/‎flash-attn2/flash_attn_xpu/flash_api.cpp‎
Lines changed: 35 additions & 29 deletions b/‎flash-attn2/flash_attn_xpu/flash_api.cpp‎
Lines changed: 35 additions & 29 deletions
diff --git a/‎flash-attn2/flash_attn_xpu/src/collective/fmha_fwd_common.hpp‎
Lines changed: 42 additions & 0 deletions b/‎flash-attn2/flash_attn_xpu/src/collective/fmha_fwd_common.hpp‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎flash-attn2/flash_attn_xpu/src/collective/fmha_fwd_mainloop_xe2.hpp‎
Lines changed: 60 additions & 4 deletions b/‎flash-attn2/flash_attn_xpu/src/collective/fmha_fwd_mainloop_xe2.hpp‎
Lines changed: 60 additions & 4 deletions
diff --git a/‎flash-attn2/flash_attn_xpu/src/create_instantiation_files.sh‎
Lines changed: 29 additions & 0 deletions b/‎flash-attn2/flash_attn_xpu/src/create_instantiation_files.sh‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎flash-attn2/flash_attn_xpu/src/flash_fwd_hdim128_kvcache_paged_bf16.cpp‎
Lines changed: 29 additions & 0 deletions b/‎flash-attn2/flash_attn_xpu/src/flash_fwd_hdim128_kvcache_paged_bf16.cpp‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎flash-attn2/flash_attn_xpu/src/flash_fwd_hdim128_kvcache_paged_fp16.cpp‎
Lines changed: 29 additions & 0 deletions b/‎flash-attn2/flash_attn_xpu/src/flash_fwd_hdim128_kvcache_paged_fp16.cpp‎
Lines changed: 29 additions & 0 deletions
@@ -592,29 +592,19 @@ mha_fwd_kvcache(
         CHECK_DEVICE(seqlens_k);
     }
 
-    // Handle rotary embedding (pre-process in-place before kernel)
-    if (rotary_cos_.has_value()) {
+    at::Tensor rotary_cos, rotary_sin;
+    int rotary_dim = 0;
+    const bool has_rotary = rotary_cos_.has_value();
+    if (has_rotary) {
         TORCH_CHECK(k_.has_value(), "If rotary cos/sin are provided, new key/value must also be provided");
-        auto rotary_cos = rotary_cos_.value();
-        auto rotary_sin = rotary_sin_.value();
+        TORCH_CHECK(rotary_sin_.has_value(), "If rotary cos is provided, rotary sin must also be provided");
+        rotary_cos = ensure_contiguous(rotary_cos_.value());
+        rotary_sin = ensure_contiguous(rotary_sin_.value());
         CHECK_DEVICE(rotary_cos); CHECK_DEVICE(rotary_sin);
-        int rotary_dim = rotary_cos.size(1) * 2;
+        rotary_dim = rotary_cos.size(1) * 2;
         TORCH_CHECK(rotary_dim <= head_size_og, "rotary_dim must be <= headdim");
         TORCH_CHECK(rotary_dim % 16 == 0, "Only rotary dimensions divisible by 16 are currently supported");
         TORCH_CHECK(rotary_cos.scalar_type() == q_dtype && rotary_sin.scalar_type() == q_dtype);
-
-        std::optional<at::Tensor> seqlen_offsets_opt;
-        if (seqlens_k_.has_value()) { seqlen_offsets_opt = seqlens_k; }
-
-        bool is_local = (window_size_left >= 0);
-        if (is_causal || is_local) {
-            apply_rotary_emb_inplace(q_padded, rotary_cos, rotary_sin, seqlen_offsets_opt, is_rotary_interleaved);
-        } else {
-            auto q_shape = q_padded.sizes();
-            auto q_reshaped = q_padded.view({q_shape[0], 1, q_shape[1] * q_shape[2], q_shape[3]});
-            apply_rotary_emb_inplace(q_reshaped, rotary_cos, rotary_sin, seqlen_offsets_opt, is_rotary_interleaved);
-        }
-        apply_rotary_emb_inplace(k_padded, rotary_cos, rotary_sin, seqlen_offsets_opt, is_rotary_interleaved);
     }
 
     at::Tensor cache_batch_idx;
@@ -638,21 +628,30 @@ mha_fwd_kvcache(
     //   - Always prefer kernel-fused scatter (passes knew/vnew to the kernel,
     //     which writes them in-place during the prologue).  This avoids any
     //     host sync and works for both contiguous and paged caches.
-    //   - Fall back to API-layer scatter only when fusion is impossible:
-    //       * needs_padding: the cache pad is a separate buffer, so the
-    //         in-kernel writer would write to the padded copy, not the user
-    //         tensor; do the scatter on the user tensor and re-pad.
-    //       * rotary_cos: the rotary application happened on the padded
-    //         buffer; we need to slice off the padding before scattering to
-    //         the user cache. (Kernel-fused scatter copies the padded buffer
-    //         instead, which is wrong.)
+    //   - Fall back to API-layer scatter only when padding is needed: the
+    //     padded cache is a separate buffer, so the in-kernel writer would
+    //     not update the user tensor.
     bool fuse_knew = k_.has_value() && seqlen_new > 0
-                     && !needs_padding && !rotary_cos_.has_value();
+                     && !needs_padding;
+    if (has_rotary && !fuse_knew) {
+        std::optional<at::Tensor> seqlen_offsets_opt;
+        if (seqlens_k_.has_value()) { seqlen_offsets_opt = seqlens_k; }
+
+        bool is_local = (window_size_left >= 0);
+        if (is_causal || is_local) {
+            apply_rotary_emb_inplace(q_padded, rotary_cos, rotary_sin, seqlen_offsets_opt, is_rotary_interleaved);
+        } else {
+            auto q_shape = q_padded.sizes();
+            auto q_reshaped = q_padded.view({q_shape[0], 1, q_shape[1] * q_shape[2], q_shape[3]});
+            apply_rotary_emb_inplace(q_reshaped, rotary_cos, rotary_sin, seqlen_offsets_opt, is_rotary_interleaved);
+        }
+        apply_rotary_emb_inplace(k_padded, rotary_cos, rotary_sin, seqlen_offsets_opt, is_rotary_interleaved);
+    }
     if (k_.has_value() && seqlen_new > 0 && !fuse_knew) {
         auto seqlens_cpu = seqlens_k.to(torch::kCPU);
         auto seqlens_accessor = seqlens_cpu.accessor<int32_t, 1>();
 
-        at::Tensor k_for_cache = rotary_cos_.has_value()
+        at::Tensor k_for_cache = has_rotary
             ? k_padded.index({torch::indexing::Slice(), torch::indexing::Slice(),
                               torch::indexing::Slice(), torch::indexing::Slice(0, head_size_og)}).contiguous()
             : ensure_contiguous(k_.value());
@@ -722,13 +721,20 @@ mha_fwd_kvcache(
         block_table_opt = block_table;
     }
 
+    std::optional<at::Tensor> rotary_cos_opt, rotary_sin_opt;
+    if (fuse_knew && has_rotary) {
+        rotary_cos_opt = rotary_cos;
+        rotary_sin_opt = rotary_sin;
+    }
+
     cutlass_fmha_fwd_kvcache_impl(
         queue,
         q_padded, kcache_padded, vcache_padded,
         out, softmax_lse,
         seqlens_k, cache_batch_idx_opt, leftpad_k_opt,
         knew_opt, vnew_opt,
-        block_table_opt, seqlen_k,
+        block_table_opt, rotary_cos_opt, rotary_sin_opt,
+        fuse_knew ? rotary_dim : 0, is_rotary_interleaved, seqlen_k,
         softmax_scale, window_size_left, window_size_right,
         is_causal, is_local);
 
 
@@ -22,6 +22,48 @@ namespace cutlass::fmha::collective {
 
 using namespace cute;
 
+template <typename Element, typename RotaryElement>
+CUTLASS_DEVICE Element apply_rotary_scalar(
+    Element x,
+    Element x_pair,
+    const RotaryElement* cos,
+    const RotaryElement* sin,
+    int position,
+    int dim,
+    int rotary_dim,
+    bool interleaved) {
+  if (rotary_dim == 0 || dim >= rotary_dim) {
+    return x;
+  }
+
+  int half_rotary = rotary_dim / 2;
+  int cos_sin_idx = interleaved ? dim / 2
+                                : (dim < half_rotary ? dim : dim - half_rotary);
+  bool is_second = interleaved ? (dim % 2) : (dim >= half_rotary);
+
+  float x_f = static_cast<float>(x);
+  float x_pair_f = static_cast<float>(x_pair);
+  float c = static_cast<float>(cos[position * half_rotary + cos_sin_idx]);
+  float s = static_cast<float>(sin[position * half_rotary + cos_sin_idx]);
+  float rotated = is_second ? x_pair_f * s + x_f * c
+                            : x_f * c - x_pair_f * s;
+  return static_cast<Element>(rotated);
+}
+
+CUTLASS_DEVICE int rotary_pair_dim(
+    int dim,
+    int rotary_dim,
+    bool interleaved) {
+  if (dim >= rotary_dim) {
+    return dim;
+  }
+  if (interleaved) {
+    return dim ^ 1;
+  }
+  int half_rotary = rotary_dim / 2;
+  return dim < half_rotary ? dim + half_rotary : dim - half_rotary;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // FMHAFwdMainloopTraits: common type aliases derived from TiledMMA / VTiles.
 
@@ -47,7 +47,8 @@ template <
     class TensorV_,
     class TiledCopyQ_ = void,
     class TiledCopyK_ = void,
-    class TiledCopyV_ = void>
+    class TiledCopyV_ = void,
+    bool HasRotary_ = false>
 struct FMHAFwdMainloopXe2 {
   static_assert(
       cutlass::detail::dependent_false<DispatchPolicy_>,
@@ -70,7 +71,8 @@ template <
     class TensorV_,
     class TiledCopyQ_,
     class TiledCopyK_,
-    class TiledCopyV_>
+    class TiledCopyV_,
+    bool HasRotary_>
 struct FMHAFwdMainloopXe2<
     Xe2<Stages>,
     CausalMask_,
@@ -85,7 +87,8 @@ struct FMHAFwdMainloopXe2<
     TensorV_,
     TiledCopyQ_,
     TiledCopyK_,
-    TiledCopyV_> {
+    TiledCopyV_,
+    HasRotary_> {
 
   // Pull in common type aliases from the shared traits.
   using Traits = FMHAFwdMainloopTraits<
@@ -123,6 +126,7 @@ struct FMHAFwdMainloopXe2<
   static constexpr bool LocalMask  = LocalMask_;
   static constexpr bool HasDropout = HasDropout_;
   static constexpr bool PagedKV   = PagedKV_;
+  static constexpr bool HasRotary = HasRotary_;
 
   // User-facing arguments
   struct Arguments {
@@ -142,6 +146,10 @@ struct FMHAFwdMainloopXe2<
     int page_size = 0;
     int max_pages_per_seq = 0;
     int total_seqlen_kv = 0;
+    const typename TensorQ::element_type* rotary_cos = nullptr;
+    const typename TensorQ::element_type* rotary_sin = nullptr;
+    int rotary_dim = 0;
+    bool is_rotary_interleaved = true;
   };
 
   struct LocalMaskFields {
@@ -165,6 +173,14 @@ struct FMHAFwdMainloopXe2<
   };
   struct EmptyPaged {};
 
+  struct RotaryFields {
+    const typename TensorQ::element_type* rotary_cos = nullptr;
+    const typename TensorQ::element_type* rotary_sin = nullptr;
+    int rotary_dim = 0;
+    bool is_rotary_interleaved = true;
+  };
+  struct EmptyRotary {};
+
   // Kernel-facing parameters
   struct Params {
     ElementS scale;
@@ -174,6 +190,8 @@ struct FMHAFwdMainloopXe2<
         dropout_fields;
     [[no_unique_address]] conditional_t<PagedKV, PagedKVFields, EmptyPaged>
         paged;
+    [[no_unique_address]] conditional_t<HasRotary, RotaryFields, EmptyRotary>
+      rotary;
   };
 
   // SLM data
@@ -209,6 +227,10 @@ struct FMHAFwdMainloopXe2<
       p.paged = {args.ptr_page_table, args.page_size,
                  args.max_pages_per_seq, args.total_seqlen_kv};
     }
+    if constexpr (HasRotary) {
+      p.rotary = {args.rotary_cos, args.rotary_sin,
+                  args.rotary_dim, args.is_rotary_interleaved};
+    }
     return p;
   }
 
@@ -236,7 +258,9 @@ struct FMHAFwdMainloopXe2<
       int& tile_row_idx,
       const int& rows_of_maxima,
       int head_q,
-      int num_heads) {
+      int num_heads,
+      int q_offset_sg,
+      int rotary_base) {
     using namespace sycl::ext::oneapi::this_work_item;
 
     auto tile_shape_v =
@@ -387,6 +411,38 @@ struct FMHAFwdMainloopXe2<
       CUTLASS_PRAGMA_UNROLL
       for (int D = 0; D < size<4>(tKgK); D++) {
         copy(copy_q, tQgQ(_, _, _, D), tQrQ);
+        if constexpr (HasRotary) {
+          if (params.rotary.rotary_dim > 0 &&
+              params.rotary.rotary_cos != nullptr &&
+              params.rotary.rotary_sin != nullptr) {
+            auto tQrQ_coords = tQrQ.tv_layout();
+            int lane_id = static_cast<int>(get_sub_group().get_local_linear_id());
+            int q_tile_base = get<0>(blk_qv) * get<0>(TileShapeQK{}) + q_offset_sg;
+            int dim_tile_base = D * get<2>(TileShapeQK{});
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < tQrQ.size(); ++i) {
+              auto value_coord = idx2crd(
+                  i, make_shape(
+                         get<1>(shape(tQrQ_coords)),
+                         get<2>(shape(tQrQ_coords))));
+              auto coord = tQrQ_coords(
+                  make_coord(lane_id, get<0>(value_coord), get<1>(value_coord)));
+              int row = q_tile_base + get<0>(coord);
+              int dim = dim_tile_base + get<1>(coord);
+                if (row < seq_len_qo && dim < params.rotary.rotary_dim) {
+                int pair_dim = rotary_pair_dim(
+                  dim, params.rotary.rotary_dim,
+                  params.rotary.is_rotary_interleaved);
+                int position = rotary_base + ((CausalMask || LocalMask) ? row : 0);
+                tQrQ(i) = apply_rotary_scalar(
+                  tQrQ(i), Q_2D(row, pair_dim), params.rotary.rotary_cos,
+                  params.rotary.rotary_sin, position, dim,
+                  params.rotary.rotary_dim,
+                  params.rotary.is_rotary_interleaved);
+              }
+            }
+          }
+        }
         copy(copy_k, tKgK_cache(_, _, _, D), tKrK);
         reorder(tQrQ, tSrQ);
         reorder(tKrK, tSrK);
 
@@ -113,6 +113,35 @@ template void policy_dispatch_${dtype}<
     0, 1>(
     sycl::queue& queue,
     const fmha_fwd_args_t& args);
+
+// Rotary kvcache variants keep rotary code out of non-rotary kernels.
+template void policy_dispatch_${dtype}<
+  prefill_policy_head${hdim},
+  PipelineStages_Prefill,
+  0, 0, true>(
+  sycl::queue& queue,
+  const fmha_fwd_args_t& args);
+
+template void policy_dispatch_${dtype}<
+  decode_policy_head${hdim},
+  PipelineStages_Decode,
+  0, 0, true>(
+  sycl::queue& queue,
+  const fmha_fwd_args_t& args);
+
+template void policy_dispatch_${dtype}<
+  prefill_policy_head${hdim},
+  PipelineStages_Prefill,
+  0, 1, true>(
+  sycl::queue& queue,
+  const fmha_fwd_args_t& args);
+
+template void policy_dispatch_${dtype}<
+  decode_paged_policy_head${hdim},
+  PipelineStages_Decode,
+  0, 1, true>(
+  sycl::queue& queue,
+  const fmha_fwd_args_t& args);
 ENDFILE
     echo "  Created flash_fwd_hdim${hdim}_kvcache_paged_${dtype}.cpp"
   done
 
@@ -18,3 +18,32 @@ template void policy_dispatch_bf16<
     0, 1>(
     sycl::queue& queue,
     const fmha_fwd_args_t& args);
+
+// Rotary kvcache variants keep rotary code out of non-rotary kernels.
+template void policy_dispatch_bf16<
+    prefill_policy_head128,
+    PipelineStages_Prefill,
+    0, 0, true>(
+    sycl::queue& queue,
+    const fmha_fwd_args_t& args);
+
+template void policy_dispatch_bf16<
+    decode_policy_head128,
+    PipelineStages_Decode,
+    0, 0, true>(
+    sycl::queue& queue,
+    const fmha_fwd_args_t& args);
+
+template void policy_dispatch_bf16<
+    prefill_policy_head128,
+    PipelineStages_Prefill,
+    0, 1, true>(
+    sycl::queue& queue,
+    const fmha_fwd_args_t& args);
+
+template void policy_dispatch_bf16<
+    decode_paged_policy_head128,
+    PipelineStages_Decode,
+    0, 1, true>(
+    sycl::queue& queue,
+    const fmha_fwd_args_t& args);
@@ -18,3 +18,32 @@ template void policy_dispatch_fp16<
     0, 1>(
     sycl::queue& queue,
     const fmha_fwd_args_t& args);
+
+// Rotary kvcache variants keep rotary code out of non-rotary kernels.
+template void policy_dispatch_fp16<
+    prefill_policy_head128,
+    PipelineStages_Prefill,
+    0, 0, true>(
+    sycl::queue& queue,
+    const fmha_fwd_args_t& args);
+
+template void policy_dispatch_fp16<
+    decode_policy_head128,
+    PipelineStages_Decode,
+    0, 0, true>(
+    sycl::queue& queue,
+    const fmha_fwd_args_t& args);
+
+template void policy_dispatch_fp16<
+    prefill_policy_head128,
+    PipelineStages_Prefill,
+    0, 1, true>(
+    sycl::queue& queue,
+    const fmha_fwd_args_t& args);
+
+template void policy_dispatch_fp16<
+    decode_paged_policy_head128,
+    PipelineStages_Decode,
+    0, 1, true>(
+    sycl::queue& queue,
+    const fmha_fwd_args_t& args);