xiaolil1
diff --git a/‎CMakeLists.txt‎
Lines changed: 10 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎csrc/xpu_cutlass_fusion.cpp‎
Lines changed: 153 additions & 363 deletions b/‎csrc/xpu_cutlass_fusion.cpp‎
Lines changed: 153 additions & 363 deletions
diff --git a/‎include/cute/algorithm/cooperative_gemm.hpp‎
Lines changed: 49 additions & 28 deletions b/‎include/cute/algorithm/cooperative_gemm.hpp‎
Lines changed: 49 additions & 28 deletions
diff --git a/‎include/cute/algorithm/tuple_algorithms.hpp‎
Lines changed: 4 additions & 24 deletions b/‎include/cute/algorithm/tuple_algorithms.hpp‎
Lines changed: 4 additions & 24 deletions
diff --git a/‎include/cute/arch/cluster_sm90.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/cute/arch/cluster_sm90.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/cute/arch/config.hpp‎
Lines changed: 60 additions & 2 deletions b/‎include/cute/arch/config.hpp‎
Lines changed: 60 additions & 2 deletions
@@ -28,7 +28,8 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
 set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
-set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp csrc/xpu_cutlass.cpp csrc/xpu_cutlass-cute.cpp csrc/xpu_cutlass_fusion.cpp)
+#set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp csrc/xpu_cutlass.cpp csrc/xpu_cutlass-cute.cpp csrc/xpu_cutlass_fusion.cpp)
+set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp csrc/xpu_cutlass_fusion.cpp)
 # C++ sources are always included
 list(APPEND SRC_FILES ${CPP_FILES})
 
@@ -312,7 +313,14 @@ if(BUILD_MPS)
     target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
 endif()
 if(BUILD_XPU)
-    set(SYCL_LINK_FLAGS "-fsycl;--offload-compress;-fsycl-targets=intel_gpu_pvc;-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier;-Xs; -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required'")
+    set(SYCL_FLAGS
+      -fsycl
+      --offload-compress
+      -fsycl-targets=intel_gpu_pvc
+      -Xspirv-translator -spirv-ext=+SPV_INTEL_split_barrier,+SPV_INTEL_2d_block_io,+SPV_INTEL_subgroup_matrix_multiply_accumulate
+      -Xs
+      -options "-cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required"
+    )
     set(SYCL_COMPILE_FLAGS "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=intel_gpu_pvc;-Xspirv-translator;-spirv-ext=+SPV_INTEL_split_barrier;")
 
     set_property(TARGET bitsandbytes PROPERTY CXX_STANDARD 20)
 
@@ -98,19 +98,23 @@ epilogue_predication(ThrMMA<Args...>    const& thr_mma,
   }
 }
 
-template<class Alpha, class TRC, class RCLayout,
+template<class ... Args, 
+         class Alpha, class TRC, class RCLayout,
          class Beta, class TSC, class SCLayout,
          class CLoadTransformOp, class CStoreTransformOp,
-         class SmemCopyOpC>
+         class SmemCopyLdOpC, class SmemCopyStOpC>
 CUTE_HOST_DEVICE
 void
-epilogue_no_predication(Alpha              const& alpha,
+epilogue_no_predication(uint32_t                   thread_idx,
+                        ThrMMA<Args...>     const& thr_mma,
+                        Alpha              const& alpha,
                         Tensor<TRC, RCLayout>   & tCrC,
                         Beta               const& beta,
-                        Tensor<TSC, SCLayout>   & tCsC,
+                        Tensor<TSC, SCLayout>   & sC,
                         CLoadTransformOp   const& sC_load_op,  // transforms C values before use in GEMM
                         CStoreTransformOp  const& sC_store_op, // transforms results before they are stored to C
-                        SmemCopyOpC        const& sC_copy_op)
+                        SmemCopyLdOpC      const& sC_copy_ld_op,
+                        SmemCopyStOpC      const& sC_copy_st_op)
 {
   using InputTypeC   = typename TSC::value_type;
   using ComputeTypeC = typename TRC::value_type;
@@ -125,18 +129,33 @@ epilogue_no_predication(Alpha              const& alpha,
     CUTE_GCC_UNREACHABLE;
   } ();
 
-  Tensor tCrDi = make_fragment_like(tCsC);
   Tensor tCrD = make_fragment_like(tCrC);
+  Tensor tCrDi = make_fragment_like<InputTypeC>(tCrD);
+
   if(!isBetaZero) {
-    copy(sC_copy_op, tCsC, tCrDi);
+    auto smem_tiled_copy_C = make_tiled_copy_C(Copy_Atom<SmemCopyLdOpC, InputTypeC>{}, thr_mma);
+    auto smem_thr_copy_C   = smem_tiled_copy_C.get_thread_slice(thread_idx);
+    Tensor tCsC            = smem_thr_copy_C.partition_S(sC);
+    Tensor tCrDi_copy_view = smem_thr_copy_C.retile_D(tCrDi);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsC) == size<1>(tCrDi_copy_view));             // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsC) == size<2>(tCrDi_copy_view));             // CPY_N
+    copy(smem_tiled_copy_C, tCsC, tCrDi_copy_view);
+
     // Transform C on/after load
     cute::transform(tCrDi, tCrD, sC_load_op);
   }
   // C = alpha * (A * B) + beta * C
   axpby(alpha, tCrC, beta, tCrD);
   // Transform C before/on store
   cute::transform(tCrD, tCrDi, sC_store_op);
-  copy(sC_copy_op, tCrDi, tCsC);
+
+  auto smem_tiled_copy_C = make_tiled_copy_C(Copy_Atom<SmemCopyStOpC, InputTypeC>{}, thr_mma);
+  auto smem_thr_copy_C   = smem_tiled_copy_C.get_thread_slice(thread_idx);
+  Tensor tCsC            = smem_thr_copy_C.partition_D(sC);
+  Tensor tCrDi_copy_view = smem_thr_copy_C.retile_S(tCrDi);
+  CUTE_STATIC_ASSERT_V(size<1>(tCsC) == size<1>(tCrDi_copy_view));             // CPY_M
+  CUTE_STATIC_ASSERT_V(size<2>(tCsC) == size<2>(tCrDi_copy_view));             // CPY_N
+  copy(smem_tiled_copy_C, tCrDi_copy_view, tCsC);
 }
 
 // Predicated Cooperative GEMM
@@ -283,23 +302,23 @@ cooperative_gemm_no_predication(uint32_t                   thread_idx,
 
   // Create register tensors for the MMA to operate on
   Tensor tCrA  = thr_mma.partition_fragment_A(sA);                    // (MMA,MMA_M,MMA_K)
+  Tensor tCrAi = make_fragment_like<InputTypeA>(tCrA);
   Tensor tCrB  = thr_mma.partition_fragment_B(sB);                    // (MMA,MMA_N,MMA_K)
+  Tensor tCrBi = make_fragment_like<InputTypeB>(tCrB);
 
   using CopyOpAType = SmemCopyOpA;
   using CopyOpBType = SmemCopyOpB;
 
   auto smem_tiled_copy_A = make_tiled_copy_A(Copy_Atom<CopyOpAType, InputTypeA>{}, thr_mma);
   auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
   Tensor tCsA            = smem_thr_copy_A.partition_S(sA);
-  Tensor tCrAi           = make_fragment_like(tCsA);
   Tensor tCrAi_copy_view = smem_thr_copy_A.retile_D(tCrAi);
   CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrAi_copy_view));             // CPY_M
   CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrAi_copy_view));             // CPY_K
 
   auto smem_tiled_copy_B = make_tiled_copy_B(Copy_Atom<CopyOpBType, InputTypeB>{}, thr_mma);
   auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
   Tensor tCsB            = smem_thr_copy_B.partition_S(sB);
-  Tensor tCrBi           = make_fragment_like(tCsB);
   Tensor tCrBi_copy_view = smem_thr_copy_B.retile_D(tCrBi);
   CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrBi_copy_view));            // CPY_N
   CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrBi_copy_view));            // CPY_K
@@ -346,7 +365,7 @@ template <class... Args,
           class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
           class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
           class SmemCopyOpA = DefaultCopy, class SmemCopyOpB = DefaultCopy,
-          class SmemCopyOpC = DefaultCopy>
+          class SmemCopyLdOpC = DefaultCopy, class SmemCopyStOpC = DefaultCopy>
 CUTE_HOST_DEVICE
 void
 cooperative_gemm(uint32_t                   thread_idx,
@@ -356,13 +375,14 @@ cooperative_gemm(uint32_t                   thread_idx,
                  Tensor<TB, BLayout> const& sB,
                  Beta                const& beta,
                  Tensor<TC, CLayout>      & sC,
-                 ALoadTransformOp    const& sA_load_op  = {}, // transforms A values before use in GEMM
-                 BLoadTransformOp    const& sB_load_op  = {}, // transforms B values before use in GEMM
-                 CLoadTransformOp    const& sC_load_op  = {}, // transforms C values before use in GEMM
-                 CStoreTransformOp   const& sC_store_op = {}, // transforms results before they are stored to C
-                 SmemCopyOpA         const& sA_copy_op  = {},
-                 SmemCopyOpB         const& sB_copy_op  = {},
-                 SmemCopyOpC         const& sC_copy_op  = {})
+                 ALoadTransformOp    const& sA_load_op    = {}, // transforms A values before use in GEMM
+                 BLoadTransformOp    const& sB_load_op    = {}, // transforms B values before use in GEMM
+                 CLoadTransformOp    const& sC_load_op    = {}, // transforms C values before use in GEMM
+                 CStoreTransformOp   const& sC_store_op   = {}, // transforms results before they are stored to C
+                 SmemCopyOpA         const& sA_copy_op    = {},
+                 SmemCopyOpB         const& sB_copy_op    = {},
+                 SmemCopyLdOpC       const& sC_copy_ld_op = {},
+                 SmemCopyStOpC       const& sC_copy_st_op = {})
 {
   CUTE_STATIC_ASSERT_V(rank(sA) == Int<2>{});
   CUTE_STATIC_ASSERT_V(rank(sB) == Int<2>{});
@@ -394,7 +414,7 @@ cooperative_gemm(uint32_t                   thread_idx,
         thread_idx, thr_mma, sA, sB, tCrC, sA_load_op, sB_load_op, sA_copy_op, sB_copy_op
     );
     detail::epilogue_no_predication(
-        alpha, tCrC, beta, tCsC, sC_load_op, sC_store_op, sC_copy_op
+        thread_idx, thr_mma,alpha, tCrC, beta, sC, sC_load_op, sC_store_op, sC_copy_ld_op, sC_copy_st_op
     );
   } else {
     detail::cooperative_gemm_predication(
@@ -466,7 +486,7 @@ template <class... Args,
           class ALoadTransformOp = cute::identity, class BLoadTransformOp  = cute::identity,
           class CLoadTransformOp = cute::identity, class CStoreTransformOp = cute::identity,
           class SmemCopyOpA = DefaultCopy, class SmemCopyOpB = DefaultCopy,
-          class SmemCopyOpC = DefaultCopy>
+          class SmemCopyLdOpC = DefaultCopy, class SmemCopyStOpC = DefaultCopy>
 CUTE_HOST_DEVICE
 void
 cooperative_gemm(uint32_t thread_idx,
@@ -476,17 +496,18 @@ cooperative_gemm(uint32_t thread_idx,
                  Tensor<TB, BLayout> const& sB,
                  Beta                const& beta,
                  Tensor<TC, CLayout>     && sC,
-                 ALoadTransformOp    const& sA_load_op  = {}, // transforms A values before use in GEMM
-                 BLoadTransformOp    const& sB_load_op  = {}, // transforms B values before use in GEMM
-                 CLoadTransformOp    const& sC_load_op  = {}, // transforms C values before use in GEMM
-                 CStoreTransformOp   const& sC_store_op = {}, // transforms results before they are stored to C
-                 SmemCopyOpA         const& sA_copy_op  = {},
-                 SmemCopyOpB         const& sB_copy_op  = {},
-                 SmemCopyOpC         const& sC_copy_op  = {})
+                 ALoadTransformOp    const& sA_load_op    = {}, // transforms A values before use in GEMM
+                 BLoadTransformOp    const& sB_load_op    = {}, // transforms B values before use in GEMM
+                 CLoadTransformOp    const& sC_load_op    = {}, // transforms C values before use in GEMM
+                 CStoreTransformOp   const& sC_store_op   = {}, // transforms results before they are stored to C
+                 SmemCopyOpA         const& sA_copy_op    = {},
+                 SmemCopyOpB         const& sB_copy_op    = {},
+                 SmemCopyLdOpC       const& sC_copy_ld_op = {},
+                 SmemCopyStOpC       const& sC_copy_st_op = {})
 {
   cooperative_gemm(thread_idx, tiled_mma, alpha, sA, sB, beta, sC,
                    sA_load_op, sB_load_op, sC_load_op, sC_store_op,
-                   sA_copy_op, sB_copy_op, sC_copy_op);
+                   sA_copy_op, sB_copy_op, sC_copy_ld_op, sC_copy_st_op);
 }
 
 // Legacy overload of cute::gemm for backwards-compatibility
 
@@ -33,6 +33,7 @@
 #include <cute/config.hpp>
 
 #include <cute/util/type_traits.hpp>
+#include <cute/container/type_list.hpp>
 #include <cute/container/tuple.hpp>
 #include <cute/algorithm/functional.hpp>
 #include <cute/numeric/integer_sequence.hpp>
@@ -283,34 +284,13 @@ transform_leaf(T0 const& t0, T1 const& t1, F&& f)
 // find and find_if
 //
 
-namespace detail {
-
-template <class T, class F, int I, int... Is>
-CUTE_HOST_DEVICE constexpr
-auto
-find_if(T const& t, F&& f, seq<I,Is...>)
-{
-  if constexpr (decltype(f(get<I>(t)))::value) {
-    return cute::C<I>{};
-  } else
-  if constexpr (sizeof...(Is) == 0) {
-    return cute::C<I+1>{};
-  } else {
-    return find_if(t, f, seq<Is...>{});
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-} // end namespace detail
-
 template <class T, class F>
 CUTE_HOST_DEVICE constexpr
 auto
 find_if(T const& t, F&& f)
 {
   if constexpr (is_tuple<T>::value) {
-    return detail::find_if(t, f, tuple_seq<T>{});
+    return detail::tapply(t, f, [] (auto... a) { return cute::C<find_true_v<decltype(a)::value...>>{}; }, tuple_seq<T>{});
   } else {
     return cute::C<decltype(f(t))::value ? 0 : 1>{};
   }
@@ -332,7 +312,7 @@ auto
 any_of(T const& t, F&& f)
 {
   if constexpr (is_tuple<T>::value) {
-    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (false_type{} || ... || a); }, tuple_seq<T>{});
+    return detail::tapply(t, f, [] (auto... a) { return (false_type{} || ... || a); }, tuple_seq<T>{});
   } else {
     return f(t);
   }
@@ -346,7 +326,7 @@ auto
 all_of(T const& t, F&& f)
 {
   if constexpr (is_tuple<T>::value) {
-    return detail::apply(cute::transform(t, f), [&] (auto const&... a) { return (true_type{} && ... && a); }, tuple_seq<T>{});
+    return detail::tapply(t, f, [] (auto... a) { return (true_type{} && ... && a); }, tuple_seq<T>{});
   } else {
     return f(t);
   }
 
@@ -31,6 +31,7 @@
 #pragma once
 
 #include <cute/config.hpp>
+#include <cute/numeric/numeric_types.hpp>
 
 // Config
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && \
 
@@ -72,6 +72,27 @@
 #  define CUTE_ARCH_TCGEN05_F16BF16_MMA_SCALED_ENABLED
 #endif
 
+#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED))
+#  define CUTE_ARCH_TMA_SM90_ENABLED 
+#  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
+#  define CUTE_ARCH_STSM_SM90_ENABLED
+#  define CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED
+#  define CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED
+#  define CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED
+#  define CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED
+#  define CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
+#  define CUTE_ARCH_TCGEN05_F16BF16_MMA_SCALED_ENABLED
+#endif
+
+#if (defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
+#  define CUTE_ARCH_TMA_SM90_ENABLED
+#  define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
+#  define CUTE_ARCH_STSM_SM90_ENABLED
+#endif
+
 #if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
 #  define CUTE_ARCH_TCGEN05_S8_MMA_ENABLED
 #endif
@@ -91,8 +112,11 @@
 #endif
 
 // {add, mul, fma}.f32x2 PTX
-#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED))
-  #define CUTE_ARCH_FLOAT2_MATH_ENABLED
+#if defined(CUTLASS_ARCH_MMA_SM100_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
+   // Enable CuTe MMA Atoms
+#  define CUTE_ARCH_FFMA2_SM100_ENABLED
+   // Enable f32x2 PTX generation
+#  define CUTE_ARCH_FLOAT2_MATH_ENABLED
 #endif
 
 #if defined(CUTLASS_ARCH_MMA_SM120_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED)
@@ -109,3 +133,37 @@
 #  endif
 #endif
 
+#if defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
+#  define CUTE_ARCH_LDSM_SM100A_ENABLED
+#  define CUTE_ARCH_STSM_SM100A_ENABLED
+#  define CUTE_ARCH_TCGEN05_TMEM_ENABLED
+#  define CUTE_ARCH_TMA_SM100_ENABLED
+#  define CUTE_ARCH_FLOAT2_MATH_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) 
+#  define CUTE_ARCH_LDSM_SM100A_ENABLED
+#  define CUTE_ARCH_STSM_SM100A_ENABLED
+#  define CUTE_ARCH_TCGEN05_TMEM_ENABLED
+#  define CUTE_ARCH_TMA_SM100_ENABLED
+#endif
+
+#if defined(CUTLASS_ARCH_MMA_SM120F_ENABLED)
+#  define CUTE_ARCH_LDSM_SM100A_ENABLED
+#  define CUTE_ARCH_STSM_SM100A_ENABLED
+#endif
+
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
+#  if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
+#    define CUTE_ARCH_LOAD256_SM100A_ENABLED
+#    define CUTE_ARCH_STORE256_SM100A_ENABLED
+#  endif
+#endif
+
+// {add, mul, fma}.f32x2 PTX
+#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
+  #define CUTE_ARCH_FLOAT2_MATH_ENABLED
+#endif
+