InfiniTensor
diff --git a/‎README.md‎
Lines changed: 15 additions & 0 deletions b/‎README.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎include/infiniccl.h‎
Lines changed: 3 additions & 3 deletions b/‎include/infiniccl.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/infinicore.h‎
Lines changed: 2 additions & 2 deletions b/‎include/infinicore.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/infinicore/adaptor/aten_adaptor.hpp‎
Lines changed: 48 additions & 0 deletions b/‎include/infinicore/adaptor/aten_adaptor.hpp‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎include/infinicore/adaptor/flash_attention_adaptor.hpp‎
Lines changed: 114 additions & 0 deletions b/‎include/infinicore/adaptor/flash_attention_adaptor.hpp‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎include/infinicore/ops/mha_varlen.hpp‎
Lines changed: 46 additions & 0 deletions b/‎include/infinicore/ops/mha_varlen.hpp‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎include/infiniop/handle.h‎
Lines changed: 2 additions & 2 deletions b/‎include/infiniop/handle.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/infiniop/operator_descriptor.h‎
Lines changed: 2 additions & 2 deletions b/‎include/infiniop/operator_descriptor.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/infiniop/ops/add.h‎
Lines changed: 4 additions & 4 deletions b/‎include/infiniop/ops/add.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎include/infiniop/ops/add_rms_norm.h‎
Lines changed: 4 additions & 4 deletions b/‎include/infiniop/ops/add_rms_norm.h‎
Lines changed: 4 additions & 4 deletions
@@ -107,6 +107,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
 | `--ali-ppu=[y\|n]`       | 是否编译阿里 PPU 接口实现         | n
 | `--ninetoothed=[y\|n]`   | 是否编译九齿实现                 | n
 | `--ccl=[y\|n]`           | 是否编译 InfiniCCL 通信库接口实现 | n
+| `--graph=[y\|n]`         | 是否编译 cuda graph 接口实现      | n
 
 ##### 手动安装底层库
 
@@ -154,6 +155,20 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
      xmake f --ascend-npu=true -cv
      ```
 
+##### 试验功能 -- 使用flash attention库中的算子
+
+  ```shell
+
+  (1) 在third_party目录拉取cutlass和flash attn库的源码(不需要--recursive)
+
+  (2) 设置(1)中cutlass路径的环境变量CUTLASS_ROOT
+
+  (3) xmake配置环节额外打开 --aten 开关，并设置 --flash-attn 库位置，例：
+      xmake f --nv-gpu=y --ccl=y --cuda=$CUDA_HOME --aten=y --flash-attn=<path-to>/InfiniCore/third_party/flash-attention -cv
+
+  (4) flash attenion库会伴随infinicore_cpp_api一同编译安装
+  ```
+
 2. 编译安装
 
    默认安装路径为 `$HOME/.infini`。
 
@@ -15,15 +15,15 @@ struct InfinicclComm;
 
 typedef struct InfinicclComm *infinicclComm_t;
 
-__C __export infiniStatus_t infinicclCommInitAll(
+__INFINI_C __export infiniStatus_t infinicclCommInitAll(
     infiniDevice_t device_type,
     infinicclComm_t *comms,
     int ndevice,
     const int *device_ids);
 
-__C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm);
+__INFINI_C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm);
 
-__C __export infiniStatus_t infinicclAllReduce(
+__INFINI_C __export infiniStatus_t infinicclAllReduce(
     void *sendbuf,
     void *recvbuf,
     size_t count,
 
@@ -10,10 +10,10 @@
 #endif
 
 #ifdef __cplusplus
-#define __C extern "C"
+#define __INFINI_C extern "C"
 #include <cstddef>
 #else
-#define __C
+#define __INFINI_C
 #include <stddef.h>
 #endif
 
 
@@ -0,0 +1,48 @@
+#ifdef ENABLE_ATEN
+#pragma once
+#include "../context/context.hpp"
+#include "../tensor.hpp"
+
+#include <ATen/ATen.h>
+
+#ifdef ENABLE_NVIDIA_API
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#endif
+
+namespace infinicore::adaptor {
+inline at::ScalarType to_at_dtype(DataType dtype) {
+    switch (dtype) {
+    case DataType::F32:
+        return at::kFloat;
+    case DataType::F16:
+        return at::kHalf;
+    case DataType::BF16:
+        return at::kBFloat16;
+    case DataType::I32:
+        return at::kInt;
+    case DataType::I64:
+        return at::kLong;
+    default:
+        throw std::runtime_error("Unsupported dtype for ATen");
+    }
+}
+
+inline at::Device to_at_device(const Device &device) {
+    if (device.getType() == Device::Type::NVIDIA) {
+        return at::Device(at::kCUDA, device.getIndex());
+    } else if (device.getType() == Device::Type::CPU) {
+        return at::Device(at::kCPU);
+    } else {
+        throw std::runtime_error("Unsupported device type for ATen");
+    }
+}
+
+at::Tensor to_aten_tensor(const infinicore::Tensor &t);
+
+#ifdef ENABLE_NVIDIA_API
+c10::cuda::CUDAStream get_cuda_stream();
+#endif
+} // namespace infinicore::adaptor
+
+#endif // ENABLE_ATEN
@@ -0,0 +1,114 @@
+#ifdef ENABLE_FLASH_ATTN
+#pragma once
+#include "aten_adaptor.hpp"
+
+namespace flash {
+std::vector<at::Tensor>
+mha_fwd(at::Tensor &q,                            // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
+        const at::Tensor &k,                      // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
+        const at::Tensor &v,                      // batch_size x seqlen_k x num_heads_k x round_multiple(head_size, 8)
+        std::optional<at::Tensor> &out_,          // batch_size x seqlen_q x num_heads x round_multiple(head_size, 8)
+        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        const float p_dropout,
+        const float softmax_scale,
+        bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        const float softcap,
+        const bool return_softmax,
+        std::optional<at::Generator> gen_);
+
+std::vector<at::Tensor>
+mha_varlen_fwd(at::Tensor &q,                               // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,                         // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+               const at::Tensor &v,                         // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+               std::optional<at::Tensor> &out_,             // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,              // b+1
+               const at::Tensor &cu_seqlens_k,              // b+1
+               std::optional<at::Tensor> &seqused_k,        // b. If given, only this many elements of each batch element's keys are used.
+               std::optional<const at::Tensor> &leftpad_k_, // batch_size
+               std::optional<at::Tensor> &block_table_,     // batch_size x max_num_blocks_per_seq
+               std::optional<at::Tensor> &alibi_slopes_,    // num_heads or b x num_heads
+               int max_seqlen_q,
+               const int max_seqlen_k,
+               const float p_dropout,
+               const float softmax_scale,
+               const bool zero_tensors,
+               bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const float softcap,
+               const bool return_softmax,
+               std::optional<at::Generator> gen_);
+
+std::vector<at::Tensor>
+mha_bwd(const at::Tensor &dout,                   // batch_size x seqlen_q x num_heads, x multiple_of(head_size_og, 8)
+        const at::Tensor &q,                      // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &k,                      // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &v,                      // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &out,                    // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &softmax_lse,            // b x h x seqlen_q
+        std::optional<at::Tensor> &dq_,           // batch_size x seqlen_q x num_heads x head_size
+        std::optional<at::Tensor> &dk_,           // batch_size x seqlen_k x num_heads_k x head_size
+        std::optional<at::Tensor> &dv_,           // batch_size x seqlen_k x num_heads_k x head_size
+        std::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        const float p_dropout,                    // probability to drop
+        const float softmax_scale,
+        const bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        const float softcap,
+        const bool deterministic,
+        std::optional<at::Generator> gen_,
+        std::optional<at::Tensor> &rng_state);
+
+std::vector<at::Tensor>
+mha_varlen_bwd(const at::Tensor &dout,                   // total_q x num_heads, x head_size
+               const at::Tensor &q,                      // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,                      // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &v,                      // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &out,                    // total_q x num_heads x head_size
+               const at::Tensor &softmax_lse,            // h x total_q, softmax logsumexp
+               std::optional<at::Tensor> &dq_,           // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dk_,           // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               std::optional<at::Tensor> &dv_,           // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,           // b+1
+               const at::Tensor &cu_seqlens_k,           // b+1
+               std::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               const int max_seqlen_q,
+               const int max_seqlen_k, // max sequence length to choose the kernel
+               const float p_dropout,  // probability to drop
+               const float softmax_scale,
+               const bool zero_tensors,
+               const bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const float softcap,
+               const bool deterministic,
+               std::optional<at::Generator> gen_,
+               std::optional<at::Tensor> &rng_state);
+
+std::vector<at::Tensor>
+mha_fwd_kvcache(at::Tensor &q,                                     // batch_size x seqlen_q x num_heads x head_size
+                const at::Tensor &kcache,                          // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+                const at::Tensor &vcache,                          // batch_size_c x seqlen_k x num_heads_k x head_size or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+                std::optional<const at::Tensor> &k_,               // batch_size x seqlen_knew x num_heads_k x head_size
+                std::optional<const at::Tensor> &v_,               // batch_size x seqlen_knew x num_heads_k x head_size
+                std::optional<const at::Tensor> &seqlens_k_,       // batch_size
+                std::optional<const at::Tensor> &rotary_cos_,      // seqlen_ro x (rotary_dim / 2)
+                std::optional<const at::Tensor> &rotary_sin_,      // seqlen_ro x (rotary_dim / 2)
+                std::optional<const at::Tensor> &cache_batch_idx_, // indices to index into the KV cache
+                std::optional<const at::Tensor> &leftpad_k_,       // batch_size
+                std::optional<at::Tensor> &block_table_,           // batch_size x max_num_blocks_per_seq
+                std::optional<at::Tensor> &alibi_slopes_,          // num_heads or batch_size x num_heads
+                std::optional<at::Tensor> &out_,                   // batch_size x seqlen_q x num_heads x head_size
+                const float softmax_scale,
+                bool is_causal,
+                int window_size_left,
+                int window_size_right,
+                const float softcap,
+                bool is_rotary_interleaved, // if true, rotary combines indices 0 & 1, else indices 0 & rotary_dim / 2
+                int num_splits);
+
+} // namespace flash
+#endif // ENABLE_FLASH_ATTN
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <optional>
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(
+    MultiheadAttentionVarlen,
+    Tensor,
+    const Tensor &,
+    const Tensor &,
+    const Tensor &,
+    const Tensor &,
+    const Tensor &,
+    const Tensor &,
+    int,
+    int,
+    std::optional<Tensor>,
+    float);
+
+Tensor mha_varlen(const Tensor &q,
+                  const Tensor &k,
+                  const Tensor &v,
+                  const Tensor &cum_seqlens_q,
+                  const Tensor &cum_seqlens_k,
+                  const Tensor &block_table,
+                  int max_seqlen_q,
+                  int max_seqlen_k,
+                  std::optional<Tensor> alibi_slopes,
+                  float scale);
+
+void mha_varlen_(Tensor out,
+                 const Tensor &q,
+                 const Tensor &k,
+                 const Tensor &v,
+                 const Tensor &cum_seqlens_q,
+                 const Tensor &cum_seqlens_k,
+                 const Tensor &block_table,
+                 int max_seqlen_q,
+                 int max_seqlen_k,
+                 std::optional<Tensor> alibi_slopes,
+                 float scale);
+
+} // namespace infinicore::op
@@ -7,8 +7,8 @@ struct InfiniopHandle;
 
 typedef struct InfiniopHandle *infiniopHandle_t;
 
-__C __export infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr);
+__INFINI_C __export infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr);
 
-__C __export infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
+__INFINI_C __export infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle);
 
 #endif
@@ -7,7 +7,7 @@
 // Base descriptor for all operators
 struct InfiniopDescriptor;
 
-__C __export infiniStatus_t infiniopGetDescriptorDeviceType(const struct InfiniopDescriptor *desc_ptr, infiniDevice_t *device_type);
-__C __export infiniStatus_t infiniopGetDescriptorDeviceId(const struct InfiniopDescriptor *desc_ptr, int *device_id);
+__INFINI_C __export infiniStatus_t infiniopGetDescriptorDeviceType(const struct InfiniopDescriptor *desc_ptr, infiniDevice_t *device_type);
+__INFINI_C __export infiniStatus_t infiniopGetDescriptorDeviceId(const struct InfiniopDescriptor *desc_ptr, int *device_id);
 
 #endif //__INFINIOP_OPERATOR_DESCRIPTOR_API_H__
@@ -5,22 +5,22 @@
 
 typedef struct InfiniopDescriptor *infiniopAddDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
+__INFINI_C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
                                                         infiniopAddDescriptor_t *desc_ptr,
                                                         infiniopTensorDescriptor_t c,
                                                         infiniopTensorDescriptor_t a,
                                                         infiniopTensorDescriptor_t b);
 
-__C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
+__INFINI_C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
                                         void *workspace,
                                         size_t workspace_size,
                                         void *c,
                                         const void *a,
                                         const void *b,
                                         void *stream);
 
-__C __export infiniStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);
 
 #endif
@@ -5,7 +5,7 @@
 
 typedef struct InfiniopDescriptor *infiniopAddRMSNormDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
+__INFINI_C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
     infiniopHandle_t handle,
     infiniopAddRMSNormDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
@@ -15,9 +15,9 @@ __C __export infiniStatus_t infiniopCreateAddRMSNormDescriptor(
     infiniopTensorDescriptor_t weight_desc,
     float epsilon);
 
-__C __export infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetAddRMSNormWorkspaceSize(infiniopAddRMSNormDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t desc,
+__INFINI_C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t desc,
                                                 void *workspace,
                                                 size_t workspace_size,
                                                 void *y,
@@ -27,6 +27,6 @@ __C __export infiniStatus_t infiniopAddRMSNorm(infiniopAddRMSNormDescriptor_t de
                                                 const void *weight,
                                                 void *stream);
 
-__C __export infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyAddRMSNormDescriptor(infiniopAddRMSNormDescriptor_t desc);
 
 #endif