issue/932 - feat: add paged attention operator referencing nvidia implementation

spike-zhu · spike-zhu · commit 92b81acbcea4 · 2026-01-15T17:43:18.000+08:00
diff --git a/src/infiniop/ops/paged_attention/moore/paged_attention_moore.h b/src/infiniop/ops/paged_attention/moore/paged_attention_moore.h
@@ -0,0 +1,8 @@
+#ifndef __PAGED_ATTENTION_MOORE_H__
+#define __PAGED_ATTENTION_MOORE_H__
+
+#include "../paged_attention.h"
+
+DESCRIPTOR(moore)
+
+#endif // __PAGED_ATTENTION_MOORE_H__
diff --git a/src/infiniop/ops/paged_attention/moore/paged_attention_moore.mu b/src/infiniop/ops/paged_attention/moore/paged_attention_moore.mu
@@ -0,0 +1,145 @@
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "paged_attention_moore.h"
+
+template <typename Tdata, typename Tcompute, size_t HEAD_SIZE, size_t NUM_THREADS>
+INFINIOP_MOORE_KERNEL pagedAttention(
+    Tdata *out, const Tdata *q, const Tdata *k_cache, const Tdata *v_cache,
+    const int64_t *block_tables, const int64_t *seq_lens, const float *alibi_slopes,
+    const size_t num_kv_heads, const float scale, const size_t max_num_blocks_per_seq,
+    const size_t block_size,
+    const ptrdiff_t q_stride,
+    const ptrdiff_t kv_block_stride,
+    const ptrdiff_t kv_head_stride,
+    const ptrdiff_t o_stride) {
+    op::paged_attention::cuda::pagedAttentionKernel<Tdata, Tcompute, HEAD_SIZE, NUM_THREADS>(
+        out, q, k_cache, v_cache, block_tables, seq_lens, alibi_slopes, num_kv_heads, scale,
+        max_num_blocks_per_seq, block_size, q_stride, kv_block_stride, kv_head_stride, o_stride);
+}
+
+namespace op::paged_attention::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t block_tables_desc,
+    infiniopTensorDescriptor_t seq_lens_desc,
+    const std::optional<infiniopTensorDescriptor_t> &alibi_slopes_desc,
+    float scale) {
+    auto info = PagedAttentionInfo::create(out_desc, q_desc, k_cache_desc, v_cache_desc, block_tables_desc, seq_lens_desc, alibi_slopes_desc, scale);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <size_t HEAD_SIZE, size_t NUM_THREADS>
+infiniStatus_t launchKernel(void *out, const void *q, const void *k_cache, const void *v_cache,
+                            infiniDtype_t dtype,
+                            const void *block_tables, const void *seq_lens, const void *alibi_slopes,
+                            size_t num_heads, size_t num_seqs,
+                            size_t num_kv_heads, float scale, size_t max_num_blocks_per_seq, size_t block_size,
+                            ptrdiff_t q_stride, ptrdiff_t kv_block_stride, ptrdiff_t kv_head_stride, ptrdiff_t o_stride,
+                            musaStream_t stream) {
+    dim3 grid(uint64_t(num_heads), uint64_t(num_seqs), 1);
+    dim3 block(NUM_THREADS);
+    size_t shared_mem_size = (HEAD_SIZE + max_num_blocks_per_seq * block_size + 2) * sizeof(float);
+
+    if (dtype == INFINI_DTYPE_F16) {
+        pagedAttention<half, float, HEAD_SIZE, NUM_THREADS>
+            <<<grid, block, shared_mem_size, stream>>>(
+                (half *)out,
+                (const half *)q, (const half *)k_cache, (const half *)v_cache,
+                (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
+                scale, max_num_blocks_per_seq, block_size,
+                q_stride, kv_block_stride, kv_head_stride, o_stride);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        pagedAttention<__mt_bfloat16, float, HEAD_SIZE, NUM_THREADS>
+            <<<grid, block, shared_mem_size, stream>>>(
+                (__mt_bfloat16 *)out, (const __mt_bfloat16 *)q, (const __mt_bfloat16 *)k_cache, (const __mt_bfloat16 *)v_cache,
+                (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
+                scale, max_num_blocks_per_seq, block_size,
+                q_stride, kv_block_stride, kv_head_stride, o_stride);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        pagedAttention<float, float, HEAD_SIZE, NUM_THREADS>
+            <<<grid, block, shared_mem_size, stream>>>(
+                (float *)out, (const float *)q, (const float *)k_cache, (const float *)v_cache,
+                (const int64_t *)block_tables, (const int64_t *)seq_lens, (const float *)alibi_slopes, num_kv_heads,
+                scale, max_num_blocks_per_seq, block_size,
+                q_stride, kv_block_stride, kv_head_stride, o_stride);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out, const void *q, const void *k_cache, const void *v_cache,
+    const void *block_tables, const void *seq_lens, const void *alibi_slopes,
+    void *stream_) const {
+    musaStream_t stream = (musaStream_t)stream_;
+
+#define LAUNCH_HEADSIZE_BLOCKSIZE(__H_SIZE, __B_SIZE)                                    \
+    launchKernel<__H_SIZE, __B_SIZE>(                                                    \
+        out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes,     \
+        _info.num_heads, _info.num_seqs,                                                 \
+        _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, \
+        _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride,     \
+        stream);
+
+#define SWITCH_HEAD_SIZE(__B_SIZE)               \
+    switch (_info.head_size) {                   \
+    case 16:                                     \
+        LAUNCH_HEADSIZE_BLOCKSIZE(16, __B_SIZE)  \
+        break;                                   \
+    case 32:                                     \
+        LAUNCH_HEADSIZE_BLOCKSIZE(32, __B_SIZE)  \
+        break;                                   \
+    case 64:                                     \
+        LAUNCH_HEADSIZE_BLOCKSIZE(64, __B_SIZE)  \
+        break;                                   \
+    case 128:                                    \
+        LAUNCH_HEADSIZE_BLOCKSIZE(128, __B_SIZE) \
+        break;                                   \
+    case 256:                                    \
+        LAUNCH_HEADSIZE_BLOCKSIZE(256, __B_SIZE) \
+        break;                                   \
+    default:                                     \
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;   \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        SWITCH_HEAD_SIZE(MOORE_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        SWITCH_HEAD_SIZE(MOORE_BLOCK_SIZE_512)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+#undef LAUNCH_HEADSIZE_BLOCKSIZE
+#undef SWITCH_HEAD_SIZE
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::paged_attention::moore
diff --git a/src/infiniop/ops/paged_attention/operator.cc b/src/infiniop/ops/paged_attention/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_NVIDIA_API
 #include "nvidia/paged_attention_nvidia.cuh"
 #endif
+#ifdef ENABLE_MOORE_API
+#include "moore/paged_attention_moore.h"
+#endif
 // #ifdef ENABLE_METAX_API
 // #include "metax/paged_attention_metax.h"
 // #endif
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreatePagedAttentionDescriptor(
     switch (handle->device) {
 #ifdef ENABLE_NVIDIA_API
         CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
 #endif
     // #ifdef ENABLE_METAX_API
     //         CREATE(INFINI_DEVICE_METAX, metax)
@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetPagedAttentionWorkspaceSize(
     switch (desc->device_type) {
 #ifdef ENABLE_NVIDIA_API
         GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
 #endif
     // #ifdef ENABLE_METAX_API
     //         GET(INFINI_DEVICE_METAX, metax)
@@ -79,6 +88,9 @@ __C infiniStatus_t infiniopPagedAttention(
     switch (desc->device_type) {
 #ifdef ENABLE_NVIDIA_API
         CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore)
 #endif
     // #ifdef ENABLE_METAX_API
     //         CALCULATE(INFINI_DEVICE_METAX, metax)
@@ -99,6 +111,9 @@ __C infiniStatus_t infiniopDestroyPagedAttentionDescriptor(
     switch (desc->device_type) {
 #ifdef ENABLE_NVIDIA_API
         DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
 #endif
     // #ifdef ENABLE_METAX_API
     //         DESTROY(INFINI_DEVICE_METAX, metax)