issue/932 - feat: add paged attention prefill operator referencing nvidia implementation

spike-zhu · spike-zhu · commit c23cd85a9296 · 2026-01-15T19:03:32.000+08:00
diff --git a/src/infiniop/ops/paged_attention_prefill/moore/paged_attention_prefill_moore.h b/src/infiniop/ops/paged_attention_prefill/moore/paged_attention_prefill_moore.h
@@ -0,0 +1,8 @@
+#ifndef __PAGED_ATTENTION_PREFILL_MOORE_H__
+#define __PAGED_ATTENTION_PREFILL_MOORE_H__
+
+#include "../paged_attention_prefill.h"
+
+DESCRIPTOR(moore)
+
+#endif // __PAGED_ATTENTION_PREFILL_MOORE_H__
diff --git a/src/infiniop/ops/paged_attention_prefill/moore/paged_attention_prefill_moore.mu b/src/infiniop/ops/paged_attention_prefill/moore/paged_attention_prefill_moore.mu
@@ -0,0 +1,126 @@
+#include <musa_fp16.h>
+#include <float.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "../cuda/kernel.cuh"
+#include "paged_attention_prefill_moore.h"
+
+template <typename Tdata, typename Tcompute>
+infiniStatus_t launchPagedAttentionPrefill(
+    Tdata *out, const Tdata *q, const Tdata *k_cache, const Tdata *v_cache,
+    const int64_t *block_tables,
+    const int64_t *seq_lens,
+    const int64_t *cum_seq_lens_q,
+    const float *alibi_slopes,
+    const size_t num_heads,
+    const size_t num_seqs,
+    const size_t num_kv_heads,
+    const float scale,
+    const size_t max_num_blocks_per_seq,
+    const size_t block_size,
+    const size_t total_q_tokens,
+    const size_t head_size,
+    const ptrdiff_t kv_block_stride,
+    const ptrdiff_t kv_head_stride,
+    const ptrdiff_t q_stride,
+    const ptrdiff_t q_head_stride,
+    musaStream_t stream) {
+
+    if (total_q_tokens == 0 || num_heads == 0) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    dim3 grid(total_q_tokens, num_heads);
+    dim3 block(head_size);
+
+    op::paged_attention_prefill::cuda::pagedAttentionPrefillKernel<Tdata, Tcompute>
+        <<<grid, block, 0, stream>>>(
+            out, q, k_cache, v_cache,
+            block_tables, seq_lens, cum_seq_lens_q, alibi_slopes,
+            num_heads, num_kv_heads, scale,
+            max_num_blocks_per_seq, block_size,
+            kv_block_stride, kv_head_stride,
+            q_stride, q_head_stride,
+            head_size,
+            num_seqs);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace op::paged_attention_prefill::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t block_tables_desc,
+    infiniopTensorDescriptor_t seq_lens_desc,
+    infiniopTensorDescriptor_t cum_seq_lens_q_desc,
+    const std::optional<infiniopTensorDescriptor_t> &alibi_slopes_desc,
+    float scale) {
+
+    auto info = PagedAttentionPrefillInfo::create(
+        out_desc, q_desc, k_cache_desc, v_cache_desc,
+        block_tables_desc, seq_lens_desc,
+        cum_seq_lens_q_desc,
+        alibi_slopes_desc, scale);
+
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out, const void *q, const void *k_cache, const void *v_cache,
+    const void *block_tables,
+    const void *seq_lens,
+    const void *cum_seq_lens_q,
+    const void *alibi_slopes,
+    void *stream_) const {
+
+    musaStream_t stream = (musaStream_t)stream_;
+
+#define LAUNCH_KERNEL(Tdata, Tcompute)                                                             \
+    launchPagedAttentionPrefill<Tdata, Tcompute>(                                                  \
+        (Tdata *)out, (const Tdata *)q, (const Tdata *)k_cache, (const Tdata *)v_cache,            \
+        (const int64_t *)block_tables, (const int64_t *)seq_lens, (const int64_t *)cum_seq_lens_q, \
+        (const float *)alibi_slopes,                                                               \
+        _info.num_heads, _info.num_seqs, _info.num_kv_heads,                                       \
+        _info.scale, _info.max_num_blocks_per_seq,                                                 \
+        _info.block_size, _info.total_q_tokens,                                                    \
+        _info.head_size,                                                                           \
+        _info.kv_block_stride, _info.kv_head_stride,                                               \
+        _info.q_stride, _info.q_head_stride,                                                       \
+        stream)
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        return LAUNCH_KERNEL(half, float);
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        return LAUNCH_KERNEL(__mt_bfloat16, float);
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        return LAUNCH_KERNEL(float, float);
+    }
+
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+}
+
+} // namespace op::paged_attention_prefill::moore
diff --git a/src/infiniop/ops/paged_attention_prefill/operator.cc b/src/infiniop/ops/paged_attention_prefill/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_NVIDIA_API
 #include "nvidia/paged_attention_prefill_nvidia.cuh"
 #endif
+#ifdef ENABLE_MOORE_API
+#include "moore/paged_attention_prefill_moore.h"
+#endif
 
 __C infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
     infiniopHandle_t handle,
@@ -32,6 +35,9 @@ __C infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
     switch (handle->device) {
 #ifdef ENABLE_NVIDIA_API
         CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -50,6 +56,9 @@ __C infiniStatus_t infiniopGetPagedAttentionPrefillWorkspaceSize(
     switch (desc->device_type) {
 #ifdef ENABLE_NVIDIA_API
         GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -75,6 +84,9 @@ __C infiniStatus_t infiniopPagedAttentionPrefill(
     switch (desc->device_type) {
 #ifdef ENABLE_NVIDIA_API
         CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -92,6 +104,9 @@ __C infiniStatus_t infiniopDestroyPagedAttentionPrefillDescriptor(
     switch (desc->device_type) {
 #ifdef ENABLE_NVIDIA_API
         DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;