issue/923 - ninetoothed kv_caching

wooway777 · wooway777 · commit c14e29d0564f · 2026-01-14T19:01:11.000+08:00
diff --git a/include/infinicore/ops/kv_caching.hpp b/include/infinicore/ops/kv_caching.hpp
@@ -1,4 +1,4 @@
-#pragma
+#pragma once
 
 #include "../device.hpp"
 #include "common/op.hpp"
@@ -15,11 +15,6 @@ class KVCaching {
     static common::OpDispatcher<schema> &dispatcher();
 };
 
-Tensor kv_caching(Tensor k_cache,
-                  Tensor v_cache,
-                  Tensor k,
-                  Tensor v,
-                  Tensor past_kv_lengths);
 void kv_caching_(Tensor k_cache,
                  Tensor v_cache,
                  Tensor k,
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
@@ -45,6 +45,7 @@
 from infinicore.ops.add import add
 from infinicore.ops.add_rms_norm import add_rms_norm, add_rms_norm_
 from infinicore.ops.attention import attention
+from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
@@ -115,6 +116,7 @@
     "add_rms_norm",
     "add_rms_norm_",
     "attention",
+    "kv_caching",
     "matmul",
     "mul",
     "narrow",
diff --git a/python/infinicore/ops/kv_caching.py b/python/infinicore/ops/kv_caching.py
@@ -0,0 +1,13 @@
+from infinicore.lib import _infinicore
+
+
+def kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
+    _infinicore.kv_caching_(
+        k_cache._underlying,
+        v_cache._underlying,
+        k._underlying,
+        v._underlying,
+        past_kv_lengths._underlying,
+    )
+
+    return k_cache, v_cache
diff --git a/src/infinicore/ops/kv_caching/kv_caching.cc b/src/infinicore/ops/kv_caching/kv_caching.cc
@@ -28,15 +28,6 @@ void KVCaching::execute(Tensor k_cache,
     func(k_cache, v_cache, k, v, past_kv_lengths);
 }
 
-Tensor kv_caching(Tensor k_cache,
-                  Tensor v_cache,
-                  Tensor k,
-                  Tensor v,
-                  Tensor past_kv_lengths) {
-    KVCaching::execute(k_cache, v_cache, k, v, past_kv_lengths);
-    return k_cache; // or v_cache, depending on the intended use
-}
-
 void kv_caching_(Tensor k_cache,
                  Tensor v_cache,
                  Tensor k,
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
@@ -8,6 +8,7 @@
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
+#include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
@@ -30,20 +31,21 @@ inline void bind(py::module &m) {
     bind_add_rms_norm(m);
     bind_attention(m);
     bind_causal_softmax(m);
+    bind_embedding(m);
     bind_flash_attention(m);
-    bind_random_sample(m);
+    bind_kv_caching(m);
     bind_linear(m);
     bind_matmul(m);
     bind_mul(m);
     bind_paged_attention(m);
     bind_paged_attention_prefill(m);
     bind_paged_caching(m);
+    bind_random_sample(m);
     bind_rearrange(m);
     bind_rms_norm(m);
+    bind_rope(m);
     bind_silu(m);
     bind_swiglu(m);
-    bind_rope(m);
-    bind_embedding(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/kv_caching.hpp b/src/infinicore/pybind11/ops/kv_caching.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/kv_caching.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_kv_caching(py::module &m) {
+    m.def("kv_caching_",
+          &op::kv_caching_,
+          py::arg("k_cache"),
+          py::arg("v_cache"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("past_kv_lengths"),
+          R"doc(In-place Key-Value Caching.
+
+Updates the KV cache in-place with new key and value tensors.
+
+Args:
+    k_cache: Key cache tensor to update in-place
+    v_cache: Value cache tensor to update in-place
+    k: New key tensor to append
+    v: New value tensor to append
+    past_kv_lengths: Tensor containing current sequence lengths for each batch
+)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/flash_attention/ninetoothed/descriptor.h b/src/infiniop/ops/flash_attention/ninetoothed/descriptor.h
@@ -67,24 +67,26 @@ class Descriptor final : public InfiniopDescriptor {
         constexpr auto block_size_m_{64};
         constexpr auto block_size_n_{64};
 
-        launch_flash_attention(stream,
-                               query,
-                               key,
-                               value,
-                               attn_mask,
-                               is_causal,
-                               scale,
-                               output,
-                               with_attn_mask,
-                               causal_variant,
-                               with_kv_cache_,
-                               emb_dim_,
-                               is_causal_,
-                               with_attn_mask_,
-                               causal_variant_,
-                               dtype_,
-                               block_size_m_,
-                               block_size_n_);
+        if (launch_flash_attention(stream,
+                                   query,
+                                   key,
+                                   value,
+                                   attn_mask,
+                                   is_causal,
+                                   scale,
+                                   output,
+                                   with_attn_mask,
+                                   causal_variant,
+                                   with_kv_cache_,
+                                   emb_dim_,
+                                   is_causal_,
+                                   with_attn_mask_,
+                                   causal_variant_,
+                                   dtype_,
+                                   block_size_m_,
+                                   block_size_n_)) {
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
 
         return INFINI_STATUS_SUCCESS;
     }
diff --git a/src/infiniop/ops/kv_caching/ninetoothed/build.py b/src/infiniop/ops/kv_caching/ninetoothed/build.py
@@ -0,0 +1,27 @@
+import ninetoothed
+from ntops.kernels import kv_caching
+
+import infiniop.ninetoothed.build
+
+
+def build():
+    dtype_values = (
+        ninetoothed.float16,
+        ninetoothed.bfloat16,
+        ninetoothed.float32,
+    )
+
+    constexpr_param_grid = {
+        "emb_dim": (1, 16, 32, 64, 128, 256),
+        "dtype": dtype_values,
+        "block_size_m": (64,),
+        "block_size_n": (64,),
+    }
+
+    infiniop.ninetoothed.build.build(
+        kv_caching.premake,
+        constexpr_param_grid,
+        caller="cuda",
+        op_name="kv_caching",
+        output_dir=infiniop.ninetoothed.build.BUILD_DIRECTORY_PATH,
+    )
diff --git a/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.h b/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.h
@@ -0,0 +1,101 @@
+#ifndef KV_CACHING_H
+#define KV_CACHING_H
+
+#include "../../../handle.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+#include "../../../../../build/ninetoothed/kv_caching.h"
+#include "../../../ninetoothed/utils.h"
+
+namespace op::kv_caching::ninetoothed {
+class Descriptor final : public InfiniopDescriptor {
+
+public:
+    Descriptor(
+        infiniopHandle_t handle,
+        infiniopTensorDescriptor_t k_cache_desc,
+        infiniopTensorDescriptor_t v_cache_desc,
+        infiniopTensorDescriptor_t k_desc,
+        infiniopTensorDescriptor_t v_desc,
+        infiniopTensorDescriptor_t past_kv_lengths_desc) : InfiniopDescriptor{handle->device, handle->device_id},
+                                                           k_cache_shape_{k_cache_desc->shape()},
+                                                           k_cache_strides_{k_cache_desc->strides()},
+                                                           v_cache_shape_{v_cache_desc->shape()},
+                                                           v_cache_strides_{v_cache_desc->strides()},
+                                                           k_shape_{k_desc->shape()},
+                                                           k_strides_{k_desc->strides()},
+                                                           v_shape_{v_desc->shape()},
+                                                           v_strides_{v_desc->strides()},
+                                                           past_kv_lengths_shape_{past_kv_lengths_desc->shape()},
+                                                           past_kv_lengths_strides_{past_kv_lengths_desc->strides()},
+                                                           dtype_{k_desc->dtype()} {}
+
+    ~Descriptor() = default;
+
+    size_t get_workspace_size() const { return 0; };
+
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t k_cache,
+        infiniopTensorDescriptor_t v_cache,
+        infiniopTensorDescriptor_t k,
+        infiniopTensorDescriptor_t v,
+        infiniopTensorDescriptor_t past_kv_lengths) {
+        *desc_ptr = new Descriptor{handle, k_cache, v_cache, k, v, past_kv_lengths};
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t calculate(
+        void *workspace, size_t workspace_size,
+        void *k_cache,
+        void *v_cache,
+        const void *k,
+        const void *v,
+        const void *past_kv_lengths,
+        void *stream) const {
+        auto k_cache_nt{::ninetoothed::Tensor{k_cache, k_cache_shape_, k_cache_strides_}};
+        auto v_cache_nt{::ninetoothed::Tensor{v_cache, v_cache_shape_, v_cache_strides_}};
+        auto k_nt{::ninetoothed::Tensor{k, k_shape_, k_strides_}};
+        auto v_nt{::ninetoothed::Tensor{v, v_shape_, v_strides_}};
+        auto past_kv_lengths_nt{::ninetoothed::Tensor{past_kv_lengths, past_kv_lengths_shape_, past_kv_lengths_strides_}};
+
+        if (launch_kv_caching(stream,
+                              k_cache_nt,
+                              v_cache_nt,
+                              k_nt,
+                              v_nt,
+                              past_kv_lengths_nt,
+                              k_shape_[3],
+                              dtype_,
+                              64, 64)) {
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+private:
+    using Size = ::ninetoothed::Tensor<>::Size;
+    using Stride = ::ninetoothed::Tensor<>::Stride;
+
+    std::vector<Size> k_cache_shape_;
+    std::vector<Stride> k_cache_strides_;
+
+    std::vector<Size> v_cache_shape_;
+    std::vector<Stride> v_cache_strides_;
+
+    std::vector<Size> k_shape_;
+    std::vector<Stride> k_strides_;
+    std::vector<Size> v_shape_;
+    std::vector<Stride> v_strides_;
+
+    std::vector<Size> past_kv_lengths_shape_;
+    std::vector<Stride> past_kv_lengths_strides_;
+
+    infiniDtype_t dtype_;
+};
+} // namespace op::kv_caching::ninetoothed
+
+#endif // KV_CACHING_H
diff --git a/src/infiniop/ops/kv_caching/operator.cc b/src/infiniop/ops/kv_caching/operator.cc
@@ -6,7 +6,9 @@
 // #include "cpu/kv_caching_cpu.h"
 #endif
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-// #include "nvidia/kv_caching_nvidia.cuh"
+#if defined(ENABLE_NINETOOTHED)
+#include "ninetoothed/kv_caching.h"
+#endif
 #endif
 
 __C infiniStatus_t infiniopCreateKVCachingDescriptor(
@@ -35,7 +37,9 @@ __C infiniStatus_t infiniopCreateKVCachingDescriptor(
         // CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-        // CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#if defined(ENABLE_NINETOOTHED)
+        CREATE(INFINI_DEVICE_NVIDIA, ninetoothed);
+#endif
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -59,7 +63,9 @@ __C infiniStatus_t infiniopGetKVCachingWorkspaceSize(
         // GET_SIZE(INFINI_DEVICE_CPU, cpu);
 #endif
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-        // GET_SIZE(INFINI_DEVICE_NVIDIA, nvidia);
+#if defined(ENABLE_NINETOOTHED)
+        GET_SIZE(INFINI_DEVICE_NVIDIA, ninetoothed);
+#endif
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -90,7 +96,9 @@ __C infiniStatus_t infiniopKVCaching(
         // CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-        // CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#if defined(ENABLE_NINETOOTHED)
+        CALCULATE(INFINI_DEVICE_NVIDIA, ninetoothed);
+#endif
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -112,7 +120,9 @@ __C infiniStatus_t infiniopDestroyKVCachingDescriptor(
         // DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-        // DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#if defined(ENABLE_NINETOOTHED)
+        DELETE(INFINI_DEVICE_NVIDIA, ninetoothed);
+#endif
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
diff --git a/test/infinicore/framework/base.py b/test/infinicore/framework/base.py
@@ -342,7 +342,10 @@ def prepare_infinicore_inputs_and_kwargs(self, inputs, kwargs, comparison_target
         for i, inp in enumerate(inputs):
             if isinstance(inp, torch.Tensor):
                 # Clone only if this input will be used for comparison
-                if comparison_target == i:
+                if comparison_target == i or (
+                    isinstance(comparison_target, (list, tuple))
+                    and i in comparison_target
+                ):
                     cloned_inp = clone_torch_tensor(inp)
                     infini_tensor = infinicore_tensor_from_torch(cloned_inp)
                     cloned_tensors.append(cloned_inp)
@@ -508,7 +511,9 @@ def run_test(self, device, test_case, config):
             # Handle multiple outputs comparison
 
             # Determine what to compare based on comparison_target
-            if comparison_target is None:
+            if comparison_target is None or isinstance(
+                comparison_target, (list, tuple)
+            ):
                 # Compare return values (out-of-place multiple outputs)
                 torch_comparison = torch_result
                 infini_comparison = infini_result
@@ -573,7 +578,9 @@ def run_test(self, device, test_case, config):
         # ==========================================================================
         else:
             # Determine comparison targets for single output
-            if comparison_target is None:
+            if comparison_target is None or isinstance(
+                comparison_target, (list, tuple)
+            ):
                 # Compare return values (out-of-place)
                 torch_comparison = torch_result
                 infini_comparison = infini_result
diff --git a/test/infinicore/ops/kv_caching.py b/test/infinicore/ops/kv_caching.py