InfiniTensor · voltjia · Oct 21, 2025 · Oct 11, 2025 · Oct 11, 2025 · Oct 13, 2025
diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
@@ -1,5 +1,8 @@
 #pragma once
 
-#include "op/matmul.hpp"
-#include "op/ones.hpp"
-#include "op/rearrange.hpp"
+#include "ops/add.hpp"
+#include "ops/attention.hpp"
+#include "ops/matmul.hpp"
+#include "ops/ones.hpp"
+#include "ops/rearrange.hpp"
+#include "ops/rms_norm.hpp"
diff --git a/include/infinicore/ops/add.hpp b/include/infinicore/ops/add.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Add {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor c, Tensor a, Tensor b);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor add(Tensor a, Tensor b);
+void add_(Tensor c, Tensor a, Tensor b);
+Tensor operator+(Tensor a, Tensor b);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/attention.hpp b/include/infinicore/ops/attention.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Attention {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, size_t);
+    static void execute(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor attention(Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+void attention_(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos);
+} // namespace infinicore::op
diff --git a/include/infinicore/op/common/cache.hpp → include/infinicore/ops/common/cache.hpp b/include/infinicore/op/common/cache.hpp → include/infinicore/ops/common/cache.hpp
diff --git a/include/infinicore/op/common/dispatcher.hpp → include/infinicore/ops/common/dispatcher.hpp b/include/infinicore/op/common/dispatcher.hpp → include/infinicore/ops/common/dispatcher.hpp
@@ -8,19 +8,19 @@ namespace infinicore::op::common {
 template <typename Fn>
 class OpDispatcher {
 public:
-    void registerDevice(Device::Type device_type, Fn fn, bool override_existing=true) {
-        if (table_[(size_t)device_type] == nullptr || override_existing){
+    void registerDevice(Device::Type device_type, Fn fn, bool override_existing = true) {
+        if (table_[(size_t)device_type] == nullptr || override_existing) {
             table_[(size_t)device_type] = fn;
         }
     }
 
-    void registerDevice(std::initializer_list<Device::Type> device_types, Fn fn, bool override_existing=true) {
+    void registerDevice(std::initializer_list<Device::Type> device_types, Fn fn, bool override_existing = true) {
         for (auto device_type : device_types) {
             registerDevice(device_type, fn, override_existing);
         }
     }
 
-    void registerAll(Fn fn, bool override_existing=true) {
+    void registerAll(Fn fn, bool override_existing = true) {
         for (size_t device_type = 0; device_type < static_cast<size_t>(Device::Type::COUNT); ++device_type) {
             registerDevice((Device::Type)device_type, fn, override_existing);
         }

diff --git a/include/infinicore/op/common/op.hpp → include/infinicore/ops/common/op.hpp b/include/infinicore/op/common/op.hpp → include/infinicore/ops/common/op.hpp
diff --git a/include/infinicore/op/matmul.hpp → include/infinicore/ops/matmul.hpp b/include/infinicore/op/matmul.hpp → include/infinicore/ops/matmul.hpp
diff --git a/include/infinicore/op/ones.hpp → include/infinicore/ops/ones.hpp b/include/infinicore/op/ones.hpp → include/infinicore/ops/ones.hpp
diff --git a/include/infinicore/op/rearrange.hpp → include/infinicore/ops/rearrange.hpp b/include/infinicore/op/rearrange.hpp → include/infinicore/ops/rearrange.hpp
diff --git a/include/infinicore/ops/rms_norm.hpp b/include/infinicore/ops/rms_norm.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class RMSNorm {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor, float);
+    static void execute(Tensor y, Tensor x, Tensor weight, float epsilon = 1e-5f);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor rms_norm(Tensor x, Tensor weight, float epsilon = 1e-5f);
+void rms_norm_(Tensor y, Tensor x, Tensor weight, float epsilon = 1e-5f);
+} // namespace infinicore::op
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
@@ -25,8 +25,11 @@
     uint8,
 )
 from infinicore.ntops import use_ntops
+from infinicore.ops.add import add
+from infinicore.ops.attention import attention
 from infinicore.ops.matmul import matmul
 from infinicore.ops.rearrange import rearrange
+from infinicore.ops.rms_norm import rms_norm
 from infinicore.tensor import (
     empty,
     from_blob,
@@ -66,8 +69,11 @@
     # `ntops` integration.
     "use_ntops",
     # Operations.
+    "add",
+    "attention",
     "matmul",
     "rearrange",
+    "rms_norm",
     "empty",
     "from_blob",
     "ones",

diff --git a/python/infinicore/dtype.py b/python/infinicore/dtype.py
@@ -4,7 +4,6 @@
 class dtype:
     def __init__(self, data_type):
         """An internal method. Please do not use this directly."""
-
         self._underlying = data_type
 
     def __repr__(self):
@@ -29,9 +28,31 @@ def __repr__(self):
             _infinicore.DataType.C128: "complex128",
             _infinicore.DataType.BF16: "bfloat16",
         }
-
         return f"infinicore.{repr_map[self._underlying]}"
 
+    def __eq__(self, other):
+        """
+        Compare two dtype objects for equality.
+
+        Args:
+            other: The object to compare with
+
+        Returns:
+            bool: True if both objects are dtype instances with the same underlying data type
+        """
+        if not isinstance(other, dtype):
+            return False
+        return self._underlying == other._underlying
+
+    def __hash__(self):
+        """
+        Return a hash value for the dtype object.
+
+        Returns:
+            int: Hash value based on the underlying data type
+        """
+        return hash(self._underlying)
+
 
 float32 = dtype(_infinicore.DataType.F32)
 float = float32

diff --git a/python/infinicore/ops/add.py b/python/infinicore/ops/add.py
@@ -0,0 +1,9 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def add(input, other, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.add(input._underlying, other._underlying))
+
+    _infinicore.add_(out._underlying, input._underlying, other._underlying)
diff --git a/python/infinicore/ops/attention.py b/python/infinicore/ops/attention.py
@@ -0,0 +1,26 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def attention(q, k, v, k_cache, v_cache, pos, *, out=None):
+    if out is None:
+        return Tensor(
+            _infinicore.attention(
+                q._underlying,
+                k._underlying,
+                v._underlying,
+                k_cache._underlying,
+                v_cache._underlying,
+                pos,
+            )
+        )
+
+    _infinicore.attention_(
+        out._underlying,
+        q._underlying,
+        k._underlying,
+        v._underlying,
+        k_cache._underlying,
+        v_cache._underlying,
+        pos,
+    )
diff --git a/python/infinicore/ops/rms_norm.py b/python/infinicore/ops/rms_norm.py
@@ -0,0 +1,13 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def rms_norm(input, weight, epsilon=1e-5, *, out=None):
+    if out is None:
+        return Tensor(
+            _infinicore.rms_norm(input._underlying, weight._underlying, epsilon)
+        )
+
+    _infinicore.rms_norm_(
+        out._underlying, input._underlying, weight._underlying, epsilon
+    )
diff --git a/src/infinicore/ops/add/add.cc b/src/infinicore/ops/add/add.cc
@@ -0,0 +1,24 @@
+#include "infinicore/ops/add.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Add::schema> &Add::dispatcher() {
+    static common::OpDispatcher<Add::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Add::execute(Tensor c, Tensor a, Tensor b) {
+    dispatcher().lookup(context::getDevice().getType())(c, a, b);
+}
+
+Tensor add(Tensor a, Tensor b) {
+    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
+    add_(c, a, b);
+    return c;
+}
+
+void add_(Tensor c, Tensor a, Tensor b) {
+    Add::execute(c, a, b);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/add/add_infiniop.cc b/src/infinicore/ops/add/add_infiniop.cc
@@ -0,0 +1,52 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/add.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::add_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAddDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAddDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAddDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor c, Tensor a, Tensor b) {
+    size_t seed = hash_combine(c, b, a);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAddDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAddDescriptor(
+            context::getInfiniopHandle(), &desc,
+            c->desc(), a->desc(), b->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAddWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAdd(
+        desc, workspace->data(), workspace_size,
+        c->data(), a->data(), b->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Add::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::add_impl::infiniop
diff --git a/src/infinicore/ops/attention/attention.cc b/src/infinicore/ops/attention/attention.cc
@@ -0,0 +1,28 @@
+#include "infinicore/ops/attention.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Attention::schema> &Attention::dispatcher() {
+    static common::OpDispatcher<Attention::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Attention::execute(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    dispatcher().lookup(context::getDevice().getType())(out, q, k, v, k_cache, v_cache, pos);
+}
+
+Tensor attention(Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    size_t n_q_head = q->shape()[0];
+    size_t seq_len = q->shape()[1];
+    size_t head_dim = q->shape()[2];
+    Shape shape = {seq_len, n_q_head, head_dim};
+    auto out = Tensor::empty(shape, q->dtype(), q->device());
+    attention_(out, q, k, v, k_cache, v_cache, pos);
+    return out;
+}
+
+void attention_(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    Attention::execute(out, q, k, v, k_cache, v_cache, pos);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/attention/attention_infiniop.cc b/src/infinicore/ops/attention/attention_infiniop.cc
@@ -0,0 +1,54 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/attention.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::attention_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAttentionDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAttentionDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAttentionDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor q, Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, size_t pos) {
+    size_t seed = hash_combine(out, q, k, v, k_cache, v_cache, pos);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAttentionDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAttentionDescriptor(
+            context::getInfiniopHandle(), &desc,
+            out->desc(), q->desc(), k->desc(), v->desc(),
+            k_cache->desc(), v_cache->desc(), pos));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAttentionWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAttention(
+        desc, workspace->data(), workspace_size,
+        out->data(), q->data(), k->data(), v->data(),
+        k_cache->data(), v_cache->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Attention::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::attention_impl::infiniop
diff --git a/src/infinicore/op/matmul/matmul.cc → src/infinicore/ops/matmul/matmul.cc b/src/infinicore/op/matmul/matmul.cc → src/infinicore/ops/matmul/matmul.cc
@@ -1,4 +1,4 @@
-#include "infinicore/op/matmul.hpp"
+#include "infinicore/ops/matmul.hpp"
 
 namespace infinicore::op {
 

diff --git a/src/infinicore/op/matmul/matmul_infiniop.cc → src/infinicore/ops/matmul/matmul_infiniop.cc b/src/infinicore/op/matmul/matmul_infiniop.cc → src/infinicore/ops/matmul/matmul_infiniop.cc
@@ -1,7 +1,7 @@
 #include "../../utils.hpp"
 #include "infinicore/common/hash.hpp"
-#include "infinicore/op/common/cache.hpp"
-#include "infinicore/op/matmul.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/matmul.hpp"
 #include <infiniop.h>
 
 namespace infinicore::op::matmul_impl::infiniop {
@@ -27,7 +27,9 @@ void calculate(Tensor c, Tensor a, Tensor b) {
     infiniopGemmDescriptor_t desc = nullptr;
 
     if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(context::getInfiniopHandle(), &desc, c->desc(), a->desc(), b->desc()));
+        INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(
+            context::getInfiniopHandle(), &desc,
+            c->desc(), a->desc(), b->desc()));
         cache.put(seed, desc);
     } else {
         desc = *desc_opt;

diff --git a/src/infinicore/op/ones/ones.cc → src/infinicore/ops/ones/ones.cc b/src/infinicore/op/ones/ones.cc → src/infinicore/ops/ones/ones.cc
@@ -1,4 +1,4 @@
-#include "infinicore/op/ones.hpp"
+#include "infinicore/ops/ones.hpp"
 
 namespace infinicore::op {