InfiniTensor
diff --git a/‎include/infinicore/ops/axpy.hpp‎
Lines changed: 3 additions & 7 deletions b/‎include/infinicore/ops/axpy.hpp‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎include/infinicore/ops/blas_dot.hpp‎
Lines changed: 4 additions & 8 deletions b/‎include/infinicore/ops/blas_dot.hpp‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎include/infinicore/ops/nrm2.hpp‎
Lines changed: 4 additions & 8 deletions b/‎include/infinicore/ops/nrm2.hpp‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎include/infinicore/ops/rot.hpp‎
Lines changed: 3 additions & 7 deletions b/‎include/infinicore/ops/rot.hpp‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎include/infinicore/ops/scal.hpp‎
Lines changed: 3 additions & 7 deletions b/‎include/infinicore/ops/scal.hpp‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎python/infinicore/ops/blas_dot.py‎
Lines changed: 2 additions & 1 deletion b/‎python/infinicore/ops/blas_dot.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/infinicore/ops/nrm2.py‎
Lines changed: 2 additions & 1 deletion b/‎python/infinicore/ops/nrm2.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/infinicore/ops/scal.py‎
Lines changed: 2 additions & 1 deletion b/‎python/infinicore/ops/scal.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/infinicore/ops/axpy/axpy.cc‎
Lines changed: 8 additions & 8 deletions b/‎src/infinicore/ops/axpy/axpy.cc‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/infinicore/ops/axpy/axpy_infiniop.cc‎
Lines changed: 36 additions & 40 deletions b/‎src/infinicore/ops/axpy/axpy_infiniop.cc‎
Lines changed: 36 additions & 40 deletions
@@ -1,17 +1,13 @@
 #pragma once
 
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
-class Axpy {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor);
-    static void execute(Tensor alpha, Tensor x, Tensor y);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Axpy, const Tensor &, const Tensor &, Tensor);
 
-void axpy_(Tensor alpha, Tensor x, Tensor y);
+void axpy_(const Tensor &alpha, const Tensor &x, Tensor y);
 
 } // namespace infinicore::op
@@ -1,18 +1,14 @@
 #pragma once
 
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
-class BlasDot {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor);
-    static void execute(Tensor result, Tensor x, Tensor y);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(BlasDot, const Tensor &, const Tensor &, Tensor);
 
-Tensor blas_dot(Tensor x, Tensor y);
-void blas_dot_(Tensor result, Tensor x, Tensor y);
+Tensor blas_dot(const Tensor &x, const Tensor &y);
+void blas_dot_(const Tensor &x, const Tensor &y, Tensor result);
 
 } // namespace infinicore::op
@@ -1,18 +1,14 @@
 #pragma once
 
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
-class Nrm2 {
-public:
-    using schema = void (*)(Tensor, Tensor);
-    static void execute(Tensor result, Tensor x);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Nrm2, const Tensor &, Tensor);
 
-Tensor nrm2(Tensor x);
-void nrm2_(Tensor result, Tensor x);
+Tensor nrm2(const Tensor &x);
+void nrm2_(const Tensor &x, Tensor result);
 
 } // namespace infinicore::op
@@ -1,17 +1,13 @@
 #pragma once
 
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
-class Rot {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor, Tensor);
-    static void execute(Tensor x, Tensor y, Tensor c, Tensor s);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Rot, Tensor, Tensor, const Tensor &, const Tensor &);
 
-void rot_(Tensor x, Tensor y, Tensor c, Tensor s);
+void rot_(Tensor x, Tensor y, const Tensor &c, const Tensor &s);
 
 } // namespace infinicore::op
@@ -1,17 +1,13 @@
 #pragma once
 
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
-class Scal {
-public:
-    using schema = void (*)(Tensor, Tensor);
-    static void execute(Tensor alpha, Tensor x);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Scal, const Tensor &, Tensor);
 
-void scal_(Tensor x, Tensor alpha);
+void scal_(const Tensor &alpha, Tensor x);
 
 } // namespace infinicore::op
@@ -6,5 +6,6 @@ def blas_dot(x: Tensor, y: Tensor, *, out=None):
     if out is None:
         return Tensor(_infinicore.blas_dot(x._underlying, y._underlying))
 
-    _infinicore.blas_dot_(out._underlying, x._underlying, y._underlying)
+    _infinicore.blas_dot_(x._underlying, y._underlying, out._underlying)
+
     return out
@@ -6,5 +6,6 @@ def nrm2(x: Tensor, *, out=None):
     if out is None:
         return Tensor(_infinicore.nrm2(x._underlying))
 
-    _infinicore.nrm2_(out._underlying, x._underlying)
+    _infinicore.nrm2_(x._underlying, out._underlying)
+
     return out
@@ -3,5 +3,6 @@
 
 
 def scal(x: Tensor, alpha: Tensor):
-    _infinicore.scal_(x._underlying, alpha._underlying)
+    _infinicore.scal_(alpha._underlying, x._underlying)
+
     return x
@@ -4,18 +4,18 @@
 
 namespace infinicore::op {
 
-common::OpDispatcher<Axpy::schema> &Axpy::dispatcher() {
-    static common::OpDispatcher<Axpy::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Axpy);
 
-void Axpy::execute(Tensor alpha, Tensor x, Tensor y) {
+Axpy::Axpy(const Tensor &alpha, const Tensor &x, Tensor y) {
     INFINICORE_ASSERT_TENSORS_SAME_DEVICE(alpha, x, y);
-    infinicore::context::setDevice(y->device());
-    dispatcher().lookup(y->device().getType())(alpha, x, y);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), alpha, x, y);
 }
 
-void axpy_(Tensor alpha, Tensor x, Tensor y) {
+void Axpy::execute(const Tensor &alpha, const Tensor &x, Tensor y) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Axpy, alpha, x, y);
+}
+
+void axpy_(const Tensor &alpha, const Tensor &x, Tensor y) {
     Axpy::execute(alpha, x, y);
 }
 
 
@@ -1,56 +1,52 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
 #include "infinicore/ops/axpy.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"
 
 namespace infinicore::op::axpy_impl::infiniop {
 
-thread_local common::OpCache<size_t, infiniopAxpyDescriptor_t> caches(
-    100, // capacity
-    [](infiniopAxpyDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyAxpyDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Axpy, 100);
 
-void calculate(Tensor alpha, Tensor x, Tensor y) {
-    size_t seed = hash_combine(alpha, x, y);
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, alpha, x, y;
+};
 
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
+void *plan(const Tensor &alpha, const Tensor &x, Tensor y) {
+    size_t seed = hash_combine(y, alpha, x);
 
-    auto &cache = caches.getCache(device_type, device_index);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Axpy,
+        seed,
+        alpha->desc(), x->desc(), y->desc());
 
-    auto desc_opt = cache.get(seed);
-    infiniopAxpyDescriptor_t desc = nullptr;
+    INFINIOP_WORKSPACE_TENSOR(workspace, Axpy, descriptor);
 
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateAxpyDescriptor(
-            context::getInfiniopHandle(y->device()), &desc,
-            alpha->desc(), x->desc(), y->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(alpha),
+        graph::GraphTensor(x),
+        graph::GraphTensor(y)};
+}
 
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetAxpyWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
 
     INFINICORE_CHECK_ERROR(infiniopAxpy(
-        desc, workspace->data(), workspace_size,
-        alpha->data(), x->data(), y->data(), context::getStream()));
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->alpha->data(),
+        planned->x->data(),
+        planned->y->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }
 
-static bool registered = []() {
-    Axpy::dispatcher().registerDevice({Device::Type::CPU,
-                                       Device::Type::CAMBRICON,
-                                       Device::Type::METAX},
-                                      &calculate,
-                                      false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Axpy, &plan, &run, &cleanup);
 
 } // namespace infinicore::op::axpy_impl::infiniop