InfiniTensor
diff --git a/‎include/infinicore/ops/embedding.hpp‎
Lines changed: 5 additions & 8 deletions b/‎include/infinicore/ops/embedding.hpp‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎src/infinicore/ops/embedding/embedding.cc‎
Lines changed: 9 additions & 17 deletions b/‎src/infinicore/ops/embedding/embedding.cc‎
Lines changed: 9 additions & 17 deletions
diff --git a/‎src/infinicore/ops/embedding/embedding_infiniop.cc‎
Lines changed: 29 additions & 34 deletions b/‎src/infinicore/ops/embedding/embedding_infiniop.cc‎
Lines changed: 29 additions & 34 deletions
@@ -1,16 +1,13 @@
 #pragma once
 
+#include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
-class Embedding {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor);
-    static void execute(Tensor out, Tensor input, Tensor weight);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Embedding, Tensor, const Tensor &, const Tensor &);
 
-Tensor embedding(Tensor input, Tensor weight);
-void embedding_(Tensor out, Tensor input, Tensor weight);
+Tensor embedding(const Tensor &input, const Tensor &weight);
+void embedding_(Tensor out, const Tensor &input, const Tensor &weight);
 } // namespace infinicore::op
@@ -5,27 +5,19 @@
 #include <stdexcept>
 
 namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Embedding);
 
-common::OpDispatcher<Embedding::schema> &Embedding::dispatcher() {
-    static common::OpDispatcher<Embedding::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void Embedding::execute(Tensor out, Tensor input, Tensor weight) {
-    // Check that all tensors are on the same device
-    // This is critical: if input is on CPU while out/weight are on GPU,
-    // passing CPU pointer to CUDA kernel will cause memory access errors
+Embedding::Embedding(Tensor out, const Tensor &input, const Tensor &weight) {
     INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, input, weight);
+}
 
-    // Set device context
-    infinicore::context::setDevice(out->device());
-
-    // Use dispatcher to lookup kernel (infiniop implementation)
-    dispatcher().lookup(out->device().getType())(out, input, weight);
+void Embedding::execute(Tensor out, const Tensor &input, const Tensor &weight) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Embedding, out, input, weight);
 }
 
-Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract
-                 Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
+Tensor embedding(const Tensor &input, // LongTensor of arbitrary shape containing the indices to extract
+                 const Tensor &weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
 ) {
     auto input_shape = input->shape();
     auto weight_shape = weight->shape();
@@ -40,7 +32,7 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
     return inputs_embeds;
 }
 
-void embedding_(Tensor out, Tensor input, Tensor weight) {
+void embedding_(Tensor out, const Tensor &input, const Tensor &weight) {
     Embedding::execute(out, input, weight);
 }
 
 
@@ -1,49 +1,44 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
+#include "../infiniop_impl.hpp"
 #include "infinicore/ops/embedding.hpp"
-#include <infiniop.h>
 
 namespace infinicore::op::embedding_impl::infiniop {
 
-thread_local common::OpCache<size_t, infiniopEmbeddingDescriptor_t> caches(
-    100, // capacity
-    [](infiniopEmbeddingDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyEmbeddingDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Embedding, 100);
 
-void calculate(Tensor out, Tensor input, Tensor weight) {
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor out, input, weight;
+};
+
+void *plan(Tensor out, const Tensor &input, const Tensor &weight) {
     size_t seed = hash_combine(out, input, weight);
 
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Embedding,
+        seed, out->desc(), input->desc(), weight->desc());
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(out),
+        graph::GraphTensor(input),
+        graph::GraphTensor(weight)};
 
-    auto desc_opt = cache.get(seed);
-    infiniopEmbeddingDescriptor_t desc = nullptr;
+    return planned;
+}
 
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateEmbeddingDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            out->desc(), input->desc(), weight->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
 
     INFINICORE_CHECK_ERROR(infiniopEmbedding(
-        desc,
-        out->data(),
-        input->data(),
-        weight->data(),
-        context::getStream()));
+        planned->descriptor->desc,
+        planned->out->data(), planned->input->data(), planned->weight->data(), context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }
 
-static bool registered = []() {
-    Embedding::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Embedding, &plan, &run, cleanup);
 
 } // namespace infinicore::op::embedding_impl::infiniop