feat(ascend): add embedding operator

zhangyue · zhangyue · commit eea8579648c0 · 2026-06-09T11:15:55.000+08:00
diff --git a/.github/ci_config.yml b/.github/ci_config.yml
@@ -15,7 +15,7 @@ platforms:
         BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
         SKIP_APT: "1"
         PIP_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple
-    setup: pip install .[dev] --no-build-isolation
+    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_NVIDIA=ON
     jobs:
       gpu:
         type: unittest
@@ -50,22 +50,21 @@ platforms:
       - /lib/firmware:/lib/firmware
       - /usr/src:/usr/src
       - /lib/modules:/lib/modules
-    setup: python -m pip install packaging exceptiongroup typing-extensions pygments pybind11 libclang && python -m pip install . --no-build-isolation --no-deps
+    setup: python -m pip install packaging exceptiongroup typing-extensions pygments pybind11 libclang && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_ILUVATAR=ON
     jobs:
       gpu:
         type: unittest
         resources:
-          gpu_ids: "0"
           ngpus: 1
           gpu_style: none
           memory: 32GB
           shm_size: 16g
-          timeout: 7200
-          queue_timeout: 7200
+          timeout: 14400
+          queue_timeout: 14400
           junit_path: test-results.xml
         stages:
           - name: test
-            run: pytest tests/ --devices iluvatar -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+            run: pytest tests/ --devices iluvatar -n 2 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
   metax:
     runner_label: Metax
@@ -80,7 +79,7 @@ platforms:
       - "--privileged"
       - "--ulimit=memlock=-1"
       - "--ulimit=stack=67108864"
-    setup: pip install .[dev] --no-build-isolation
+    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_METAX=ON
     jobs:
       gpu:
         type: unittest
@@ -107,7 +106,7 @@ platforms:
         PIP_INDEX_URL: https://pypi.org/simple
     docker_args:
       - "--privileged"
-    setup: pip install .[dev] --no-build-isolation
+    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_MOORE=ON
     jobs:
       gpu:
         type: unittest
@@ -133,7 +132,7 @@ platforms:
         PIP_INDEX_URL: https://pypi.org/simple
     docker_args:
       - "--privileged"
-    setup: pip install .[dev] --no-build-isolation
+    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_CAMBRICON=ON
     jobs:
       gpu:
         type: unittest
@@ -168,7 +167,7 @@ platforms:
       - "--group-add=video"
     volumes:
       - /opt/hyhal:/opt/hyhal:ro
-    setup: pip install .[dev] --no-build-isolation
+    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_HYGON=ON
     jobs:
       gpu:
         type: unittest
@@ -205,7 +204,7 @@ platforms:
       - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro
     env:
       ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest
-    setup: pip install .[dev] --no-build-isolation
+    setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_ASCEND=ON
     jobs:
       npu:
         type: unittest
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -542,10 +542,39 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS)
     file(GLOB_RECURSE OPERATOR_CALL_INSTANTIATION_SOURCES CONFIGURE_DEPENDS
         "${PROJECT_SOURCE_DIR}/generated/src/operator_call_instantiations_*.cc")
 
+    set(_operator_call_instantiation_job_pool_arg)
+    if(WITH_TORCH AND CMAKE_GENERATOR MATCHES "Ninja")
+        set(INFINIOPS_OPERATOR_CALL_INSTANTIATION_COMPILE_JOBS "2" CACHE STRING
+            "Maximum concurrent generated operator call instantiation compilations")
+        set_property(GLOBAL APPEND PROPERTY JOB_POOLS
+            operator_call_instantiation_compile=${INFINIOPS_OPERATOR_CALL_INSTANTIATION_COMPILE_JOBS})
+        set(_operator_call_instantiation_job_pool_arg
+            JOB_POOL operator_call_instantiation_compile)
+    endif()
+
     if(WITH_NVIDIA OR WITH_HYGON)
         set_source_files_properties(${OPERATOR_CALL_INSTANTIATION_SOURCES}
             PROPERTIES LANGUAGE CUDA)
-        target_sources(infiniops PRIVATE ${OPERATOR_CALL_INSTANTIATION_SOURCES})
+        if(WITH_TORCH AND CMAKE_GENERATOR MATCHES "Ninja")
+            add_library(infiniops_operator_call_instantiation_objs OBJECT
+                ${OPERATOR_CALL_INSTANTIATION_SOURCES})
+            set_target_properties(infiniops_operator_call_instantiation_objs
+                PROPERTIES
+                    CUDA_STANDARD 17
+                    CUDA_STANDARD_REQUIRED ON
+                    JOB_POOL_COMPILE operator_call_instantiation_compile
+                    POSITION_INDEPENDENT_CODE ON)
+            target_include_directories(infiniops_operator_call_instantiation_objs PRIVATE
+                $<TARGET_PROPERTY:infiniops,INCLUDE_DIRECTORIES>)
+            target_compile_definitions(infiniops_operator_call_instantiation_objs PRIVATE
+                $<TARGET_PROPERTY:infiniops,COMPILE_DEFINITIONS>)
+            target_compile_options(infiniops_operator_call_instantiation_objs PRIVATE
+                $<TARGET_PROPERTY:infiniops,COMPILE_OPTIONS>)
+            target_sources(infiniops PRIVATE
+                $<TARGET_OBJECTS:infiniops_operator_call_instantiation_objs>)
+        else()
+            target_sources(infiniops PRIVATE ${OPERATOR_CALL_INSTANTIATION_SOURCES})
+        endif()
     elseif(WITH_ILUVATAR)
         set(_iluvatar_call_instantiation_include_flags
             "-I${CMAKE_CURRENT_SOURCE_DIR}"
@@ -591,6 +620,7 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS)
                     -c "${_src}" -o "${_obj}"
                 DEPENDS "${_src}"
                 ${_depfile_arg}
+                ${_operator_call_instantiation_job_pool_arg}
                 COMMENT "Compiling ${_name}.cc with CoreX clang++"
                 VERBATIM
             )
diff --git a/src/base/embedding.h b/src/base/embedding.h
@@ -0,0 +1,68 @@
+#ifndef INFINI_OPS_BASE_EMBEDDING_H_
+#define INFINI_OPS_BASE_EMBEDDING_H_
+
+#include <cassert>
+
+#include "data_type.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+// Embedding performs a token embedding lookup.
+//
+// Interface follows the inference-time vLLM/PyTorch convention:
+//   `out = weight[input_ids]`.
+//
+// The input layout is:
+//   `input_ids`: Any shape, `int32` or `int64`.
+//   `weight`: `[vocab_size, hidden_size]`.
+//   `out`: `input_ids.shape + [hidden_size]`.
+//
+// This is the inference subset of `torch.nn.functional.embedding`; options
+// such as `padding_idx`, `max_norm`, `scale_grad_by_freq`, and `sparse` are
+// intentionally not part of this operator.
+class Embedding : public Operator<Embedding> {
+ public:
+  Embedding(const Tensor input_ids, const Tensor weight, Tensor out)
+      : num_tokens_{input_ids.numel()},
+        vocab_size_{weight.size(0)},
+        hidden_size_{weight.size(1)},
+        input_dtype_{input_ids.dtype()},
+        weight_dtype_{weight.dtype()} {
+    assert((input_dtype_ == DataType::kInt32 ||
+            input_dtype_ == DataType::kInt64) &&
+           "`Embedding` requires `input_ids` to be `int32` or `int64`");
+    assert(
+        weight.ndim() == 2 &&
+        "`Embedding` requires `weight` to be 2D `[vocab_size, hidden_size]`");
+    assert(out.dtype() == weight.dtype() &&
+           "`Embedding` requires `out` and `weight` to have the same dtype");
+    assert(out.ndim() == input_ids.ndim() + 1 &&
+           "`Embedding` requires `out.ndim == input_ids.ndim + 1`");
+    assert(out.size(-1) == hidden_size_ &&
+           "`Embedding` requires `out.shape[-1] == weight.shape[-1]`");
+
+    for (std::size_t i = 0; i < input_ids.ndim(); ++i) {
+      assert(out.size(i) == input_ids.size(i) &&
+             "`Embedding` requires `out` prefix shape to match `input_ids`");
+    }
+  }
+
+  virtual void operator()(const Tensor input_ids, const Tensor weight,
+                          Tensor out) const = 0;
+
+ protected:
+  Tensor::Size num_tokens_{0};
+
+  Tensor::Size vocab_size_{0};
+
+  Tensor::Size hidden_size_{0};
+
+  const DataType input_dtype_;
+
+  const DataType weight_dtype_;
+};
+
+}  // namespace infini::ops
+
+#endif  // INFINI_OPS_BASE_EMBEDDING_H_
diff --git a/src/native/ascend/ops/embedding/kernel.h b/src/native/ascend/ops/embedding/kernel.h
@@ -0,0 +1,80 @@
+#ifndef INFINI_OPS_ASCEND_EMBEDDING_KERNEL_H_
+#define INFINI_OPS_ASCEND_EMBEDDING_KERNEL_H_
+
+#include <cassert>
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnnop/aclnn_embedding.h"
+#include "base/embedding.h"
+#include "native/ascend/common.h"
+#include "native/ascend/workspace_pool_.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+template <>
+class Operator<Embedding, Device::Type::kAscend> : public Embedding {
+ public:
+  Operator(const Tensor input_ids, const Tensor weight, Tensor out)
+      : Embedding(input_ids, weight, out),
+        input_ids_cache_(input_ids),
+        weight_cache_(weight),
+        out_cache_(out) {
+    assert((weight_dtype_ == DataType::kFloat16 ||
+            weight_dtype_ == DataType::kBFloat16 ||
+            weight_dtype_ == DataType::kFloat32) &&
+           "`Embedding`: Ascend path supports `float16`, `bfloat16`, and "
+           "`float32` weights");
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    input_ids_cache_.release();
+    weight_cache_.release();
+    out_cache_.release();
+  }
+
+  void operator()(const Tensor input_ids, const Tensor weight,
+                  Tensor out) const override {
+    auto stream = static_cast<aclrtStream>(stream_);
+
+    auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
+    auto t_input_ids =
+        input_ids_cache_.get(const_cast<void*>(input_ids.data()));
+    auto t_out = out_cache_.get(out.data());
+
+    if (!executor_) {
+      auto ret = aclnnEmbeddingGetWorkspaceSize(t_weight, t_input_ids, t_out,
+                                                &ws_size_, &executor_);
+      assert(ret == ACL_SUCCESS && "`aclnnEmbeddingGetWorkspaceSize` failed");
+      aclSetAclOpExecutorRepeatable(executor_);
+    } else {
+      aclSetInputTensorAddr(executor_, 0, t_weight,
+                            const_cast<void*>(weight.data()));
+      aclSetInputTensorAddr(executor_, 1, t_input_ids,
+                            const_cast<void*>(input_ids.data()));
+      aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
+    }
+
+    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
+    auto ret = aclnnEmbedding(arena.buf, ws_size_, executor_, stream);
+    assert(ret == ACL_SUCCESS && "`aclnnEmbedding` failed");
+  }
+
+ private:
+  mutable ascend::AclTensorCache input_ids_cache_;
+
+  mutable ascend::AclTensorCache weight_cache_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  mutable aclOpExecutor* executor_ = nullptr;
+
+  mutable uint64_t ws_size_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif  // INFINI_OPS_ASCEND_EMBEDDING_KERNEL_H_
diff --git a/tests/test_embedding.py b/tests/test_embedding.py
@@ -0,0 +1,69 @@
+import infini.ops
+import pytest
+import torch
+
+from tests.utils import Payload, get_stream
+
+
+@pytest.mark.auto_act_and_assert
+@pytest.mark.parametrize(
+    "input_shape, vocab_size, hidden_size",
+    (
+        ((5,), 17, 8),
+        ((2, 3), 23, 16),
+    ),
+)
+@pytest.mark.parametrize("index_dtype", (torch.int32, torch.int64))
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float32, 0.0, 0.0),
+        (torch.float16, 0.0, 0.0),
+        (torch.bfloat16, 0.0, 0.0),
+    ),
+)
+def test_embedding(
+    input_shape,
+    vocab_size,
+    hidden_size,
+    index_dtype,
+    implementation_index,
+    dtype,
+    device,
+    rtol,
+    atol,
+):
+    input_ids = torch.randint(
+        0, vocab_size, input_shape, dtype=index_dtype, device=device
+    )
+    weight = torch.randn((vocab_size, hidden_size), dtype=dtype, device=device)
+    out = torch.empty((*input_shape, hidden_size), dtype=dtype, device=device)
+
+    return Payload(
+        lambda *args, **kwargs: _embedding(
+            *args, **kwargs, implementation_index=implementation_index
+        ),
+        _ref_embedding,
+        (input_ids, weight, out),
+        {},
+        rtol=rtol,
+        atol=atol,
+    )
+
+
+def _embedding(input_ids, weight, out, *, implementation_index=0):
+    infini.ops.embedding(
+        input_ids,
+        weight,
+        out,
+        implementation_index=implementation_index,
+        stream=get_stream(input_ids.device),
+    )
+
+    return out
+
+
+def _ref_embedding(input_ids, weight, out):
+    del out
+
+    return torch.nn.functional.embedding(input_ids.long(), weight)