Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .ci/images/ascend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,12 @@ RUN pip install --no-cache-dir --progress off \
pytest-xdist \
ruff

ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit

WORKDIR /workspace
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ option(WITH_CAMBRICON "Enable Cambricon backend" OFF)
option(WITH_MOORE "Enable Moore backend" OFF)
option(WITH_ASCEND "Enable Ascend backend" OFF)

option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires torch_npu)" OFF)

option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
option(GENERATE_PYTHON_BINDINGS "Generate Python bindings" OFF)

Expand Down
36 changes: 26 additions & 10 deletions scripts/generate_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __init__(self, name, constructors, calls):

def _find_optional_tensor_params(op_name):
"""Return a set of parameter names declared as `std::optional<Tensor>` in
the base header. `libclang` resolves the type to `int` when the STL
the base header. libclang resolves the type to ``int`` when the STL
headers are not fully available, so we fall back to a regex scan of the
source text.
"""
Expand All @@ -103,24 +103,39 @@ def _find_optional_tensor_params(op_name):
return set(re.findall(r"std::optional<Tensor>\s+(\w+)", source))


def _find_vector_tensor_params(op_name):
"""Return a set of parameter names declared as `std::vector<Tensor>` in
the base header.
"""
source = (_BASE_DIR / f"{op_name}.h").read_text()

return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))


def _generate_pybind11(operator):
optional_tensor_params = _find_optional_tensor_params(operator.name)
vector_tensor_params = _find_vector_tensor_params(operator.name)

def _is_optional_tensor(arg):
if arg.spelling in optional_tensor_params:
return True

return "std::optional" in arg.type.spelling and "Tensor" in arg.type.spelling

def _is_vector_tensor(arg):
if arg.spelling in vector_tensor_params:
return True
return "std::vector" in arg.type.spelling and "Tensor" in arg.type.spelling

def _generate_params(node):
parts = []

for arg in node.get_arguments():
if arg.spelling == "stream":
continue

if _is_optional_tensor(arg):
parts.append(f"std::optional<py::object> {arg.spelling}")
elif _is_vector_tensor(arg):
parts.append(f"std::vector<py::object> {arg.spelling}")
else:
param = arg.type.spelling.replace("const Tensor", "py::object").replace(
"Tensor", "py::object"
Expand All @@ -135,9 +150,10 @@ def _generate_arguments(node):
for arg in node.get_arguments():
if arg.spelling == "stream":
continue

if _is_optional_tensor(arg):
args.append(f"OptionalTensorFromPybind11Handle({arg.spelling})")
elif _is_vector_tensor(arg):
args.append(f"VectorTensorFromPybind11Handle({arg.spelling})")
elif "Tensor" in arg.type.spelling:
args.append(f"TensorFromPybind11Handle({arg.spelling})")
else:
Expand Down Expand Up @@ -167,23 +183,23 @@ def _generate_call(op_name, call, method=True):

if not method:
params = (
f"{call_params}, std::uintptr_t stream, std::size_t implementation_index"
f"{call_params}, std::size_t implementation_index, std::uintptr_t stream"
if call_params
else "std::uintptr_t stream, std::size_t implementation_index"
else "std::size_t implementation_index, std::uintptr_t stream"
)
py_args = _generate_py_args(call)
py_args_str = f"{py_args}, " if py_args else ""

return (
f' m.def("{op_name}", []({params}) {{\n'
f" Config config;\n"
f" config.set_implementation_index(implementation_index);\n"
f" Handle handle;\n"
f" if (stream) {{\n"
f" handle.set_stream(reinterpret_cast<void*>(stream));\n"
f" }}\n"
f" Config config;\n"
f" config.set_implementation_index(implementation_index);\n"
f" return Self::call(handle, config, {call_args});\n"
f' }}, {py_args_str}py::kw_only(), py::arg("stream") = 0, py::arg("implementation_index") = 0);'
f' }}, {py_args_str}py::kw_only(), py::arg("implementation_index") = 0, py::arg("stream") = 0);'
)

return f""" .def("__call__", [](const Self& self, {call_params}) {{
Expand Down Expand Up @@ -442,7 +458,7 @@ def _get_all_ops(devices):
nargs="+",
default="cpu",
type=str,
help="Devices to use. Please pick from `cpu`, `nvidia`, `cambricon`, `ascend`, `metax`, `moore`, `iluvatar`, `kunlun`, `hygon`, and `qy`. (default: `cpu`)",
help="Devices to use. Please pick from cpu, nvidia, cambricon, ascend, metax, moore, iluvatar, kunlun, hygon, and qy. (default: cpu)",
)

args = parser.parse_args()
Expand Down
46 changes: 45 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,10 @@ if(WITH_ASCEND)
"ascend/*.cc"
"ascend/*.cpp"
)
# Exclude `kernel_impl.cpp` — AscendC device code, not compiled by the host C++ compiler.
# Exclude kernel_impl.cpp — AscendC device code, not compiled by the host C++ compiler.
list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*kernel_impl\\.cpp$")
# Exclude custom_kernel/ — standalone PyTorch extension, built separately.
list(FILTER ASCEND_SOURCES EXCLUDE REGEX ".*/custom_kernel/.*")

target_compile_definitions(infiniops PUBLIC WITH_ASCEND=1)
target_sources(infiniops PRIVATE ${ASCEND_SOURCES})
Expand Down Expand Up @@ -215,7 +217,38 @@ if(WITH_ASCEND)
"${ASCEND_HOME}/lib64/libopapi.so"
"${ASCEND_HAL_LIB}")

# ATB (Ascend Transformer Boost) — provides fused operators like
# PagedAttention and ReshapeAndCache that are graph-capture safe.
set(ATB_HOME_DIR "$ENV{ATB_HOME_PATH}")
if(NOT ATB_HOME_DIR)
# Default search path under CANN nnal directory.
file(GLOB ATB_SEARCH_DIRS "/usr/local/Ascend/nnal/atb/*/atb/cxx_abi_1")
if(ATB_SEARCH_DIRS)
list(SORT ATB_SEARCH_DIRS ORDER DESCENDING)
list(GET ATB_SEARCH_DIRS 0 ATB_HOME_DIR)
endif()
endif()

if(ATB_HOME_DIR AND EXISTS "${ATB_HOME_DIR}/include/atb/operation.h")
message(STATUS "ATB found: ${ATB_HOME_DIR}")
target_compile_definitions(infiniops PUBLIC INFINI_HAS_ATB=1)
target_include_directories(infiniops PUBLIC "${ATB_HOME_DIR}/include")
target_link_libraries(infiniops PUBLIC "${ATB_HOME_DIR}/lib/libatb.so")
else()
message(STATUS "ATB not found — ATB-based operators disabled")
endif()

list(APPEND DEVICE_LIST "ascend")

# Custom AscendC kernels (PyTorch extension, requires torch_npu).
if(BUILD_CUSTOM_KERNEL)
add_subdirectory(ascend/custom_kernel)

# Link the compiled AscendC kernel objects into infiniops so that
# custom kernel implementations (e.g. RmsNorm index 1) can call
# them via the generated launch functions.
target_compile_definitions(infiniops PUBLIC INFINI_HAS_CUSTOM_RMS_NORM=1)
endif()
endif()

target_include_directories(infiniops PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
Expand Down Expand Up @@ -257,6 +290,17 @@ if(GENERATE_PYTHON_BINDINGS)
target_include_directories(ops PRIVATE ${PROJECT_SOURCE_DIR})
target_link_libraries(ops PRIVATE infiniops)

# Custom AscendC kernel objects must be linked directly into ops
# because the AscendC toolchain compiles host stubs with hidden
# visibility — `libinfiniops.so` cannot re-export those symbols.
# The `Operator<..., 1>` template instantiations that call
# `aclrtlaunch_*` live in `ops.cc`, so link here with
# `--whole-archive` to ensure all launch functions are available.
if(BUILD_CUSTOM_KERNEL)
target_link_libraries(ops PRIVATE
-Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
endif()

set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
set_target_properties(ops PROPERTIES INSTALL_RPATH "$ORIGIN")

Expand Down
82 changes: 82 additions & 0 deletions src/ascend/add/kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#ifndef INFINI_OPS_ASCEND_ADD_KERNEL_H_
#define INFINI_OPS_ASCEND_ADD_KERNEL_H_

#include "acl/acl.h"
#include "aclnn/aclnn_base.h"
#include "aclnn_add.h"
#include "ascend/common.h"
#include "ascend/workspace_pool_.h"
#include "base/add.h"
#include "data_type.h"
#include "operator.h"

namespace infini::ops {

template <>
class Operator<Add, Device::Type::kAscend> : public Add {
public:
Operator(const Tensor input, const Tensor other, Tensor out)
: Add(input, other, out),
in_cache_(input),
oth_cache_(other),
out_cache_(out) {
// `aclCreateScalar` stores the pointer rather than copying the value, so
// `alpha_storage_*` must remain alive for the lifetime of `alpha_`.
// The alpha scalar type must match the tensor dtype: use int64 for integer
// dtypes and float for floating-point dtypes.
if (ascend::isIntegerDtype(input.dtype())) {
alpha_ = aclCreateScalar(&alpha_int_storage_, ACL_INT64);
} else {
alpha_ = aclCreateScalar(&alpha_float_storage_, ACL_FLOAT);
}
}

~Operator() {
if (executor_) aclDestroyAclOpExecutor(executor_);
aclDestroyScalar(alpha_);
}

void operator()(const Tensor input, const Tensor other,
Tensor out) const override {
auto stream = static_cast<aclrtStream>(stream_);
auto t_in = in_cache_.get(const_cast<void*>(input.data()));
auto t_oth = oth_cache_.get(const_cast<void*>(other.data()));
auto t_out = out_cache_.get(out.data());

if (!executor_) {
aclnnAddGetWorkspaceSize(t_in, t_oth, alpha_, t_out, &ws_size_,
&executor_);
aclSetAclOpExecutorRepeatable(executor_);
} else {
aclSetInputTensorAddr(executor_, 0, t_in,
const_cast<void*>(input.data()));
aclSetInputTensorAddr(executor_, 1, t_oth,
const_cast<void*>(other.data()));
aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
}

auto& arena = ascend::workspacePool().ensure(stream, ws_size_);
aclnnAdd(arena.buf, ws_size_, executor_, stream);
}

private:
mutable ascend::AclTensorCache in_cache_;

mutable ascend::AclTensorCache oth_cache_;

mutable ascend::AclTensorCache out_cache_;

mutable aclOpExecutor* executor_ = nullptr;

mutable uint64_t ws_size_ = 0;

float alpha_float_storage_ =
1.0f; // Stable address for `aclCreateScalar` (float).
int64_t alpha_int_storage_ =
1; // Stable address for `aclCreateScalar` (int).
aclScalar* alpha_ = nullptr;
};

} // namespace infini::ops

#endif
Loading
Loading