InfiniTensor
diff --git a/‎src/ascend/custom_kernel/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎src/ascend/custom_kernel/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/ascend/custom_kernel/CMakeLists.txt‎
Lines changed: 35 additions & 0 deletions b/‎src/ascend/custom_kernel/CMakeLists.txt‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/ascend/custom_kernel/build.sh‎
Lines changed: 30 additions & 0 deletions b/‎src/ascend/custom_kernel/build.sh‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/ascend/custom_kernel/cmake/config_ascend.cmake‎
Lines changed: 23 additions & 0 deletions b/‎src/ascend/custom_kernel/cmake/config_ascend.cmake‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎src/ascend/custom_kernel/cmake/config_envs.cmake‎
Lines changed: 83 additions & 0 deletions b/‎src/ascend/custom_kernel/cmake/config_envs.cmake‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎src/ascend/custom_kernel/csrc/CMakeLists.txt‎
Lines changed: 51 additions & 0 deletions b/‎src/ascend/custom_kernel/csrc/CMakeLists.txt‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎src/ascend/custom_kernel/csrc/ops.h‎
Lines changed: 21 additions & 0 deletions b/‎src/ascend/custom_kernel/csrc/ops.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/ascend/custom_kernel/csrc/ops/add_rms_norm/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎src/ascend/custom_kernel/csrc/ops/add_rms_norm/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/ascend/custom_kernel/csrc/ops/add_rms_norm/op_host/add_rms_norm.cpp‎
Lines changed: 144 additions & 0 deletions b/‎src/ascend/custom_kernel/csrc/ops/add_rms_norm/op_host/add_rms_norm.cpp‎
Lines changed: 144 additions & 0 deletions
@@ -0,0 +1,3 @@
+build/
+output/
+python/
@@ -0,0 +1,35 @@
+cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
+project(ascend-kernel LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE RELEASE)
+endif()
+
+add_compile_options(-Wunused-value -Wcast-align -Wcast-qual -Wwrite-strings
+                    -Wsign-compare -Wextra)
+
+if(${CMAKE_BUILD_TYPE} MATCHES "RELEASE")
+    add_compile_options(-O3 -fvisibility=hidden -fvisibility-inlines-hidden
+                        -fstack-protector-strong -fPIE -fPIC)
+    message(STATUS "build type set to RELEASE")
+else()
+    add_compile_options(-g -rdynamic)
+endif()
+
+set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR}/csrc)
+set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build)
+set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output)
+
+include(cmake/config_envs.cmake)
+include(cmake/config_ascend.cmake)
+
+find_program(CCACHE_PROGRAM ccache)
+if(CCACHE_PROGRAM)
+    message(STATUS "Found ccache: ${CCACHE_PROGRAM}")
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
+endif()
+
+add_subdirectory(csrc)
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Build custom AscendC kernels into libascend_kernel.so.
+set -e
+
+SOC_VERSION="${1:-Ascend910_9382}"
+
+# Detect CANN toolkit path.
+_CANN_TOOLKIT_INSTALL_PATH=$(grep "Toolkit_InstallPath" /etc/Ascend/ascend_cann_install.info | awk -F'=' '{print $2}')
+source "${_CANN_TOOLKIT_INSTALL_PATH}/set_env.sh"
+echo "CANN: ${ASCEND_TOOLKIT_HOME}"
+
+ASCEND_INCLUDE_DIR=${ASCEND_TOOLKIT_HOME}/$(arch)-linux/include
+CURRENT_DIR=$(pwd)
+OUTPUT_DIR=${CURRENT_DIR}/output
+mkdir -p "${OUTPUT_DIR}"
+
+BUILD_DIR=build
+rm -rf "${BUILD_DIR}"
+mkdir -p "${BUILD_DIR}"
+
+cmake \
+    -DASCEND_HOME_PATH="${ASCEND_HOME_PATH}" \
+    -DASCEND_INCLUDE_DIR="${ASCEND_INCLUDE_DIR}" \
+    -DSOC_VERSION="${SOC_VERSION}" \
+    -B "${BUILD_DIR}" \
+    -S .
+
+cmake --build "${BUILD_DIR}" -j 16
+
+echo "Build complete. Output: ${OUTPUT_DIR}"
@@ -0,0 +1,23 @@
+
+if(DEFINED ASCEND_HOME_PATH)
+elseif(DEFINED ENV{ASCEND_HOME_PATH})
+    set(ASCEND_HOME_PATH "$ENV{ASCEND_HOME_PATH}" CACHE PATH "ASCEND CANN package installation directory" FORCE)
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH})
+
+if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+
+message(STATUS "ASCEND_CANN_PACKAGE_PATH = ${ASCEND_CANN_PACKAGE_PATH}")
+message(STATUS "ASCEND_HOME_PATH = ${ASCEND_HOME_PATH}")
@@ -0,0 +1,83 @@
+# find python binary
+find_program(PYTHON_EXECUTABLE NAMES python3)
+
+if (NOT EXISTS ${PYTHON_EXECUTABLE})
+    message(FATAL_ERROR "python3 is not found, install python firstly")
+endif ()
+
+# get torch path, torch npu path, pybind11 path via python script
+execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} "-c"
+        "import torch; import torch_npu; import os; import pybind11; import sysconfig;
+torch_dir = os.path.realpath(os.path.dirname(torch.__file__));
+torch_npu_dir = os.path.realpath(os.path.dirname(torch_npu.__file__));
+pybind11_dir = os.path.realpath(os.path.dirname(pybind11.__file__));
+abi_enabled=torch.compiled_with_cxx11_abi();
+python_include_dir = sysconfig.get_path('include');
+print(torch_dir, torch_npu_dir, pybind11_dir, abi_enabled, python_include_dir, end='');
+quit(0)
+        "
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_VARIABLE OUTPUT_ENV_DEFINES)
+
+# if failed to run the python script
+if (NOT ${EXEC_RESULT} EQUAL 0)
+    message(FATAL_ERROR "failed to get run python script to get ENVS like TORCH_DIR etc")
+else ()
+    message(STATUS "run python script successfully, output string is [${OUTPUT_ENV_DEFINES}]")
+endif ()
+
+# extract TORCH_DIR and set it
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $1}'"
+        OUTPUT_VARIABLE TORCH_DIR
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# extract TORCH_NPU_DIR and set it
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $2}'"
+        OUTPUT_VARIABLE TORCH_NPU_DIR
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# extract PYBIND11_DIR and set it
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $3}'"
+        OUTPUT_VARIABLE PYBIND11_DIR
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# extract PYTROCH_ABI and set it
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $4}'"
+        OUTPUT_VARIABLE TORCH_API_ENABLED
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# extract PYTHON_INCLUDE_DIR and set it
+execute_process(
+        COMMAND sh -c "echo \"${OUTPUT_ENV_DEFINES}\" | awk '{print $5}'"
+        OUTPUT_VARIABLE PYTHON_INCLUDE_DIR
+        RESULT_VARIABLE EXEC_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+message(STATUS "SOC_VERSION=${SOC_VERSION}")
+message(STATUS "TORCH_DIR=${TORCH_DIR}")
+message(STATUS "TORCH_NPU_DIR=${TORCH_NPU_DIR}")
+message(STATUS "PYBIND11_DIR=${PYBIND11_DIR}")
+message(STATUS "PYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}")
+
+# set _GLIBCXX_USE_CXX11_ABI
+if (${TORCH_API_ENABLED} STREQUAL "True")
+    add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=1)
+    message(STATUS "_GLIBCXX_USE_CXX11_ABI=1")
+else ()
+    add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=0)
+    message(STATUS "_GLIBCXX_USE_CXX11_ABI=0")
+endif ()
@@ -0,0 +1,51 @@
+# Set the library output dir to the project output for linking.
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_PATH})
+
+# Host side files.
+file(GLOB OP_SRCS
+    ${PROJECT_OP_SRC_BASE}/register.cpp
+    ${PROJECT_OP_SRC_BASE}/ops/rms_norm/op_host/rms_norm.cpp
+)
+
+# Set the shared library name.
+set(OP_PLUGIN_NAME ascend_kernel)
+
+# Kernel side files (device code compiled by AscendC toolchain).
+ascendc_library(no_workspace_kernel STATIC
+    ${PROJECT_OP_SRC_BASE}/ops/rms_norm/op_kernel/rms_norm.cpp
+)
+
+# Create shared library libascend_kernel.so.
+add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS})
+
+target_link_libraries(${OP_PLUGIN_NAME} PRIVATE
+    no_workspace_kernel
+    torch_npu
+    ascendcl
+    tiling_api
+    nnopbase
+    opapi
+    register
+    platform
+    ascendalog
+    dl
+)
+
+target_link_directories(${OP_PLUGIN_NAME} PRIVATE
+    ${TORCH_DIR}/lib
+    ${TORCH_NPU_DIR}/lib
+)
+
+target_include_directories(${OP_PLUGIN_NAME} PRIVATE
+    ${PROJECT_OP_SRC_BASE}/utils
+    ${PROJECT_SOURCE_DIR}/include
+    ${TORCH_DIR}/include
+    ${TORCH_DIR}/include/torch/csrc/api/include
+    ${TORCH_NPU_DIR}/include/third_party/acl/inc
+    ${TORCH_NPU_DIR}/include/third_party/hccl/inc
+    ${TORCH_NPU_DIR}/include
+    ${PYTHON_INCLUDE_DIR}
+    ${ASCEND_INCLUDE_DIR}/external
+    ${ASCEND_INCLUDE_DIR}/experiment/platform
+    ${ASCEND_INCLUDE_DIR}/experiment/runtime
+)
@@ -0,0 +1,21 @@
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef OPS_H
+#define OPS_H
+
+namespace ascend_kernel {
+
+at::Tensor rms_norm(const at::Tensor &input, const at::Tensor &weight,
+                    double eps);
+
+} // namespace ascend_kernel
+
+#endif // OPS_H
@@ -0,0 +1 @@
+ascendc_add_operator(OP_NAME add_rms_norm)
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2025, InfiniTensor.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include "torch_kernel_helper.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "aclrtlaunch_add_rms_norm.h"
+
+namespace ascend_kernel {
+
+std::vector<at::Tensor> add_rms_norm(const at::Tensor &x1,
+                                     const at::Tensor &x2,
+                                     const at::Tensor &weight, double eps) {
+    // Input validation.
+    TORCH_CHECK(x1.dim() > 0,
+                "add_rms_norm: x1 must have at least 1 dimension");
+    TORCH_CHECK(x1.sizes() == x2.sizes(),
+                "add_rms_norm: x1 and x2 must have the same shape");
+    TORCH_CHECK(x1.scalar_type() == x2.scalar_type(),
+                "add_rms_norm: x1 and x2 must have the same dtype");
+    TORCH_CHECK(x1.scalar_type() == at::kHalf ||
+                    x1.scalar_type() == at::kFloat,
+                "add_rms_norm: only float16 and float32 are supported, got ",
+                x1.scalar_type());
+    TORCH_CHECK(weight.dim() == 1,
+                "add_rms_norm: weight must be 1-dimensional");
+    TORCH_CHECK(weight.size(0) == x1.size(-1),
+                "add_rms_norm: weight size (", weight.size(0),
+                ") must match input last dim (", x1.size(-1), ")");
+
+    int64_t dimLength = x1.size(-1);
+    int64_t totalRows = x1.numel() / dimLength;
+
+    if (totalRows == 0 || dimLength == 0) {
+        return {at::empty_like(x1), at::empty_like(x1)};
+    }
+
+    at::Tensor inp1 = x1.contiguous();
+    at::Tensor inp2 = x2.contiguous();
+    int64_t dtypeSize = inp1.element_size();
+
+    // Hardware parameters.
+    auto ascendc_platform =
+        platform_ascendc::PlatformAscendCManager::GetInstance();
+    int64_t coreNum =
+        static_cast<int64_t>(ascendc_platform->GetCoreNumAiv());
+    uint64_t ubSize;
+    ascendc_platform->GetCoreMemSize(platform_ascendc::CoreMemType::UB,
+                                     ubSize);
+    int64_t ubSizeLimit = static_cast<int64_t>(ubSize);
+
+    // Alignment (32-byte boundary).
+    int64_t alignElements = 32 / dtypeSize;
+    int64_t dimLengthAlign =
+        ((dimLength + alignElements - 1) / alignElements) * alignElements;
+
+    // UB capacity check.
+    // fp16: inQ_x1(×2×2) + inQ_x2(×2×2) + outQ_y(×2×2) + outQ_xout(×2×2)
+    //       + fp32Buf1(×4) + fp32Buf2(×4) + weight(×4) = 16 + 12 = 28
+    // fp32: inQ_x1(×2×4) + inQ_x2(×2×4) + outQ_y(×2×4) + outQ_xout(×2×4)
+    //       + weight(×4) = 32 + 4 = 36
+    int64_t bufferCoefficient = (dtypeSize == 2) ? 28 : 36;
+    int64_t maxDimLength =
+        (ubSizeLimit - 1024) / bufferCoefficient;
+    int64_t fpAlignElements = 32 / 4;
+    maxDimLength =
+        (maxDimLength / fpAlignElements) * fpAlignElements;
+    TORCH_CHECK(dimLengthAlign <= maxDimLength,
+                "add_rms_norm: dimLength ", dimLength,
+                " (aligned ", dimLengthAlign,
+                ") exceeds UB capacity (max ", maxDimLength, ")");
+
+    // Padding.
+    at::Tensor kernelInput1;
+    at::Tensor kernelInput2;
+
+    if (dimLength != dimLengthAlign) {
+        kernelInput1 = inp1.reshape({totalRows, dimLength});
+        kernelInput1 = at::constant_pad_nd(
+            kernelInput1, {0, dimLengthAlign - dimLength}, 0.0);
+        kernelInput1 = kernelInput1.contiguous();
+
+        kernelInput2 = inp2.reshape({totalRows, dimLength});
+        kernelInput2 = at::constant_pad_nd(
+            kernelInput2, {0, dimLengthAlign - dimLength}, 0.0);
+        kernelInput2 = kernelInput2.contiguous();
+    } else {
+        kernelInput1 =
+            inp1.reshape({totalRows, dimLengthAlign}).contiguous();
+        kernelInput2 =
+            inp2.reshape({totalRows, dimLengthAlign}).contiguous();
+    }
+
+    at::Tensor kernelOutputY = at::empty_like(kernelInput1);
+    at::Tensor kernelOutputXOut = at::empty_like(kernelInput1);
+
+    // Weight: always pass as fp32, padded to `dimLengthAlign`.
+    at::Tensor weightFloat = weight.contiguous().to(at::kFloat);
+
+    if (dimLength != dimLengthAlign) {
+        weightFloat = at::constant_pad_nd(
+            weightFloat, {0, dimLengthAlign - dimLength}, 0.0);
+    }
+
+    weightFloat = weightFloat.contiguous();
+
+    // Block-level tiling (distribute rows across cores).
+    int64_t usedCoreNum = std::min(totalRows, coreNum);
+    int64_t formerLength =
+        (totalRows + usedCoreNum - 1) / usedCoreNum;
+    int64_t tailLength = formerLength - 1;
+    int64_t formerNum = totalRows - tailLength * usedCoreNum;
+    uint32_t blockDim = static_cast<uint32_t>(usedCoreNum);
+
+    // All EXEC_KERNEL_CMD args must be lvalues.
+    float epsFloat = static_cast<float>(eps);
+    int64_t dtypeSizeVal = dtypeSize;
+
+    EXEC_KERNEL_CMD(add_rms_norm, blockDim,
+                    kernelInput1, kernelInput2, weightFloat,
+                    kernelOutputY, kernelOutputXOut,
+                    totalRows, dimLength, dimLengthAlign,
+                    formerNum, formerLength, tailLength,
+                    epsFloat, dtypeSizeVal);
+
+    // Remove padding and reshape back to original shape.
+    at::Tensor outputY = kernelOutputY;
+    at::Tensor outputXOut = kernelOutputXOut;
+
+    if (dimLength != dimLengthAlign) {
+        outputY = outputY.narrow(-1, 0, dimLength).contiguous();
+        outputXOut = outputXOut.narrow(-1, 0, dimLength).contiguous();
+    }
+
+    outputY = outputY.reshape(x1.sizes());
+    outputXOut = outputXOut.reshape(x1.sizes());
+
+    return {outputY, outputXOut};
+}
+
+}  // namespace ascend_kernel
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+ascendc_add_operator(OP_NAME add_rms_norm)`