diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt index 6b3236f8b..e92836172 100644 --- a/csrc/CMakeLists.txt +++ b/csrc/CMakeLists.txt @@ -22,6 +22,7 @@ FILE(GLOB OP_SRCS ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/lightning_indexer.cpp ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_host/tiling/lightning_indexer_tiling.cpp ${PROJECT_OP_SRC_BASE}/tri_inv/op_host/tri_inv.cpp + ${PROJECT_OP_SRC_BASE}/tri_inv/op_host/tri_inv_cube.cpp ) if(BUILD_CATLASS_MODULE) list(APPEND OP_SRCS @@ -53,6 +54,7 @@ set(WORKSPACE_KERNEL_SRCS ${PROJECT_OP_SRC_BASE}/alloc_extend/op_kernel/alloc_extend_kernel.cpp ${PROJECT_OP_SRC_BASE}/build_tree/op_kernel/build_tree_kernel.cpp ${PROJECT_OP_SRC_BASE}/lightning_indexer/op_kernel/lightning_indexer_kernel.cpp + ${PROJECT_OP_SRC_BASE}/tri_inv/op_kernel/tri_inv_cube_kernel.cpp ) if(BUILD_CATLASS_MODULE) list(APPEND WORKSPACE_KERNEL_SRCS diff --git a/csrc/pytorch_extensions.cpp b/csrc/pytorch_extensions.cpp index a7d093730..9cbced6ee 100644 --- a/csrc/pytorch_extensions.cpp +++ b/csrc/pytorch_extensions.cpp @@ -98,6 +98,7 @@ TORCH_LIBRARY_FRAGMENT(npu, m) "int? sparse_count=None, int? sparse_mode=None) -> Tensor"); m.def("triangular_inverse(Tensor x) -> Tensor"); + m.def("cube_triangular_inverse(Tensor x) -> Tensor"); } } // namespace @@ -141,5 +142,7 @@ TORCH_LIBRARY_IMPL(npu, PrivateUse1, m) m.impl("lightning_indexer", TORCH_FN(sglang::npu_kernel::lightning_indexer)); m.impl("triangular_inverse", TORCH_FN(sglang::npu_kernel::tri_inv_col_sweep)); + + m.impl("cube_triangular_inverse", TORCH_FN(sglang::npu_kernel::tri_inv_cube_col_sweep)); } } // namespace diff --git a/csrc/tri_inv/README.md b/csrc/tri_inv/README.md index 67a4dc2fc..d05b56a91 100644 --- a/csrc/tri_inv/README.md +++ b/csrc/tri_inv/README.md @@ -1,5 +1,7 @@ ##### Description of tri_inv -This is a vector-only AscendC triangular inversion kernel on Ascend NPU. +This implements several AscendC triangular inversion kernels on Ascend NPU. -The kernel supports matrix sizes `16, 32, 64, 128` and data types `fp16` and `fp32`. +The AIV based kernels support matrix sizes `16, 32, 64, 128` and data types `fp16` and `fp32`. + +The AIC based kernel support matrix sizes `16, 32, 64, 128` and data type `fp16`. diff --git a/csrc/tri_inv/op_host/tiling_tri_inv_cube.h b/csrc/tri_inv/op_host/tiling_tri_inv_cube.h new file mode 100644 index 000000000..687819d10 --- /dev/null +++ b/csrc/tri_inv/op_host/tiling_tri_inv_cube.h @@ -0,0 +1,24 @@ +#pragma once + +#include + +namespace sglang { + +namespace npu_kernel { + +/** + * @brief `tri_inv_cube_col_sweep` kernel tiling parameter structure. + */ +struct TriInvColumnSweepCubeTiling { + /// @brief Number of blocks. + uint32_t num_blocks; + /// @brief Total number of input elements. + uint32_t num_elems; + /// @brief Input matrix size. + uint32_t matrix_size; + /// @brief Workspace circular buffer length. + uint32_t ws_circular_buffer_len; +}; + +} // namespace npu_kernel +} // namespace sglang diff --git a/csrc/tri_inv/op_host/tri_inv_cube.cpp b/csrc/tri_inv/op_host/tri_inv_cube.cpp new file mode 100644 index 000000000..e767090ba --- /dev/null +++ b/csrc/tri_inv/op_host/tri_inv_cube.cpp @@ -0,0 +1,81 @@ +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "defines.h" +#include "torch_helper.h" + +#include "tiling_tri_inv_cube.h" +#include "aclrtlaunch_tri_inv_cube_col_sweep_fp16.h" +#include "tiling/platform/platform_ascendc.h" + +namespace sglang { + +namespace npu_kernel { + +at::Tensor calc_tiling(const TriInvColumnSweepCubeTiling &tiling) +{ + constexpr uint32_t PADDING_BYTE = 32U; + + // align to 32 bytes + int32_t tiling_size = (sizeof(TriInvColumnSweepCubeTiling) + PADDING_BYTE - 1) / PADDING_BYTE * PADDING_BYTE; + auto tiling_buffer = at::empty({tiling_size}, at::TensorOptions().dtype(at::kByte).device(at::kCPU)); + + TriInvColumnSweepCubeTiling *tiling_data = + reinterpret_cast(tiling_buffer.data_ptr()); + tiling_data->num_blocks = tiling.num_blocks; + tiling_data->num_elems = tiling.num_elems; + tiling_data->matrix_size = tiling.matrix_size; + tiling_data->ws_circular_buffer_len = tiling.ws_circular_buffer_len; + + auto tiling_tensor = TorchNpuHelper::CopyTensorHostToDevice(tiling_buffer); + return tiling_tensor; +} + +HOST_API at::Tensor tri_inv_cube_col_sweep(const at::Tensor &tensor) +{ + platform_ascendc::PlatformAscendC *platformAscendC = platform_ascendc::PlatformAscendCManager::GetInstance(); + auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); + + const auto dtype = tensor.options().dtype(); + if (tensor.dim() < 2) { + throw std::runtime_error("Input tensor must have at least 2 dimensions.\n"); + } + + const uint32_t matrix_size = static_cast(tensor.size(-1)); + if (matrix_size != tensor.size(-2)) { + throw std::runtime_error("Only square matrices are supported.\n"); + } + + const uint32_t num_elems = static_cast(tensor.numel()); + const uint32_t block_dim = static_cast(num_elems / (matrix_size * matrix_size)); + + auto tensor_out = at::empty_like(tensor, at::kFloat); + + const uint32_t WS_CIRCULAR_BUFFER_LEN = 4; + const TriInvColumnSweepCubeTiling tiling{block_dim, num_elems, matrix_size, WS_CIRCULAR_BUFFER_LEN}; + const at::Tensor tiling_device = calc_tiling(tiling); + + // workspace + const uint64_t system_workspace_size = static_cast(platformAscendC->GetLibApiWorkSpaceSize()); + const uint64_t workspace_size = system_workspace_size + num_elems * WS_CIRCULAR_BUFFER_LEN * tensor.itemsize(); + const auto options = at::TensorOptions().dtype(at::kByte).device(tensor.options().device()); + auto workspace = at::empty({static_cast(workspace_size)}, options); + + if (dtype == at::kHalf) { + EXEC_KERNEL_CMD(tri_inv_cube_col_sweep_fp16, block_dim, tensor, tensor_out, workspace, tiling_device); + } else { + throw std::runtime_error("Unsupported data type for tri_inv_cube_col_sweep. fp16 is currently supported."); + } + aclrtSynchronizeStream(acl_stream); + return tensor_out; +} + +} // namespace npu_kernel +} // namespace sglang diff --git a/csrc/tri_inv/op_kernel/kernel_mat_gen.h b/csrc/tri_inv/op_kernel/kernel_mat_gen.h new file mode 100644 index 000000000..9345c6fae --- /dev/null +++ b/csrc/tri_inv/op_kernel/kernel_mat_gen.h @@ -0,0 +1,221 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * @file kernel_mat_gen.h + * @brief Kernel implementing an AIV matrix generator that generates the matrix + * formulation of column sweep. + */ +#pragma once + +namespace sglang { + +namespace npu_kernel { +/** + * @brief Returns a sequence of matrices that encode the column-sweep steps in + * matrix notation. On the first iteration, it returns the identity matrix. + * + * @tparam Input data type. Support fp16/half. + * + * See discussion on Section 3.2.1 of [1], in particular Equations 3.8 and 3.9 + * on page 54. + * + * \code{.python} + def aiv_matrix_gen(A: npt.ArrayLike): + n = A.shape[0] + I_n = np.eye(n, dtype=A.dtype) + + # Your transformation A = 2I_n - A + A = 2 * I_n - A + + for k in reversed(range(n)): + M = I_n.copy() + M[:, k] = A[:, k] + yield M + + * \endcode + * + * [1] Parallelism in Matrix Computations.E.Gallopoulos, B.Philippe and + A.H.Sameh. + * Hard cover(ISBN : 978 - 94 - 017 - 7187 - 0), + * Soft cover(ISBN : 978 - 94 - 024 - 0317 - 6), + * Electronic(ISBN : 978 - 94 - 017 - 7188 - 7) + **/ +template +class KernelMatGen +{ +public: + /** + * @brief Class constructor. + * + * @param [in] matrix_size Input square matrix size. + * @param [in] circular_buffer_len Length of workspace circular buffer to + * overcome GM memory consistency issues. + */ + __aicore__ inline KernelMatGen(uint32_t matrix_size, uint32_t circular_buffer_len) + : matrix_size_(matrix_size), + tile_len_(matrix_size * matrix_size), + aic_id_(AscendC::GetBlockIdx() / AscendC::GetTaskRation()), + global_in_offset_(aic_id_ * tile_len_), + ws_circular_buffer_len_(circular_buffer_len), + global_out_offset_(aic_id_ * tile_len_ * ws_circular_buffer_len_) + {} + + /** + * @brief Initialize global and local memory structures. + * + * @param [in] vec_in Pointer to the input vector in global memory. + * @param [in] vec_out Pointer to the output vector in global memory. + */ + __aicore__ inline void Init(GM_ADDR vec_in, GM_ADDR vec_out) + { + const uint32_t vec_len = AscendC::GetBlockNum() * tile_len_; + global_in_.SetGlobalBuffer((__gm__ T *)vec_in, vec_len); + global_out_.SetGlobalBuffer((__gm__ T *)vec_out, vec_len * ws_circular_buffer_len_); + + pipe_.InitBuffer(in_q_, 1, tile_len_ * sizeof(T)); + pipe_.InitBuffer(out_q_, 1, tile_len_ * sizeof(T)); + pipe_.InitBuffer(work_buf_, tile_len_ * sizeof(T)); + } + + /** + * @brief Run the kernel. + * + */ + __aicore__ inline void Process() + { + // Read input matrix into work_buf_. + const AscendC::LocalTensor in_lt = in_q_.template AllocTensor(); + DataCopy(in_lt, global_in_[global_in_offset_], in_lt.GetSize()); + in_q_.EnQue(in_lt); + + ReadInputMatrixInUB(); + + // AIV-0 writes identity matrix for AIC + if (AscendC::GetSubBlockIdx() == 0) { + EnQueueIdentityMatrix(); + AscendC::LocalTensor out_lt = out_q_.template DeQue(); + DataCopy(global_out_[global_out_offset_], out_lt, out_lt.GetSize()); + out_q_.FreeTensor(out_lt); + } + + // Sync with all AIVs in group, to write the matrix. + SyncGroup(); + + // First matrix is identity (just wait one more round) + SyncGroup(); + + const AscendC::LocalTensor work_lt = work_buf_.Get(); + uint32_t circular_buf_idx = 1; + + // Matrix column sweep algorithm requires `matrix_size_` iterations. + for (int32_t col_index = matrix_size_ - 2; col_index >= 0; col_index--) { + // AIV-0: writes the (col_index + 1)-th column of the identity matrix and + // writes the "column-sweep" column of matrix `M`. + if (AscendC::GetSubBlockIdx() == 0) { + const AscendC::LocalTensor vec_out_lt = out_q_.AllocTensor(); + AscendC::Duplicate(vec_out_lt, static_cast(0), matrix_size_ * matrix_size_); + AscendC::PipeBarrier(); + + // Set one on the main diagonal + for (uint32_t i = 0; i < matrix_size_; i++) { + vec_out_lt.SetValue(i * matrix_size_ + i, static_cast(1)); + } + + AscendC::PipeBarrier(); + // Write the (col_index)-th column of matrix M. + const uint32_t col_offset = col_index * matrix_size_; + DataCopy(vec_out_lt[col_offset], work_lt[col_offset], matrix_size_); + AscendC::PipeBarrier(); + out_q_.EnQue(vec_out_lt); + + AscendC::LocalTensor out_lt = out_q_.template DeQue(); + DataCopy(global_out_[global_out_offset_ + circular_buf_idx * tile_len_], out_lt, out_lt.GetSize()); + out_q_.FreeTensor(out_lt); + circular_buf_idx = (circular_buf_idx + 1) % ws_circular_buffer_len_; + } + + // Sync with all AIVs in group, to write the matrix. + SyncGroup(); + } + } + +private: + /** + * @brief Read (and transform) the input triangular matrix A into the + * `work_buf_`. The transformation is `2*I_n - A`. + */ + __aicore__ inline void ReadInputMatrixInUB() + { + AscendC::LocalTensor vec_in_lt = in_q_.DeQue(); + AscendC::LocalTensor work_lt = work_buf_.Get(); + Muls(work_lt, vec_in_lt, static_cast(-1), vec_in_lt.GetSize()); + for (uint32_t i = 0; i < matrix_size_; i++) { + work_lt.SetValue(i * matrix_size_ + i, 1); + } + in_q_.FreeTensor(vec_in_lt); + } + + /** + * @brief EnQue identity matrix on output queue. + * + */ + __aicore__ inline void EnQueueIdentityMatrix() + { + const AscendC::LocalTensor vec_out_lt = out_q_.AllocTensor(); + AscendC::Duplicate(vec_out_lt, static_cast(0), matrix_size_ * matrix_size_); + AscendC::PipeBarrier(); + + // Set one on the main diagonal + for (uint32_t i = 0; i < matrix_size_; i++) { + vec_out_lt.SetValue(i * matrix_size_ + i, static_cast(1)); + } + + out_q_.EnQue(vec_out_lt); + } + + /** + * @brief Returns a synchronization config. + * + * @param [in] mode Synchronization mode. + * @param [in] flag_id Flag to use for synchronization. + * @return Synchronization config. + */ + __aicore__ inline int GetSyncConf(int mode, int flag_id) + { + return 1 | (mode << 4) | (flag_id << 8); + } + + /** + * @brief Synchronize cube and vector cores within a single group. + * + */ + __aicore__ inline void SyncGroup() + { + const int mode = 2; + + const int AIV_SET_FLAG_ID = 11; + const int AIC_SET_FLAG_ID = 12; + ffts_cross_core_sync(PIPE_MTE3, GetSyncConf(mode, AIV_SET_FLAG_ID)); + wait_flag_dev(AIC_SET_FLAG_ID); + return; + } + + AscendC::TPipe pipe_; + + AscendC::TQue in_q_; + AscendC::TQue out_q_; + + AscendC::TBuf work_buf_; + + AscendC::GlobalTensor global_in_; + AscendC::GlobalTensor global_out_; + + const uint32_t matrix_size_; + const uint32_t tile_len_; + const uint32_t aic_id_; + const uint32_t global_in_offset_; + const uint32_t ws_circular_buffer_len_; + const uint32_t global_out_offset_; +}; +} // namespace npu_kernel +} // namespace sglang \ No newline at end of file diff --git a/csrc/tri_inv/op_kernel/kernel_tri_inv_cube.h b/csrc/tri_inv/op_kernel/kernel_tri_inv_cube.h new file mode 100644 index 000000000..5389a525f --- /dev/null +++ b/csrc/tri_inv/op_kernel/kernel_tri_inv_cube.h @@ -0,0 +1,491 @@ +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2026. All rights reserved. + * + * @file kernel_tri_inv.h + * @brief Kernel implementing a Vector matrix inverse kernel operation. + */ + +#pragma once + +#include "kernel_operator.h" +#include "kernel_mat_gen.h" + +namespace sglang { + +namespace npu_kernel { +/** + * @brief Returns the matrix inverse of an upper triangular square matrix of + * size `matrix_size`. The matrix has ones on the main diagonal. + * + * The column sweep algorithm is used for the linear system Ax=e_j where e_j is + * the standard vector. + * + * The kernel assumes \f$ matrix_size_ + 1 \f$ synchronizations with the AIVs. + * + * @tparam InputT Input data type. Default value fp16/half. + * + */ +template +class KernelTriInvCubeColSweep +{ + using OutputT = float; + +public: + /** + * @brief Class constructor. + * + * @param [in] vec_len Total length of input tensor. + * @param [in] matrix_size Input square matrix size. + * @param [in] circular_buffer_len Length of workspace circular buffer to + * overcome GM memory consistency issues. + */ + __aicore__ inline KernelTriInvCubeColSweep(uint32_t vec_len, uint32_t matrix_size, uint32_t circular_buffer_len) + : vec_len_(vec_len), + matrix_size_(matrix_size), + ws_circular_buffer_len_(circular_buffer_len), + tile_len_(matrix_size * matrix_size), + global_in_offset_(AscendC::GetBlockIdx() * tile_len_ * ws_circular_buffer_len_), + global_out_offset_(AscendC::GetBlockIdx() * tile_len_) + {} + + /** + * @brief Initialize global and local memory structures. + * + * @param [in] matrix_stream_in Pointer where the input matrices will be + * written by AIVs in global memory. + * @param [in] inv_matrix_out Pointer where the output matrix inverses + * will be written. + */ + __aicore__ inline void Init(GM_ADDR matrix_stream_in, GM_ADDR inv_matrix_out) + { + global_A_.SetGlobalBuffer((__gm__ InputT *)matrix_stream_in, vec_len_ * ws_circular_buffer_len_); + global_C_.SetGlobalBuffer((__gm__ OutputT *)inv_matrix_out, vec_len_); + + pipe_.InitBuffer(a1_q_, 1, tile_len_ * sizeof(InputT)); + pipe_.InitBuffer(a2_q_, 1, tile_len_ * sizeof(InputT)); + pipe_.InitBuffer(b1_q_, 1, tile_len_ * sizeof(InputT)); + pipe_.InitBuffer(b2_q_, 1, tile_len_ * sizeof(InputT)); + pipe_.InitBuffer(co1_q_, 1, tile_len_ * sizeof(OutputT)); + } + + /** + * @brief Run kernel. + * + */ + __aicore__ inline void Process() + { + // On the first iteration, the AIVs will send the identity matrix to AIC + SyncGroup(); + LoadIdentityMatrixinL0C(); + + // Read again the identity matrix from AIV core. + uint32_t circular_buf_idx = 0; + + // Matrix column sweep algorithm requires `matrix_size_` iterations. + for (uint32_t iter = 0; iter < matrix_size_; iter++) { + // Sync with all AIVs in group, to write the matrix. + SyncGroup(); + + // Load next matrix A and perform C = A @ C + LoadMatrixAintoL0A(circular_buf_idx); + MultiplyAWithC(); + circular_buf_idx = (circular_buf_idx + 1) % ws_circular_buffer_len_; + } + + // Write L0C matrix to global memory + CopyCL0ToGlobal(); + } + +private: + /** + * @brief Loads matrix from global memory into L0A (`a1_q_` queue). + */ + __aicore__ inline void LoadMatrixAintoL0A(uint32_t iter) + { + // Load matrix from global_A_ into L1 + CopyGmToL1A(a1_q_, global_A_[global_in_offset_ + iter * tile_len_], m_blocks_, k_blocks_); + CopyL1ToL0A(a2_q_, a1_q_, m_blocks_, k_blocks_); + } + + /** + * @brief Copy a tensor from global memory to the queue in A L1 memory. + * + * The data layout is transformed from ND to NZ. + * + * @param [in] q Destination queue. The position must be A1. + * @param [in] global Source global tensor. + * @param [in] fractals_h Number of fractal patterns in the height dimension of + * the input matrix. + * @param [in] fractals_w Number of fractal patterns in the width dimension of + * the input matrix. + */ + __aicore__ inline void CopyGmToL1A(AscendC::TQue &q, + const AscendC::GlobalTensor &global, uint16_t fractals_h, + uint16_t fractals_w) + { + const AscendC::LocalTensor lt = q.template AllocTensor(); + AscendC::Nd2NzParams params; + params.ndNum = 1; + params.nValue = m_blocks_ * 16; + params.dValue = k_blocks_ * 16; + params.srcDValue = params.dValue; + params.dstNzC0Stride = params.nValue; + params.dstNzNStride = 1; + // params.dstNzMatrixStride = 1; // This should have no effect when ndNum=1; + DataCopy(lt, global, params); + q.EnQue(lt); + } + + /** + * @brief Copy a tensor from global memory to the queue in B L1 memory. + * + * The data layout is transformed from ND to NZ. + * + * @param [in] q Destination queue. The position must be B1. + * @param [in] global Source global tensor. + * @param [in] fractals_h Number of fractal patterns in the height dimension of + * the input matrix. + * @param [in] fractals_w Number of fractal patterns in the width dimension of + * the input matrix. + */ + __aicore__ inline void CopyGmToL1B(AscendC::TQue &q, + const AscendC::GlobalTensor &global, uint16_t fractals_h, + uint16_t fractals_w) + { + const AscendC::LocalTensor lt = q.template AllocTensor(); + AscendC::Nd2NzParams params; + params.ndNum = 1; + params.nValue = fractals_h * 16; + params.dValue = fractals_w * 16; + params.srcDValue = params.dValue; + params.dstNzC0Stride = params.nValue; + params.dstNzNStride = 1; + // params.dstNzMatrixStride = 1; // This should have no effect when ndNum=1; + DataCopy(lt, global, params); + q.EnQue(lt); + } + + /** + * @brief Copy a tensor from a given A L1 to A L0 queue. + * + * The function performs an NZ to ZZ layout transformation. + * + * @param [in] l0_q Destination queue. The position must be A2. + * @param [in] l1_q Source queue. The position must be A1. + * @param [in] fractals_h Number of fractal patterns in the height dimension of + * the input matrix. + * @param [in] fractals_w Number of fractal patterns in the width dimension of + * the input matrix. + */ + __aicore__ inline void CopyL1ToL0A(AscendC::TQue &l0_q, + AscendC::TQue &l1_q, uint16_t fractals_h, + uint16_t fractals_w) + { + const AscendC::LocalTensor l0_lt = l0_q.template AllocTensor(); + AscendC::LocalTensor l1_lt = l1_q.template DeQue(); + + int src_offset = 0; + int dst_offset = 0; + + for (uint16_t i = 0; i < fractals_h; ++i) { + AscendC::LoadData2dParams params; + params.repeatTimes = fractals_w; + params.srcStride = fractals_h; + params.ifTranspose = false; + + LoadData(l0_lt[dst_offset], l1_lt[src_offset], params); + + src_offset += 16 * 16; + dst_offset += fractals_w * 16 * 16; + } + + l1_q.FreeTensor(l1_lt); + l0_q.EnQue(l0_lt); + } + + /** + * @brief Copy a tensor from a given B L1 to B L0 queue. + * + * The function performs an NZ to ZN layout transformation. + * + * @param [in] l0_q Destination queue. The position must be B2. + * @param [in] l1_q Source queue. The position must be B1. + * @param [in] fractals_h Number of fractal patterns in the height dimension of + * the input matrix. + * @param [in] fractals_w Number of fractal patterns in the width dimension of + * the input matrix. + */ + __aicore__ inline void CopyL1ToL0B(AscendC::TQue &l0_q, + AscendC::TQue &l1_q, uint16_t fractals_h, + uint16_t fractals_w) + { + AscendC::LocalTensor l1_lt = l1_q.template DeQue(); + const AscendC::LocalTensor l0_lt = l0_q.template AllocTensor(); + constexpr bool transpose = true; + int src_offset = 0; + int dst_offset = 0; + + for (uint16_t i = 0; i < fractals_h; ++i) { + AscendC::LoadData2dParams params; + params.repeatTimes = fractals_w; + params.srcStride = fractals_h; + params.ifTranspose = transpose; + + LoadData(l0_lt[dst_offset], l1_lt[src_offset], params); + + src_offset += 16 * 16; + dst_offset += fractals_w * 16 * 16; + } + + l1_q.FreeTensor(l1_lt); + l0_q.EnQue(l0_lt); + } + + /** + * @brief Copy a tensor from CO1 queue to B1 queue. + * + * @param [in] dst_q Destination queue. The position must be B1. + * @param [in] src_q Source queue. The position must be CO1. + * @param [in] height Height of the matrix. + * @param [in] width Width of the matrix. + */ + __aicore__ inline void CopyC01ToB1(AscendC::TQue &dst_q, + AscendC::TQue &src_q, uint32_t height, + uint32_t width) + { + AscendC::LocalTensor src_lt = src_q.template DeQue(); + const AscendC::LocalTensor dst_lt = dst_q.template AllocTensor(); + + AscendC::FixpipeParamsV220 params; + params.nSize = width; + params.mSize = height; + params.srcStride = height; + params.dstStride = width; + params.ndNum = 1; + params.quantPre = QuantMode_t::F322F16; + + AscendC::Fixpipe(dst_lt, src_lt, params); + src_q.FreeTensor(src_lt); + dst_q.EnQue(dst_lt); + } + + /** + * @brief Returns a synchronization config. + * + * @param [in] mode Synchronization mode. + * @param [in] flag_id Flag to use for synchronization. + * @return Synchronization config. + */ + __aicore__ inline int GetSyncConf(int mode, int flag_id) + { + return 1 | (mode << 4) | (flag_id << 8); + } + + /** + * @brief Synchronize cube and vector cores within a single group. + * + */ + __aicore__ inline void SyncGroup() + { + const int mode = 2; + + const int AIV_SET_FLAG_ID = 11; + const int AIC_SET_FLAG_ID = 12; + ffts_cross_core_sync(PIPE_FIX, GetSyncConf(mode, AIC_SET_FLAG_ID)); + wait_flag_dev(AIV_SET_FLAG_ID); + + return; + } + + /** + * @brief Perform matrix multiplication in Cube unit, like C = A @ C + * + * Assumes that the matrices A and C are enqueued. + */ + __aicore__ inline void MultiplyAWithC() + { + AscendC::LocalTensor a2_lt = a2_q_.DeQue(); + + // Load C matrix from L0C into L0B. + CopyC01ToB1(b1_q_, co1_q_, M_, N_); + CopyL1ToL0B(b2_q_, b1_q_, k_blocks_, n_blocks_); + AscendC::LocalTensor b2_lt = b2_q_.DeQue(); + + AscendC::LocalTensor c1_lt = co1_q_.AllocTensor(); + + Mmad(c1_lt, a2_lt, b2_lt, {M_, N_, K_, false /* accumulate_c */, 0, false, false, false}); + + co1_q_.EnQue(c1_lt); + a2_q_.FreeTensor(a2_lt); + b2_q_.FreeTensor(b2_lt); + } + + /** + * @brief Loads the identity matrix from global memory to L0C (`co1_q_` + * queue). + */ + __aicore__ inline void LoadIdentityMatrixinL0C() + { + LoadIdentityMatrixinL0A(); + LoadIdentityMatrixinL0B(); + AscendC::LocalTensor a2_lt = a2_q_.DeQue(); + AscendC::LocalTensor b2_lt = b2_q_.DeQue(); + AscendC::LocalTensor c1_lt = co1_q_.AllocTensor(); + + Mmad(c1_lt, a2_lt, b2_lt, {M_, N_, K_, false, 0, false, false, false}); + + co1_q_.EnQue(c1_lt); + a2_q_.FreeTensor(a2_lt); + b2_q_.FreeTensor(b2_lt); + } + + /** + * @brief Loads the identity matrix from global memory to L0A (`a1_q_` + * queue). + */ + __aicore__ inline void LoadIdentityMatrixinL0A() + { + CopyGmToL1A(a1_q_, global_A_[global_in_offset_], m_blocks_, k_blocks_); + CopyL1ToL0A(a2_q_, a1_q_, m_blocks_, k_blocks_); + } + + /** + * @brief Loads the identity matrix from global memory to L0B (`b1_q_` + * queue). + */ + __aicore__ inline void LoadIdentityMatrixinL0B() + { + // Here, we "abuse" the 'global_A_' pointer + CopyGmToL1B(b1_q_, global_A_[global_in_offset_], k_blocks_, n_blocks_); + + // Plain copy from L1 to L0B, because the layout is already correct. + AscendC::LocalTensor src = b1_q_.template DeQue(); + const AscendC::LocalTensor dst = b2_q_.template AllocTensor(); + + AscendC::LoadData2dParams params; + params.repeatTimes = n_blocks_ * k_blocks_; + params.srcStride = 1; + params.ifTranspose = false; + + LoadData(dst, src, params); + + b1_q_.FreeTensor(src); + b2_q_.EnQue(dst); + } + + /** + * @brief Copy a tensor from CO1 queue to global memory. + * + */ + __aicore__ inline void CopyCL0ToGlobal() + { + constexpr uint16_t fractal_size = 16; + + AscendC::LocalTensor lt = co1_q_.template DeQue(); + + AscendC::FixpipeParams params; + params.cburstNum = M_; + params.burstLen = N_ * fractal_size * sizeof(OutputT) / 32; + params.dstStride = M_; + + AscendC::Nz2NdParams nz2nd_params; + nz2nd_params.nz2ndEn = true; + nz2nd_params.originalNSize = M_; + params.nz2ndParams = nz2nd_params; + + AscendC::Fixpipe(global_C_[global_out_offset_], lt, params); + + co1_q_.FreeTensor(lt); + } + + AscendC::TPipe pipe_; + + AscendC::TQue a1_q_; + AscendC::TQue a2_q_; + AscendC::TQue b1_q_; + AscendC::TQue b2_q_; + + AscendC::TQue co1_q_; + + AscendC::GlobalTensor global_A_; + AscendC::GlobalTensor global_C_; + + const uint32_t vec_len_; + const uint32_t matrix_size_; + const uint32_t ws_circular_buffer_len_; + const uint32_t tile_len_; + const uint32_t global_in_offset_; + const uint32_t global_out_offset_; + + constexpr static uint32_t M_CUBE_BLOCK_SIZE = 16; + constexpr static uint32_t N_CUBE_BLOCK_SIZE = 16; + constexpr static uint32_t K_CUBE_BLOCK_SIZE = 16; + + const uint16_t M_ = matrix_size_; + const uint16_t K_ = matrix_size_; + const uint16_t N_ = matrix_size_; + + const uint32_t n_blocks_ = N_ / N_CUBE_BLOCK_SIZE; + const uint32_t k_blocks_ = K_ / K_CUBE_BLOCK_SIZE; + const uint32_t m_blocks_ = M_ / M_CUBE_BLOCK_SIZE; +}; + +/** + * @brief Run the `tri_inv_cube_col_sweep` kernel. + * + * @tparam InputT Input data type. Supports fp16/half. + * + * @param [in] matrix_stream_in Pointer where the input matrices will be + * written by AIVs in global memory. + * @param [in] inv_matrix_out Pointer where the output matrix inverses + * will be written. + * @param [in] matrix_size Input square matrix size to invert. + */ +template +__aicore__ inline void run_tri_inv_cube_col_sweep(GM_ADDR matrix_stream_in, GM_ADDR inv_matrix_out, GM_ADDR workspace, + uint32_t vec_len, uint32_t matrix_size, + uint32_t ws_circular_buffer_len) +{ + if ASCEND_IS_AIV { + KernelMatGen op(matrix_size, ws_circular_buffer_len); + op.Init(matrix_stream_in, workspace); + op.Process(); + } + + if ASCEND_IS_AIC { + KernelTriInvCubeColSweep op(vec_len, matrix_size, ws_circular_buffer_len); + op.Init(workspace, inv_matrix_out); + op.Process(); + } +} + +/** + * @brief Copies tiling structure from global memory to registers. + * + * @tparam TilingT Structure representing kernel tiling parameters. + * @param [in] tiling Pointer to the structure allocated in registers. + * @param [in] tiling_global Pointer to the structure in global memory. + */ +template +__aicore__ inline void GetTilingData(TilingT *const tiling, GM_ADDR tiling_global) +{ + uint32_t *const tiling_32b = reinterpret_cast(tiling); + const __gm__ uint32_t *const tiling_global_32b = reinterpret_cast<__gm__ uint32_t *>(tiling_global); + + for (uint32_t i = 0; i < sizeof(TilingT) / sizeof(uint32_t); i++) { + tiling_32b[i] = tiling_global_32b[i]; + } +} + +} // namespace npu_kernel +} // namespace sglang diff --git a/csrc/tri_inv/op_kernel/tri_inv_cube_kernel.cpp b/csrc/tri_inv/op_kernel/tri_inv_cube_kernel.cpp new file mode 100644 index 000000000..24b51f81b --- /dev/null +++ b/csrc/tri_inv/op_kernel/tri_inv_cube_kernel.cpp @@ -0,0 +1,30 @@ +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "kernel_tri_inv_cube.h" + +#include "../op_host/tiling_tri_inv_cube.h" + +/** + * @brief Run the `tri_inv_cube_col_sweep` kernel on dtype fp16/half. + * + * @param [in] vec_in Pointer to input vector. + * @param [in] vec_out Pointer to output vector. + * @param [in] workspace Pointer to workspace. + * @param [in] tiling_gm Pointer to tiling vector. + */ +extern "C" __global__ __aicore__ void tri_inv_cube_col_sweep_fp16(GM_ADDR vec_in, GM_ADDR vec_out, GM_ADDR workspace, + GM_ADDR tiling_gm) +{ + sglang::npu_kernel::TriInvColumnSweepCubeTiling tiling; + sglang::npu_kernel::GetTilingData(&tiling, tiling_gm); + sglang::npu_kernel::run_tri_inv_cube_col_sweep(vec_in, vec_out, workspace, tiling.num_elems, + tiling.matrix_size, tiling.ws_circular_buffer_len); +} diff --git a/include/sgl_kenel_npu_ops.h b/include/sgl_kenel_npu_ops.h index e2f5a6804..dbd72d177 100644 --- a/include/sgl_kenel_npu_ops.h +++ b/include/sgl_kenel_npu_ops.h @@ -114,7 +114,7 @@ at::Tensor lightning_indexer( c10::optional sparse_count, c10::optional sparse_mode); /** - * @brief Triangular inverse of input tensor where last two dimensions represent + * @brief AIV-based triangular inverse of input tensor where last two dimensions represent * a matrix. * * @param [in] tensor_in Tensor of dimensions (..., n, n) where `n` is @@ -123,6 +123,17 @@ at::Tensor lightning_indexer( * is inversed. */ at::Tensor tri_inv_col_sweep(const at::Tensor &tensor_in); + +/** + * @brief AIC-based triangular inverse of input tensor where last two dimensions represent + * a matrix. + * + * @param [in] tensor_in Tensor of dimensions (..., n, n) where `n` is + * the matrix size. + * @return at::Tensor Returns tensor of same shape where each matrix of size n + * is inversed. + */ +at::Tensor tri_inv_cube_col_sweep(const at::Tensor &tensor_in); } // namespace npu_kernel } // namespace sglang diff --git a/python/sgl_kernel_npu/sgl_kernel_npu/fla/chunk.py b/python/sgl_kernel_npu/sgl_kernel_npu/fla/chunk.py index a545b6d6a..d6caf5634 100644 --- a/python/sgl_kernel_npu/sgl_kernel_npu/fla/chunk.py +++ b/python/sgl_kernel_npu/sgl_kernel_npu/fla/chunk.py @@ -30,6 +30,15 @@ def fast_inv_tril(A: torch.Tensor): return A_inv.to(dtype) +def fast_cube_inv_tril(A: torch.Tensor): + dtype = A.dtype + assert A.shape[-2] == A.shape[-1] + chunk_size = A.shape[-1] + identity = torch.eye(chunk_size, dtype=torch.float16, device=A.device) + A_inv = torch.ops.npu.cube_triangular_inverse(identity - A.to(torch.float16)) + return A_inv.to(dtype) + + def inv_tril_inplace(A: torch.Tensor): """ Returns the "matrix inverse" of the last two dimensions of A that must be a strict lower-triangular matrix. diff --git a/tests/python/sgl_kernel_npu/test_gated_delta_ascendc_tri_inv.py b/tests/python/sgl_kernel_npu/test_gated_delta_ascendc_tri_inv.py index 624c49f00..c7fb1fe8c 100644 --- a/tests/python/sgl_kernel_npu/test_gated_delta_ascendc_tri_inv.py +++ b/tests/python/sgl_kernel_npu/test_gated_delta_ascendc_tri_inv.py @@ -3,7 +3,12 @@ import pytest import torch import torch.nn.functional as F -from sgl_kernel_npu.fla.chunk import chunk_gated_delta_rule_native, fast_inv_tril +import typing +from sgl_kernel_npu.fla.chunk import ( + chunk_gated_delta_rule_native, + fast_inv_tril, + fast_cube_inv_tril, +) device = "npu" @@ -36,6 +41,7 @@ def print_diff(name, ref, tri, atol=0.005): print(f"Exceeds tolerance ({atol})!") +@pytest.mark.parametrize("tri_inv_fn", [fast_inv_tril, fast_cube_inv_tril]) @pytest.mark.parametrize( ("H", "D", "mask_p", "cu_seqlens", "dtype"), [ @@ -84,6 +90,7 @@ def test_chunk_varlen( mask_p: float, cu_seqlens: list[int], dtype: torch.dtype, + tri_inv_fn: typing.Callable, ): if D != 128: pytest.skip( @@ -142,7 +149,7 @@ def test_chunk_varlen( initial_state=h0[i], output_final_state=True, use_qk_l2norm_in_kernel=False, - tri_inv_fn=fast_inv_tril, + tri_inv_fn=tri_inv_fn, ) actual.append(actual_i) actual_ht.append(actual_ht_i) @@ -151,8 +158,8 @@ def test_chunk_varlen( torch.npu.synchronize() - print_diff("o", ref, actual, 0.01) - print_diff("ht", ref_ht, actual_ht, 0.01) + print_diff("o", ref, actual, 0.05) + print_diff("ht", ref_ht, actual_ht, 0.05) - assert_close("o", ref, actual, 0.01) - assert_close("ht", ref_ht, actual_ht, 0.01) + assert_close("o", ref, actual, 0.05) + assert_close("ht", ref_ht, actual_ht, 0.05) diff --git a/tests/python/sgl_kernel_npu/test_triangular_inverse.py b/tests/python/sgl_kernel_npu/test_triangular_inverse.py index 22aed8daa..28e0c5c66 100644 --- a/tests/python/sgl_kernel_npu/test_triangular_inverse.py +++ b/tests/python/sgl_kernel_npu/test_triangular_inverse.py @@ -43,19 +43,46 @@ def np_triu_inv_cs(input_x, dtype: np.dtype = np.float16): return output.astype(dtype) -def rand_np_tril(batch_size: int, n: int, dtype: np.dtype): - "Returns a random unit lower triangular matrix of size n." - A = np.random.rand(batch_size, n, n).astype(dtype) - A = np.tril(A) +def np_tril_inv_cube_cs(input_x, dtype: np.dtype = np.float16): + """Return the matrix inverse of a 3d tensor where first dimension is batch dimension. + Each batch dimension returns U_inv_n of input U_n, i.e., U_n U_inv_n = I_n. + + The algorithm is a matrix-formulation of the column sweep's algorithm. + """ + output = np.zeros_like(input_x) + batch_dim, n, _ = input_x.shape + I_n = np.eye(n, dtype=np.float32) + + for idx in range(batch_dim): + U_n = input_x[idx, :, :] + U_n = 2 * I_n - U_n + U_inv_n = I_n.copy() + + for k in reversed(range(n)): + M = I_n.copy() + M[:, k] = U_n[:, k] + U_inv_n = M.astype(np.float32) @ U_inv_n.astype(np.float32) + # FIXPIPE fp32 -> fp16 + U_inv_n = U_inv_n.astype(dtype) + + output[idx, :, :] = U_inv_n + + return output.astype(dtype) + + +def rand_np_triu(batch_size: int, n: int, dtype: np.dtype): + "Returns a random unit upper triangular matrix of size n." + A = 0.1 * np.random.rand(batch_size, n, n).astype(dtype) + A = np.triu(A) for k in range(batch_size): np.fill_diagonal(A[k, :, :], 1.0) return A.astype(dtype) -def ones_np_tril(batch_size: int, n: int, dtype: np.dtype): - "Returns an all-ones lower triangular matrix of size n." +def ones_np_triu(batch_size: int, n: int, dtype: np.dtype): + "Returns an all-ones upper triangular matrix of size n." A = np.ones((batch_size, n, n)).astype(dtype) - A = np.tril(A) + A = np.triu(A) return A.astype(dtype) @@ -64,7 +91,7 @@ def ones_np_tril(batch_size: int, n: int, dtype: np.dtype): @pytest.mark.parametrize("data_type", [np.float16, np.float32], ids=str) @pytest.mark.parametrize( "mat_gen", - (rand_np_tril, ones_np_tril), + (rand_np_triu, ones_np_triu), ) def test_tri_inv_col_sweep( batch_size: int, @@ -92,12 +119,56 @@ def test_tri_inv_col_sweep( assert torch.equal(actual, expected) +@pytest.mark.parametrize("batch_size", [2, 4, 40, 256]) +@pytest.mark.parametrize("matrix_size", [16, 32, 64, 128]) +@pytest.mark.parametrize("data_type", [np.float16], ids=str) +@pytest.mark.parametrize( + "mat_gen", + (rand_np_triu, ones_np_triu), +) +def test_tri_inv_cube_col_sweep( + batch_size: int, + matrix_size: int, + data_type: np.dtype, + mat_gen: callable, +): + + input_x_cpu = mat_gen(batch_size, matrix_size, data_type) + expected_cpu = np_tril_inv_cube_cs(input_x_cpu, data_type).transpose(0, 2, 1) + + input_x_cpu = input_x_cpu.transpose(0, 2, 1) + torch.npu.synchronize() + input_x = torch.from_numpy(input_x_cpu).half().npu() + torch.npu.synchronize() + expected = torch.from_numpy(expected_cpu).half().npu() + torch.npu.synchronize() + actual = torch.ops.npu.cube_triangular_inverse(input_x).transpose(1, 2) + torch.npu.synchronize() + + torch.set_printoptions( + threshold=10_000, # bigger than 16*16 + linewidth=200, # avoid line wrapping + precision=4, # optional + sci_mode=False, # optional + ) + + print("Input") + print(input_x.cpu()) + print("Expected") + print(expected_cpu) + print("Actual") + print(actual.cpu()) + + assert actual.shape == expected.shape, "Output shape does not match expected shape." + assert torch.allclose(actual.float(), expected.float(), atol=0.1, rtol=0.5) + + @pytest.mark.parametrize("batch_size", [1, 2, 4, 40, 256]) @pytest.mark.parametrize("matrix_size", [16, 32, 64, 128]) @pytest.mark.parametrize("data_type", [np.float32], ids=str) @pytest.mark.parametrize( "mat_gen,atol,rtol", - [(rand_np_tril, 1e-5, 1e-5), (ones_np_tril, 0, 0)], + [(rand_np_triu, 1e-5, 1e-5), (ones_np_triu, 0, 0)], ) def test_tri_inv_col_sweep_np_linalg_inv( batch_size: int, @@ -107,8 +178,8 @@ def test_tri_inv_col_sweep_np_linalg_inv( atol: float, rtol: float, ): - - input_x_cpu = mat_gen(batch_size, matrix_size, data_type) + # the copy forces the input to be contiguous + input_x_cpu = mat_gen(batch_size, matrix_size, data_type).transpose(0, 2, 1).copy() golden_numpy_cpu = np.linalg.inv(input_x_cpu) # Convert input matrices from row-major order to column-major order