diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/cholesky_solve_grad_kernel_register.cu index 9b754efe16..c77121724b 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/cholesky_solve_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/cholesky_solve_grad_kernel_register.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/cholesky_solve_grad_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h" PD_CUSTOM_KERNEL_REGISTER(cholesky_solve_grad, metax_gpu, diff --git a/backends/metax_gpu/kernels/cuda_kernels/dirichlet_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/dirichlet_kernel_register.cu index 6bfa234d11..df11c758db 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/dirichlet_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/dirichlet_kernel_register.cu @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/dirichlet_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/dirichlet_kernel_impl.h" PD_CUSTOM_KERNEL_REGISTER(dirichlet, metax_gpu, ALL_LAYOUT, - phi::Dirichletkernel, + phi::DirichletKernel, float, double, phi::dtype::float16, diff --git a/backends/metax_gpu/kernels/cuda_kernels/gru_unit_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/gru_unit_grad_kernel_register.cu index 20f82652d8..d6077cdb96 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/gru_unit_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/gru_unit_grad_kernel_register.cu @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/gru_unit_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/gru_unit_kernel_impl.h" PD_CUSTOM_KERNEL_REGISTER(gru_unit_grad, metax_gpu, diff --git a/backends/metax_gpu/kernels/cuda_kernels/gru_unit_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/gru_unit_kernel_register.cu index 87b0526ecb..e4a7482bdf 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/gru_unit_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/gru_unit_kernel_register.cu @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/gru_unit_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/gru_unit_kernel_impl.h" PD_CUSTOM_KERNEL_REGISTER( gru_unit, metax_gpu, ALL_LAYOUT, phi::GRUUnitKernel, float, double) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/inverse_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/inverse_grad_kernel_register.cu index 5e992969e5..e87733a7ed 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/inverse_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/inverse_grad_kernel_register.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/inverse_grad_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/inverse_grad_kernel_impl.h" #include "paddle/phi/kernels/inverse_grad_kernel.h" PD_CUSTOM_KERNEL_REGISTER(inverse_grad, diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/lstm_grad_kernel_register.cu index 6bd0844ee3..9c00c70a33 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/lstm_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/lstm_grad_kernel_register.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/lstm_kernel_impl.h" +#include "paddle/phi/kernels/impl/lstm_kernel_impl.h" #include "paddle/phi/kernels/lstm_kernel.h" PD_CUSTOM_KERNEL_REGISTER( diff --git a/backends/metax_gpu/kernels/custom_kernel/addmm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/addmm_kernel_register.cu index ead21b1eb7..f855de37dd 100644 --- a/backends/metax_gpu/kernels/custom_kernel/addmm_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/addmm_kernel_register.cu @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "../impl/addmm_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/addmm_kernel.h" +#include "paddle/phi/kernels/impl/addmm_kernel_impl.h" PD_REGISTER_PLUGIN_KERNEL(addmm, metax_gpu, diff --git a/backends/metax_gpu/kernels/custom_kernel/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/bilinear_grad_kernel_register.cu index 07980dc2ff..6a3176db06 100644 --- a/backends/metax_gpu/kernels/custom_kernel/bilinear_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/bilinear_grad_kernel_register.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/bilinear_grad_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/bilinear_grad_kernel.h" +#include "paddle/phi/kernels/impl/bilinear_grad_kernel_impl.h" PD_REGISTER_PLUGIN_KERNEL(bilinear_grad, metax_gpu, diff --git a/backends/metax_gpu/kernels/custom_kernel/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/bilinear_kernel_register.cu index 8ce3daaa12..90f1e200d6 100644 --- a/backends/metax_gpu/kernels/custom_kernel/bilinear_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/bilinear_kernel_register.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/bilinear_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/bilinear_kernel.h" +#include "paddle/phi/kernels/impl/bilinear_kernel_impl.h" PD_REGISTER_PLUGIN_KERNEL( bilinear, metax_gpu, ALL_LAYOUT, phi::BilinearKernel, float, double) {} diff --git a/backends/metax_gpu/kernels/custom_kernel/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/bmm_grad_kernel_register.cu index 4e5ef5e3a0..e796222054 100644 --- a/backends/metax_gpu/kernels/custom_kernel/bmm_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/bmm_grad_kernel_register.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/bmm_grad_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/bmm_grad_kernel.h" +#include "paddle/phi/kernels/impl/bmm_grad_kernel_impl.h" PD_REGISTER_PLUGIN_KERNEL(bmm_grad, metax_gpu, diff --git a/backends/metax_gpu/kernels/custom_kernel/bmm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/bmm_kernel_register.cu index b7e6e7312c..22eac8cef7 100644 --- a/backends/metax_gpu/kernels/custom_kernel/bmm_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/bmm_kernel_register.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/bmm_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/bmm_kernel.h" +#include "paddle/phi/kernels/impl/bmm_kernel_impl.h" PD_REGISTER_PLUGIN_KERNEL(bmm, metax_gpu, diff --git a/backends/metax_gpu/kernels/custom_kernel/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/cholesky_grad_kernel_register.cu index 2a29045e08..2f5fee6aa7 100644 --- a/backends/metax_gpu/kernels/custom_kernel/cholesky_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/cholesky_grad_kernel_register.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/cholesky_grad_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cholesky_grad_kernel.h" +#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h" PD_REGISTER_PLUGIN_KERNEL(cholesky_grad, metax_gpu, diff --git a/backends/metax_gpu/kernels/custom_kernel/conv_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/conv_kernel_register.cu index 112275470a..4f340997fb 100644 --- a/backends/metax_gpu/kernels/custom_kernel/conv_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/conv_kernel_register.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. #if 0 -#include "kernels/impl/conv_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/conv_kernel.h" +#include "paddle/phi/kernels/impl/conv_kernel_impl.h" namespace phi { diff --git a/backends/metax_gpu/kernels/custom_kernel/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/flatten2_grad_kernel_register.cu index 8fe0d25fae..76541d8290 100644 --- a/backends/metax_gpu/kernels/custom_kernel/flatten2_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/flatten2_grad_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. // clang-format off #include "paddle/phi/core/tensor_utils.h" //NOLINT -#include "kernels/impl/flatten2_kernel_impl.h" +#include "paddle/phi/kernels/impl/flatten2_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" // clang-format on diff --git a/backends/metax_gpu/kernels/custom_kernel/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/flatten2_kernel_register.cu index e42e12796a..cc94c28722 100644 --- a/backends/metax_gpu/kernels/custom_kernel/flatten2_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/flatten2_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. // clang-format off #include "paddle/phi/core/tensor_utils.h" //NOLINT -#include "kernels/impl/flatten2_kernel_impl.h" +#include "paddle/phi/kernels/impl/flatten2_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" // clang-format on diff --git a/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu index 15cd9b9238..768cb056b6 100644 --- a/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/lstm_kernel_impl.h" +#include "paddle/phi/kernels/impl/lstm_kernel_impl.h" #include "paddle/phi/kernels/lstm_kernel.h" PD_REGISTER_PLUGIN_KERNEL( diff --git a/backends/metax_gpu/kernels/custom_kernel/lu_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/lu_grad_kernel_register.cu index b3952b9cf9..1db0ab5324 100644 --- a/backends/metax_gpu/kernels/custom_kernel/lu_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/lu_grad_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. // clang-format off #include "paddle/phi/core/tensor_utils.h" //NOLINT -#include "kernels/impl/lu_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/lu_grad_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/lu_grad_kernel.h" diff --git a/backends/metax_gpu/kernels/custom_kernel/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/multi_dot_grad_kernel_register.cu index b0560d65e6..727172fec3 100644 --- a/backends/metax_gpu/kernels/custom_kernel/multi_dot_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/multi_dot_grad_kernel_register.cu @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "kernels/impl/multi_dot_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" #include "paddle/phi/kernels/multi_dot_grad_kernel.h" PD_REGISTER_PLUGIN_KERNEL(multi_dot_grad, metax_gpu, diff --git a/backends/metax_gpu/kernels/custom_kernel/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/multi_dot_kernel_register.cu index aa3201ad51..f0c5afbd58 100644 --- a/backends/metax_gpu/kernels/custom_kernel/multi_dot_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/multi_dot_kernel_register.cu @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "kernels/impl/multi_dot_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" #include "paddle/phi/kernels/multi_dot_kernel.h" PD_REGISTER_PLUGIN_KERNEL(multi_dot, metax_gpu, diff --git a/backends/metax_gpu/kernels/custom_kernel/mv_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/mv_kernel_register.cu index b70204fd03..41d9e15532 100644 --- a/backends/metax_gpu/kernels/custom_kernel/mv_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/mv_kernel_register.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/mv_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/mv_kernel_impl.h" #include "paddle/phi/kernels/mv_kernel.h" PD_REGISTER_PLUGIN_KERNEL( diff --git a/backends/metax_gpu/kernels/custom_kernel/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/standard_gamma_kernel_register.cu index 0a10e2a628..82fb2e4adc 100644 --- a/backends/metax_gpu/kernels/custom_kernel/standard_gamma_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/standard_gamma_kernel_register.cu @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "kernels/impl/standard_gamma_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/standard_gamma_kernel_impl.h" PD_REGISTER_PLUGIN_KERNEL(standard_gamma, metax_gpu, diff --git a/backends/metax_gpu/kernels/custom_kernel/stft_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/stft_kernel_register.cu index 5376f5f4e9..63c68cf295 100644 --- a/backends/metax_gpu/kernels/custom_kernel/stft_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/stft_kernel_register.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/stft_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/stft_kernel_impl.h" #include "paddle/phi/kernels/stft_kernel.h" PD_REGISTER_PLUGIN_KERNEL( diff --git a/backends/metax_gpu/kernels/custom_kernel/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/triangular_solve_grad_kernel_register.cu index 116a453382..b7bb40193a 100644 --- a/backends/metax_gpu/kernels/custom_kernel/triangular_solve_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/custom_kernel/triangular_solve_grad_kernel_register.cu @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/impl/triangular_solve_grad_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h" #ifdef PADDLE_WITH_CUDA PD_REGISTER_PLUGIN_KERNEL(triangular_solve_grad, diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h deleted file mode 100644 index 9de0b3aadd..0000000000 --- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h +++ /dev/null @@ -1,131 +0,0 @@ -// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights -// Reserved. -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -// clang-format off -#include -#include "glog/logging.h" - -#include "paddle/phi/kernels/addmm_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" -// clang-format on -namespace phi { - -template -using PhiEigenTensor = EigenTensor; - -using Array1 = Eigen::DSizes; -using Array2 = Eigen::DSizes; - -template -void AddmmKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& x, - const DenseTensor& y, - float beta, - float alpha, - DenseTensor* out) { - auto input_dims = input.dims(); - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - DenseTensor input_2d(input); - if (input.dims().size() == 1) { - input_dims = {1, input.dims()[0]}; - input_2d.Resize(input_dims); - } - - // broadcast mode check - if (x_dims[0] != input_dims[0]) { - PADDLE_ENFORCE_EQ(input_dims[0], - 1, - errors::InvalidArgument( - "When x_dims[0] is not equal with input_dims[0], " - "input_dims[0] must be 1 but got %s", - input_dims[0])); - PADDLE_ENFORCE_EQ(y_dims[1] == input_dims[1] || input_dims[1] == 1, - true, - errors::InvalidArgument( - "The input tensor shape mismatch, input shape=[%s], " - "x shape=[%s], y shape=[%s]", - input_dims, - x_dims, - y_dims)); - } - // broadcast mode check - if (y_dims[1] != input_dims[1]) { - PADDLE_ENFORCE_EQ(input_dims[1], - 1, - errors::InvalidArgument( - "When y_dims[1] is not equal with input_dims[0], " - "input_dims[0] must be 1 but got %s", - input_dims[1])); - PADDLE_ENFORCE_EQ(x_dims[0] == input_dims[0] || input_dims[0] == 1, - true, - errors::InvalidArgument( - "The input tensor shape mismatch, input shape=[%s], " - "x shape=[%s], y shape=[%s]", - input_dims, - x_dims, - y_dims)); - } - // broadcast mode check - PADDLE_ENFORCE_EQ( - x_dims[1], - y_dims[0], - errors::InvalidArgument( - "The input tensor X's width must be equal with matrix Y' height. " - "But received X's shape = [%s], Y's shape = [%s].", - x_dims[1], - y_dims[0])); - - dev_ctx.template Alloc(out); - if (out->numel() == 0) return; - auto blas = funcs::GetBlas(dev_ctx); - - // calc broadcast dim - Array2 bcast_dims; - bcast_dims[0] = x_dims[0] / input_dims[0]; - bcast_dims[1] = y_dims[1] / input_dims[1]; - VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]"; - // broadcast using eigen - const DenseTensor& const_ref_input = input_2d; - auto eigen_input = PhiEigenTensor::From(const_ref_input); - auto eigen_out = PhiEigenTensor::From(*out); - auto& place = *dev_ctx.eigen_device(); - funcs::EigenBroadcast, T, 2>::Eval( - place, eigen_out, eigen_input, bcast_dims); - - T t_alpha = static_cast(alpha); - T t_beta = static_cast(beta); - blas.GEMM(false, - false, - x_dims[0], - y_dims[1], - x_dims[1], - t_alpha, - x.data(), - x_dims[1], - y.data(), - y_dims[1], - t_beta, - out->data(), - y_dims[1]); -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h deleted file mode 100644 index 1c52ea22e4..0000000000 --- a/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h +++ /dev/null @@ -1,220 +0,0 @@ -/* Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "glog/logging.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/kernels/baddbmm_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" - -namespace phi { - -template -using PhiEigenTensor = EigenTensor; - -using Array1 = Eigen::DSizes; -using Array2 = Eigen::DSizes; -using Array3 = Eigen::DSizes; - -template -void BaddbmmKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& x, - const DenseTensor& y, - float beta, - float alpha, - DenseTensor* out) { - auto input_dims = input.dims(); - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - DenseTensor input_3d(input); - if (input.dims().size() == 2) { - input_dims = {input.dims()[0], 1, input.dims()[1]}; - input_3d.Resize(input_dims); - } - - // broadcast mode check - if (x_dims[0] != input_dims[0]) { - PADDLE_ENFORCE_EQ(input_dims[0], - 1, - errors::InvalidArgument( - "When x_dims[0] is not equal with input_dims[0], " - "input_dims[0] must be 1 but got %s", - input_dims[0])); - PADDLE_ENFORCE_EQ( - (x_dims[1] == input_dims[1] || input_dims[1] == 1) && - (y_dims[2] == input_dims[2] || input_dims[2] == 1), - true, - errors::InvalidArgument( - "When x_dims[0] is not equal with input_dims[0], " - "x_dims[1] and y_dims[2] must be equal with input_dims[1] and " - "input_dims[2] respectively, or input_dims[1] and input_dims[2] " - "must be 1. But got x_dims[1] = %s, input_dims[1] = %s, y_dims[2] " - "= %s, input_dims[2] = %s", - x_dims[1], - input_dims[1], - y_dims[2], - input_dims[2])); - } - - if (x_dims[1] != input_dims[1]) { - PADDLE_ENFORCE_EQ(input_dims[1], - 1, - errors::InvalidArgument( - "When x_dims[1] is not equal with input_dims[1], " - "input_dims[1] must be 1 but got %s", - input_dims[1])); - PADDLE_ENFORCE_EQ( - (x_dims[0] == input_dims[0] || input_dims[0] == 1) && - (y_dims[2] == input_dims[2] || input_dims[2] == 1), - true, - errors::InvalidArgument( - "When x_dims[1] is not equal with input_dims[1], " - "x_dims[0] and y_dims[2] must be equal with input_dims[0] and " - "input_dims[2] respectively, or input_dims[0] and input_dims[2] " - "must be 1. But got x_dims[0] = %s, input_dims[0] = %s, y_dims[2] " - "= %s, input_dims[2] = %s", - x_dims[0], - input_dims[0], - y_dims[2], - input_dims[2])); - } - - if (y_dims[2] != input_dims[2]) { - PADDLE_ENFORCE_EQ(input_dims[2], - 1, - errors::InvalidArgument( - "When y_dims[2] is not equal with input_dims[2], " - "input_dims[2] must be 1 but got %s", - input_dims[2])); - PADDLE_ENFORCE_EQ( - (x_dims[0] == input_dims[0] || input_dims[0] == 1) && - (x_dims[1] == input_dims[1] || input_dims[1] == 1), - true, - errors::InvalidArgument( - "When y_dims[2] is not equal with input_dims[2], " - "x_dims[0] and x_dims[1] must be equal with input_dims[0] and " - "input_dims[1] respectively, or input_dims[0] and input_dims[1] " - "must be 1. But got x_dims[0] = %s, input_dims[0] = %s, x_dims[1] " - "= %s, input_dims[1] = %s", - x_dims[0], - input_dims[0], - x_dims[1], - input_dims[1])); - } - PADDLE_ENFORCE_EQ( - x_dims[2], - y_dims[1], - errors::InvalidArgument( - "The input tensor X's width must be equal with matrix Y' height. " - "But received X's shape = [%s], Y's shape = [%s].", - x_dims[2], - y_dims[1])); - - dev_ctx.template Alloc(out); - auto blas = funcs::GetBlas(dev_ctx); - - // calc broadcast dim - Array3 bcast_dims; - bcast_dims[0] = x_dims[0] / input_dims[0]; - bcast_dims[1] = x_dims[1] / input_dims[1]; - bcast_dims[2] = y_dims[2] / input_dims[2]; - VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "," - << bcast_dims[2] << "]"; - - // broadcast using eigen - const DenseTensor& const_ref_input = input_3d; - auto eigen_input = PhiEigenTensor::From(const_ref_input); - auto eigen_out = PhiEigenTensor::From(*out); - auto& place = *dev_ctx.eigen_device(); - funcs::EigenBroadcast, T, 3>::Eval( - place, eigen_out, eigen_input, bcast_dims); - - using MPType = typename phi::dtype::MPTypeTrait::Type; - - // special case for MPType - if constexpr (std::is_same_v) { - VLOG(4) << "Function: baddbmm, Type of T: " << typeid(T).name(); - VLOG(4) << "Function: baddbmm, Type of MPType: " << typeid(MPType).name(); - float t_alpha = alpha; - float t_beta = beta; - if (x_dims[0] == 1) { - blas.GEMM(CblasNoTrans, - CblasNoTrans, - x_dims[1], - y_dims[2], - x_dims[2], - t_alpha, - x.data(), - y.data(), - t_beta, - out->data()); - } else { - blas.BatchedGEMM(CblasNoTrans, - CblasNoTrans, - x_dims[1], - y_dims[2], - x_dims[2], - t_alpha, - x.data(), - y.data(), - t_beta, - out->data(), - x_dims[0], - x_dims[1] * x_dims[2], - x_dims[2] * y_dims[2]); - } - } else { - T t_alpha = static_cast(alpha); - T t_beta = static_cast(beta); - if (x_dims[0] == 1) { - blas.GEMM(CblasNoTrans, - CblasNoTrans, - x_dims[1], - y_dims[2], - x_dims[2], - t_alpha, - x.data(), - y.data(), - t_beta, - out->data()); - } else { - blas.BatchedGEMM(CblasNoTrans, - CblasNoTrans, - x_dims[1], - y_dims[2], - x_dims[2], - t_alpha, - x.data(), - y.data(), - t_beta, - out->data(), - x_dims[0], - x_dims[1] * x_dims[2], - x_dims[2] * y_dims[2]); - // x_dims[2] == y_dims[1] - } - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h deleted file mode 100644 index b64f94bc7e..0000000000 --- a/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" - -namespace phi { - -template -void BilinearGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& weight, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - DenseTensor* dweight, - DenseTensor* dbias) { - auto batch_size = x.dims()[0]; - auto weight_dims = weight.dims(); - int out_dim = weight_dims[0]; - auto x_dim = weight_dims[1]; - auto y_dim = weight_dims[2]; - - auto x_mat = EigenMatrix::From(x); - auto y_mat = EigenMatrix::From(y); - auto dout_mat = EigenMatrix::From(dout); - auto& place = *dev_ctx.eigen_device(); - // Create the intermediate variable to calculate the Output(Y@Grad). - DenseTensor x_scale; - x_scale.Resize(common::make_ddim({batch_size, x_dim})); - dev_ctx.template Alloc(&x_scale); - auto x_scale_mat = EigenMatrix::From(x_scale); - - // Create the intermediate variable to calculate the Output(X@Grad). - DenseTensor y_scale; - y_scale.Resize(common::make_ddim({batch_size, y_dim})); - dev_ctx.template Alloc(&y_scale); - auto y_scale_mat = EigenMatrix::From(y_scale); - - funcs::SetConstant set_zero; - - if (dx) { - dev_ctx.template Alloc(dx); - set_zero(dev_ctx, dx, static_cast(0)); - } - - if (dy) { - dev_ctx.template Alloc(dy); - set_zero(dev_ctx, dy, static_cast(0)); - } - - if (dweight) { - dev_ctx.template Alloc(dweight); - } - - auto blas = funcs::GetBlas(dev_ctx); - - // Calculate the Output(X@Grad) and Output(Y@Grad). - if (dx || dy || dweight) { - Eigen::DSizes bcast_for_x(1, y_dim); - Eigen::DSizes bcast_for_y(1, x_dim); - Eigen::DSizes bcast_for_weight(1, x_dim); - - for (int i = 0; i < out_dim; ++i) { - DenseTensor weight_i = - weight.Slice(i, i + 1).Resize(common::make_ddim({x_dim, y_dim})); - auto output_vec = dout_mat.chip(i, 1); - - if (dx) { - y_scale_mat.device(place) = - output_vec.reshape(Eigen::DSizes(batch_size, 1)) - .broadcast(bcast_for_x) * - y_mat; - blas.GEMM(CblasNoTrans, - CblasTrans, - batch_size, - x_dim, - y_dim, - 1, - y_scale.data(), - weight_i.data(), - 1, - dx->data()); - } - - if (dy || dweight) { - auto output_vec_y = - output_vec.reshape(Eigen::DSizes(batch_size, 1)) - .broadcast(bcast_for_y); - x_scale_mat.device(place) = output_vec_y * x_mat; - if (dy) { - blas.GEMM(CblasNoTrans, - CblasNoTrans, - batch_size, - y_dim, - x_dim, - 1, - x_scale.data(), - weight_i.data(), - 1, - dy->data()); - } - if (dweight) { - DenseTensor dweight_i = dweight->Slice(i, i + 1).Resize( - common::make_ddim({x_dim, y_dim})); - blas.GEMM(CblasTrans, - CblasNoTrans, - x_dim, - y_dim, - batch_size, - 1, - x_scale.data(), - y.data(), - 0, - dweight_i.data()); - } - } - } - } - - // calculate the gradient of Input(Bias). - if (dbias) { - dev_ctx.template Alloc(dbias); - auto dbias_mat = EigenVector::Flatten(*dbias); - dbias_mat.device(place) = dout_mat.sum(Eigen::DSizes(0)); - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h b/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h deleted file mode 100644 index 48861d4893..0000000000 --- a/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/utils/optional.h" - -namespace phi { - -template -void BilinearKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& weight, - const paddle::optional& bias, - DenseTensor* out) { - dev_ctx.template Alloc(out); - - auto y_mat = EigenMatrix::From(y); - auto output_mat = EigenMatrix::From(*out); - - auto batch_size = x.dims()[0]; - auto weight_dims = weight.dims(); - int out_dim = weight_dims[0]; - auto x_dim = weight_dims[1]; - auto y_dim = weight_dims[2]; - auto& place = *dev_ctx.eigen_device(); - - // Create the intermediate variable to calculate the result of - // Input(X) multiplied by Input(Weight_i), the formula is: - // left_mul = X Weight_i. - DenseTensor left_mul; - left_mul.Resize(common::make_ddim({batch_size, y_dim})); - dev_ctx.template Alloc(&left_mul); - auto left_mul_mat = EigenMatrix::From(left_mul); - - for (int i = 0; i < out_dim; ++i) { - auto output_col_vec = output_mat.chip(i, 1); - DenseTensor weight_mat = - weight.Slice(i, i + 1).Resize(common::make_ddim({x_dim, y_dim})); - phi::funcs::GetBlas(dev_ctx).GEMM(CblasNoTrans, - CblasNoTrans, - batch_size, - y_dim, - x_dim, - 1, - x.data(), - weight_mat.data(), - 0, - left_mul.data()); - output_col_vec.device(place) = - (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); - } - if (bias.get_ptr()) { - auto bias_vec = EigenMatrix::From(*(bias.get_ptr())); - Eigen::DSizes bcast(batch_size, 1); - output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat; - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h deleted file mode 100644 index cd5978ae59..0000000000 --- a/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/bmm_grad_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h" - -namespace phi { - -template -void MatMul(const Context& dev_ctx, - const DenseTensor& a, - bool trans_a, - const DenseTensor& b, - bool trans_b, - DenseTensor* out) { - dev_ctx.template Alloc(out); - auto blas = phi::funcs::GetBlas(dev_ctx); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b); - - blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0)); -} - -template -void CalcInputGrad(const Context& dev_ctx, - const DenseTensor& a, - bool trans_a, - const DenseTensor& b, - bool trans_b, - DenseTensor* out) { - if (out == nullptr) return; - MatMul(dev_ctx, a, trans_a, b, trans_b, out); -} - -template -void BmmGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out_grad, - DenseTensor* x_grad, - DenseTensor* y_grad) { - if (x_grad && x_grad->numel() == 0) { - dev_ctx.template Alloc(x_grad); - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(y.dims())), 0, y_grad); - return; - } - if (y_grad && y_grad->numel() == 0) { - dev_ctx.template Alloc(y_grad); - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(x.dims())), 0, x_grad); - return; - } - DenseTensor x_help = x; - DenseTensor y_help = y; - DenseTensor out_grad_help = out_grad; - ReshapeXYOutIntoMatrixSequence( - &x_help, &y_help, &out_grad_help, false, false); - - phi::DDim dx_dims; - if (x_grad) { - dx_dims = x_grad->dims(); - if (dx_dims != x_help.dims()) { - x_grad->Resize(x_help.dims()); - } - } - - phi::DDim dy_dims; - if (y_grad) { - dy_dims = y_grad->dims(); - if (dy_dims != y_help.dims()) { - y_grad->Resize(y_help.dims()); - } - } - - CalcInputGrad( - dev_ctx, out_grad_help, false, y_help, true, x_grad); - CalcInputGrad( - dev_ctx, x_help, true, out_grad_help, false, y_grad); - - if (x_grad) { - if (dx_dims != x_help.dims()) { - x_grad->Resize(dx_dims); - } - } - if (y_grad) { - if (dy_dims != y_help.dims()) { - y_grad->Resize(dy_dims); - } - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h deleted file mode 100644 index ce493b4908..0000000000 --- a/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/bmm_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace phi { - -template -void BmmKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - dev_ctx.template Alloc(out); - - if (x.numel() == 0 || y.numel() == 0) { - return; - } - - auto blas = phi::funcs::GetBlas(dev_ctx); - - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(x.dims(), 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(y.dims(), 0, false); - - blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0)); -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h deleted file mode 100644 index 5d146dae8d..0000000000 --- a/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h +++ /dev/null @@ -1,339 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/kernels/cholesky_grad_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/for_range.h" - -namespace phi { - -template -inline void TransCompute(const int dim, - const Context& dev_ctx, - const DenseTensor& in, - DenseTensor* out, - const std::vector& axis) { - switch (dim) { - case 1: - funcs::Transpose trans1; - trans1(dev_ctx, in, out, axis); - break; - case 2: - funcs::Transpose trans2; - trans2(dev_ctx, in, out, axis); - break; - case 3: - funcs::Transpose trans3; - trans3(dev_ctx, in, out, axis); - break; - case 4: - funcs::Transpose trans4; - trans4(dev_ctx, in, out, axis); - break; - case 5: - funcs::Transpose trans5; - trans5(dev_ctx, in, out, axis); - break; - case 6: - funcs::Transpose trans6; - trans6(dev_ctx, in, out, axis); - break; - default: - // for dim >= 7 situation - funcs::TransposeNormal trans_normal; - trans_normal(dev_ctx, in, out, axis); - } -} - -/*! Use these functors to implement tril, triu, diagonal and other operators */ -template -struct EyeFunctor { - EyeFunctor(const int m, const int n, T* output) - : m_(m), n_(n), output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int global_row = index / n_; - const int col = index - global_row * n_; - const int batch = global_row / m_; - const int row = global_row - batch * m_; - output_[index] = col == row ? static_cast(1) : static_cast(0); - } - - const int m_, n_; - T* output_; -}; - -template -struct MatrixSetDiagFunctor { - /*! Overwrite specified diagonals of output by the values in diagonal. - * diagonals can be a central band specified by num_diags and - * upper_diag_index, where upper_diag_index=0 refers to the main diagonal, - * positive value means superdiagonal and negative value means subdiagonal. - * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len] - * and the num_diags diagonals has a up to down layout. Otherwise it has a - * shape [i, j, ..., max_diag_len]. - */ - MatrixSetDiagFunctor(const int m, - const int n, - const int num_diags, - const int max_diag_len, - const int upper_diag_index, - const T* diag, - T* output) - : m_(m), - n_(n), - num_diags_(num_diags), - max_diag_len_(max_diag_len), - upper_diag_index_(upper_diag_index), - diag_(diag), - output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int batch_and_diag_index = index / max_diag_len_; - const int index_in_the_diagonal = - index - batch_and_diag_index * max_diag_len_; - const int batch = batch_and_diag_index / num_diags_; - const int diag_index_in_input = batch_and_diag_index - batch * num_diags_; - // diag_index=0 refers to the main diagonal - const int diag_index = upper_diag_index_ - diag_index_in_input; - // shift down for subdiagonal if diag_index < 0 - const int y_index = - index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index); - // shift right for superdiagonal if diag_index > 0 - const int x_index = - index_in_the_diagonal + (0 > diag_index ? 0 : diag_index); - - // Upper-bound checks for diagonals shorter than max_diag_len. - // y_index and x_index are nonnegative by construction. - if (y_index < m_ && x_index < n_) { - const int out_index = batch * m_ * n_ + y_index * n_ + x_index; - output_[out_index] = diag_[index]; - } - } - - const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_; - const T* diag_; - T* output_; -}; - -template -struct MatrixDiagPartFunctor { - /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0 - * refers to the main diagonal, positive value means superdiagonal and - * negative value means subdiagonal */ - MatrixDiagPartFunctor(const int m, - const int n, - const int num_diags, - const int max_diag_len, - const int upper_diag_index, - const T padding, - const T* input, - T* output) - : m_(m), - n_(n), - num_diags_(num_diags), - max_diag_len_(max_diag_len), - upper_diag_index_(upper_diag_index), - input_(input), - output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int batch_and_mapped_diag_index = index / max_diag_len_; - const int index_in_the_diagonal = - index - batch_and_mapped_diag_index * max_diag_len_; - const int batch = batch_and_mapped_diag_index / num_diags_; - const int mapped_diag_index = - batch_and_mapped_diag_index - batch * num_diags_; - // diag_index=0 refers to the main diagonal - const int diag_index = upper_diag_index_ - mapped_diag_index; - // shift down for subdiagonal if diag_index < 0 - const int y_index = - index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index); - // shift right for superdiagonal if diag_index > 0 - const int x_index = - index_in_the_diagonal + (0 > diag_index ? 0 : diag_index); - if (y_index < m_ && x_index < n_) { - output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index]; - } else { - output_[index] = padding_; - } - } - - const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_; - const T padding_; - const T* input_; - T* output_; -}; - -template -struct MatrixBandPartScaleEndFunctor { - /*! Compared with MatrixBandPartFunctor, it scale up values at the end of - * band. It can be used to fuse the following operations, which actually - * output triangular with diagonal scaled up: - * 1. dig = matrix_diag_part(middle) - * 2. middle = matrix_set_diag(middle, diag * scalar) - * 3. middle = matrix_band_part(middle, -1, 0) - */ - MatrixBandPartScaleEndFunctor(const int m, - const int n, - const int num_lower_diags, - const int num_upper_diags, - const T scale, - const T* input, - T* output) - : m_(m), - n_(n), - num_lower_diags_(num_lower_diags), - num_upper_diags_(num_upper_diags), - scale_(scale), - input_(input), - output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int col = index % n_; - const int row = (index / n_) % m_; - const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_); - const int band_end = - (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1); - if (col < band_start || col >= band_end) { - output_[index] = 0; - } else if (col == band_end - 1) { - output_[index] = scale_ * input_[index]; - } else { - output_[index] = input_[index]; - } - } - - const int m_, n_, num_lower_diags_, num_upper_diags_; - const T scale_; - const T* input_; - T* output_; -}; - -template -struct AddtoScaleFunctor { - AddtoScaleFunctor(const T scale, const T* input, T* output) - : scale_(scale), input_(input), output_(output) {} - HOSTDEVICE void operator()(size_t index) const { - output_[index] += input_[index]; - output_[index] *= scale_; - } - const T scale_; - const T* input_; - T* output_; -}; - -template -void CholeskyGradKernel(const Context& dev_ctx, - const DenseTensor& out, - const DenseTensor& out_grad, - bool upper, - DenseTensor* x_grad) { - if (x_grad->numel() == 0) { - dev_ctx.template Alloc(x_grad); - return; - } - - auto* x_grad_data = dev_ctx.template Alloc(x_grad); - auto& dims = out.dims(); - int batch_count = 1; - for (int i = 0; i < dims.size() - 2; i++) { - batch_count *= dims[i]; - } - auto m = dims[dims.size() - 1]; - int tensor_size = batch_count * m * m; - - std::vector axis(dims.size() - 2); - std::iota(axis.begin(), axis.end(), 0); - axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2}); - DenseTensor l, l_grad; - if (upper) { - l.Resize(dims); - dev_ctx.template Alloc(&l); - l_grad.Resize(dims); - dev_ctx.template Alloc(&l_grad); - TransCompute(dims.size(), dev_ctx, out, &l, axis); - TransCompute(dims.size(), dev_ctx, out_grad, &l_grad, axis); - } else { - l = out; - l_grad = out_grad; - } - auto* l_data = l.data(); - - /*! refer to Iain Murray (2016); arXiv 1602.07527 */ - /*! phi = matmul(L.transpose(-1, -2), grad) */ - DenseTensor middle; - middle.Resize(dims); - auto* middle_data = dev_ctx.template Alloc(&middle); - auto trans_desc = funcs::CreateMatrixDescriptor(dims, 0, true); - auto no_trans_desc = funcs::CreateMatrixDescriptor(dims, 0, false); - auto blas = funcs::GetBlas(dev_ctx); - blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0)); - - /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */ - phi::funcs::ForRange for_range(dev_ctx, tensor_size); - MatrixBandPartScaleEndFunctor matrix_band_part_scale_end_functor( - m, - m, - /* num_lower_diags */ m, - /* num_upper_diags */ 0, - /* scale */ 0.5, - middle_data, - middle_data); - for_range(matrix_band_part_scale_end_functor); - - // Compute inverse by solving the triangular linear system AX = B, where B - // is the identity matrix. The matrix X would be overwritten on B - DenseTensor identity; - identity.Resize(dims); - auto* identity_data = dev_ctx.template Alloc(&identity); - EyeFunctor eye_functor(m, m, identity_data); - for_range(eye_functor); - // TODO(guosheng): use trsmBatched for GPU - for (int i = 0; i < batch_count; i++) { - blas.TRSM(/*side*/ CblasLeft, - /*uplo*/ CblasLower, - /*trans*/ CblasNoTrans, - /*diag*/ CblasNonUnit, - /*m*/ m, - /*n*/ m, - /*alpha*/ T(1), - l_data + i * m * m, - /*lda*/ m, - identity_data + i * m * m, - /*ldb*/ m); - } - DenseTensor& l_inverse = identity; - - /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */ - DenseTensor middle1; - middle1.Resize(dims); - dev_ctx.template Alloc(&middle1); - blas.MatMul( - l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1, T(0)); - blas.MatMul( - middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad, T(0)); - - /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */ - DenseTensor x_grad_trans; - x_grad_trans.Resize(dims); - auto* x_grad_trans_data = dev_ctx.template Alloc(&x_grad_trans); - TransCompute(dims.size(), dev_ctx, *x_grad, &x_grad_trans, axis); - AddtoScaleFunctor addto_scale_functor(0.5, x_grad_trans_data, x_grad_data); - for_range(addto_scale_functor); -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h deleted file mode 100644 index 098092767c..0000000000 --- a/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/cholesky_solve_grad_kernel.h" -#include "paddle/phi/kernels/cholesky_solve_kernel.h" -#include "paddle/phi/kernels/complex_kernel.h" -#include "paddle/phi/kernels/elementwise_add_kernel.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/expand_kernel.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/common_shape.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/for_range.h" -#include "paddle/phi/kernels/funcs/matrix_reduce.h" -#include "paddle/phi/kernels/funcs/tril_triu_compute.h" -#include "paddle/phi/kernels/transpose_kernel.h" -namespace phi { - -template -void CholeskySolveGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - bool upper, - DenseTensor* dx, - DenseTensor* dy) { - if (dout.numel() == 0) { - if (dx) { - dev_ctx.template Alloc(dx); - if (dx->numel() != 0) { - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx); - } - } - if (dy) { - dev_ctx.template Alloc(dy); - if (dy->numel() != 0) { - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(dy->dims())), 0, dy); - } - } - return; - } - // get broadcast dim - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = - funcs::MatrixGetBroadcastDims(x, y); - IntArray x_bst_dims(x_bst_dims_vec); - IntArray y_bst_dims(y_bst_dims_vec); - - // Tensor broadcast to temp 'y_bst' - DenseTensor y_bst = phi::Empty(dev_ctx, y_bst_dims); - ExpandKernel(dev_ctx, y, y_bst_dims, &y_bst); - - // reuse forward to calculate dx_bst, which is broad_cast of dx - DenseTensor dx_bst = phi::Empty(dev_ctx, x_bst_dims); - CholeskySolveKernel(dev_ctx, dout, y_bst, upper, &dx_bst); - - // get 'dx' according to 'dx_bst' - dx->Resize(x.dims()); - dev_ctx.template Alloc(dx); - if (dx_bst.dims() == x.dims()) { - Copy(dev_ctx, dx_bst, dev_ctx.GetPlace(), false, dx); - } else { - funcs::MatrixReduceSumFunctor functor; - functor(dev_ctx, dx_bst, dx); - dx->Resize(x.dims()); - } - - // calculate out's conjugate for complex - DenseTensor out_conj = Conj(dev_ctx, out); - out_conj = phi::TransposeLast2Dim(dev_ctx, out_conj); - - DenseTensor commonterm = phi::Empty(dev_ctx, y_bst_dims); - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.MatMul(dx_bst, - phi::funcs::CreateMatrixDescriptor(dx_bst.dims(), 0, false), - out_conj, - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false), - static_cast(1), - &commonterm, - static_cast(0)); - - // calculate commonterm's conjugate for complex - DenseTensor commonterm_conj = Conj(dev_ctx, commonterm); - commonterm_conj = phi::TransposeLast2Dim(dev_ctx, commonterm_conj); - - phi::AddKernel(dev_ctx, commonterm, commonterm_conj, &commonterm); - - DenseTensor dy_bst = phi::Empty(dev_ctx, y_bst_dims); - if (upper) { - blas.MatMul(y_bst, - phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false), - commonterm, - phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false), - static_cast(-1), - &dy_bst, - static_cast(0)); - } else { - blas.MatMul(commonterm, - phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false), - y_bst, - phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false), - static_cast(-1), - &dy_bst, - static_cast(0)); - } - - // get upper or lower of 'dy_bst' - DenseTensor dy_bst_upper = phi::Empty(dev_ctx, y_bst_dims); - - int y_bst_ndim = y_bst_dims_vec.size(); - const auto H = y_bst_dims_vec[y_bst_ndim - 2]; - const auto W = y_bst_dims_vec[y_bst_ndim - 1]; - phi::funcs::ForRange y_for_range(dev_ctx, dy_bst.numel()); - phi::funcs::TrilTriuCompute tril_triu_functor( - dy_bst.data(), 0, !upper, H, W, dy_bst_upper.data()); - y_for_range(tril_triu_functor); - - // get 'dy' according to 'dy_bst' - dy->Resize(y.dims()); - dev_ctx.template Alloc(dy); - if (dy_bst_upper.dims() == y.dims()) { - Copy(dev_ctx, dy_bst_upper, dev_ctx.GetPlace(), false, dy); - } else { - funcs::MatrixReduceSumFunctor functor; - functor(dev_ctx, dy_bst_upper, dy); - dy->Resize(y.dims()); - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h b/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h deleted file mode 100644 index 920881af72..0000000000 --- a/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h" -#else -#include "gpudnn/conv_cudnn_v7.h" -#endif - -#include "paddle/phi/backends/dynload/cudnn.h" -#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/funcs/batch_norm_utils.h" -#include "paddle/phi/kernels/funcs/padding.h" - -COMMON_DECLARE_bool(cudnn_deterministic); -PD_DECLARE_int64(conv_workspace_size_limit); -COMMON_DECLARE_bool(cudnn_exhaustive_search); - -namespace phi { - -static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) { - return dev_ctx.GetComputeCapability() >= 70; -} - -// inline cudnnTensorFormat_t GetCudnnTensorFormat( -// const phi::DataLayout& order) { // Not use -// switch (order) { -// case phi::DataLayout::kNHWC: -// return CUDNN_TENSOR_NHWC; -// case phi::DataLayout::kNCHW: -// return CUDNN_TENSOR_NCHW; -// case phi::DataLayout::NCDHW: -// return CUDNN_TENSOR_NCHW; // NOTE: cudnn treat NdTensor as the same -// case phi::DataLayout::NDHWC: -// return CUDNN_TENSOR_NHWC; // add, liyamei -// default: -// PADDLE_THROW(common::errors::Unimplemented( -// "CUDNN has no equivalent dataLayout for input order.")); -// } -// return CUDNN_TENSOR_NCHW; -// } - -// static inline void GetNCDHW(const DDim& dims, -// const phi::DataLayout& layout, -// int* N, -// int* C, -// int* D, -// int* H, -// int* W) { -// *N = dims[0]; -// *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; -// int i = layout == phi::DataLayout::kNCHW ? 0 : 1; -// if (dims.size() == 5) { -// *D = dims[2 - i]; -// *H = dims[3 - i]; -// *W = dims[4 - i]; -// } else { -// *D = 1; -// *H = dims[2 - i]; -// *W = dims[3 - i]; -// } -// } - -} // namespace phi - -// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double -// ) {} diff --git a/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h deleted file mode 100644 index 6066720ab0..0000000000 --- a/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h +++ /dev/null @@ -1,557 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/batch_norm_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/im2col.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/vol2col.h" -namespace phi { - -template -void ConvGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter_t, - const DenseTensor& output_grad, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - const std::vector& dilations_t, - int groups, - const std::string& data_format, - DenseTensor* input_grad, - DenseTensor* filter_grad) { - // The filter and filter_grad will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - - if (!input_grad && !filter_grad) return; - std::vector paddings = paddings_t; - std::vector dilations = dilations_t; - - DenseTensor filter = filter_t; - // 0-size - if (input.numel() == 0) { - if (input_grad) dev_ctx.template Alloc(input_grad); - if (filter_grad) { - phi::Full( - dev_ctx, - phi::IntArray(common::vectorize(filter_grad->dims())), - 0, - filter_grad); - } - return; - } - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - DenseTensor transformed_input(input.type()); - DenseTensor transformed_output_grad(output_grad.type()); - - if (channel_last) { - ResizeToChannelFirst(dev_ctx, &input, &transformed_input); - TransToChannelFirst(dev_ctx, &input, &transformed_input); - - ResizeToChannelFirst( - dev_ctx, &output_grad, &transformed_output_grad); - TransToChannelFirst( - dev_ctx, &output_grad, &transformed_output_grad); - } else { - transformed_input = input; - transformed_output_grad = output_grad; - } - - // update padding and dilation - auto in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = common::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(common::vectorize(filter.dims())); - // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - common::vectorize(transformed_output_grad.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, - // o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = transformed_input.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - DDim col_shape(common::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (i_c/g * k_h * k_w, o_h * o_w) - // or - // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) - DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); - - DDim input_shape = - slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size()); - - DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - DDim output_matrix_shape = { - transformed_output_grad.dims()[1], - transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * - transformed_output_grad.dims()[1])}; - - // convolution backward input operator: gemm + col2im(or col2vol) - // convolution backward weight operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - DenseTensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - DenseTensor col_matrix; - if (is_expand) { - col.Resize(col_shape); - dev_ctx.template Alloc(&col); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (input_grad) { - dev_ctx.template Alloc(input_grad); - DenseTensor transformed_input_grad(input_grad->type()); - if (channel_last) { - ResizeToChannelFirst( - dev_ctx, input_grad, &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - // if is_expand is false, the operation of set_zero is unnecessary, - // because math::matmul will reset input_grad. - if (is_expand) { - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); - } - phi::funcs::Col2ImFunctor col2im; - phi::funcs::Col2VolFunctor col2vol; - - for (int i = 0; i < batch_size; i++) { - DenseTensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - DenseTensor in_grad_batch = - transformed_input_grad.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - DenseTensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor filter_slice = - filter.Slice(g * out_step, (g + 1) * out_step); - - DenseTensor in_grad_slice = - in_grad_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col_matrix.ShareDataWith(in_grad_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(filter_slice, - true, - out_grad_slice, - false, - T(1.0), - &col_matrix, - T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, - col, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &in_grad_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); - } - } - } - if (channel_last) { - TransToChannelLast( - dev_ctx, &transformed_input_grad, input_grad); - } - } - - if (filter_grad) { - dev_ctx.template Alloc(filter_grad); - Tensor filter_grad_ = *filter_grad; - filter_grad_.Resize(filter_matrix_shape); - set_zero(dev_ctx, filter_grad, static_cast(0)); - phi::funcs::Im2ColFunctor im2col; - phi::funcs::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; i++) { - DenseTensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - DenseTensor in_batch = - transformed_input.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // im2col - DenseTensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - in_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - DenseTensor filter_grad_slice = - filter_grad_.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(out_grad_slice, - false, - col_matrix, - true, - T(1.0), - &filter_grad_slice, - T(1.0)); - } - } - } -} - -template -void ConvGradGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter, - const DenseTensor& out_grad, - const paddle::optional& input_grad_grad, - const paddle::optional& filter_grad_grad, - const std::vector& strides_t, - const std::vector& paddings_t, - const std::string& padding_algorithm, - const std::vector& dilations_t, - int groups, - const std::string& data_format, - DenseTensor* input_grad, - DenseTensor* filter_grad, - DenseTensor* out_grad_grad) { - const DenseTensor* X = &input; - const DenseTensor* dY = &out_grad; - const DenseTensor* ddX = input_grad_grad.get_ptr(); - const DenseTensor* ddW_in = filter_grad_grad.get_ptr(); - - DenseTensor* ddY = out_grad_grad; - DenseTensor* dW = filter_grad; - DenseTensor* dX = input_grad; - DenseTensor W = filter; - - if (!ddY && !dW && !dX) return; - - const std::vector strides = strides_t; - std::vector paddings = paddings_t; - std::vector dilations = dilations_t; - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - DenseTensor transformed_X(X->type()); - DenseTensor transformed_dY(dY->type()); - DenseTensor transformed_ddX(X->type()); - - if (channel_last) { - ResizeToChannelFirst(dev_ctx, X, &transformed_X); - TransToChannelFirst(dev_ctx, X, &transformed_X); - - ResizeToChannelFirst(dev_ctx, dY, &transformed_dY); - TransToChannelFirst(dev_ctx, dY, &transformed_dY); - - if (ddX) { - ResizeToChannelFirst(dev_ctx, ddX, &transformed_ddX); - TransToChannelFirst(dev_ctx, ddX, &transformed_ddX); - } - } else { - transformed_X = *X; - transformed_dY = *dY; - if (ddX) { - transformed_ddX = *ddX; - } - } - - // update padding and dilation - auto in_dims = transformed_X.dims(); - auto filter_dims = W.dims(); - - DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size()); - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = common::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_X.dims()[0]); - std::vector filter_shape_vec(common::vectorize(W.dims())); - std::vector output_shape_vec( - common::vectorize(transformed_dY.dims())); - - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - // col_shape [in_channel/group, kh, kw, oh, ow] - col_shape_vec[0] = transformed_X.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; - } - DDim col_shape(common::make_ddim(col_shape_vec)); - // col_matrix_shape [in_channel/group * kh * kw, oh * ow] - DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); - // input_shape [Cin, H, W] - DDim input_shape = - slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); - // filter_matrix_shape [Cout, Cin * kh * kw] - DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]}; - - W.Resize(filter_matrix_shape); - DDim output_matrix_shape = { - transformed_dY.dims()[1], - transformed_dY.numel() / - (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; - int in_step = static_cast(transformed_X.dims()[1]) / groups; - int out_step = static_cast(transformed_dY.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - DenseTensor col; - DenseTensor col_matrix; - if (is_expand) { - col.Resize(col_shape); - dev_ctx.template Alloc(&col); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - // dx convolution double grad: gemm + col2im(col2vol) - // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, - // oH, oW) - if (dX && ddW_in) { - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - dev_ctx.template Alloc(dX); - - DenseTensor transformed_dX(dX->type()); - - if (channel_last) { - ResizeToChannelFirst(dev_ctx, dX, &transformed_dX); - - } else { - transformed_dX = *dX; - } - // if is_expand is false, the operation of set_zero is unnecessary - // because math::matmul will reset dx - if (is_expand) { - set_zero(dev_ctx, &transformed_dX, static_cast(0)); - } - phi::funcs::Col2ImFunctor col2im; - phi::funcs::Col2VolFunctor col2vol; - - for (int i = 0; i < batch_size; i++) { - DenseTensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col_matrix.ShareDataWith(dx_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul( - ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, - col, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &dx_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); - } - } - } - if (channel_last) { - TransToChannelLast(dev_ctx, &transformed_dX, dX); - } - } - - // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, - // oH, oW) - // dw convolution double grad: im2col(vol2col) + gemm - if (dW && ddX) { - dev_ctx.template Alloc(dW); - set_zero(dev_ctx, dW, static_cast(0)); - DenseTensor dW_arr = *dW; - dW_arr.Resize(filter_matrix_shape); - phi::funcs::Im2ColFunctor im2col; - phi::funcs::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - DenseTensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; ++g) { - // im2col - DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - ddx_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - - DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0)); - } - } - } - - // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), - // w/ddw(Cout, Cin, kh, kw) - // ddy convolution double grad: im2col(vol2col) + gemm - if (ddY) { - dev_ctx.template Alloc(ddY); - - DenseTensor transformed_ddY(ddY->type()); - if (channel_last) { - ResizeToChannelFirst(dev_ctx, ddY, &transformed_ddY); - } else { - transformed_ddY = *ddY; - } - - set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - phi::funcs::Im2ColFunctor im2col; - phi::funcs::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - DenseTensor ddy_batch = - transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; ++g) { - // gemm - DenseTensor ddy_slice = - ddy_batch.Slice(g * out_step, (g + 1) * out_step); - - if (ddX) { - DenseTensor ddx_batch = - transformed_ddX.Slice(i, i + 1).Resize(input_shape); - DenseTensor ddx_slice = - ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - ddx_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0)); - } - - if (ddW_in) { - DenseTensor x_batch = - transformed_X.Slice(i, i + 1).Resize(input_shape); - DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - - DenseTensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - if (!is_expand) { - col.ShareDataWith(x_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - x_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); - } - - // gemm - DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0)); - } - } - } - if (channel_last) { - TransToChannelLast(dev_ctx, &transformed_ddY, ddY); - } - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/conv_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_kernel_impl.h deleted file mode 100644 index 4395e5d578..0000000000 --- a/backends/metax_gpu/kernels/impl/conv_kernel_impl.h +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/conv_kernel.h" -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/batch_norm_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/im2col.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/vol2col.h" - -namespace phi { - -template -void ConvKernelImpl(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& filter_t, - const std::vector& strides, - const std::vector& paddings_t, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations_t, - const std::string& data_format, - DenseTensor* output) { - std::vector paddings = paddings_t; - std::vector dilations = dilations_t; - DenseTensor filter = filter_t; - if (input.numel() == 0) { - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(output->dims())), 0, output); - return; - } - // The filter will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - dev_ctx.template Alloc(output); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - DenseTensor transformed_input(input.type()); - DenseTensor transformed_output(output->type()); - - if (channel_last) { - ResizeToChannelFirst(dev_ctx, &input, &transformed_input); - TransToChannelFirst(dev_ctx, &input, &transformed_input); - - ResizeToChannelFirst(dev_ctx, output, &transformed_output); - - } else { - transformed_input = input; - transformed_output = *output; - } - - // update padding and dilation - auto trans_in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - - DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size()); - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - - std::vector ksize = common::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - // filter_shape_vec: - // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(common::vectorize(filter.dims())); - - // output_shape_vec: - // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - common::vectorize(transformed_output.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: - // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, - // o_d,o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = trans_in_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - - DDim col_shape(common::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: - // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * - // o_w) - - DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - DenseTensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - DenseTensor col_matrix; - if (is_expand) { - // col = dev_ctx.AllocateTmpTensor(col_shape, dev_ctx); - col.Resize(col_shape); - dev_ctx.template Alloc(&col); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - DDim in_matrix_shape = - slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size()); - - DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - DDim output_matrix_shape = { - transformed_output.dims()[1], - transformed_output.numel() / - (transformed_output.dims()[0] * transformed_output.dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output.dims()[1]) / groups; - - phi::funcs::Im2ColFunctor im2col; - phi::funcs::Vol2ColFunctor vol2col; - - auto blas = phi::funcs::GetBlas(dev_ctx); - for (int i = 0; i < batch_size; i++) { - DenseTensor in_batch = - transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); - DenseTensor out_batch = - transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, - in_slice, - dilations, - strides, - std::vector{ - paddings[0], paddings[2], paddings[1], paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul( - filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0)); - } - } - if (channel_last) { - TransToChannelLast(dev_ctx, &transformed_output, output); - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h deleted file mode 100644 index aadc5d2b8a..0000000000 --- a/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h +++ /dev/null @@ -1,287 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/common/ddim.h" -#include "paddle/common/layout.h" -#include "paddle/phi/kernels/conv_transpose_kernel.h" -#include "paddle/phi/kernels/cpu/conv_util.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" -#include "paddle/phi/kernels/funcs/im2col.h" -#include "paddle/phi/kernels/funcs/slice.h" -#include "paddle/phi/kernels/funcs/vol2col.h" - -namespace phi { - -template -void ConvTransposeRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - DenseTensor* out) { - if (x.numel() == 0 || filter.numel() == 0) { - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out); - return; - } - const DataLayout data_layout = common::StringToDataLayout(data_format); - // The filter will be reshaped, so it should not be constant - DenseTensor filter_ = filter; - std::vector paddings_ = paddings; - std::vector dilations_ = dilations; - - auto x_dims = x.dims(); - auto filter_dims = filter_.dims(); - auto out_dims = out->dims(); - const int batch_size = static_cast(x.dims()[0]); - - DDim in_data_dims; - if (data_layout != DataLayout::kNHWC) { - in_data_dims = slice_ddim(x_dims, 2, x_dims.size()); - } else { - in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); - } - DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = common::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize); - - // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first - // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last - std::vector x_shape_vec = common::vectorize(x.dims()); - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec = common::vectorize(filter_.dims()); - - // use col_shape in the im2col and col2im (or vol2col and col2vol) - // calculation - // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - if (data_layout != DataLayout::kNHWC) { - col_shape_vec[0] = out_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2]; - } - } else { - col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1]; - } - } - DDim col_shape(common::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w) - DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1); - - DenseTensor col; - col.Resize(col_shape); - dev_ctx.template Alloc(&col); - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - DenseTensor col_matrix; - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - - // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first - // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last - DDim out_shape = slice_ddim(out->dims(), 1, out->dims().size()); - - // x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first - // x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last - DDim x_matrix_shape; - if (data_layout != DataLayout::kNHWC) { - x_matrix_shape = {x_dims[1], col_matrix_shape[1]}; - } else { - x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]}; - } - - // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w) - DDim filter_matrix_shape; - if (data_layout != DataLayout::kNHWC) { - filter_matrix_shape = {x_dims[1], col_matrix_shape[0]}; - } else { - filter_matrix_shape = {x_dims[x_dims.size() - 1], col_matrix_shape[0]}; - } - filter_.Resize(filter_matrix_shape); - - dev_ctx.template Alloc(out); - - funcs::SetConstant set_zero; - - auto blas = funcs::GetBlas(dev_ctx); - set_zero(dev_ctx, out, static_cast(0)); - - int in_step = (data_layout != DataLayout::kNHWC - ? static_cast(x_dims[1]) / groups - : static_cast(x_dims[x_dims.size() - 1]) / groups); - - int out_step = - (data_layout != DataLayout::kNHWC - ? static_cast(out_dims[1]) / groups - : static_cast(out_dims[out_dims.size() - 1]) / groups); - phi::funcs::Col2ImFunctor col2im; - phi::funcs::Col2VolFunctor col2vol; - funcs::ConcatFunctor concat_functor; - - // convolution transpose: gemm + col2im or col2vol (similar to conv-backward - // on x) - size_t D = x.dims().size(); - for (int i = 0; i < batch_size; i++) { - // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first - // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last - DenseTensor x_batch = x.Slice(i, i + 1).Resize(x_matrix_shape); - - // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first - // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last - DenseTensor out_batch = out->Slice(i, i + 1).Resize(out_shape); - - std::vector out_batch_vec; - for (int g = 0; g < groups; g++) { - int64_t start = g * in_step; - int64_t end = (g + 1) * in_step; - int axes = (data_layout != DataLayout::kNHWC ? 0 : 1); - DenseTensor filter_slice = filter_.Slice(g * in_step, (g + 1) * in_step); - DenseTensor in_slice, out_slice; - - // col_matrix = filter_slice * x_slice - // of shape (o_c/g * k_h * k_w, h * w) - // or (o_c/g * k_d * k_h * k_w, d * h * w) - if (data_layout != DataLayout::kNHWC) { - in_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(filter_slice, - true, - in_slice, - false, - static_cast(1.0), - &col_matrix, - static_cast(0.0)); - } else { - funcs::Slice( - dev_ctx, &x_batch, &in_slice, start, end, axes); - start = g * out_step; - end = (g + 1) * out_step; - axes = D - 2; - if (D == 4U) { - funcs::Slice( - dev_ctx, &out_batch, &out_slice, start, end, axes); - } else if (D == 5U) { - funcs::Slice( - dev_ctx, &out_batch, &out_slice, start, end, axes); - } - blas.MatMul(filter_slice, - true, - in_slice, - true, - static_cast(1.0), - &col_matrix, - static_cast(0.0)); - } - - if (data_dim == 2U) { - // col2im: col_matrix -> dy from (o_c/g * k_h * k_w, h * w) to (o_c/g, - // o_h, o_w) or (o_h, o_w, o_c/g) - col2im(dev_ctx, - col, - dilations_, - strides, - std::vector{ - paddings_[0], paddings_[2], paddings_[1], paddings_[3]}, - &out_slice, - data_layout); - } else if (data_dim == 3U) { - // col2vol: col_matrix -> dy from (o_c/g * k_d * k_h * k_w, d * h * w) - // to (o_c/g, o_d, o_h, o_w) or (o_d, o_h, o_w, o_c/g) - col2vol(dev_ctx, - col, - dilations_, - strides, - paddings_, - &out_slice, - data_layout); - } - if (data_layout == DataLayout::kNHWC) { - out_batch_vec.push_back(out_slice); - } - } - if (data_layout == DataLayout::kNHWC) { - concat_functor( - dev_ctx, out_batch_vec, static_cast(D - 2), &out_batch); - } - } -} - -template -void Conv2dTransposeKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::vector& output_padding UNUSED, - const IntArray& output_size UNUSED, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - DenseTensor* out) { - ConvTransposeRawKernel(dev_ctx, - x, - filter, - strides, - paddings, - padding_algorithm, - groups, - dilations, - data_format, - out); -} - -template -void Conv3dTransposeKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::vector& output_padding UNUSED, - const std::vector& output_size UNUSED, - const std::string& padding_algorithm, - int groups, - const std::vector& dilations, - const std::string& data_format, - DenseTensor* out) { - ConvTransposeRawKernel(dev_ctx, - x, - filter, - strides, - paddings, - padding_algorithm, - groups, - dilations, - data_format, - out); -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h deleted file mode 100644 index b9931a8997..0000000000 --- a/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h +++ /dev/null @@ -1,365 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/common/hostdevice.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/deformable_conv_functor.h" - -namespace phi { - -template -HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, - T argmax_w, - const int h, - const int w, - const int height, - const int width) { - if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || - argmax_w >= width) { - return 0; - } - - int argmax_h_low = floor(argmax_h); - int argmax_w_low = floor(argmax_w); - int argmax_h_high = argmax_h_low + 1; - int argmax_w_high = argmax_w_low + 1; - - T weight = 0; - - weight = (h == argmax_h_low && w == argmax_w_low) - ? (h + 1 - argmax_h) * (w + 1 - argmax_w) - : weight; - weight = (h == argmax_h_low && w == argmax_w_high) - ? (h + 1 - argmax_h) * (argmax_w + 1 - w) - : weight; - weight = (h == argmax_h_high && w == argmax_w_low) - ? (argmax_h + 1 - h) * (w + 1 - argmax_w) - : weight; - weight = (h == argmax_h_high && w == argmax_w_high) - ? (argmax_h + 1 - h) * (argmax_w + 1 - w) - : weight; - - return weight; -} - -template -HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, - T argmax_w, - const int height, - const int width, - const T* im_data, - const int data_width, - const int bp_dir) { - if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || - argmax_w >= width) { - return 0; - } - - int argmax_h_low = floor(argmax_h); - int argmax_w_low = floor(argmax_w); - int argmax_h_high = argmax_h_low + 1; - int argmax_w_high = argmax_w_low + 1; - - T weight = 0; - - if (bp_dir == 0) { - weight += (argmax_h_low >= 0 && argmax_w_low >= 0) - ? -1 * (argmax_w_low + 1 - argmax_w) * - im_data[argmax_h_low * data_width + argmax_w_low] - : 0; - - weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) - ? -1 * (argmax_w - argmax_w_low) * - im_data[argmax_h_low * data_width + argmax_w_high] - : 0; - - weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) - ? (argmax_w_low + 1 - argmax_w) * - im_data[argmax_h_high * data_width + argmax_w_low] - : 0; - weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) - ? (argmax_w - argmax_w_low) * - im_data[argmax_h_high * data_width + argmax_w_high] - : 0; - } else if (bp_dir == 1) { - weight += (argmax_h_low >= 0 && argmax_w_low >= 0) - ? -1 * (argmax_h_low + 1 - argmax_h) * - im_data[argmax_h_low * data_width + argmax_w_low] - : 0; - weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) - ? (argmax_h_low + 1 - argmax_h) * - im_data[argmax_h_low * data_width + argmax_w_high] - : 0; - weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) - ? -1 * (argmax_h - argmax_h_low) * - im_data[argmax_h_high * data_width + argmax_w_low] - : 0; - weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) - ? (argmax_h - argmax_h_low) * - im_data[argmax_h_high * data_width + argmax_w_high] - : 0; - } - - return weight; -} - -template -void ModulatedDeformableCol2imCoord(const Context& dev_ctx, - const T* data_col, - const T* data_im, - const T* data_offset, - const T* data_mask, - const std::vector& im_shape, - const std::vector& col_shape, - const std::vector& kernel_shape, - const std::vector& paddings, - const std::vector& strides, - const std::vector& dilations, - const int deformable_groups, - T* grad_offset, - T* grad_mask); - -template -void ModulatedDeformableCol2im(const Context& dev_ctx, - const T* data_col, - const T* data_offset, - const T* data_mask, - const std::vector& im_shape, - const std::vector& col_shape, - const std::vector& kernel_shape, - const std::vector& pad, - const std::vector& stride, - const std::vector& dilation, - const int deformable_group, - T* grad_im); - -template -void FilterGradAddup(const Context& dev_ctx, - const int nthreads, - const int n, - const int height, - const int width, - const T* dweight_3d, - T* filter_grad); - -template -void DeformableConvGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& offset, - const DenseTensor& filter, - const paddle::optional& mask, - const DenseTensor& out_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - int deformable_groups, - int groups, - int im2col_step, - DenseTensor* dx, - DenseTensor* offset_grad, - DenseTensor* filter_grad, - DenseTensor* mask_grad) { - const int batch_size = static_cast(x.dims()[0]); - - DDim input_shape = common::slice_ddim(x.dims(), 1, x.dims().size()); - std::vector input_shape_vec = common::vectorize(input_shape); - std::vector filter_shape_vec(common::vectorize(filter.dims())); - std::vector output_shape_vec(common::vectorize(out_grad.dims())); - - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - - DenseTensor col_buffer = Empty(dev_ctx, col_buffer_shape_vec); - DenseTensor output_buffer; - output_buffer.ShareDataWith(out_grad).Resize( - common::make_ddim(output_buffer_shape_vec)); - - int64_t M = - input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3]; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = output_shape_vec[1] / groups; - - DDim weight_3d_shape = {groups, K, M}; - DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K, N}; - DDim col_buffer_3d_shape = {groups, M, N}; - DDim filter_grad_shape = {groups, K, M}; - - DenseTensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(weight_3d_shape); - DenseTensor out_grad_4d; - out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape); - DenseTensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - int input_dim = x.numel() / x.dims()[0]; - int input_offset_dim = offset.numel() / offset.dims()[0]; - int input_mask_dim = mask ? mask->numel() / mask->dims()[0] : 0; - - if (filter_grad) { - Full(dev_ctx, - {filter_grad_shape.Get(), filter_grad_shape.size()}, - 0, - filter_grad); - } - - if (dx) { - dev_ctx.template Alloc(dx); - set_zero(dev_ctx, dx, static_cast(0)); - } - - if (offset_grad) { - dev_ctx.template Alloc(offset_grad); - set_zero(dev_ctx, offset_grad, static_cast(0)); - - if (mask_grad) { - dev_ctx.template Alloc(mask_grad); - set_zero(dev_ctx, mask_grad, static_cast(0)); - } - } - - for (int i = 0; i < batch_size / im2col_step; ++i) { - DenseTensor out_grad_3d = out_grad_4d.Slice(i, i + 1).Resize( - common::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size())); - for (int g = 0; g < groups; ++g) { - DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - common::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - DenseTensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize( - common::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size())); - DenseTensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(common::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - blas.MatMul(weight_3d_slice, - true, - out_grad_3d_slice, - false, - T(1.0), - &col_buffer_3d_slice, - T(0.0)); - } - col_buffer.Resize(common::make_ddim(col_buffer_shape_vec)); - - T* col_buffer_ptr = col_buffer.data(); - const T* input_ptr = x.data(); - const T* offset_ptr = offset.data(); - const T* mask_data_ptr = - mask ? mask->data() + i * im2col_step * input_mask_dim : nullptr; - if (offset_grad) { - T* offset_grad_ptr = offset_grad->data(); - T* mask_grad_data_ptr = - mask_grad ? mask_grad->data() + i * im2col_step * input_mask_dim - : nullptr; - // get grad of offset and mask - ModulatedDeformableCol2imCoord( - dev_ctx, - col_buffer_ptr, - input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_data_ptr, - input_shape_vec, - col_buffer_shape_vec, - filter_shape_vec, - paddings, - strides, - dilations, - deformable_groups, - offset_grad_ptr + i * im2col_step * input_offset_dim, - mask_grad_data_ptr); - } - if (dx) { - T* dx_ptr = dx->data(); - // get grad of input - ModulatedDeformableCol2im(dev_ctx, - col_buffer_ptr, - offset_ptr + i * im2col_step * input_offset_dim, - mask_data_ptr, - input_shape_vec, - col_buffer_shape_vec, - filter_shape_vec, - paddings, - strides, - dilations, - deformable_groups, - dx_ptr + i * im2col_step * input_dim); - dx->Resize(x.dims()); - } - - funcs::ModulatedDeformableIm2col( - dev_ctx, - input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_data_ptr, - input_shape_vec, - col_buffer_shape_vec, - filter_shape_vec, - paddings, - strides, - dilations, - deformable_groups, - col_buffer_ptr); - - col_buffer_3d.Resize(col_buffer_3d_shape); - - if (filter_grad) { - DenseTensor dweight_3d = Empty( - dev_ctx, {filter_grad_shape.Get(), filter_grad_shape.size()}); - for (int g = 0; g < groups; ++g) { - DenseTensor out_grad_3d_slice = - out_grad_3d.Slice(g, g + 1).Resize(common::slice_ddim( - out_grad_3d.dims(), 1, out_grad_3d.dims().size())); - DenseTensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(common::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - DenseTensor dweight_3d_slice = dweight_3d.Slice(g, g + 1).Resize( - common::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size())); - - blas.MatMul(out_grad_3d_slice, - false, - col_buffer_3d_slice, - true, - T(1.0), - &dweight_3d_slice, - T(0.0)); - } - - // update grad of weights - FilterGradAddup(dev_ctx, - dweight_3d.numel(), - groups, - K, - M, - dweight_3d.data(), - filter_grad->data()); - } - } - if (filter_grad) { - filter_grad->Resize(filter.dims()); - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h deleted file mode 100644 index a627bbf702..0000000000 --- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h +++ /dev/null @@ -1,339 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "kernels/custom_kernel/elementwise.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/dirichlet_kernel.h" -#include "paddle/phi/kernels/elementwise_divide_kernel.h" -#include "paddle/phi/kernels/funcs/broadcast_function.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/for_range.h" -#include "paddle/phi/kernels/funcs/reduce_function.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" -#include "paddle/phi/kernels/reduce_sum_kernel.h" - -// ROCM hcc doesn't work well with using std:: in kernel functions -#if defined(PADDLE_WITH_CUDA) -#define COMPAT_EXP exp -#define COMPAT_CEIL ceil -#define COMPAT_FLOOR floor -#define COMPAT_LOG log -#define COMPAT_POW pow -#define COMPAT_SQRT sqrt -#define COMPAT_TAN tan -#define COMPAT_ABS abs -#define COMPAT_LOG1P log1p -#else -#define COMPAT_EXP std::exp -#define COMPAT_CEIL std::ceil -#define COMPAT_FLOOR std::floor -#define COMPAT_LOG std::log -#define COMPAT_POW std::pow -#define COMPAT_SQRT std::sqrt -#define COMPAT_TAN std::tan -#define COMPAT_ABS std::abs -#define COMPAT_LOG1P std::log1p -#endif - -#ifdef PADDLE_WITH_CUDA -#include -#endif -#ifdef PADDLE_WITH_HIP -#include -#endif - -#if defined(PADDLE_WITH_CUDA) -using COMPAT_RANDSTATEPHILOX4_32_10_T = curandStatePhilox4_32_10_t; -#define COMPAT_RAND_INIT curand_init -#define COMPAT_RAND_UNIFORM curand_uniform -#define COMPAT_RAND_NORMAL curand_normal -#elif defined(PADDLE_WITH_HIP) -using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t; -#define COMPAT_RAND_INIT hiprand_init -#define COMPAT_RAND_UNIFORM hiprand_uniform -#define COMPAT_RAND_NORMAL hiprand_normal -#endif - -namespace phi { - -template -struct BaseSampler { - SamplerT sampler_; - HOSTDEVICE BaseSampler(const SamplerT& sampler) : sampler_(sampler) {} - HOSTDEVICE ScalarT sample() { - // Sometimes convert float to float16/bfloat16 - return static_cast(sampler_()); - } -}; - -template -struct GammaSampler { - void operator()(const Context& dev_ctx, - const DenseTensor& alpha, - DenseTensor* out); -}; - -template -struct DirichletSampler { - void operator()(const Context& dev_ctx, - const DenseTensor& alpha, - DenseTensor* out); -}; - -// `sample_gamma` is d from Numpy's distributions.c, and add support for -// paddle data type and code style. -// Source MIT licensed: -/* Copyright 2005 Robert Kern (robert.kern@gmail.com) - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -template -HOSTDEVICE ScalarT -sample_gamma(ScalarT alpha, - BaseSampler standard_uniform, - BaseSampler standard_normal) { - using MPTypeScalar = typename phi::dtype::MPTypeTrait::Type; - using MPTypeAccscalar = typename phi::dtype::MPTypeTrait::Type; - - MPTypeAccscalar mp_scale = static_cast(1.0f); - MPTypeScalar mp_alpha = static_cast(alpha); - - // Boost alpha for higher acceptance probability. - if (mp_alpha < 1.0f) { - if (mp_alpha == 0.f) return static_cast(0.f); - MPTypeAccscalar mp_sample = - static_cast(standard_uniform.sample()); - mp_scale *= COMPAT_POW(1 - mp_sample, 1.0f / mp_alpha); - mp_alpha += 1.0f; - } - - // This implements the acceptance-rejection method of Marsaglia and Tsang - // (2000) - // doi:10.1145/358407.358414 - const MPTypeAccscalar d = mp_alpha - 1.0f / 3.0f; - const MPTypeAccscalar c = 1.0f / COMPAT_SQRT(9.0f * d); - for (;;) { - MPTypeAccscalar x, y; - do { - x = static_cast(standard_normal.sample()); - y = 1.0f + c * x; - } while (y <= 0); - const MPTypeAccscalar v = y * y * y; - const MPTypeAccscalar u = - 1 - static_cast(standard_uniform.sample()); - const MPTypeAccscalar xx = x * x; - if (u < 1.0f - 0.0331f * xx * xx) - return static_cast(mp_scale * d * v); - if (COMPAT_LOG(u) < 0.5f * xx + d * (1.0f - v + COMPAT_LOG(v))) - return static_cast(mp_scale * d * v); - } -} - -template -struct GammaCPUFunctor { - GammaCPUFunctor(const T* alpha, - T* gamma, - BaseSampler uniform, - BaseSampler normal) - : alpha_(alpha), gamma_(gamma), uniform_(uniform), normal_(normal) {} - - HOST void operator()(int64_t index) { - auto sample = sample_gamma( - alpha_[index], uniform_, normal_); - gamma_[index] = std::max(std::numeric_limits::min(), sample); - } - - const T* alpha_; - T* gamma_; - BaseSampler uniform_; - BaseSampler normal_; -}; - -template -struct GammaSampler { - void operator()(const CPUContext& dev_ctx, - const DenseTensor& alpha, - DenseTensor* out) { - auto generator = dev_ctx.GetGenerator()->GetCPUEngine(); - - auto uniform = [&generator]() -> T { - std::uniform_real_distribution u(0.0, 1.0); - return u(*generator); - }; - BaseSampler standard_uniform(uniform); - - auto normal = [&generator]() { - std::normal_distribution n(0.0, 1.0); - return n(*generator); - }; - BaseSampler standard_normal(normal); - - GammaCPUFunctor gamma_functor( - alpha.data(), out->data(), standard_uniform, standard_normal); - funcs::ForRange for_range(dev_ctx, out->numel()); - for_range(gamma_functor); - } -}; - -template -struct DirichletSampler { - void operator()(const CPUContext& dev_ctx, - const DenseTensor& alpha, - DenseTensor* out) { - // sample from K gamma distributions, where K=alpha.numel() - DenseTensor gamma_samples; - gamma_samples.Resize(alpha.dims()); - dev_ctx.template Alloc(&gamma_samples); - - GammaSampler gamma_sampler; - gamma_sampler(dev_ctx, alpha, &gamma_samples); - - // normalize them into a simplex, along the last axis - DenseTensor gamma_sum; - auto new_shape = gamma_samples.dims(); - new_shape[new_shape.size() - 1] = 1; - gamma_sum.Resize(new_shape); - dev_ctx.template Alloc(&gamma_sum); - - funcs::ReduceKernelImpl( - dev_ctx, - gamma_samples, - &gamma_sum, - {new_shape.size() - 1}, - true, - false); - - funcs::ElementwiseCompute, T>( - dev_ctx, gamma_samples, gamma_sum, funcs::DivideFunctor(), out); - } -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -struct GammaCUDAFunctor { - GammaCUDAFunctor(const T* alpha, T* gamma, uint64_t seed, uint64_t offset) - : alpha_(alpha), gamma_(gamma), seed_(seed), offset_(offset) {} - - DEVICE void operator()(int64_t index) { - // curand initialization - COMPAT_RANDSTATEPHILOX4_32_10_T state; - COMPAT_RAND_INIT( - /*seed=*/seed_, /*subsequence=*/index, /*offset=*/offset_, &state); - - // sample - auto uniform_lambda = [&state]() { return COMPAT_RAND_UNIFORM(&state); }; - BaseSampler standard_uniform(uniform_lambda); - auto normal_lambda = [&state]() { return COMPAT_RAND_NORMAL(&state); }; - BaseSampler standard_normal(normal_lambda); - - auto sample = - sample_gamma( - alpha_[index], standard_uniform, standard_normal); - gamma_[index] = std::max(std::numeric_limits::min(), sample); - } - - const T* alpha_; - T* gamma_; - const uint64_t seed_; - const uint64_t offset_; -}; - -template -struct GammaSampler { - void operator()(const GPUContext& dev_ctx, - const DenseTensor& alpha, - DenseTensor* out) { - auto p_gen = dev_ctx.GetGenerator(); - auto seed_and_offset = p_gen->IncrementOffset(10); // hard-coded offset - auto seed = seed_and_offset.first; - auto offset = seed_and_offset.second; - - GammaCUDAFunctor gamma_functor( - alpha.data(), out->data(), seed, offset); - funcs::ForRange for_range(dev_ctx, out->numel()); - for_range(gamma_functor); - } -}; - -template -struct DirichletSampler { - void operator()(const GPUContext& dev_ctx, - const DenseTensor& alpha, - DenseTensor* out) { - // sample from K gamma distributions, where K=alpha.numel() - DenseTensor gamma_samples; - gamma_samples.Resize(alpha.dims()); - dev_ctx.template Alloc(&gamma_samples); - - GammaSampler gamma_sampler; - gamma_sampler(dev_ctx, alpha, &gamma_samples); - - // normalize them into a simplex, along the last axis - DenseTensor gamma_sum; - auto new_shape = gamma_samples.dims(); - new_shape[new_shape.size() - 1] = 1; - gamma_sum.Resize(new_shape); - dev_ctx.template Alloc(&gamma_sum); - - phi::SumRawKernel(dev_ctx, - gamma_samples, - {new_shape.size() - 1}, - true, - false, - gamma_sum.dtype(), - &gamma_sum); - phi::DivideKernel(dev_ctx, gamma_samples, gamma_sum, out); - } -}; -#endif - -template -void Dirichletkernel(const Context& dev_ctx, - const DenseTensor& alpha, - DenseTensor* out) { - dev_ctx.template Alloc(out); - DirichletSampler sampler; - sampler(dev_ctx, alpha, out); -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/elementwise.h b/backends/metax_gpu/kernels/impl/elementwise.h deleted file mode 100644 index b9f3d8af1c..0000000000 --- a/backends/metax_gpu/kernels/impl/elementwise.h +++ /dev/null @@ -1,201 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/broadcast_function.h" -#include "paddle/phi/kernels/funcs/common_shape.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" - -namespace phi { - -// FORWARD CODE - -// Add -template -struct SameDimsAddFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsAddFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.VADD( - x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); - } -}; - -template -struct SameDimsAddFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - dev_ctx.template Alloc(z); - auto eigen_x = phi::EigenVector::Flatten(x); - auto eigen_y = phi::EigenVector::Flatten(y); - auto eigen_z = phi::EigenVector::Flatten(*z); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x + eigen_y; - } -}; - -// Subtract -template -struct SameDimsSubtractFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsSubtractFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.VSUB( - x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); - } -}; - -template -struct SameDimsSubtractFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - auto eigen_x = phi::EigenVector::Flatten(x); - auto eigen_y = phi::EigenVector::Flatten(y); - auto eigen_z = phi::EigenVector::Flatten(*z); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x - eigen_y; - } -}; - -// Divide -template -struct SameDimsDivideFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsDivideFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx UNUSED, - const DenseTensor& x UNUSED, - const DenseTensor& y UNUSED, - DenseTensor* z UNUSED) { - common::errors::InvalidArgument( - "If use SameDimsDivideFunctor, template args(T) must be floating " - "point. "); - } -}; - -template -struct SameDimsDivideFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.VDIV( - x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); - } -}; - -// Multiply -template -struct SameDimsMultiplyFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsMultiplyFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.VMUL( - x.numel(), x.data(), y.data(), dev_ctx.template Alloc(z)); - } -}; - -template -struct SameDimsMultiplyFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - auto eigen_x = phi::EigenVector::Flatten(x); - auto eigen_y = phi::EigenVector::Flatten(y); - auto eigen_z = phi::EigenVector::Flatten(*z); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x * eigen_y; - } -}; - -template -struct SameDimsElementwiseCompute { - void operator()(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - Functor()(dev_ctx, x, y, z); - } -}; - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h b/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h deleted file mode 100644 index dc4059a722..0000000000 --- a/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/flatten_grad_kernel.h" -#include "paddle/phi/kernels/flatten_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/flatten2_utils.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace phi { - -template -void Flatten2Kernel(const Context &dev_ctx, - const DenseTensor &x, - int axis, - DenseTensor *out, - DenseTensor *x_shape) { - auto &axes = axis; - - auto *in = &x; - auto x_dims = in->dims(); - - auto out_dims = common::make_ddim(phi::funcs::GetOutputShape(axes, x_dims)); - - dev_ctx.Alloc(out, x.dtype()); - phi::Copy(dev_ctx, *in, dev_ctx.GetPlace(), false, out); - out->Resize(out_dims); -} - -template -void Flatten2GradKernel(const Context &dev_ctx, - const DenseTensor &x, - const DenseTensor &x_shape, - const DenseTensor &out_grad, - int axis, - DenseTensor *x_grad) { - auto *d_x = x_grad; - auto *d_out = &out_grad; - - auto xshape_dims = x_shape.dims(); - auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - dev_ctx.Alloc(x_grad, out_grad.dtype()); - phi::Copy(dev_ctx, *d_out, dev_ctx.GetPlace(), false, d_x); - d_x->Resize(x_dims); -} -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h b/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h deleted file mode 100644 index ef12141f91..0000000000 --- a/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h +++ /dev/null @@ -1,340 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/activation_functor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/utils/optional.h" -namespace phi { - -enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; - -template -void ActCompute( - const int act_type, const Device& d, X x, Y y, phi::Place place) { - if (act_type == identity) { - y.device(d) = x; - } else if (act_type == sigmoid) { - phi::funcs::SigmoidFunctor()(d, x, y); - } else if (act_type == tanh) { - phi::funcs::TanhFunctor()(d, x, y); - } else if (act_type == relu) { - if (place == phi::CPUPlace()) - phi::funcs::ReluCPUFunctor()(d, x, y); - else - phi::funcs::ReluCUDAFunctor()(d, x, y); - } else { - PADDLE_THROW(common::errors::Unimplemented( - "Unsupported activation type, only supports identity, sigmoid, tanh " - "and relu.")); - } -} - -#define ACT_COMPUTE ActCompute - -template -void GRUUnitKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& hidden_prev, - const DenseTensor& weight, - const paddle::optional& bias, - int activation, - int gate_activation, - bool origin_mode, - DenseTensor* gate, - DenseTensor* reset_hidden_prev, - DenseTensor* hidden) { - auto* input_p = &input; - auto* hidden_prev_p = &hidden_prev; - - dev_ctx.template Alloc(gate); - dev_ctx.template Alloc(reset_hidden_prev); - dev_ctx.template Alloc(hidden); - - int batch_size = input_p->dims()[0]; - int frame_size = hidden_prev_p->dims()[1]; - - auto x = phi::EigenMatrix::From(input); - auto h_p = phi::EigenMatrix::From(hidden_prev); - auto g = phi::EigenMatrix::From(*gate); - auto r_h_p = phi::EigenMatrix::From(*reset_hidden_prev); - auto h = phi::EigenMatrix::From(*hidden); - auto& place = *dev_ctx.eigen_device(); - - // calculate unactivated gate outputs - if (bias) { - auto b = phi::EigenMatrix::From(bias.get()); - g.device(place) = - x + b.reshape(Eigen::array({{1, frame_size * 3}})) - .broadcast(Eigen::array({{batch_size, 1}})); - } else { - g.device(place) = x; - } - const T* hidden_prev_data = hidden_prev.data(); - const T* weight_data = weight.data(); - T* gate_data = gate->data(); - T* reset_hidden_prev_data = reset_hidden_prev->data(); - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.GEMM(false, - false, - batch_size, - 2 * frame_size, - frame_size, - 1, - hidden_prev_data, - frame_size, - weight_data, - frame_size * 2, - 1, - gate_data, - frame_size * 3); - - // calculate activated gate - Eigen::array extents{{batch_size, frame_size}}; - Eigen::array u_offsets{{0, 0}}; - ACT_COMPUTE(gate_activation, - place, - g.slice(u_offsets, extents), - g.slice(u_offsets, extents), - dev_ctx.GetPlace()); - auto u = g.slice(u_offsets, extents); // update gate - Eigen::array r_offsets{{0, frame_size}}; - ACT_COMPUTE(gate_activation, - place, - g.slice(r_offsets, extents), - g.slice(r_offsets, extents), - dev_ctx.GetPlace()); - auto r = g.slice(r_offsets, extents); // reset gate - r_h_p.device(place) = r * h_p; // reset previous hidden state - blas.GEMM(false, - false, - batch_size, - frame_size, - frame_size, - 1, - reset_hidden_prev_data, - frame_size, - weight_data + frame_size * frame_size * 2, - frame_size, - 1, - gate_data + frame_size * 2, - frame_size * 3); - - Eigen::array c_offsets{{0, frame_size * 2}}; - ACT_COMPUTE(activation, - place, - g.slice(c_offsets, extents), - g.slice(c_offsets, extents), - dev_ctx.GetPlace()); - auto c = g.slice(c_offsets, extents); // output candidate - - // calculate final output - if (origin_mode) { - h.device(place) = c + u * (h_p - c); // (1 - u) * c + u * h_p - } else { - h.device(place) = u * (c - h_p) + h_p; // u * c + (1 - u) * h_p - } -} - -template -void ActGradCompute( - const int act_type, const Device& d, X x, Y y, DX dx, DY dy) { - // x is dummy and won't be used even in Relu(use y instead) - if (act_type == identity) - dx.device(d) = dy; - else if (act_type == sigmoid) - phi::funcs::SigmoidGradFunctor()(d, x, y, dy, dx); - else if (act_type == tanh) - phi::funcs::TanhGradFunctor()(d, x, y, dy, dx); - else if (act_type == relu) - phi::funcs::ReluGradFunctor()(d, x, y, dy, dx); - else - PADDLE_THROW(common::errors::Unimplemented( - "Unsupported activation type, only supports identity, sigmoid, tanh " - "and relu.")); -} - -#define ACT_GRAD_COMPUTE ActGradCompute - -template -void GRUUnitGradKernel(const Context& dev_ctx, - const DenseTensor& input, - const DenseTensor& hidden_prev, - const DenseTensor& weight, - const paddle::optional& bias, - const DenseTensor& gate, - const DenseTensor& reset_hidden_prev, - const DenseTensor& hidden_grad, - int activation, - int gate_activation, - bool origin_mode, - DenseTensor* input_grad, - DenseTensor* hidden_prev_grad, - DenseTensor* weight_grad, - DenseTensor* bias_grad) { - phi::DenseTensor gate_grad; - phi::DenseTensor reset_hidden_prev_grad; - - const T* hidden_prev_data = hidden_prev.data(); - const T* weight_data = weight.data(); - gate_grad.Resize(input.dims()); - T* gate_grad_data = dev_ctx.template Alloc(&gate_grad); - const T* reset_hidden_prev_data = reset_hidden_prev.data(); - reset_hidden_prev_grad.Resize(reset_hidden_prev.dims()); - T* reset_hidden_prev_grad_data = - dev_ctx.template Alloc(&reset_hidden_prev_grad); - - auto h_p = phi::EigenMatrix::From(hidden_prev); - auto g = phi::EigenMatrix::From(gate); - auto d_h = phi::EigenMatrix::From(hidden_grad); - auto d_g = phi::EigenMatrix::From(gate_grad); - auto d_r_h_p = phi::EigenMatrix::From(reset_hidden_prev_grad); - auto& place = *dev_ctx.eigen_device(); - - int batch_size = input.dims()[0]; - int frame_size = hidden_prev.dims()[1]; - - Eigen::array extents{{batch_size, frame_size}}; - Eigen::array u_offsets{{0, 0}}; - auto u = g.slice(u_offsets, extents); // update gate - Eigen::array r_offsets{{0, frame_size}}; - auto r = g.slice(r_offsets, extents); // reset gate - Eigen::array c_offsets{{0, frame_size * 2}}; - auto c = g.slice(c_offsets, extents); // output candidate - - // backward for unactivated update gate - if (origin_mode) { - ACT_GRAD_COMPUTE(gate_activation, - place, - u, - u, - d_g.slice(u_offsets, extents), - d_h * (h_p - c)); - // backward for unactivated output candidate - ACT_GRAD_COMPUTE( - activation, place, c, c, d_g.slice(c_offsets, extents), d_h * (1 - u)); - } else { - ACT_GRAD_COMPUTE(gate_activation, - place, - u, - u, - d_g.slice(u_offsets, extents), - d_h * (c - h_p)); - // backward for unactivated output candidate - ACT_GRAD_COMPUTE( - activation, place, c, c, d_g.slice(c_offsets, extents), d_h * u); - } - // backward for reset_hidden_prev - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.GEMM(false, - true, - batch_size, - frame_size, - frame_size, - 1, - gate_grad_data + frame_size * 2, - frame_size * 3, - weight_data + frame_size * frame_size * 2, - frame_size, - 0, - reset_hidden_prev_grad_data, - frame_size); - // backward for unactivated reset gate - ACT_GRAD_COMPUTE(gate_activation, - place, - r, - r, - d_g.slice(r_offsets, extents), - d_r_h_p * h_p); - // backward for weight - if (weight_grad) { - T* weight_grad_data = dev_ctx.template Alloc(weight_grad); - // backward for state_weight - blas.GEMM(true, - false, - frame_size, - frame_size, - batch_size, - 1, - reset_hidden_prev_data, - frame_size, - gate_grad_data + frame_size * 2, - frame_size * 3, - 0, - weight_grad_data + frame_size * frame_size * 2, - frame_size); - - // backward for update_gate_weight and reset_gate_weight - blas.GEMM(true, - false, - frame_size, - frame_size * 2, - batch_size, - 1, - hidden_prev_data, - frame_size, - gate_grad_data, - frame_size * 3, - 0, - weight_grad_data, - frame_size * 2); - } - // backward for hidden_prev - if (hidden_prev_grad) { - T* hidden_prev_grad_data = dev_ctx.template Alloc(hidden_prev_grad); - auto d_h_p = phi::EigenMatrix::From(*hidden_prev_grad); - if (origin_mode) { - d_h_p.device(place) = d_r_h_p * r + d_h * u; - } else { - d_h_p.device(place) = d_r_h_p * r + d_h * (1 - u); - } - blas.GEMM(false, - true, - batch_size, - frame_size, - frame_size * 2, - 1, - gate_grad_data, - frame_size * 3, - weight_data, - frame_size * 2, - 1, - hidden_prev_grad_data, - frame_size); - } - // backward for input - if (input_grad) { - dev_ctx.template Alloc(input_grad); - auto d_x = phi::EigenMatrix::From(*input_grad); - d_x.device(place) = d_g; - } - // backward for bias - if (bias_grad) { - dev_ctx.template Alloc(bias_grad); - auto d_b = phi::EigenVector::Flatten(*bias_grad); - d_b.device(place) = d_g.sum(Eigen::array({{0}})); - } -} -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/index_select_impl.h b/backends/metax_gpu/kernels/impl/index_select_impl.h deleted file mode 100644 index ac39cab270..0000000000 --- a/backends/metax_gpu/kernels/impl/index_select_impl.h +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "glog/logging.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace phi { - -template -struct IndexSelectAdd { - void operator()(const Context& dev_ctx UNUSED, - int slice_size, - const T* src_pointer, - const T* p_pointer, - T* dist_pointer) { - for (int i = 0; i < slice_size; i++) { - dist_pointer[i] = src_pointer[i] + p_pointer[i]; - } - } -}; - -template -struct IndexSelectAdd< - Context, - T, - typename std::enable_if::value>::type> { - void operator()(const Context& dev_ctx, - int slice_size, - const T* src_pointer, - const T* p_pointer, - T* dist_pointer) { - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer); - } -}; - -template -void IndexSelectInner(const Context& dev_ctx, - DenseTensor* input, - const DenseTensor& index, - DenseTensor* output, - int dim) { - auto input_dim = input->dims(); - auto input_dim_size = input_dim.size(); - auto output_dim = output->dims(); - auto index_size = index.dims()[0]; - - DenseTensor index_cpu_copy; - if (index.place().GetType() != phi::AllocationType::CPU) { - phi::Copy(dev_ctx, index, phi::CPUPlace(), true, &index_cpu_copy); - } - const IndexT* index_data = index.place().GetType() == phi::AllocationType::CPU - ? index.data() - : index_cpu_copy.data(); - dev_ctx.template Alloc(output); - - auto slice_size = 1; - for (auto i = dim + 1; i < input_dim_size; i++) { - slice_size *= input_dim[i]; - } - - auto outer_nums = 1; - for (auto i = 0; i < dim; i++) { - outer_nums *= input_dim[i]; - } - - for (int i = 0; i < index_size; i++) { - PADDLE_ENFORCE_GE( - index_data[i], - -input_dim[dim], - common::errors::InvalidArgument( - "Variable value (index) of OP(index_select) " - "expected >= %ld and < %ld, but got %ld. Please check input " - "value.", - -input_dim[dim], - input_dim[dim], - index_data[i])); - PADDLE_ENFORCE_LT( - index_data[i], - input_dim[dim], - common::errors::InvalidArgument( - "Variable value (index) of OP(index_select) " - "expected >= %ld and < %ld, but got %ld. Please check input " - "value.", - -input_dim[dim], - input_dim[dim], - index_data[i])); - } - - VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums - << "; slice_size: " << slice_size << "; index_size: " << index_size; - - input->Resize(common::make_ddim({outer_nums, input_dim[dim], slice_size})); - output->Resize(common::make_ddim({outer_nums, index_size, slice_size})); - - auto input_tensor = EigenTensor::From(*input); - auto output_tensor = EigenTensor::From(*output); - - auto& place = *dev_ctx.eigen_device(); - - for (auto j = 0; j < index_size; j++) { - IndexT index_value = index_data[j]; - if (index_value < 0) { - index_value += input_dim[dim]; - } - auto output_t = output_tensor.chip(j, 1); - output_t.device(place) = input_tensor.chip(index_value, 1); - } - input->Resize(input_dim); - output->Resize(output_dim); -} - -template -void IndexSelectGradInner(const Context& dev_ctx, - const DenseTensor& out_grad, - const DenseTensor& index, - DenseTensor* x_grad, - int dim) { - const T* input_data = out_grad.data(); - const IndexT* index_data = index.data(); - - const T* p_output = dev_ctx.template Alloc(x_grad); - T* out_data = dev_ctx.template Alloc(x_grad); - - auto input_dim = out_grad.dims(); - auto input_dim_size = input_dim.size(); - auto output_dim = x_grad->dims(); - - phi::funcs::SetConstant set_constant; - set_constant(dev_ctx, x_grad, static_cast(0.0)); - - auto slice_size = 1; - for (auto i = dim + 1; i < input_dim_size; i++) { - slice_size *= input_dim[i]; - } - - auto input_width = slice_size * input_dim[dim]; - auto output_width = slice_size * output_dim[dim]; - - auto outer_nums = 1; - for (auto i = 0; i < dim; i++) { - outer_nums *= input_dim[i]; - } - - auto index_size = index.dims()[0]; - VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums - << "; slice_size: " << slice_size << "; input_width: " << input_width - << "; output_width: " << output_width - << "; index_size: " << index_size; - - for (auto i = 0; i < outer_nums; i++) { - auto input_start_offset = i * input_width; - auto output_start_offset = i * output_width; - - for (auto j = 0; j < index_size; j++) { - IndexT index_value = index_data[j]; - if (index_value < 0) { - index_value += input_dim[dim]; - } - auto src = input_data + input_start_offset + j * slice_size; - auto p_out = p_output + output_start_offset + index_value * slice_size; - auto dst = out_data + output_start_offset + index_value * slice_size; - IndexSelectAdd index_select_add; - index_select_add(dev_ctx, slice_size, src, p_out, dst); - } - } - x_grad->Resize(output_dim); -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h deleted file mode 100644 index 64b56f2cd1..0000000000 --- a/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/complex_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/matrix_inverse.h" -#include "paddle/phi/kernels/inverse_grad_kernel.h" - -namespace phi { - -template -void InverseGradKernel(const Context& dev_ctx, - const DenseTensor& out, - const DenseTensor& out_grad, - DenseTensor* in_grad) { - if (in_grad) { - dev_ctx.template Alloc(in_grad); - if (out_grad.numel() == 0) { - return; - } - auto blas = phi::funcs::GetBlas(dev_ctx); - - DenseTensor tmp_out; - tmp_out.Resize(out.dims()); - dev_ctx.template Alloc(&tmp_out); - - if (IsComplexType(out.dtype())) { - DenseTensor out_conj; - out_conj.Resize(out.dims()); - dev_ctx.template Alloc(&out_conj); - - phi::ConjKernel(dev_ctx, out, &out_conj); - - auto mat_dim_a0 = - phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false); - auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); - blas.MatMul( - out_grad, mat_dim_a0, out_conj, mat_dim_b0, T(1), &tmp_out, T(0)); - - auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); - auto mat_dim_b1 = - phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false); - blas.MatMul( - out_conj, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0)); - } else { - auto mat_dim_a0 = - phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false); - auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); - blas.MatMul(out_grad, mat_dim_a0, out, mat_dim_b0, T(1), &tmp_out, T(0)); - - auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); - auto mat_dim_b1 = - phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false); - blas.MatMul(out, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0)); - } - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/load_kernel_impl.h b/backends/metax_gpu/kernels/impl/load_kernel_impl.h deleted file mode 100644 index a3358e669b..0000000000 --- a/backends/metax_gpu/kernels/impl/load_kernel_impl.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "kernels/funcs/dense_tensor_serialize.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/cast_kernel.h" - -namespace phi { - -template -void LoadKernel(const Context& dev_ctx, - const std::string& file_path, - int64_t seek, - const std::vector& shape, - bool load_as_fp16, - phi::DenseTensor* out) { - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - std::ifstream fin(file_path, std::ios::binary); - PADDLE_ENFORCE_EQ( - static_cast(fin), - true, - errors::Unavailable("Load operator fail to open file %s, please check " - "whether the model file is complete or damaged.", - file_path)); - PADDLE_ENFORCE_NOT_NULL( - out, - errors::InvalidArgument("The variable to be loaded cannot be found.")); - - if (seek != -1) { - PADDLE_ENFORCE_GE(seek, - 0, - errors::InvalidArgument( - "seek with tensor must great than or equal to 0")); - phi::DeserializeFromStream(fin, out, dev_ctx, seek, shape); - } else { - phi::DeserializeFromStream(fin, out, dev_ctx); - } - - auto in_dtype = out->dtype(); - auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype; - if (in_dtype != out_dtype) { - phi::CastKernel(dev_ctx, *out, out_dtype, out); - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h b/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h deleted file mode 100644 index 4a061fe471..0000000000 --- a/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/detail/activation_functions.h" -#include "paddle/phi/kernels/funcs/lstm_compute.h" -#include "paddle/phi/kernels/funcs/lstm_utils.h" - -namespace phi { - -template -void LSTMKernel(const Context& dev_ctx, - const DenseTensor& input, - const paddle::optional& h0, - const paddle::optional& c0, - const DenseTensor& weight, - const DenseTensor& bias, - bool use_peepholes, - bool is_reverse, - bool is_test, - const std::string& gate_activation, - const std::string& cell_activation, - const std::string& candidate_activation, - DenseTensor* hidden, - DenseTensor* cell, - DenseTensor* batch_gate, - DenseTensor* batch_cell_pre_act) { - auto* hidden_t0 = h0.get_ptr(); - auto* cell_t0 = c0.get_ptr(); - - phi::DenseTensor* batch_gate_new = nullptr; - phi::DenseTensor batch_gate_temp; - if (is_test) { - batch_gate_new = &batch_gate_temp; - batch_gate_new->Resize(input.dims()); - } else { - batch_gate_new = batch_gate; - } - - dev_ctx.template Alloc(batch_gate_new); - dev_ctx.template Alloc(hidden); - dev_ctx.template Alloc(cell); - - phi::funcs::DenseTensor2BatchFunctor to_batch; - to_batch(dev_ctx, input, batch_gate_new, true, is_reverse); - - auto in_dims = input.dims(); - int frame_size = static_cast(in_dims[1] / 4); - phi::DDim dims({in_dims[0], frame_size}); - - if (bias.initialized()) { - phi::DenseTensor b = bias; - b.Resize({bias.numel(), 1}); - phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size); - phi::funcs::RowwiseAdd add_bias; - add_bias(dev_ctx, *batch_gate_new, gate_bias, batch_gate_new); - } - - phi::funcs::LstmMetaValue lstm_value; - if (bias.initialized() && use_peepholes) { - T* bias_data = const_cast(bias.data()); - // the code style in LstmMetaValue will be updated later. - - lstm_value.check_ig = bias_data + 4 * frame_size; - lstm_value.check_fg = lstm_value.check_ig + frame_size; - lstm_value.check_og = lstm_value.check_fg + frame_size; - } else { - lstm_value.check_ig = nullptr; - lstm_value.check_fg = nullptr; - lstm_value.check_og = nullptr; - } - lstm_value.prev_state_value = nullptr; - phi::DenseTensor ordered_c0; - - phi::Vector order(batch_gate_new->lod()[2]); - - if (cell_t0) { - // Since the batch computing for LSTM reorders the input sequence - // according to their length. The initialized cell state also needs - // to reorder. - ReorderInitState(dev_ctx, *cell_t0, order, &ordered_c0, true); - lstm_value.prev_state_value = ordered_c0.data(); - } - - // Use the local variable as here. - phi::DenseTensor batch_hidden, batch_cell, batch_cell_pre_act_temp; - phi::DenseTensor* batch_cell_pre_act_p; - if (is_test) { - batch_cell_pre_act_p = &batch_cell_pre_act_temp; - } else { - batch_cell_pre_act_p = batch_cell_pre_act; - } - batch_hidden.Resize(dims); - batch_cell.Resize(dims); - dev_ctx.template Alloc(&batch_hidden); - dev_ctx.template Alloc(&batch_cell); - batch_cell_pre_act_p->Resize(dims); - dev_ctx.template Alloc(batch_cell_pre_act_p); - - auto batch_starts = batch_gate_new->lod()[0]; - size_t num_batch = batch_starts.size() - 1; - auto gate_act = phi::funcs::detail::GetActivationType(gate_activation); - auto cell_act = phi::funcs::detail::GetActivationType(cell_activation); - auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation); - - auto blas = phi::funcs::GetBlas(dev_ctx); - for (size_t n = 0; n < num_batch; n++) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - - phi::DenseTensor gate_t = batch_gate_new->Slice(bstart, bend); - phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend); - phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend); - phi::DenseTensor cell_pre_act_t = batch_cell_pre_act_p->Slice(bstart, bend); - - int cur_batch_size = bend - bstart; - - if (n > 0) { - int pre_h_start = static_cast(batch_starts[n - 1]); - int pre_h_end = pre_h_start + cur_batch_size; - auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); - blas.MatMul(pre_hidden_t, - false, - weight, - false, - static_cast(1.0), - &gate_t, - static_cast(1.0)); - } else if (hidden_t0 != nullptr) { - // If n == 0 and there is no initialized hidden state, that is to say - // the H0 is zeros, the calculation W_h * H0 will be skipped. - // If n == 0 and there is initialized hidden state, calculate W_h * H0. - - // Since the batch computing for LSTM reorders the input sequence - // according to their length. The initialized hidden state also needs - // to reorder. - phi::DenseTensor ordered_h0; - ReorderInitState( - dev_ctx, *hidden_t0, order, &ordered_h0, true); - blas.MatMul(ordered_h0, - false, - weight, - false, - static_cast(1.0), - &gate_t, - static_cast(1.0)); - } - - lstm_value.gate_value = gate_t.data(); - lstm_value.output_value = out_t.data(); - lstm_value.state_value = cell_t.data(); - lstm_value.state_active_value = cell_pre_act_t.data(); - T cell_clip = 0.0; - phi::funcs::LstmUnitFunctor::compute(dev_ctx, - lstm_value, - frame_size, - cur_batch_size, - cell_clip, - gate_act, - cell_act, - cand_act); - lstm_value.prev_state_value = lstm_value.state_value; - } - - phi::funcs::Batch2DenseTensorFunctor to_seq; - batch_hidden.set_lod(batch_gate_new->lod()); - // restore the output hidden in phi::DenseTensor from the batch hidden - to_seq(dev_ctx, batch_hidden, hidden); - - batch_cell.set_lod(batch_gate_new->lod()); - // restore the output cell state in phi::DenseTensor from the batch cell - to_seq(dev_ctx, batch_cell, cell); -} - -template -void LSTMGradKernel(const Context& dev_ctx, - const DenseTensor& input_in, - const paddle::optional& h0_in, - const paddle::optional& c0_in, - const DenseTensor& weight_in, - const DenseTensor& bias_in, - const DenseTensor& hidden_in, - const DenseTensor& cell_in, - const DenseTensor& batch_gate_in, - const DenseTensor& batch_cell_pre_act_in, - const DenseTensor& hidden_grad, - bool use_peepholes, - bool is_reverse, - bool is_test, - const std::string& gate_activation, - const std::string& cell_activation, - const std::string& candidate_activation, - DenseTensor* input_grad, - DenseTensor* h0_grad, - DenseTensor* c0_grad, - DenseTensor* weight_grad, - DenseTensor* bias_grad) { - auto* input = &input_in; - auto* weight = &weight_in; - auto* bias = &bias_in; - - auto* hidden_out = &hidden_in; - auto* cell_out = &cell_in; - - auto* batch_gate = &batch_gate_in; - auto* batch_cell_pre_act = &batch_cell_pre_act_in; - - auto* hidden_g = &hidden_grad; - - auto* in_g = input_grad; - auto* weight_g = weight_grad; - auto* bias_g = bias_grad; - - auto* h0 = h0_in.get_ptr(); - auto* c0 = c0_in.get_ptr(); - - auto* h0_g = h0_grad; - auto* c0_g = c0_grad; - - phi::funcs::SetConstant zero; - if (weight_g) { - dev_ctx.template Alloc(weight_g); - zero(dev_ctx, weight_g, static_cast(0.0)); - } - - // ordered_h0/c0 is the reordered hidden/cell initialization. - // ordered_h0_g/c0_g is the reordered gradient of hidden/cell - // initialization. - phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; - phi::Vector order(batch_gate->lod()[2]); - - if (c0) { - ReorderInitState(dev_ctx, *c0, order, &ordered_c0, true); - } - if (c0 && c0_g) { - ordered_c0_g.Resize(c0_g->dims()); - dev_ctx.template Alloc(&ordered_c0_g); - } - - auto in_dims = input->dims(); - auto out_dims = hidden_g->dims(); - int frame_size = static_cast(in_dims[1] / 4); - PADDLE_ENFORCE_EQ(frame_size, - out_dims[1], - common::errors::InvalidArgument( - "The second dimension of Input(hidden_grad) should be " - "%d, but received %d in LSTM@Grad operator.", - frame_size, - out_dims[1])); - - phi::funcs::LstmMetaValue lstm_value; - if (bias && use_peepholes) { - T* bias_data = const_cast(bias->data()); - lstm_value.check_ig = bias_data + 4 * frame_size; - lstm_value.check_fg = lstm_value.check_ig + frame_size; - lstm_value.check_og = lstm_value.check_fg + frame_size; - } else { - lstm_value.check_ig = nullptr; - lstm_value.check_fg = nullptr; - lstm_value.check_og = nullptr; - } - - phi::funcs::LstmMetaGrad lstm_grad; - - if (bias && bias_g) { - dev_ctx.template Alloc(bias_g); - zero(dev_ctx, bias_g, static_cast(0.0)); - } - if (bias && bias_g && use_peepholes) { - T* bias_g_data = bias_g->data(); - lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size; - lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size; - lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size; - } else { - lstm_grad.check_ig_grad = nullptr; - lstm_grad.check_fg_grad = nullptr; - lstm_grad.check_og_grad = nullptr; - } - - phi::funcs::DenseTensor2BatchFunctor to_batch; - - auto ToBatch = [&batch_gate, &to_batch](const Context& ctx, - const phi::DenseTensor& src, - const phi::DDim& dims, - phi::DenseTensor& dst) { - dst.Resize(dims); - ctx.template Alloc(&dst); - dst.set_lod(batch_gate->lod()); - to_batch(ctx, src, &dst, false); - }; - - phi::DenseTensor batch_hidden, batch_hidden_g, batch_cell; - ToBatch(dev_ctx, *hidden_out, out_dims, batch_hidden); - ToBatch(dev_ctx, *hidden_g, out_dims, batch_hidden_g); - ToBatch(dev_ctx, *cell_out, out_dims, batch_cell); - - phi::DenseTensor batch_cell_g, batch_gate_g; - batch_cell_g.Resize(out_dims); - dev_ctx.template Alloc(&batch_cell_g); - // TODO(qingqing) support the case output cell has gradient. - // to_batch(dev_ctx, *cell_g, batch_cell_g, false); - zero(dev_ctx, &batch_cell_g, static_cast(0.0)); - batch_gate_g.Resize(batch_gate->dims()); - dev_ctx.template Alloc(&batch_gate_g); - batch_gate_g.set_lod(batch_gate->lod()); - - auto gate_act = phi::funcs::detail::GetActivationType(gate_activation); - auto cell_act = phi::funcs::detail::GetActivationType(cell_activation); - auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation); - - auto batch_starts = batch_gate->lod()[0]; - size_t num_batch = batch_starts.size() - 1; - auto blas = phi::funcs::GetBlas(dev_ctx); - for (int n = static_cast(num_batch) - 1; n >= 0; n--) { - int bstart = static_cast(batch_starts[n]); - int bend = static_cast(batch_starts[n + 1]); - - phi::DenseTensor gate = batch_gate->Slice(bstart, bend); - phi::DenseTensor cell = batch_cell.Slice(bstart, bend); - phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); - lstm_value.gate_value = gate.data(); - lstm_value.state_value = cell.data(); - lstm_value.state_active_value = cell_pre_act.data(); - - phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend); - phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend); - phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend); - lstm_grad.state_grad = cell_g.data(); - lstm_grad.gate_grad = gate_g.data(); - lstm_grad.output_grad = out_g.data(); - - if (n > 0) { - int bstart_pre = static_cast(batch_starts[n - 1]); - phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart); - phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); - lstm_value.prev_state_value = cell_pre.data(); - lstm_grad.prev_state_grad = cell_pre_g.data(); - } else { - lstm_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; - lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; - } - - // lstm_value.output_value not used in bp, set to nullptr - // lstm_grad.state_active_grad not used in bp, set to nullptr - lstm_value.output_value = nullptr; - lstm_grad.state_active_grad = nullptr; - int cur_batch_size = bend - bstart; - T cell_clip = 0.0; - phi::funcs::LstmUnitGradFunctor::compute(dev_ctx, - lstm_value, - lstm_grad, - frame_size, - cur_batch_size, - cell_clip, - gate_act, - cell_act, - cand_act); - - if (n > 0) { - int pre_h_start = static_cast(batch_starts[n - 1]); - int pre_h_end = pre_h_start + cur_batch_size; - auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); - blas.MatMul(gate_g, - false, - *weight, - true, - static_cast(1.0), - &pre_hidden_g, - static_cast(1.0)); - if (weight_g) { - /* backward weight */ - auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); - blas.MatMul(pre_hidden, - true, - gate_g, - false, - static_cast(1.0), - weight_g, - static_cast(1.0)); - } - } else { - if (h0 && weight_g) { - ReorderInitState(dev_ctx, *h0, order, &ordered_h0, true); - blas.MatMul(ordered_h0, - true, - gate_g, - false, - static_cast(1.0), - weight_g, - static_cast(1.0)); - } - if (h0 && h0_g) { - ordered_h0_g.Resize(h0_g->dims()); - dev_ctx.template Alloc(&ordered_h0_g); - blas.MatMul(gate_g, - false, - *weight, - true, - static_cast(1.0), - &ordered_h0_g, - static_cast(0.0)); - } - } - } - - phi::funcs::Batch2DenseTensorFunctor to_seq; - if (in_g) { - /* backward data */ - dev_ctx.template Alloc(in_g); - to_seq(dev_ctx, batch_gate_g, in_g); - } - if (bias && bias_g) { - /* backward bias */ - phi::DenseTensor b_g = *bias_g; - b_g.Resize({bias_g->numel(), 1}); - phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size); - phi::funcs::ColwiseSum col_sum; - col_sum(dev_ctx, batch_gate_g, &gate_bias_g); - } - - if (h0 && h0_g) { - ReorderInitState(dev_ctx, ordered_h0_g, order, h0_g, false); - } - if (c0 && c0_g) { - ReorderInitState(dev_ctx, ordered_c0_g, order, c0_g, false); - } -} -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h deleted file mode 100644 index 5a2e5d48a1..0000000000 --- a/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h +++ /dev/null @@ -1,308 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/impl/lu_kernel_impl.h" -#include "paddle/phi/kernels/triangular_solve_kernel.h" - -namespace phi { - -template -void LUGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out, - const DenseTensor& pivots, - const DenseTensor& out_grad, - bool pivot UNUSED, - DenseTensor* x_grad) { - dev_ctx.template Alloc(x_grad); - - auto blas = phi::funcs::GetBlas(dev_ctx); - - auto xdims = x.dims(); - int xrank = xdims.size(); - int64_t m = xdims[xrank - 2]; - int64_t n = xdims[xrank - 1]; - int64_t k = std::min(m, n); - - DenseTensor L, U, L_narrow, U_narrow, L_narrow_mH, U_narrow_mH, grad_narrow; - LU_Unpack(dev_ctx, &out, &L, &U); - - Tensor_narrow(dev_ctx, &L, &L_narrow, 0, k, 0, k); - Tensor_narrow(dev_ctx, &U, &U_narrow, 0, k, 0, k); - Tensor_narrow(dev_ctx, &out_grad, &grad_narrow, 0, k, 0, k); - auto graddims = grad_narrow.dims(); - - Tensor_Conj(dev_ctx, L_narrow, &L_narrow_mH); - Tensor_Conj(dev_ctx, U_narrow, &U_narrow_mH); - L_narrow_mH = Transpose2DTo6D(dev_ctx, L_narrow_mH); - U_narrow_mH = Transpose2DTo6D(dev_ctx, U_narrow_mH); - - auto LmHdims = L_narrow_mH.dims(); - auto UmHdims = U_narrow_mH.dims(); - - DenseTensor phi_L, phi_U, phi, psi; - phi_L.Resize(LmHdims); - dev_ctx.template Alloc(&phi_L); - phi_U.Resize(UmHdims); - dev_ctx.template Alloc(&phi_U); - auto mat_dim_l = phi::funcs::CreateMatrixDescriptor(LmHdims, 0, false); - auto mat_dim_u = phi::funcs::CreateMatrixDescriptor(UmHdims, 0, false); - auto mat_dim_g = phi::funcs::CreateMatrixDescriptor(graddims, 0, false); - blas.MatMul(L_narrow_mH, - mat_dim_l, - grad_narrow, - mat_dim_g, - static_cast(1), - &phi_L, - static_cast(0)); - - blas.MatMul(grad_narrow, - mat_dim_g, - U_narrow_mH, - mat_dim_u, - static_cast(1), - &phi_U, - static_cast(0)); - - auto phil_rank = LmHdims.size(); - auto phiu_rank = UmHdims.size(); - phi::funcs::ForRange l_for_range(dev_ctx, phi_L.numel()); - phi::funcs::TrilTriuCompute tril_computer(phi_L.data(), - -1, - true, - LmHdims[phil_rank - 2], - LmHdims[phil_rank - 1], - phi_L.data()); - l_for_range(tril_computer); - - phi::funcs::ForRange u_for_range(dev_ctx, phi_U.numel()); - phi::funcs::TrilTriuCompute triu_computer(phi_U.data(), - 0, - false, - UmHdims[phiu_rank - 2], - UmHdims[phiu_rank - 1], - phi_U.data()); - u_for_range(triu_computer); - - Tensor_Add(dev_ctx, phi_L, phi_U, &phi); - psi.Resize(xdims); - dev_ctx.template Alloc(&psi); - phi::funcs::SetConstant setter; - setter(dev_ctx, &psi, static_cast(0)); - - std::vector axes = {xrank - 2, xrank - 1}; - std::vector slice_starts(2, 0); - std::vector slice_ends(2, 0); - auto valuedims = common::vectorize(xdims); - - DenseTensor Pmat; - Unpack_Pivot(dev_ctx, pivots, &Pmat, m, k); - - if (m <= n) { - if (k < n) { - DenseTensor U_complement, U_grad_complement, phi_complement, - phi_complement_l; - Tensor_narrow(dev_ctx, &U, &U_complement, 0, k, k, n); - Tensor_narrow( - dev_ctx, &out_grad, &U_grad_complement, 0, k, k, n); - DenseTensor U_complement_mH = - Transpose2DTo6D(dev_ctx, U_complement); - - Tensor_Conj(dev_ctx, U_complement_mH, &U_complement_mH); - - auto mat_dim_g = phi::funcs::CreateMatrixDescriptor( - U_grad_complement.dims(), 0, false); - auto mat_dim_u = - phi::funcs::CreateMatrixDescriptor(U_complement_mH.dims(), 0, false); - auto phidims = UmHdims; - phidims[UmHdims.size() - 2] = k; - phidims[UmHdims.size() - 1] = k; - phi_complement.Resize(phidims); - dev_ctx.template Alloc(&phi_complement); - blas.MatMul(U_grad_complement, - mat_dim_g, - U_complement_mH, - mat_dim_u, - static_cast(1), - &phi_complement, - static_cast(0)); - - phi_complement_l.Resize(phidims); - dev_ctx.template Alloc(&phi_complement_l); - const auto H = phidims[phidims.size() - 2]; - const auto W = phidims[phidims.size() - 1]; - phi::funcs::ForRange x_for_range(dev_ctx, - phi_complement.numel()); - phi::funcs::TrilTriuCompute tril_computer( - phi_complement.data(), -1, true, H, W, phi_complement_l.data()); - x_for_range(tril_computer); - - Tensor_Sub(dev_ctx, phi, phi_complement_l, &phi); - - slice_starts[0] = 0; - slice_starts[1] = k; - slice_ends[0] = k; - slice_ends[1] = n; - valuedims[xrank - 2] = k; - valuedims[xrank - 1] = n - k; - SetValueCompute_dispatch(dev_ctx, - &psi, - &U_grad_complement, - &psi, - axes, - &slice_starts, - &slice_ends, - valuedims, - xrank); - } - - DenseTensor psi_principal, phi_mH, psi_tmp; - Tensor_Conj(dev_ctx, phi, &phi_mH); - phi_mH = Transpose2DTo6D(dev_ctx, phi_mH); - - phi::TriangularSolveKernel( - dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal); - - Tensor_Conj(dev_ctx, psi_principal, &psi_principal); - psi_principal = Transpose2DTo6D(dev_ctx, psi_principal); - slice_starts[0] = 0; - slice_starts[1] = 0; - slice_ends[0] = k; - slice_ends[1] = k; - valuedims[xrank - 2] = k; - valuedims[xrank - 1] = k; - - SetValueCompute_dispatch(dev_ctx, - &psi, - &psi_principal, - &psi, - axes, - &slice_starts, - &slice_ends, - valuedims, - xrank); - - phi::TriangularSolveKernel( - dev_ctx, L_narrow_mH, psi, true, false, true, &psi_tmp); - - auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(psi_tmp.dims(), 0, false); - blas.MatMul(Pmat, - mat_dim_p, - psi_tmp, - mat_dim_b, - static_cast(1), - x_grad, - static_cast(0)); - } else { - DenseTensor L_complement, L_grad_complement, phi_complement, - phi_complement_u; - Tensor_narrow(dev_ctx, &L, &L_complement, k, m, 0, k); - Tensor_narrow( - dev_ctx, &out_grad, &L_grad_complement, k, m, 0, k); - DenseTensor L_complement_mH = - Transpose2DTo6D(dev_ctx, L_complement); - Tensor_Conj(dev_ctx, L_complement_mH, &L_complement_mH); - - auto mat_dim_g = - phi::funcs::CreateMatrixDescriptor(L_grad_complement.dims(), 0, false); - auto mat_dim_u = - phi::funcs::CreateMatrixDescriptor(L_complement_mH.dims(), 0, false); - auto phidims = LmHdims; - phidims[LmHdims.size() - 2] = k; - phidims[LmHdims.size() - 1] = k; - phi_complement.Resize(phidims); - dev_ctx.template Alloc(&phi_complement); - blas.MatMul(L_complement_mH, - mat_dim_u, - L_grad_complement, - mat_dim_g, - static_cast(1), - &phi_complement, - static_cast(0)); - - phi_complement_u.Resize(phidims); - dev_ctx.template Alloc(&phi_complement_u); - const auto H = phidims[phidims.size() - 2]; - const auto W = phidims[phidims.size() - 1]; - phi::funcs::ForRange x_for_range(dev_ctx, phi_complement.numel()); - phi::funcs::TrilTriuCompute triu_computer( - phi_complement.data(), 0, false, H, W, phi_complement_u.data()); - x_for_range(triu_computer); - - Tensor_Sub(dev_ctx, phi, phi_complement_u, &phi); - - slice_starts[0] = k; - slice_starts[1] = 0; - slice_ends[0] = m; - slice_ends[1] = k; - valuedims[xrank - 2] = m - k; - valuedims[xrank - 1] = k; - SetValueCompute_dispatch(dev_ctx, - &psi, - &L_grad_complement, - &psi, - axes, - &slice_starts, - &slice_ends, - valuedims, - xrank); - DenseTensor psi_principal, phi_mH, psi_tmp, U_narrow_mH; - - phi::TriangularSolveKernel( - dev_ctx, L_narrow_mH, phi, true, false, true, &psi_principal); - - slice_starts[0] = 0; - slice_starts[1] = 0; - slice_ends[0] = k; - slice_ends[1] = k; - valuedims[xrank - 2] = k; - valuedims[xrank - 1] = k; - - SetValueCompute_dispatch(dev_ctx, - &psi, - &psi_principal, - &psi, - axes, - &slice_starts, - &slice_ends, - valuedims, - xrank); - - psi_tmp.Resize(psi.dims()); - dev_ctx.template Alloc(&psi_tmp); - auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(psi.dims(), 0, false); - blas.MatMul(Pmat, - mat_dim_p, - psi, - mat_dim_b, - static_cast(1), - &psi_tmp, - static_cast(0)); - psi_tmp = Transpose2DTo6D(dev_ctx, psi_tmp); - - Tensor_Conj(dev_ctx, U_narrow, &U_narrow_mH); - phi::TriangularSolveKernel( - dev_ctx, U_narrow_mH, psi_tmp, true, false, false, &psi); - *x_grad = Transpose2DTo6D(dev_ctx, psi); - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h deleted file mode 100644 index 24dee650df..0000000000 --- a/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/infermeta/binary.h" -// #include "paddle/phi/paddle/phi/kernels/funcs/blas/blas.h" - -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/matrix_solve.h" -#include "paddle/phi/kernels/impl/lu_kernel_impl.h" -#include "paddle/phi/kernels/lu_solve_grad_kernel.h" -#include "paddle/phi/kernels/lu_solve_kernel.h" -#include "paddle/phi/kernels/lu_unpack_kernel.h" -#include "paddle/phi/kernels/triangular_solve_kernel.h" - -namespace phi { - -template -DenseTensor GetMH(const Context& dev_ctx, const DenseTensor x) { - DenseTensor x_mH; - phi::Tensor_Conj(dev_ctx, x, &x_mH); - return phi::Transpose2DTo6D(dev_ctx, x_mH); -} - -template -void LuSolveGradKernel(const Context& dev_ctx, - const DenseTensor& b, - const DenseTensor& lu, - const DenseTensor& pivots, - const DenseTensor& out, - const DenseTensor& out_grad, - const std::string& trans, - DenseTensor* b_grad, - DenseTensor* lu_grad) { - if (b_grad != nullptr) { - dev_ctx.template Alloc(b_grad); - std::string trans_t = (trans == "N") ? "T" : "N"; - phi::LuSolveKernel( - dev_ctx, out_grad, lu, pivots, trans_t, b_grad); - } - - if (lu_grad != nullptr) { - dev_ctx.template Alloc(lu_grad); - - DenseTensor p, l, u, l_mH, u_mH; - MetaTensor meta_p(&p); - MetaTensor meta_l(&l); - MetaTensor meta_u(&u); - bool unpack_pivots = (trans == "N") ? false : true; - LUUnpackInferMeta( - lu, pivots, true, unpack_pivots, &meta_p, &meta_l, &meta_u); - phi::LUUnpackKernel( - dev_ctx, lu, pivots, true, unpack_pivots, &p, &l, &u); - l_mH = GetMH(dev_ctx, l); - u_mH = GetMH(dev_ctx, u); - if (trans == "N") { - // gR = U^{-H}op_2(-gX)op_2(X)^Ha - DenseTensor gR, psi_tmp, out_mH; - out_mH = GetMH(dev_ctx, out); - - auto blas = phi::funcs::GetBlas(dev_ctx); - auto out_grad_dims = out_grad.dims(); - auto mat_dim_l = - phi::funcs::CreateMatrixDescriptor(out_grad_dims, 0, false); - auto out_mH_dims = out_mH.dims(); - auto mat_dim_g = - phi::funcs::CreateMatrixDescriptor(out_mH_dims, 0, false); - psi_tmp.Resize(lu.dims()); - dev_ctx.template Alloc(&psi_tmp); - blas.MatMul(out_grad, - mat_dim_l, - out_mH, - mat_dim_g, - static_cast(-1), - &psi_tmp, - static_cast(0)); - phi::TriangularSolveKernel( - dev_ctx, u_mH, psi_tmp, false, false, false, &gR); - - // gL = (L^{-H} gR U^H).tril(-1) - DenseTensor mul_tmp, gL; - auto gr_dims = gR.dims(); - auto mat_dim_r = phi::funcs::CreateMatrixDescriptor(gr_dims, 0, false); - auto gu_dims = u_mH.dims(); - auto mat_dim_u = phi::funcs::CreateMatrixDescriptor(gu_dims, 0, false); - mul_tmp.Resize(gr_dims); - dev_ctx.template Alloc(&mul_tmp); - blas.MatMul(gR, - mat_dim_r, - u_mH, - mat_dim_u, - static_cast(1), - &mul_tmp, - static_cast(0)); - phi::TriangularSolveKernel( - dev_ctx, l_mH, mul_tmp, true, false, true, &gL); - - auto phil_rank = gL.dims().size(); - auto phir_rank = gR.dims().size(); - phi::funcs::ForRange l_for_range(dev_ctx, gL.numel()); - phi::funcs::TrilTriuCompute tril_computer(gL.data(), - -1, - true, - gL.dims()[phil_rank - 2], - gL.dims()[phil_rank - 1], - gL.data()); - l_for_range(tril_computer); - - phi::funcs::ForRange r_for_range(dev_ctx, gR.numel()); - phi::funcs::TrilTriuCompute triu_computer(gR.data(), - 0, - false, - gR.dims()[phir_rank - 2], - gR.dims()[phir_rank - 1], - gR.data()); - r_for_range(triu_computer); - Tensor_Add(dev_ctx, gL, gR, lu_grad); - } else { - DenseTensor gR, p_mT, tem_out, out_grad_mH, tem_out1, tem_out2, tem_out3, - gU; - p_mT = Transpose2DTo6D(dev_ctx, p); - auto PmTdims = p_mT.dims(); - auto Outdims = out.dims(); - auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(PmTdims, 0, false); - auto mat_dim_o = phi::funcs::CreateMatrixDescriptor(Outdims, 0, false); - tem_out.Resize(Outdims); - dev_ctx.template Alloc(&tem_out); - auto blas = phi::funcs::GetBlas(dev_ctx); - // gR = -P^T op_3(X)op_1(op_2(gX))P - blas.MatMul(p_mT, - mat_dim_p, - out, - mat_dim_o, - static_cast(-1), - &tem_out, - static_cast(0)); - out_grad_mH = GetMH(dev_ctx, out_grad); - auto TemOutdims = tem_out.dims(); - auto OutGradmHdims = out_grad_mH.dims(); - auto mat_dim_tem_out = - phi::funcs::CreateMatrixDescriptor(TemOutdims, 0, false); - auto mat_dim_out_grad_mH = - phi::funcs::CreateMatrixDescriptor(OutGradmHdims, 0, false); - tem_out1.Resize(lu.dims()); - dev_ctx.template Alloc(&tem_out1); - blas.MatMul(tem_out, - mat_dim_tem_out, - out_grad_mH, - mat_dim_out_grad_mH, - static_cast(1), - &tem_out1, - static_cast(0)); - auto TemOutdims1 = tem_out1.dims(); - auto pdims = p.dims(); - auto mat_dim_tem_out1 = - phi::funcs::CreateMatrixDescriptor(TemOutdims1, 0, false); - auto mat_dim_p1 = phi::funcs::CreateMatrixDescriptor(pdims, 0, false); - tem_out2.Resize(TemOutdims1); - dev_ctx.template Alloc(&tem_out2); - blas.MatMul(tem_out1, - mat_dim_tem_out1, - p, - mat_dim_p1, - static_cast(1), - &tem_out2, - static_cast(0)); - // gR = gR L^{-H} - phi::TriangularSolveKernel( - dev_ctx, l_mH, tem_out2, true, true, true, &gR); - // gU = (L^H gR U^{-H}).triu() - auto LmHdims = l_mH.dims(); - auto gRdims = gR.dims(); - auto mat_dim_l_mh = phi::funcs::CreateMatrixDescriptor(LmHdims, 0, false); - auto mat_dim_gr = phi::funcs::CreateMatrixDescriptor(gRdims, 0, false); - tem_out3.Resize(LmHdims); - dev_ctx.template Alloc(&tem_out3); - blas.MatMul(l_mH, - mat_dim_l_mh, - gR, - mat_dim_gr, - static_cast(1), - &tem_out3, - static_cast(0)); - phi::TriangularSolveKernel( - dev_ctx, u_mH, tem_out3, false, true, false, &gU); - - auto phiu_rank = gU.dims().size(); - auto phir_rank = gR.dims().size(); - phi::funcs::ForRange l_for_range(dev_ctx, gR.numel()); - phi::funcs::TrilTriuCompute tril_computer(gR.data(), - -1, - true, - gR.dims()[phir_rank - 2], - gR.dims()[phir_rank - 1], - gR.data()); - l_for_range(tril_computer); - - phi::funcs::ForRange r_for_range(dev_ctx, gU.numel()); - phi::funcs::TrilTriuCompute triu_computer(gU.data(), - 0, - false, - gU.dims()[phiu_rank - 2], - gU.dims()[phiu_rank - 1], - gU.data()); - r_for_range(triu_computer); - Tensor_Add(dev_ctx, gR, gU, lu_grad); - } - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h b/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h deleted file mode 100644 index 7ba97234cc..0000000000 --- a/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h +++ /dev/null @@ -1,477 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -namespace phi { - -template -inline DenseTensor MatMul(const Context& dev_ctx, - const DenseTensor& matrix_a, - const DenseTensor& matrix_b, - const phi::DDim& a_dim, - const phi::DDim& b_dim) { - auto blas = phi::funcs::GetBlas(dev_ctx); - - DenseTensor matrix_c; - phi::DDim c_dim = common::make_ddim({a_dim[0], b_dim[1]}); - matrix_c.Resize(c_dim); - dev_ctx.template Alloc(&matrix_c); - - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false); - const T alpha = static_cast(1.0); - blas.MatMul(matrix_a.data(), - mat_dim_a, - matrix_b.data(), - mat_dim_b, - alpha, - matrix_c.data(), - T(0)); - return matrix_c; -} - -/** - * @brief Recursively calculate matrix multiplication according to the optimal - * order - * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j] - * - * @param - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the right of sub chain - * save_result: set true by backward - * results: save the intermediate result during backward - */ -template -inline DenseTensor MatChainMul(const Context& dev_ctx, - const std::vector& ins, - const std::vector& ins_dims, - const std::vector& order, - const uint64_t i, - const uint64_t j, - const bool save_result, - std::vector* results) { - if (i == j) { - return *ins[i]; - } - - const auto A = MatChainMul(dev_ctx, - ins, - ins_dims, - order, - i, - order[i * ins.size() + j], - save_result, - results); - phi::DDim a_dim = A.dims(); - if (i == order[i * ins.size() + j]) { - a_dim = ins_dims[i]; - } - - const auto B = MatChainMul(dev_ctx, - ins, - ins_dims, - order, - order[i * ins.size() + j] + 1, - j, - save_result, - results); - phi::DDim b_dim = B.dims(); - if (j == order[i * ins.size() + j] + 1) { - b_dim = ins_dims[j]; - } - - auto result = MatMul(dev_ctx, A, B, a_dim, b_dim); - if (save_result) { - (*results)[i * ins.size() + j] = result; - } - return result; -} - -/** - * @brief get the optimal order - */ -template -std::vector GetOrder(const std::vector& ins, - const std::vector& ins_dims) { - auto n = ins.size(); - // p: save the ins shape, the ins[i] shape is (p[i], p[i+1]) - std::vector p(n + 1); - for (uint64_t i = 0; i < n; i++) { - p[i] = ins_dims[i][0]; - } - p[n] = ins_dims[n - 1][1]; - - // m[i, j]: save the lowest cost for multiplying ins[i...j] - std::vector m(n * n, 0); - // define ins[i...j] means multiplying matrices from ins[i] to ins[j] - // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then - // multiply the resulting matrices is the optimal order for ins[i...j] - std::vector order(n * n); - for (uint64_t l = 1; l < n; l++) { - for (uint64_t i = 0; i < n - l; i++) { - auto j = i + l; - m[i * n + j] = 0xffffffff; - for (uint64_t k = i; k < j; k++) { - uint64_t q = - m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1]; - if (q < m[i * n + j]) { - m[i * n + j] = q; - order[i * n + j] = k; - } - } - } - } - return order; -} - -template -static inline DenseTensor MultiDotMatChainOrder( - const Context& dev_ctx, - const std::vector& ins, - const std::vector& ins_dims, - const bool save_result, - std::vector* results) { - auto order = GetOrder(ins, ins_dims); - return MatChainMul( - dev_ctx, ins, ins_dims, order, 0, ins.size() - 1, save_result, results); -} - -template -inline void GetDims(const std::vector& ins, - std::vector* ins_dims) { - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - (*ins_dims)[i] = ins[i]->dims(); - if (i == 0 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = common::make_ddim({1, (*ins_dims)[i][0]}); - } else if (i == n - 1 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = common::make_ddim({(*ins_dims)[i][0], 1}); - } - } -} - -template -void MultiDotKernel(const Context& dev_ctx, - const std::vector& x, - DenseTensor* out) { - auto ins = x; - dev_ctx.template Alloc(out); - - auto blas = phi::funcs::GetBlas(dev_ctx); - - auto n = ins.size(); - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - // If any numel is 0, then return. - bool size_0 = false; - for (size_t i = 0; i < n; i++) { - if (x[i]->numel() == 0) size_0 = true; - } - if (size_0) { - // For example: [2, 0], [0, 4] -> [2, 4] - if (out && out->numel() > 0) { - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out); - } - return; - } - const T scale = static_cast(1.0); - if (n == 2) { - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0)); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - DenseTensor tmp_out; - phi::DDim tmp_dim = common::make_ddim({Ma, Nb}); - tmp_out.Resize(tmp_dim); - dev_ctx.template Alloc(&tmp_out); - blas.MatMul( - *ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, T(0)); - auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0)); - } else { - DenseTensor tmp_out; - phi::DDim tmp_dim = common::make_ddim({Ka, Nc}); - tmp_out.Resize(tmp_dim); - dev_ctx.template Alloc(&tmp_out); - blas.MatMul( - *ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0)); - auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0)); - } - } else { - std::vector results; - const auto tmp = MultiDotMatChainOrder( - dev_ctx, ins, ins_dims, false, &results); - auto out_dim = out->dims(); - *out = tmp; - out->Resize(out_dim); - } - - // auto& theta_grad = out; - // T* th_data = ctx.template Alloc(theta_grad); -} - -/** - * @brief calculate dA and dB - * dA = dout * transpose(B) - * dB = transpose(A) * dout - */ -template -void CalcGrad(const Context& dev_ctx, - const DenseTensor& dout, - const DenseTensor& A, - const DenseTensor& B, - const phi::DDim& dout_dim, - const phi::DDim& a_dim, - const phi::DDim& b_dim, - DenseTensor* dA, - DenseTensor* dB) { - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true); - T alpha = static_cast(1.0); - auto blas = phi::funcs::GetBlas(dev_ctx); - blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); - blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); -} - -/** - * @brief calculate multi matrix multiplication grad by a chain order - * @param - * dout: the grad of multi matrix multiplication out - * dx: the out grad of inputs - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the right of sub chain - * results: the intermediate result of forward - */ -template -void MatChainMulGrad(const Context& dev_ctx, - const DenseTensor& dout, - std::vector* dx, - const std::vector& ins, - const phi::DDim& dout_dim, - const std::vector& ins_dims, - const std::vector& order, - const uint64_t i, - const uint64_t j, - const std::vector& results) { - if (i == j) { - *((*dx)[i]) = dout; - return; - } - - const auto n = ins.size(); - const auto right = order[i * n + j]; - const auto left = order[i * n + j] + 1; - // get the multi result of left sub chain - const auto* A = &results[i * n + right]; - phi::DDim a_dim = A->dims(); - if (i == right) { - A = ins[i]; - a_dim = ins_dims[i]; - } - // get the multi result of right sub chain - const auto* B = &results[left * n + j]; - phi::DDim b_dim = B->dims(); - if (left == j) { - B = ins[j]; - b_dim = ins_dims[j]; - } - DenseTensor dA, dB; - dA.Resize({dout_dim[0], b_dim[0]}); - dB.Resize({a_dim[1], dout_dim[1]}); - dev_ctx.template Alloc(&dA); - dev_ctx.template Alloc(&dB); - - CalcGrad(dev_ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB); - MatChainMulGrad( - dev_ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, results); - MatChainMulGrad( - dev_ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, results); -} - -template -void MultiDotGradMatChainOrder(const Context& dev_ctx, - const DenseTensor& dout, - const std::vector& ins, - const phi::DDim& dout_dim, - const std::vector& ins_dims, - std::vector* dx) { - auto order = GetOrder(ins, ins_dims); - auto n = ins.size(); - std::vector results(n * n); - MatChainMul( - dev_ctx, ins, ins_dims, order, 0, n - 1, true, &results); - MatChainMulGrad( - dev_ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, results); -} - -template -void MultiDotGradKernel(const Context& dev_ctx, - const std::vector& x, - const DenseTensor& out_grad, - std::vector x_grad) { - auto ins = x; - auto dout = out_grad; - auto dx = x_grad; - - auto blas = phi::funcs::GetBlas(dev_ctx); - - bool size_0 = false; - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - dev_ctx.template Alloc(dx[i]); - - if (dx[i]->numel() == 0) { - size_0 = true; - } - } - if (size_0) { - for (size_t i = 0; i < n; i++) { - if (dx[i]->numel() > 0) { - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(dx[i]->dims())), 0, dx[i]); - } - } - return; - } - - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - phi::DDim dout_dim = dout.dims(); - if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) { - dout_dim = common::make_ddim({1, 1}); - } else if (ins[0]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = common::make_ddim({1, dout_dim[0]}); - } - } else if (ins[n - 1]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = common::make_ddim({dout_dim[0], 1}); - } - } - - T alpha = static_cast(1); - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - if (n == 2) { - CalcGrad(dev_ctx, - dout, - *ins[0], - *ins[1], - dout_dim, - ins_dims[0], - ins_dims[1], - dx[0], - dx[1]); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - DenseTensor tmp_out, tmp_dout; - tmp_out.Resize({Ma, Nb}); - dev_ctx.template Alloc(&tmp_out); - tmp_dout.Resize({mat_dim_dout.height_, Nb}); - dev_ctx.template Alloc(&tmp_dout); - blas.MatMul( - *ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, T(0)); - CalcGrad(dev_ctx, - dout, - tmp_out, - *ins[2], - dout_dim, - tmp_out.dims(), - ins_dims[2], - &tmp_dout, - dx[2]); - CalcGrad(dev_ctx, - tmp_dout, - *ins[0], - *ins[1], - tmp_dout.dims(), - ins_dims[0], - ins_dims[1], - dx[0], - dx[1]); - } else { - DenseTensor tmp_out, tmp_dout; - tmp_out.Resize({Ka, Nc}); - dev_ctx.template Alloc(&tmp_out); - tmp_dout.Resize({Ka, mat_dim_dout.width_}); - dev_ctx.template Alloc(&tmp_dout); - blas.MatMul( - *ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, T(0)); - CalcGrad(dev_ctx, - dout, - *ins[0], - tmp_out, - dout_dim, - ins_dims[0], - tmp_dout.dims(), - dx[0], - &tmp_dout); - CalcGrad(dev_ctx, - tmp_dout, - *ins[1], - *ins[2], - tmp_dout.dims(), - ins_dims[1], - ins_dims[2], - dx[1], - dx[2]); - } - } else { - MultiDotGradMatChainOrder( - dev_ctx, dout, ins, dout_dim, ins_dims, &dx); - // if x's shape is: [3] [3, 4] [4] - // dx's shape will be: [1, 3] [3, 4] [4, 1] - if (ins[n - 1]->dims().size() == 1) { - dx[n - 1]->Resize({dx[n - 1]->dims()[0]}); - } - if (ins[0]->dims().size() == 1) { - dx[0]->Resize({dx[0]->dims()[1]}); - } - } -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/mv_kernel_impl.h b/backends/metax_gpu/kernels/impl/mv_kernel_impl.h deleted file mode 100644 index 4baee25a09..0000000000 --- a/backends/metax_gpu/kernels/impl/mv_kernel_impl.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace phi { - -template -void MvKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& vec, - DenseTensor* out) { - const auto& dim_x = x.dims(); - - // get data ptr - const T* x_data = x.data(); - const T* vec_data = vec.data(); - T* out_data = dev_ctx.template Alloc(out); - - auto blas = phi::funcs::GetBlas(dev_ctx); - - blas.GEMV(false, - dim_x[0], - dim_x[1], - static_cast(1), - x_data, - vec_data, - static_cast(0), - out_data); -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/repeat_interleave_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/repeat_interleave_grad_kernel_impl.h deleted file mode 100644 index e2c5d6c242..0000000000 --- a/backends/metax_gpu/kernels/impl/repeat_interleave_grad_kernel_impl.h +++ /dev/null @@ -1,217 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "kernels/impl/index_select_impl.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/repeat_interleave_grad_kernel.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/primitive/functor_primitives.h" -#ifdef __NVCC__ -#include "cub/cub.cuh" -#else -#include -namespace cub = hipcub; -#endif -#endif - -#include "paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h" - -namespace phi { - -#if defined(__NVCC__) || defined(__HIPCC__) -using phi::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void index_select_grad_cuda_kernel(const T* output_grad, - T* input_grad, - const IndexT* index, - int64_t N, - int64_t stride, - int64_t size, - int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - phi::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); -} - -template -__global__ void index_select_grad_init(T* input_grad, int64_t N) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - input_grad[idx] = 0.0; -} -#endif -template -void RepeatInterleaveWithTensorIndexGradKernel( - const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& repeats_tensor, - const DenseTensor& out_grad, - int dim, - DenseTensor* x_grad) { - auto input_dim = x_grad->dims(); - if (dim < 0) { - dim += static_cast(input_dim.size()); - } - - DenseTensor index; - PADDLE_ENFORCE_EQ(repeats_tensor.dims()[0] == x_grad->dims()[dim], - true, - common::errors::InvalidArgument( - "The length of Input(RepeatsTensor) must be the " - "same as length of Input(X) in axis. " - "But received: [%s], required: [%d].", - repeats_tensor.dims()[0], - x_grad->dims()[dim])); - - const auto& index_type = repeats_tensor.dtype(); - - bool index_type_match = - index_type == DataType::INT32 || index_type == DataType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, - true, - common::errors::InvalidArgument( - "Input(Repeats) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - DataTypeToString(index_type), - DataTypeToString(DataType::INT32), - DataTypeToString(DataType::INT64))); -#if defined(__NVCC__) || defined(__HIPCC__) - - auto output_dim = out_grad.dims(); - auto stride_dim = common::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - int64_t numel = x_grad->numel(); - int64_t out_nums = out_grad.numel(); - auto* out_grad_data = out_grad.data(); - dev_ctx.template Alloc(x_grad); - auto* in_grad_data = x_grad->data(); - auto stream = dev_ctx.stream(); - index_select_grad_init - <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>(in_grad_data, numel); - - if (index_type == DataType::INT64) { - phi::funcs::RepeatsTensor2IndexTensorFunctor()( - dev_ctx, repeats_tensor, &index); - int64_t index_nums = index.numel(); - - const int64_t* index_data = index.data(); - index_select_grad_cuda_kernel - <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>(out_grad_data, - in_grad_data, - index_data, - out_nums, - stride, - size, - delta); - } else { - phi::funcs::RepeatsTensor2IndexTensorFunctor()( - dev_ctx, repeats_tensor, &index); - int64_t index_nums = index.numel(); - - const int* index_data = index.data(); - index_select_grad_cuda_kernel - <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>(out_grad_data, - in_grad_data, - index_data, - out_nums, - stride, - size, - delta); - } -#endif -} - -template -void RepeatInterleaveGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - int repeats, - int dim, - DenseTensor* x_grad) { - if (x_grad && x_grad->numel() == 0) { - dev_ctx.template Alloc(x_grad); - return; - } - auto input_dim = x_grad->dims(); - if (dim < 0) { - dim += input_dim.size(); - } - - DenseTensor index; -#if defined(__NVCC__) || defined(__HIPCC__) - auto output_dim = out_grad.dims(); - auto stride_dim = common::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - int64_t numel = x_grad->numel(); - int64_t out_nums = out_grad.numel(); - auto* out_grad_data = out_grad.data(); - dev_ctx.template Alloc(x_grad); - auto* in_grad_data = x_grad->data(); - auto stream = dev_ctx.stream(); - index_select_grad_init - <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>(in_grad_data, numel); - int64_t index_size = x_grad->dims()[dim] * repeats; - std::vector index_vec(index_size); - for (int i = 0; i < x_grad->dims()[dim]; i++) { - std::fill_n(index_vec.begin() + i * repeats, repeats, i); - } - index.Resize(common::make_ddim({index_size})); - phi::TensorFromVector(index_vec, dev_ctx, &index); - - const int* index_data = index.data(); - int64_t index_nums = index.numel(); - index_select_grad_cuda_kernel - <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>(out_grad_data, - in_grad_data, - index_data, - out_nums, - stride, - size, - delta); -#endif -} -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/repeat_interleave_kernel_impl.h b/backends/metax_gpu/kernels/impl/repeat_interleave_kernel_impl.h deleted file mode 100644 index 041bc1f9e5..0000000000 --- a/backends/metax_gpu/kernels/impl/repeat_interleave_kernel_impl.h +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -// #include "paddle/phi/kernels/cpu/index_select_impl.h" -#include "kernels/impl/index_select_impl.h" -#include "paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h" -#include "paddle/phi/kernels/repeat_interleave_kernel.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/phi/backends/gpu/gpu_decls.h" -#include "paddle/phi/backends/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_primitives.h" -#include "paddle/phi/backends/gpu/gpu_resources.h" -#include "paddle/phi/kernels/primitive/functor_primitives.h" -#endif - -namespace phi { - -#if defined(__NVCC__) || defined(__HIPCC__) -using phi::PADDLE_CUDA_NUM_THREADS; -template -__global__ void index_select_cuda_kernel(const T* input, - T* output, - const IndexT* index, - int64_t N, - int64_t stride, - int64_t size, - int64_t delta) { - const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - const int64_t stride_size = stride * size; - - const int64_t pre_idx = idx / stride_size; - const int64_t remainder = idx % stride_size; - const int64_t dim_idx = remainder / stride; - - const IndexT src_dim_idx = index[dim_idx]; - - const int64_t input_idx = - idx + ((delta * pre_idx) + (src_dim_idx - dim_idx)) * stride; - output[idx] = input[input_idx]; -} -#endif - -template -void RepeatInterleaveKernel(const Context& dev_ctx, - const DenseTensor& x, - int repeats, - int dim, - DenseTensor* out) { - PADDLE_ENFORCE_GT(repeats, - 0, - common::errors::InvalidArgument( - "repeats must grater than 0, but got %d", repeats)); - if (out && out->numel() == 0) { - dev_ctx.template Alloc(out); - return; - } - auto place = dev_ctx.GetPlace(); - auto cpu_place = phi::CPUPlace(); - - auto input_dim = x.dims(); - if (dim < 0) { - dim += input_dim.size(); - } - - DenseTensor index; - int64_t index_size = input_dim[dim] * repeats; - std::vector index_vec(index_size); - for (int i = 0; i < input_dim[dim]; i++) { - std::fill_n(index_vec.begin() + i * repeats, repeats, i); - } - index.Resize(common::make_ddim({index_size})); - if (place == cpu_place) { - DenseTensor x_copy = x; - phi::TensorFromVector(index_vec, dev_ctx, &index); - - auto output_dim = common::vectorize(x.dims()); - output_dim[dim] = index_size; - out->Resize(common::make_ddim(output_dim)); - phi::IndexSelectInner(dev_ctx, &x_copy, index, out, dim); -#if defined(__NVCC__) || defined(__HIPCC__) - } else { - auto stride_dim = common::stride(input_dim); - int64_t stride = stride_dim[dim]; - phi::TensorFromVector(index_vec, dev_ctx, &index); - auto stream = dev_ctx.stream(); - auto output_dim = common::vectorize(x.dims()); - output_dim[dim] = index_size; - out->Resize(common::make_ddim(output_dim)); - dev_ctx.template Alloc(out); - auto* out_data = out->data(); - int64_t numel = out->numel(); - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const int* index_data = index.data(); - index_select_cuda_kernel - <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>( - x.data(), out_data, index_data, numel, stride, size, delta); - } -#else - } -#endif -} - -template -void RepeatInterleaveWithTensorIndexKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& repeats_tensor, - int dim, - DenseTensor* out) { - auto place = dev_ctx.GetPlace(); - auto cpu_place = phi::CPUPlace(); - - auto input_dim = x.dims(); - if (dim < 0) { - dim += input_dim.size(); - } - DenseTensor index; - PADDLE_ENFORCE_EQ(repeats_tensor.dims()[0] == x.dims()[dim], - true, - common::errors::InvalidArgument( - "The length of Input(RepeatsTensor) must be the " - "same as length of Input(X) in axis. " - "But received: [%s], required: [%d].", - repeats_tensor.dims()[0], - x.dims()[dim])); - const auto& index_type = repeats_tensor.dtype(); - bool index_type_match = - index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, - true, - common::errors::InvalidArgument( - "Input(RepeatsTensor) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - DataTypeToString(index_type), - DataTypeToString(phi::DataType::INT32), - DataTypeToString(phi::DataType::INT64))); - - if (x.numel() == 0) { - // infer out shape - if (index_type == phi::DataType::INT32) { - phi::funcs::RepeatsTensor2IndexTensorFunctor()( - dev_ctx, repeats_tensor, &index); - - } else if (index_type == phi::DataType::INT64) { - phi::funcs::RepeatsTensor2IndexTensorFunctor()( - dev_ctx, repeats_tensor, &index); - } - auto output_dim = common::vectorize(x.dims()); - output_dim[dim] = index.dims()[0]; - out->Resize(common::make_ddim(output_dim)); - dev_ctx.template Alloc(out); - return; - } - if (place == cpu_place) { - auto x_copy = x; - if (index_type == phi::DataType::INT32) { - phi::funcs::RepeatsTensor2IndexTensorFunctor()( - dev_ctx, repeats_tensor, &index); - auto output_dim = common::vectorize(x.dims()); - output_dim[dim] = index.dims()[0]; - out->Resize(common::make_ddim(output_dim)); - IndexSelectInner(dev_ctx, &x_copy, index, out, dim); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::RepeatsTensor2IndexTensorFunctor()( - dev_ctx, repeats_tensor, &index); - auto output_dim = common::vectorize(x.dims()); - output_dim[dim] = index.dims()[0]; - out->Resize(common::make_ddim(output_dim)); - IndexSelectInner(dev_ctx, &x_copy, index, out, dim); - } -#if defined(__NVCC__) || defined(__HIPCC__) - } else { - auto stride_dim = common::stride(input_dim); - int64_t stride = stride_dim[dim]; - auto stream = dev_ctx.stream(); - auto* in_data = x.data(); - if (index_type == phi::DataType::INT64) { - phi::funcs::RepeatsTensor2IndexTensorFunctor()( - dev_ctx, repeats_tensor, &index); - - const int64_t* index_data = index.data(); - auto output_dim = common::vectorize(x.dims()); - output_dim[dim] = index.dims()[0]; - out->Resize(common::make_ddim(output_dim)); - T* out_data = dev_ctx.template Alloc(out); - int64_t numel = out->numel(); - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - index_select_cuda_kernel - <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>( - in_data, out_data, index_data, numel, stride, size, delta); - } else { - phi::funcs::RepeatsTensor2IndexTensorFunctor()( - dev_ctx, repeats_tensor, &index); - - const int* index_data = index.data(); - auto output_dim = common::vectorize(x.dims()); - output_dim[dim] = index.dims()[0]; - out->Resize(common::make_ddim(output_dim)); - T* out_data = dev_ctx.template Alloc(out); - int64_t numel = out->numel(); - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - index_select_cuda_kernel - <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, - 0, - stream>>>( - in_data, out_data, index_data, numel, stride, size, delta); - } - } -#else - } -#endif -} - -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/standard_gamma_kernel_impl.h b/backends/metax_gpu/kernels/impl/standard_gamma_kernel_impl.h deleted file mode 100644 index 2667cf2cae..0000000000 --- a/backends/metax_gpu/kernels/impl/standard_gamma_kernel_impl.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "kernels/impl/dirichlet_kernel_impl.h" -#include "paddle/phi/kernels/standard_gamma_kernel.h" - -namespace phi { -template -void StandardGammaKernel(const Context& dev_ctx, - const DenseTensor& alpha, - DenseTensor* out) { - dev_ctx.template Alloc(out); - GammaSampler sampler; - sampler(dev_ctx, alpha, out); -} -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/stft_kernel_impl.h b/backends/metax_gpu/kernels/impl/stft_kernel_impl.h deleted file mode 100644 index 46fce91937..0000000000 --- a/backends/metax_gpu/kernels/impl/stft_kernel_impl.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "kernels/impl/elementwise.h" -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/type_traits.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/fft.h" -#include "paddle/phi/kernels/funcs/fft_fill_conj.h" -#include "paddle/phi/kernels/funcs/frame_functor.h" - -namespace phi { - -template -void StftKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& window, - int n_fft, - int hop_length, - bool normalized, - bool onesided, - DenseTensor* out) { - using C = phi::dtype::complex; - - dev_ctx.template Alloc(out); - - const size_t x_rank = x.dims().size(); - const size_t out_rank = out->dims().size(); - - const int n_frames = out->dims()[out_rank - 1]; - const int seq_length = x.dims()[x_rank - 1]; - - std::vector axes = {1}; - - // Frame - phi::DenseTensor frames; - phi::DDim frames_dims(out->dims()); - frames_dims.at(axes.back()) = n_fft; - frames.Resize(frames_dims); - dev_ctx.template Alloc(&frames); - phi::funcs::FrameFunctor()(dev_ctx, - &x, - &frames, - seq_length, - n_fft, - n_frames, - hop_length, - /*is_grad*/ false); - - // Window - phi::DenseTensor frames_w; - frames_w.Resize(frames_dims); - dev_ctx.template Alloc(&frames_w); - phi::funcs::ElementwiseCompute, T, T>( - dev_ctx, - frames, - window, - phi::funcs::MultiplyFunctor(), - &frames_w, - axes.back()); - - // FFTR2C - phi::funcs::FFTNormMode normalization; - if (normalized) { - normalization = phi::funcs::get_norm_from_string("ortho", true); - } else { - normalization = phi::funcs::get_norm_from_string("backward", true); - } - phi::funcs::FFTR2CFunctor fft_r2c_func; - - if (onesided) { - fft_r2c_func(dev_ctx, frames_w, out, axes, normalization, true); - } else { - phi::DDim onesided_dims(out->dims()); - const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1; - onesided_dims.at(axes.back()) = onesided_axis_size; - phi::DenseTensor onesided_out; - onesided_out.Resize(onesided_dims); - dev_ctx.template Alloc(&onesided_out); - fft_r2c_func(dev_ctx, frames_w, &onesided_out, axes, normalization, true); - phi::funcs::FFTFillConj(dev_ctx, &onesided_out, out, axes); - } -} -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h deleted file mode 100644 index ad656b7a6c..0000000000 --- a/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/common_shape.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/for_range.h" -#include "paddle/phi/kernels/funcs/matrix_reduce.h" -#include "paddle/phi/kernels/funcs/tril_triu_compute.h" -#include "paddle/phi/kernels/triangular_solve_grad_kernel.h" -#include "paddle/phi/kernels/triangular_solve_kernel.h" -namespace phi { - -template -void TriangularSolveGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - bool upper, - bool transpose, - bool unitriangular, - DenseTensor* dx, - DenseTensor* dy) { - if (out.numel() == 0) { - if (dx) { - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx); - } - if (dy) { - phi::Full( - dev_ctx, phi::IntArray(common::vectorize(dy->dims())), 0, dy); - } - return; - } - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = - funcs::MatrixGetBroadcastDims(x, y); - - IntArray y_bst_dims_array(y_bst_dims_vec); - DenseTensor dy_bst = phi::Empty(dev_ctx, y_bst_dims_array); - if (dy) { - // calculate x's conjugate for complex - DenseTensor x_conj; - x_conj.Resize(x.dims()); - - phi::funcs::ForRange x_for_range(dev_ctx, x.numel()); - phi::funcs::ConjFunctor x_functor( - x.data(), x.numel(), dev_ctx.template Alloc(&x_conj)); - x_for_range(x_functor); - - // reuse forward to get dy_bst, and the result has been broadcasted already. - TriangularSolveKernel( - dev_ctx, x_conj, dout, upper, !transpose, unitriangular, &dy_bst); - - dy->Resize(y.dims()); - dev_ctx.template Alloc(dy); - if (dy_bst.dims() == y.dims()) { - Copy(dev_ctx, dy_bst, dev_ctx.GetPlace(), false, dy); - } else { - funcs::MatrixReduceSumFunctor functor; - functor(dev_ctx, dy_bst, dy); - dy->Resize(y.dims()); - } - } - - IntArray x_bst_dims_array(x_bst_dims_vec); - DenseTensor dx_bst = phi::Empty(dev_ctx, x_bst_dims_array); - if (dx) { - // calculate x's conjugate for complex - DenseTensor out_conj; - out_conj.Resize(out.dims()); - - phi::funcs::ForRange out_for_range(dev_ctx, out.numel()); - phi::funcs::ConjFunctor out_functor( - out.data(), out.numel(), dev_ctx.template Alloc(&out_conj)); - out_for_range(out_functor); - - auto blas = phi::funcs::GetBlas(dev_ctx); - if (transpose) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true); - blas.MatMul(out_conj, - mat_dim_a, - dy_bst, - mat_dim_b, - static_cast(-1), - &dx_bst, - static_cast(0)); - } else { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true); - blas.MatMul(dy_bst, - mat_dim_a, - out_conj, - mat_dim_b, - static_cast(-1), - &dx_bst, - static_cast(0)); - } - - // get upper or lower triangular - DenseTensor dx_bst_upper = - phi::Empty(dev_ctx, x_bst_dims_array); - - const auto& dims = dx_bst.dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - phi::funcs::ForRange x_for_range(dev_ctx, dx_bst.numel()); - phi::funcs::TrilTriuCompute tril_triu_functor( - dx_bst.data(), unitriangular, !upper, H, W, dx_bst_upper.data()); - x_for_range(tril_triu_functor); - - dx->Resize(x.dims()); - dev_ctx.template Alloc(dx); - if (dx_bst.dims() == x.dims()) { - Copy(dev_ctx, dx_bst_upper, dev_ctx.GetPlace(), false, dx); - } else { - funcs::MatrixReduceSumFunctor functor; - functor(dev_ctx, dx_bst_upper, dx); - dx->Resize(x.dims()); - } - } -} - -} // namespace phi