diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/cholesky_solve_grad_kernel_register.cu
index 9b754efe16..c77121724b 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/cholesky_solve_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/cholesky_solve_grad_kernel_register.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/cholesky_solve_grad_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h"
 
 PD_CUSTOM_KERNEL_REGISTER(cholesky_solve_grad,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/dirichlet_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/dirichlet_kernel_register.cu
index 6bfa234d11..df11c758db 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/dirichlet_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/dirichlet_kernel_register.cu
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/dirichlet_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/dirichlet_kernel_impl.h"
 
 PD_CUSTOM_KERNEL_REGISTER(dirichlet,
                           metax_gpu,
                           ALL_LAYOUT,
-                          phi::Dirichletkernel,
+                          phi::DirichletKernel,
                           float,
                           double,
                           phi::dtype::float16,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/gru_unit_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/gru_unit_grad_kernel_register.cu
index 20f82652d8..d6077cdb96 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/gru_unit_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/gru_unit_grad_kernel_register.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/gru_unit_kernel_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/gru_unit_kernel_impl.h"
 
 PD_CUSTOM_KERNEL_REGISTER(gru_unit_grad,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/gru_unit_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/gru_unit_kernel_register.cu
index 87b0526ecb..e4a7482bdf 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/gru_unit_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/gru_unit_kernel_register.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/gru_unit_kernel_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/gru_unit_kernel_impl.h"
 
 PD_CUSTOM_KERNEL_REGISTER(
     gru_unit, metax_gpu, ALL_LAYOUT, phi::GRUUnitKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/inverse_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/inverse_grad_kernel_register.cu
index 5e992969e5..e87733a7ed 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/inverse_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/inverse_grad_kernel_register.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/inverse_grad_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/inverse_grad_kernel_impl.h"
 #include "paddle/phi/kernels/inverse_grad_kernel.h"
 
 PD_CUSTOM_KERNEL_REGISTER(inverse_grad,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/lstm_grad_kernel_register.cu
index 6bd0844ee3..9c00c70a33 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/lstm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/lstm_grad_kernel_register.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/lstm_kernel_impl.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
 #include "paddle/phi/kernels/lstm_kernel.h"
 
 PD_CUSTOM_KERNEL_REGISTER(
diff --git a/backends/metax_gpu/kernels/custom_kernel/addmm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/addmm_kernel_register.cu
index ead21b1eb7..f855de37dd 100644
--- a/backends/metax_gpu/kernels/custom_kernel/addmm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/addmm_kernel_register.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../impl/addmm_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/addmm_kernel.h"
+#include "paddle/phi/kernels/impl/addmm_kernel_impl.h"
 
 PD_REGISTER_PLUGIN_KERNEL(addmm,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/custom_kernel/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/bilinear_grad_kernel_register.cu
index 07980dc2ff..6a3176db06 100644
--- a/backends/metax_gpu/kernels/custom_kernel/bilinear_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/bilinear_grad_kernel_register.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/bilinear_grad_kernel_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bilinear_grad_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_grad_kernel_impl.h"
 
 PD_REGISTER_PLUGIN_KERNEL(bilinear_grad,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/custom_kernel/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/bilinear_kernel_register.cu
index 8ce3daaa12..90f1e200d6 100644
--- a/backends/metax_gpu/kernels/custom_kernel/bilinear_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/bilinear_kernel_register.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/bilinear_kernel_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bilinear_kernel.h"
+#include "paddle/phi/kernels/impl/bilinear_kernel_impl.h"
 
 PD_REGISTER_PLUGIN_KERNEL(
     bilinear, metax_gpu, ALL_LAYOUT, phi::BilinearKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/custom_kernel/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/bmm_grad_kernel_register.cu
index 4e5ef5e3a0..e796222054 100644
--- a/backends/metax_gpu/kernels/custom_kernel/bmm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/bmm_grad_kernel_register.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/bmm_grad_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bmm_grad_kernel.h"
+#include "paddle/phi/kernels/impl/bmm_grad_kernel_impl.h"
 
 PD_REGISTER_PLUGIN_KERNEL(bmm_grad,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/custom_kernel/bmm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/bmm_kernel_register.cu
index b7e6e7312c..22eac8cef7 100644
--- a/backends/metax_gpu/kernels/custom_kernel/bmm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/bmm_kernel_register.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/bmm_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/bmm_kernel.h"
+#include "paddle/phi/kernels/impl/bmm_kernel_impl.h"
 
 PD_REGISTER_PLUGIN_KERNEL(bmm,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/custom_kernel/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/cholesky_grad_kernel_register.cu
index 2a29045e08..2f5fee6aa7 100644
--- a/backends/metax_gpu/kernels/custom_kernel/cholesky_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/cholesky_grad_kernel_register.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/cholesky_grad_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cholesky_grad_kernel.h"
+#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h"
 
 PD_REGISTER_PLUGIN_KERNEL(cholesky_grad,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/custom_kernel/conv_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/conv_kernel_register.cu
index 112275470a..4f340997fb 100644
--- a/backends/metax_gpu/kernels/custom_kernel/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/conv_kernel_register.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #if 0
-#include "kernels/impl/conv_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/conv_kernel.h"
+#include "paddle/phi/kernels/impl/conv_kernel_impl.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/custom_kernel/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/flatten2_grad_kernel_register.cu
index 8fe0d25fae..76541d8290 100644
--- a/backends/metax_gpu/kernels/custom_kernel/flatten2_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/flatten2_grad_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 // clang-format off
 #include "paddle/phi/core/tensor_utils.h"  //NOLINT
-#include "kernels/impl/flatten2_kernel_impl.h"
+#include "paddle/phi/kernels/impl/flatten2_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 // clang-format on
diff --git a/backends/metax_gpu/kernels/custom_kernel/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/flatten2_kernel_register.cu
index e42e12796a..cc94c28722 100644
--- a/backends/metax_gpu/kernels/custom_kernel/flatten2_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/flatten2_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 // clang-format off
 #include "paddle/phi/core/tensor_utils.h"  //NOLINT
-#include "kernels/impl/flatten2_kernel_impl.h"
+#include "paddle/phi/kernels/impl/flatten2_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 // clang-format on
diff --git a/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
index 15cd9b9238..768cb056b6 100644
--- a/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/lstm_kernel_register.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/lstm_kernel_impl.h"
+#include "paddle/phi/kernels/impl/lstm_kernel_impl.h"
 #include "paddle/phi/kernels/lstm_kernel.h"
 
 PD_REGISTER_PLUGIN_KERNEL(
diff --git a/backends/metax_gpu/kernels/custom_kernel/lu_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/lu_grad_kernel_register.cu
index b3952b9cf9..1db0ab5324 100644
--- a/backends/metax_gpu/kernels/custom_kernel/lu_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/lu_grad_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 // clang-format off
 #include "paddle/phi/core/tensor_utils.h"  //NOLINT
-#include "kernels/impl/lu_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/lu_grad_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/lu_grad_kernel.h"
diff --git a/backends/metax_gpu/kernels/custom_kernel/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/multi_dot_grad_kernel_register.cu
index b0560d65e6..727172fec3 100644
--- a/backends/metax_gpu/kernels/custom_kernel/multi_dot_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/multi_dot_grad_kernel_register.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "kernels/impl/multi_dot_kernel_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 #include "paddle/phi/kernels/multi_dot_grad_kernel.h"
 PD_REGISTER_PLUGIN_KERNEL(multi_dot_grad,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/custom_kernel/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/multi_dot_kernel_register.cu
index aa3201ad51..f0c5afbd58 100644
--- a/backends/metax_gpu/kernels/custom_kernel/multi_dot_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/multi_dot_kernel_register.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "kernels/impl/multi_dot_kernel_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h"
 #include "paddle/phi/kernels/multi_dot_kernel.h"
 PD_REGISTER_PLUGIN_KERNEL(multi_dot,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/custom_kernel/mv_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/mv_kernel_register.cu
index b70204fd03..41d9e15532 100644
--- a/backends/metax_gpu/kernels/custom_kernel/mv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/mv_kernel_register.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/mv_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/mv_kernel_impl.h"
 #include "paddle/phi/kernels/mv_kernel.h"
 
 PD_REGISTER_PLUGIN_KERNEL(
diff --git a/backends/metax_gpu/kernels/custom_kernel/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/standard_gamma_kernel_register.cu
index 0a10e2a628..82fb2e4adc 100644
--- a/backends/metax_gpu/kernels/custom_kernel/standard_gamma_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/standard_gamma_kernel_register.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "kernels/impl/standard_gamma_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/standard_gamma_kernel_impl.h"
 
 PD_REGISTER_PLUGIN_KERNEL(standard_gamma,
                           metax_gpu,
diff --git a/backends/metax_gpu/kernels/custom_kernel/stft_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/stft_kernel_register.cu
index 5376f5f4e9..63c68cf295 100644
--- a/backends/metax_gpu/kernels/custom_kernel/stft_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/stft_kernel_register.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/stft_kernel_impl.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/stft_kernel_impl.h"
 #include "paddle/phi/kernels/stft_kernel.h"
 
 PD_REGISTER_PLUGIN_KERNEL(
diff --git a/backends/metax_gpu/kernels/custom_kernel/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/custom_kernel/triangular_solve_grad_kernel_register.cu
index 116a453382..b7bb40193a 100644
--- a/backends/metax_gpu/kernels/custom_kernel/triangular_solve_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/custom_kernel/triangular_solve_grad_kernel_register.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/impl/triangular_solve_grad_kernel_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
 
 #ifdef PADDLE_WITH_CUDA
 PD_REGISTER_PLUGIN_KERNEL(triangular_solve_grad,
diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
deleted file mode 100644
index 9de0b3aadd..0000000000
--- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include <type_traits>
-#include "glog/logging.h"
-
-#include "paddle/phi/kernels/addmm_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-// clang-format on
-namespace phi {
-
-template <typename T, size_t D, int MajorType = Eigen::RowMajor>
-using PhiEigenTensor = EigenTensor<T, D, MajorType>;
-
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
-
-template <typename T, typename Context>
-void AddmmKernel(const Context& dev_ctx,
-                 const DenseTensor& input,
-                 const DenseTensor& x,
-                 const DenseTensor& y,
-                 float beta,
-                 float alpha,
-                 DenseTensor* out) {
-  auto input_dims = input.dims();
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-
-  DenseTensor input_2d(input);
-  if (input.dims().size() == 1) {
-    input_dims = {1, input.dims()[0]};
-    input_2d.Resize(input_dims);
-  }
-
-  // broadcast mode check
-  if (x_dims[0] != input_dims[0]) {
-    PADDLE_ENFORCE_EQ(input_dims[0],
-                      1,
-                      errors::InvalidArgument(
-                          "When x_dims[0] is not equal with input_dims[0], "
-                          "input_dims[0] must be 1 but got %s",
-                          input_dims[0]));
-    PADDLE_ENFORCE_EQ(y_dims[1] == input_dims[1] || input_dims[1] == 1,
-                      true,
-                      errors::InvalidArgument(
-                          "The input tensor shape mismatch, input shape=[%s], "
-                          "x shape=[%s], y shape=[%s]",
-                          input_dims,
-                          x_dims,
-                          y_dims));
-  }
-  // broadcast mode check
-  if (y_dims[1] != input_dims[1]) {
-    PADDLE_ENFORCE_EQ(input_dims[1],
-                      1,
-                      errors::InvalidArgument(
-                          "When y_dims[1] is not equal with input_dims[0], "
-                          "input_dims[0] must be 1 but got %s",
-                          input_dims[1]));
-    PADDLE_ENFORCE_EQ(x_dims[0] == input_dims[0] || input_dims[0] == 1,
-                      true,
-                      errors::InvalidArgument(
-                          "The input tensor shape mismatch, input shape=[%s], "
-                          "x shape=[%s], y shape=[%s]",
-                          input_dims,
-                          x_dims,
-                          y_dims));
-  }
-  // broadcast mode check
-  PADDLE_ENFORCE_EQ(
-      x_dims[1],
-      y_dims[0],
-      errors::InvalidArgument(
-          "The input tensor X's width must be equal with matrix Y' height. "
-          "But received X's shape = [%s], Y's shape = [%s].",
-          x_dims[1],
-          y_dims[0]));
-
-  dev_ctx.template Alloc<T>(out);
-  if (out->numel() == 0) return;
-  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
-
-  // calc broadcast dim
-  Array2 bcast_dims;
-  bcast_dims[0] = x_dims[0] / input_dims[0];
-  bcast_dims[1] = y_dims[1] / input_dims[1];
-  VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
-  // broadcast using eigen
-  const DenseTensor& const_ref_input = input_2d;
-  auto eigen_input = PhiEigenTensor<T, 2>::From(const_ref_input);
-  auto eigen_out = PhiEigenTensor<T, 2>::From(*out);
-  auto& place = *dev_ctx.eigen_device();
-  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
-      place, eigen_out, eigen_input, bcast_dims);
-
-  T t_alpha = static_cast<T>(alpha);
-  T t_beta = static_cast<T>(beta);
-  blas.GEMM(false,
-            false,
-            x_dims[0],
-            y_dims[1],
-            x_dims[1],
-            t_alpha,
-            x.data<T>(),
-            x_dims[1],
-            y.data<T>(),
-            y_dims[1],
-            t_beta,
-            out->data<T>(),
-            y_dims[1]);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h
deleted file mode 100644
index 1c52ea22e4..0000000000
--- a/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <type_traits>
-
-#include "glog/logging.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/kernels/baddbmm_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
-namespace phi {
-
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using PhiEigenTensor = EigenTensor<T, D, MajorType, IndexType>;
-
-using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
-using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
-using Array3 = Eigen::DSizes<Eigen::DenseIndex, 3>;
-
-template <typename T, typename Context>
-void BaddbmmKernel(const Context& dev_ctx,
-                   const DenseTensor& input,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   float beta,
-                   float alpha,
-                   DenseTensor* out) {
-  auto input_dims = input.dims();
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-
-  DenseTensor input_3d(input);
-  if (input.dims().size() == 2) {
-    input_dims = {input.dims()[0], 1, input.dims()[1]};
-    input_3d.Resize(input_dims);
-  }
-
-  // broadcast mode check
-  if (x_dims[0] != input_dims[0]) {
-    PADDLE_ENFORCE_EQ(input_dims[0],
-                      1,
-                      errors::InvalidArgument(
-                          "When x_dims[0] is not equal with input_dims[0], "
-                          "input_dims[0] must be 1 but got %s",
-                          input_dims[0]));
-    PADDLE_ENFORCE_EQ(
-        (x_dims[1] == input_dims[1] || input_dims[1] == 1) &&
-            (y_dims[2] == input_dims[2] || input_dims[2] == 1),
-        true,
-        errors::InvalidArgument(
-            "When x_dims[0] is not equal with input_dims[0], "
-            "x_dims[1] and y_dims[2] must be equal with input_dims[1] and "
-            "input_dims[2] respectively, or input_dims[1] and input_dims[2] "
-            "must be 1. But got x_dims[1] = %s, input_dims[1] = %s, y_dims[2] "
-            "= %s, input_dims[2] = %s",
-            x_dims[1],
-            input_dims[1],
-            y_dims[2],
-            input_dims[2]));
-  }
-
-  if (x_dims[1] != input_dims[1]) {
-    PADDLE_ENFORCE_EQ(input_dims[1],
-                      1,
-                      errors::InvalidArgument(
-                          "When x_dims[1] is not equal with input_dims[1], "
-                          "input_dims[1] must be 1 but got %s",
-                          input_dims[1]));
-    PADDLE_ENFORCE_EQ(
-        (x_dims[0] == input_dims[0] || input_dims[0] == 1) &&
-            (y_dims[2] == input_dims[2] || input_dims[2] == 1),
-        true,
-        errors::InvalidArgument(
-            "When x_dims[1] is not equal with input_dims[1], "
-            "x_dims[0] and y_dims[2] must be equal with input_dims[0] and "
-            "input_dims[2] respectively, or input_dims[0] and input_dims[2] "
-            "must be 1. But got x_dims[0] = %s, input_dims[0] = %s, y_dims[2] "
-            "= %s, input_dims[2] = %s",
-            x_dims[0],
-            input_dims[0],
-            y_dims[2],
-            input_dims[2]));
-  }
-
-  if (y_dims[2] != input_dims[2]) {
-    PADDLE_ENFORCE_EQ(input_dims[2],
-                      1,
-                      errors::InvalidArgument(
-                          "When y_dims[2] is not equal with input_dims[2], "
-                          "input_dims[2] must be 1 but got %s",
-                          input_dims[2]));
-    PADDLE_ENFORCE_EQ(
-        (x_dims[0] == input_dims[0] || input_dims[0] == 1) &&
-            (x_dims[1] == input_dims[1] || input_dims[1] == 1),
-        true,
-        errors::InvalidArgument(
-            "When y_dims[2] is not equal with input_dims[2], "
-            "x_dims[0] and x_dims[1] must be equal with input_dims[0] and "
-            "input_dims[1] respectively, or input_dims[0] and input_dims[1] "
-            "must be 1. But got x_dims[0] = %s, input_dims[0] = %s, x_dims[1] "
-            "= %s, input_dims[1] = %s",
-            x_dims[0],
-            input_dims[0],
-            x_dims[1],
-            input_dims[1]));
-  }
-  PADDLE_ENFORCE_EQ(
-      x_dims[2],
-      y_dims[1],
-      errors::InvalidArgument(
-          "The input tensor X's width must be equal with matrix Y' height. "
-          "But received X's shape = [%s], Y's shape = [%s].",
-          x_dims[2],
-          y_dims[1]));
-
-  dev_ctx.template Alloc<T>(out);
-  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
-
-  // calc broadcast dim
-  Array3 bcast_dims;
-  bcast_dims[0] = x_dims[0] / input_dims[0];
-  bcast_dims[1] = x_dims[1] / input_dims[1];
-  bcast_dims[2] = y_dims[2] / input_dims[2];
-  VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << ","
-          << bcast_dims[2] << "]";
-
-  // broadcast using eigen
-  const DenseTensor& const_ref_input = input_3d;
-  auto eigen_input = PhiEigenTensor<T, 3>::From(const_ref_input);
-  auto eigen_out = PhiEigenTensor<T, 3>::From(*out);
-  auto& place = *dev_ctx.eigen_device();
-  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 3>::Eval(
-      place, eigen_out, eigen_input, bcast_dims);
-
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-
-  // special case for MPType
-  if constexpr (std::is_same_v<MPType, float>) {
-    VLOG(4) << "Function: baddbmm, Type of T: " << typeid(T).name();
-    VLOG(4) << "Function: baddbmm, Type of MPType: " << typeid(MPType).name();
-    float t_alpha = alpha;
-    float t_beta = beta;
-    if (x_dims[0] == 1) {
-      blas.GEMM(CblasNoTrans,
-                CblasNoTrans,
-                x_dims[1],
-                y_dims[2],
-                x_dims[2],
-                t_alpha,
-                x.data<T>(),
-                y.data<T>(),
-                t_beta,
-                out->data<T>());
-    } else {
-      blas.BatchedGEMM(CblasNoTrans,
-                       CblasNoTrans,
-                       x_dims[1],
-                       y_dims[2],
-                       x_dims[2],
-                       t_alpha,
-                       x.data<T>(),
-                       y.data<T>(),
-                       t_beta,
-                       out->data<T>(),
-                       x_dims[0],
-                       x_dims[1] * x_dims[2],
-                       x_dims[2] * y_dims[2]);
-    }
-  } else {
-    T t_alpha = static_cast<T>(alpha);
-    T t_beta = static_cast<T>(beta);
-    if (x_dims[0] == 1) {
-      blas.GEMM(CblasNoTrans,
-                CblasNoTrans,
-                x_dims[1],
-                y_dims[2],
-                x_dims[2],
-                t_alpha,
-                x.data<T>(),
-                y.data<T>(),
-                t_beta,
-                out->data<T>());
-    } else {
-      blas.BatchedGEMM(CblasNoTrans,
-                       CblasNoTrans,
-                       x_dims[1],
-                       y_dims[2],
-                       x_dims[2],
-                       t_alpha,
-                       x.data<T>(),
-                       y.data<T>(),
-                       t_beta,
-                       out->data<T>(),
-                       x_dims[0],
-                       x_dims[1] * x_dims[2],
-                       x_dims[2] * y_dims[2]);
-      // x_dims[2] == y_dims[1]
-    }
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h
deleted file mode 100644
index b64f94bc7e..0000000000
--- a/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void BilinearGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& weight,
-                        const DenseTensor& dout,
-                        DenseTensor* dx,
-                        DenseTensor* dy,
-                        DenseTensor* dweight,
-                        DenseTensor* dbias) {
-  auto batch_size = x.dims()[0];
-  auto weight_dims = weight.dims();
-  int out_dim = weight_dims[0];
-  auto x_dim = weight_dims[1];
-  auto y_dim = weight_dims[2];
-
-  auto x_mat = EigenMatrix<T>::From(x);
-  auto y_mat = EigenMatrix<T>::From(y);
-  auto dout_mat = EigenMatrix<T>::From(dout);
-  auto& place = *dev_ctx.eigen_device();
-  // Create the intermediate variable to calculate the Output(Y@Grad).
-  DenseTensor x_scale;
-  x_scale.Resize(common::make_ddim({batch_size, x_dim}));
-  dev_ctx.template Alloc<T>(&x_scale);
-  auto x_scale_mat = EigenMatrix<T>::From(x_scale);
-
-  // Create the intermediate variable to calculate the Output(X@Grad).
-  DenseTensor y_scale;
-  y_scale.Resize(common::make_ddim({batch_size, y_dim}));
-  dev_ctx.template Alloc<T>(&y_scale);
-  auto y_scale_mat = EigenMatrix<T>::From(y_scale);
-
-  funcs::SetConstant<Context, T> set_zero;
-
-  if (dx) {
-    dev_ctx.template Alloc<T>(dx);
-    set_zero(dev_ctx, dx, static_cast<T>(0));
-  }
-
-  if (dy) {
-    dev_ctx.template Alloc<T>(dy);
-    set_zero(dev_ctx, dy, static_cast<T>(0));
-  }
-
-  if (dweight) {
-    dev_ctx.template Alloc<T>(dweight);
-  }
-
-  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
-
-  // Calculate the Output(X@Grad) and Output(Y@Grad).
-  if (dx || dy || dweight) {
-    Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
-    Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
-    Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
-
-    for (int i = 0; i < out_dim; ++i) {
-      DenseTensor weight_i =
-          weight.Slice(i, i + 1).Resize(common::make_ddim({x_dim, y_dim}));
-      auto output_vec = dout_mat.chip(i, 1);
-
-      if (dx) {
-        y_scale_mat.device(place) =
-            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                .broadcast(bcast_for_x) *
-            y_mat;
-        blas.GEMM(CblasNoTrans,
-                  CblasTrans,
-                  batch_size,
-                  x_dim,
-                  y_dim,
-                  1,
-                  y_scale.data<T>(),
-                  weight_i.data<T>(),
-                  1,
-                  dx->data<T>());
-      }
-
-      if (dy || dweight) {
-        auto output_vec_y =
-            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
-                .broadcast(bcast_for_y);
-        x_scale_mat.device(place) = output_vec_y * x_mat;
-        if (dy) {
-          blas.GEMM(CblasNoTrans,
-                    CblasNoTrans,
-                    batch_size,
-                    y_dim,
-                    x_dim,
-                    1,
-                    x_scale.data<T>(),
-                    weight_i.data<T>(),
-                    1,
-                    dy->data<T>());
-        }
-        if (dweight) {
-          DenseTensor dweight_i = dweight->Slice(i, i + 1).Resize(
-              common::make_ddim({x_dim, y_dim}));
-          blas.GEMM(CblasTrans,
-                    CblasNoTrans,
-                    x_dim,
-                    y_dim,
-                    batch_size,
-                    1,
-                    x_scale.data<T>(),
-                    y.data<T>(),
-                    0,
-                    dweight_i.data<T>());
-        }
-      }
-    }
-  }
-
-  // calculate the gradient of Input(Bias).
-  if (dbias) {
-    dev_ctx.template Alloc<T>(dbias);
-    auto dbias_mat = EigenVector<T>::Flatten(*dbias);
-    dbias_mat.device(place) = dout_mat.sum(Eigen::DSizes<int, 1>(0));
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h b/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h
deleted file mode 100644
index 48861d4893..0000000000
--- a/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/utils/optional.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void BilinearKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    const DenseTensor& weight,
-                    const paddle::optional<DenseTensor>& bias,
-                    DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-
-  auto y_mat = EigenMatrix<T>::From(y);
-  auto output_mat = EigenMatrix<T>::From(*out);
-
-  auto batch_size = x.dims()[0];
-  auto weight_dims = weight.dims();
-  int out_dim = weight_dims[0];
-  auto x_dim = weight_dims[1];
-  auto y_dim = weight_dims[2];
-  auto& place = *dev_ctx.eigen_device();
-
-  // Create the intermediate variable to calculate the result of
-  // Input(X) multiplied by Input(Weight_i), the formula is:
-  // left_mul = X Weight_i.
-  DenseTensor left_mul;
-  left_mul.Resize(common::make_ddim({batch_size, y_dim}));
-  dev_ctx.template Alloc<T>(&left_mul);
-  auto left_mul_mat = EigenMatrix<T>::From(left_mul);
-
-  for (int i = 0; i < out_dim; ++i) {
-    auto output_col_vec = output_mat.chip(i, 1);
-    DenseTensor weight_mat =
-        weight.Slice(i, i + 1).Resize(common::make_ddim({x_dim, y_dim}));
-    phi::funcs::GetBlas<Context, T>(dev_ctx).GEMM(CblasNoTrans,
-                                                  CblasNoTrans,
-                                                  batch_size,
-                                                  y_dim,
-                                                  x_dim,
-                                                  1,
-                                                  x.data<T>(),
-                                                  weight_mat.data<T>(),
-                                                  0,
-                                                  left_mul.data<T>());
-    output_col_vec.device(place) =
-        (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
-  }
-  if (bias.get_ptr()) {
-    auto bias_vec = EigenMatrix<T>::From(*(bias.get_ptr()));
-    Eigen::DSizes<int, 2> bcast(batch_size, 1);
-    output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h
deleted file mode 100644
index cd5978ae59..0000000000
--- a/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/kernels/bmm_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void MatMul(const Context& dev_ctx,
-            const DenseTensor& a,
-            bool trans_a,
-            const DenseTensor& b,
-            bool trans_b,
-            DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
-
-  blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0));
-}
-
-template <typename T, typename Context>
-void CalcInputGrad(const Context& dev_ctx,
-                   const DenseTensor& a,
-                   bool trans_a,
-                   const DenseTensor& b,
-                   bool trans_b,
-                   DenseTensor* out) {
-  if (out == nullptr) return;
-  MatMul<T, Context>(dev_ctx, a, trans_a, b, trans_b, out);
-}
-
-template <typename T, typename Context>
-void BmmGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& out_grad,
-                   DenseTensor* x_grad,
-                   DenseTensor* y_grad) {
-  if (x_grad && x_grad->numel() == 0) {
-    dev_ctx.template Alloc<T>(x_grad);
-    phi::Full<T, Context>(
-        dev_ctx, phi::IntArray(common::vectorize(y.dims())), 0, y_grad);
-    return;
-  }
-  if (y_grad && y_grad->numel() == 0) {
-    dev_ctx.template Alloc<T>(y_grad);
-    phi::Full<T, Context>(
-        dev_ctx, phi::IntArray(common::vectorize(x.dims())), 0, x_grad);
-    return;
-  }
-  DenseTensor x_help = x;
-  DenseTensor y_help = y;
-  DenseTensor out_grad_help = out_grad;
-  ReshapeXYOutIntoMatrixSequence(
-      &x_help, &y_help, &out_grad_help, false, false);
-
-  phi::DDim dx_dims;
-  if (x_grad) {
-    dx_dims = x_grad->dims();
-    if (dx_dims != x_help.dims()) {
-      x_grad->Resize(x_help.dims());
-    }
-  }
-
-  phi::DDim dy_dims;
-  if (y_grad) {
-    dy_dims = y_grad->dims();
-    if (dy_dims != y_help.dims()) {
-      y_grad->Resize(y_help.dims());
-    }
-  }
-
-  CalcInputGrad<T, Context>(
-      dev_ctx, out_grad_help, false, y_help, true, x_grad);
-  CalcInputGrad<T, Context>(
-      dev_ctx, x_help, true, out_grad_help, false, y_grad);
-
-  if (x_grad) {
-    if (dx_dims != x_help.dims()) {
-      x_grad->Resize(dx_dims);
-    }
-  }
-  if (y_grad) {
-    if (dy_dims != y_help.dims()) {
-      y_grad->Resize(dy_dims);
-    }
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h
deleted file mode 100644
index ce493b4908..0000000000
--- a/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/kernels/bmm_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void BmmKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-
-  if (x.numel() == 0 || y.numel() == 0) {
-    return;
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(x.dims(), 0, false);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(y.dims(), 0, false);
-
-  blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0));
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h
deleted file mode 100644
index 5d146dae8d..0000000000
--- a/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h
+++ /dev/null
@@ -1,339 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/kernels/cholesky_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-
-namespace phi {
-
-template <typename Context, typename T>
-inline void TransCompute(const int dim,
-                         const Context& dev_ctx,
-                         const DenseTensor& in,
-                         DenseTensor* out,
-                         const std::vector<int>& axis) {
-  switch (dim) {
-    case 1:
-      funcs::Transpose<Context, T, 1> trans1;
-      trans1(dev_ctx, in, out, axis);
-      break;
-    case 2:
-      funcs::Transpose<Context, T, 2> trans2;
-      trans2(dev_ctx, in, out, axis);
-      break;
-    case 3:
-      funcs::Transpose<Context, T, 3> trans3;
-      trans3(dev_ctx, in, out, axis);
-      break;
-    case 4:
-      funcs::Transpose<Context, T, 4> trans4;
-      trans4(dev_ctx, in, out, axis);
-      break;
-    case 5:
-      funcs::Transpose<Context, T, 5> trans5;
-      trans5(dev_ctx, in, out, axis);
-      break;
-    case 6:
-      funcs::Transpose<Context, T, 6> trans6;
-      trans6(dev_ctx, in, out, axis);
-      break;
-    default:
-      // for dim >= 7 situation
-      funcs::TransposeNormal<Context, T> trans_normal;
-      trans_normal(dev_ctx, in, out, axis);
-  }
-}
-
-/*! Use these functors to implement tril, triu, diagonal and other operators */
-template <typename T>
-struct EyeFunctor {
-  EyeFunctor(const int m, const int n, T* output)
-      : m_(m), n_(n), output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int global_row = index / n_;
-    const int col = index - global_row * n_;
-    const int batch = global_row / m_;
-    const int row = global_row - batch * m_;
-    output_[index] = col == row ? static_cast<T>(1) : static_cast<T>(0);
-  }
-
-  const int m_, n_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixSetDiagFunctor {
-  /*! Overwrite specified diagonals of output by the values in diagonal.
-   * diagonals can be a central band specified by num_diags and
-   * upper_diag_index, where upper_diag_index=0 refers to the main diagonal,
-   * positive value means superdiagonal and negative value means subdiagonal.
-   * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len]
-   * and the num_diags diagonals has a up to down layout. Otherwise it has a
-   * shape [i, j, ..., max_diag_len].
-   */
-  MatrixSetDiagFunctor(const int m,
-                       const int n,
-                       const int num_diags,
-                       const int max_diag_len,
-                       const int upper_diag_index,
-                       const T* diag,
-                       T* output)
-      : m_(m),
-        n_(n),
-        num_diags_(num_diags),
-        max_diag_len_(max_diag_len),
-        upper_diag_index_(upper_diag_index),
-        diag_(diag),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int batch_and_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_diag_index * max_diag_len_;
-    const int batch = batch_and_diag_index / num_diags_;
-    const int diag_index_in_input = batch_and_diag_index - batch * num_diags_;
-    // diag_index=0 refers to the main diagonal
-    const int diag_index = upper_diag_index_ - diag_index_in_input;
-    // shift down for subdiagonal if diag_index < 0
-    const int y_index =
-        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
-    // shift right for superdiagonal if diag_index > 0
-    const int x_index =
-        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
-
-    // Upper-bound checks for diagonals shorter than max_diag_len.
-    // y_index and x_index are nonnegative by construction.
-    if (y_index < m_ && x_index < n_) {
-      const int out_index = batch * m_ * n_ + y_index * n_ + x_index;
-      output_[out_index] = diag_[index];
-    }
-  }
-
-  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
-  const T* diag_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixDiagPartFunctor {
-  /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0
-   * refers to the main diagonal, positive value means superdiagonal and
-   * negative value means subdiagonal */
-  MatrixDiagPartFunctor(const int m,
-                        const int n,
-                        const int num_diags,
-                        const int max_diag_len,
-                        const int upper_diag_index,
-                        const T padding,
-                        const T* input,
-                        T* output)
-      : m_(m),
-        n_(n),
-        num_diags_(num_diags),
-        max_diag_len_(max_diag_len),
-        upper_diag_index_(upper_diag_index),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int batch_and_mapped_diag_index = index / max_diag_len_;
-    const int index_in_the_diagonal =
-        index - batch_and_mapped_diag_index * max_diag_len_;
-    const int batch = batch_and_mapped_diag_index / num_diags_;
-    const int mapped_diag_index =
-        batch_and_mapped_diag_index - batch * num_diags_;
-    // diag_index=0 refers to the main diagonal
-    const int diag_index = upper_diag_index_ - mapped_diag_index;
-    // shift down for subdiagonal if diag_index < 0
-    const int y_index =
-        index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index);
-    // shift right for superdiagonal if diag_index > 0
-    const int x_index =
-        index_in_the_diagonal + (0 > diag_index ? 0 : diag_index);
-    if (y_index < m_ && x_index < n_) {
-      output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index];
-    } else {
-      output_[index] = padding_;
-    }
-  }
-
-  const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_;
-  const T padding_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct MatrixBandPartScaleEndFunctor {
-  /*! Compared with MatrixBandPartFunctor, it scale up values at the end of
-   * band. It can be used to fuse the following operations, which actually
-   * output triangular with diagonal scaled up:
-   * 1. dig = matrix_diag_part(middle)
-   * 2. middle = matrix_set_diag(middle, diag * scalar)
-   * 3. middle = matrix_band_part(middle, -1, 0)
-   */
-  MatrixBandPartScaleEndFunctor(const int m,
-                                const int n,
-                                const int num_lower_diags,
-                                const int num_upper_diags,
-                                const T scale,
-                                const T* input,
-                                T* output)
-      : m_(m),
-        n_(n),
-        num_lower_diags_(num_lower_diags),
-        num_upper_diags_(num_upper_diags),
-        scale_(scale),
-        input_(input),
-        output_(output) {}
-
-  HOSTDEVICE void operator()(size_t index) const {
-    const int col = index % n_;
-    const int row = (index / n_) % m_;
-    const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_);
-    const int band_end =
-        (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1);
-    if (col < band_start || col >= band_end) {
-      output_[index] = 0;
-    } else if (col == band_end - 1) {
-      output_[index] = scale_ * input_[index];
-    } else {
-      output_[index] = input_[index];
-    }
-  }
-
-  const int m_, n_, num_lower_diags_, num_upper_diags_;
-  const T scale_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T>
-struct AddtoScaleFunctor {
-  AddtoScaleFunctor(const T scale, const T* input, T* output)
-      : scale_(scale), input_(input), output_(output) {}
-  HOSTDEVICE void operator()(size_t index) const {
-    output_[index] += input_[index];
-    output_[index] *= scale_;
-  }
-  const T scale_;
-  const T* input_;
-  T* output_;
-};
-
-template <typename T, typename Context>
-void CholeskyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& out,
-                        const DenseTensor& out_grad,
-                        bool upper,
-                        DenseTensor* x_grad) {
-  if (x_grad->numel() == 0) {
-    dev_ctx.template Alloc<T>(x_grad);
-    return;
-  }
-
-  auto* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
-  auto& dims = out.dims();
-  int batch_count = 1;
-  for (int i = 0; i < dims.size() - 2; i++) {
-    batch_count *= dims[i];
-  }
-  auto m = dims[dims.size() - 1];
-  int tensor_size = batch_count * m * m;
-
-  std::vector<int> axis(dims.size() - 2);
-  std::iota(axis.begin(), axis.end(), 0);
-  axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2});
-  DenseTensor l, l_grad;
-  if (upper) {
-    l.Resize(dims);
-    dev_ctx.template Alloc<T>(&l);
-    l_grad.Resize(dims);
-    dev_ctx.template Alloc<T>(&l_grad);
-    TransCompute<Context, T>(dims.size(), dev_ctx, out, &l, axis);
-    TransCompute<Context, T>(dims.size(), dev_ctx, out_grad, &l_grad, axis);
-  } else {
-    l = out;
-    l_grad = out_grad;
-  }
-  auto* l_data = l.data<T>();
-
-  /*！ refer to Iain Murray (2016); arXiv 1602.07527 */
-  /*! phi = matmul(L.transpose(-1, -2), grad) */
-  DenseTensor middle;
-  middle.Resize(dims);
-  auto* middle_data = dev_ctx.template Alloc<T>(&middle);
-  auto trans_desc = funcs::CreateMatrixDescriptor(dims, 0, true);
-  auto no_trans_desc = funcs::CreateMatrixDescriptor(dims, 0, false);
-  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
-  blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
-
-  /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */
-  phi::funcs::ForRange<Context> for_range(dev_ctx, tensor_size);
-  MatrixBandPartScaleEndFunctor<T> matrix_band_part_scale_end_functor(
-      m,
-      m,
-      /* num_lower_diags */ m,
-      /* num_upper_diags */ 0,
-      /* scale */ 0.5,
-      middle_data,
-      middle_data);
-  for_range(matrix_band_part_scale_end_functor);
-
-  // Compute inverse by solving the triangular linear system AX = B, where B
-  // is the identity matrix. The matrix X would be overwritten on B
-  DenseTensor identity;
-  identity.Resize(dims);
-  auto* identity_data = dev_ctx.template Alloc<T>(&identity);
-  EyeFunctor<T> eye_functor(m, m, identity_data);
-  for_range(eye_functor);
-  // TODO(guosheng): use trsmBatched for GPU
-  for (int i = 0; i < batch_count; i++) {
-    blas.TRSM(/*side*/ CblasLeft,
-              /*uplo*/ CblasLower,
-              /*trans*/ CblasNoTrans,
-              /*diag*/ CblasNonUnit,
-              /*m*/ m,
-              /*n*/ m,
-              /*alpha*/ T(1),
-              l_data + i * m * m,
-              /*lda*/ m,
-              identity_data + i * m * m,
-              /*ldb*/ m);
-  }
-  DenseTensor& l_inverse = identity;
-
-  /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */
-  DenseTensor middle1;
-  middle1.Resize(dims);
-  dev_ctx.template Alloc<T>(&middle1);
-  blas.MatMul(
-      l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1, T(0));
-  blas.MatMul(
-      middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad, T(0));
-
-  /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */
-  DenseTensor x_grad_trans;
-  x_grad_trans.Resize(dims);
-  auto* x_grad_trans_data = dev_ctx.template Alloc<T>(&x_grad_trans);
-  TransCompute<Context, T>(dims.size(), dev_ctx, *x_grad, &x_grad_trans, axis);
-  AddtoScaleFunctor<T> addto_scale_functor(0.5, x_grad_trans_data, x_grad_data);
-  for_range(addto_scale_functor);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h
deleted file mode 100644
index 098092767c..0000000000
--- a/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
-#include "paddle/phi/kernels/cholesky_solve_kernel.h"
-#include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/expand_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-#include "paddle/phi/kernels/funcs/matrix_reduce.h"
-#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-namespace phi {
-
-template <typename T, typename Context>
-void CholeskySolveGradKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             const DenseTensor& out,
-                             const DenseTensor& dout,
-                             bool upper,
-                             DenseTensor* dx,
-                             DenseTensor* dy) {
-  if (dout.numel() == 0) {
-    if (dx) {
-      dev_ctx.template Alloc<T>(dx);
-      if (dx->numel() != 0) {
-        phi::Full<T, Context>(
-            dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx);
-      }
-    }
-    if (dy) {
-      dev_ctx.template Alloc<T>(dy);
-      if (dy->numel() != 0) {
-        phi::Full<T, Context>(
-            dev_ctx, phi::IntArray(common::vectorize(dy->dims())), 0, dy);
-      }
-    }
-    return;
-  }
-  // get broadcast dim
-  std::vector<int64_t> x_bst_dims_vec;
-  std::vector<int64_t> y_bst_dims_vec;
-  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
-      funcs::MatrixGetBroadcastDims(x, y);
-  IntArray x_bst_dims(x_bst_dims_vec);
-  IntArray y_bst_dims(y_bst_dims_vec);
-
-  // Tensor broadcast to temp 'y_bst'
-  DenseTensor y_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
-  ExpandKernel<T, Context>(dev_ctx, y, y_bst_dims, &y_bst);
-
-  // reuse forward to calculate dx_bst, which is broad_cast of dx
-  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims);
-  CholeskySolveKernel<T, Context>(dev_ctx, dout, y_bst, upper, &dx_bst);
-
-  // get 'dx' according to 'dx_bst'
-  dx->Resize(x.dims());
-  dev_ctx.template Alloc<T>(dx);
-  if (dx_bst.dims() == x.dims()) {
-    Copy<Context>(dev_ctx, dx_bst, dev_ctx.GetPlace(), false, dx);
-  } else {
-    funcs::MatrixReduceSumFunctor<T, Context> functor;
-    functor(dev_ctx, dx_bst, dx);
-    dx->Resize(x.dims());
-  }
-
-  // calculate out's conjugate for complex
-  DenseTensor out_conj = Conj<T, Context>(dev_ctx, out);
-  out_conj = phi::TransposeLast2Dim<T>(dev_ctx, out_conj);
-
-  DenseTensor commonterm = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  blas.MatMul(dx_bst,
-              phi::funcs::CreateMatrixDescriptor(dx_bst.dims(), 0, false),
-              out_conj,
-              phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false),
-              static_cast<T>(1),
-              &commonterm,
-              static_cast<T>(0));
-
-  // calculate commonterm's conjugate for complex
-  DenseTensor commonterm_conj = Conj<T, Context>(dev_ctx, commonterm);
-  commonterm_conj = phi::TransposeLast2Dim<T>(dev_ctx, commonterm_conj);
-
-  phi::AddKernel<T>(dev_ctx, commonterm, commonterm_conj, &commonterm);
-
-  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
-  if (upper) {
-    blas.MatMul(y_bst,
-                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
-                commonterm,
-                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
-                static_cast<T>(-1),
-                &dy_bst,
-                static_cast<T>(0));
-  } else {
-    blas.MatMul(commonterm,
-                phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false),
-                y_bst,
-                phi::funcs::CreateMatrixDescriptor(y_bst.dims(), 0, false),
-                static_cast<T>(-1),
-                &dy_bst,
-                static_cast<T>(0));
-  }
-
-  // get upper or lower of 'dy_bst'
-  DenseTensor dy_bst_upper = phi::Empty<T, Context>(dev_ctx, y_bst_dims);
-
-  int y_bst_ndim = y_bst_dims_vec.size();
-  const auto H = y_bst_dims_vec[y_bst_ndim - 2];
-  const auto W = y_bst_dims_vec[y_bst_ndim - 1];
-  phi::funcs::ForRange<Context> y_for_range(dev_ctx, dy_bst.numel());
-  phi::funcs::TrilTriuCompute<T> tril_triu_functor(
-      dy_bst.data<T>(), 0, !upper, H, W, dy_bst_upper.data<T>());
-  y_for_range(tril_triu_functor);
-
-  // get 'dy' according to 'dy_bst'
-  dy->Resize(y.dims());
-  dev_ctx.template Alloc<T>(dy);
-  if (dy_bst_upper.dims() == y.dims()) {
-    Copy<Context>(dev_ctx, dy_bst_upper, dev_ctx.GetPlace(), false, dy);
-  } else {
-    funcs::MatrixReduceSumFunctor<T, Context> functor;
-    functor(dev_ctx, dy_bst_upper, dy);
-    dy->Resize(y.dims());
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h b/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h
deleted file mode 100644
index 920881af72..0000000000
--- a/backends/metax_gpu/kernels/impl/conv_cudnn_impl.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/kernel_registry.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/phi/kernels/gpudnn/conv_miopen_helper.h"
-#else
-#include "gpudnn/conv_cudnn_v7.h"
-#endif
-
-#include "paddle/phi/backends/dynload/cudnn.h"
-#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-#include "paddle/phi/kernels/funcs/padding.h"
-
-COMMON_DECLARE_bool(cudnn_deterministic);
-PD_DECLARE_int64(conv_workspace_size_limit);
-COMMON_DECLARE_bool(cudnn_exhaustive_search);
-
-namespace phi {
-
-static inline bool IsVoltaOrLater(const phi::GPUContext& dev_ctx) {
-  return dev_ctx.GetComputeCapability() >= 70;
-}
-
-// inline cudnnTensorFormat_t GetCudnnTensorFormat(
-//     const phi::DataLayout& order) {  // Not use
-//   switch (order) {
-//     case phi::DataLayout::kNHWC:
-//       return CUDNN_TENSOR_NHWC;
-//     case phi::DataLayout::kNCHW:
-//       return CUDNN_TENSOR_NCHW;
-//     case phi::DataLayout::NCDHW:
-//       return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
-//     case phi::DataLayout::NDHWC:
-//       return CUDNN_TENSOR_NHWC;  // add, liyamei
-//     default:
-//       PADDLE_THROW(common::errors::Unimplemented(
-//           "CUDNN has no equivalent dataLayout for input order."));
-//   }
-//   return CUDNN_TENSOR_NCHW;
-// }
-
-// static inline void GetNCDHW(const DDim& dims,
-//                             const phi::DataLayout& layout,
-//                             int* N,
-//                             int* C,
-//                             int* D,
-//                             int* H,
-//                             int* W) {
-//   *N = dims[0];
-//   *C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-//   int i = layout == phi::DataLayout::kNCHW ? 0 : 1;
-//   if (dims.size() == 5) {
-//     *D = dims[2 - i];
-//     *H = dims[3 - i];
-//     *W = dims[4 - i];
-//   } else {
-//     *D = 1;
-//     *H = dims[2 - i];
-//     *W = dims[3 - i];
-//   }
-// }
-
-}  // namespace phi
-
-// PD_REGISTER_KERNEL(convdnn, GPU, ALL_LAYOUT, phi::ConvKernel, float, double
-// ) {}
diff --git a/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h
deleted file mode 100644
index 6066720ab0..0000000000
--- a/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h
+++ /dev/null
@@ -1,557 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/kernels/cpu/conv_util.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/im2col.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/vol2col.h"
-namespace phi {
-
-template <typename T, typename Context>
-void ConvGradKernel(const Context& dev_ctx,
-                    const DenseTensor& input,
-                    const DenseTensor& filter_t,
-                    const DenseTensor& output_grad,
-                    const std::vector<int>& strides,
-                    const std::vector<int>& paddings_t,
-                    const std::string& padding_algorithm,
-                    const std::vector<int>& dilations_t,
-                    int groups,
-                    const std::string& data_format,
-                    DenseTensor* input_grad,
-                    DenseTensor* filter_grad) {
-  // The filter and filter_grad will be reshaped in the calculations,
-  // so here use an assignment operation,
-  // that avoids modifying the variable in the Scope.
-
-  if (!input_grad && !filter_grad) return;
-  std::vector<int> paddings = paddings_t;
-  std::vector<int> dilations = dilations_t;
-
-  DenseTensor filter = filter_t;
-  // 0-size
-  if (input.numel() == 0) {
-    if (input_grad) dev_ctx.template Alloc<T>(input_grad);
-    if (filter_grad) {
-      phi::Full<T, Context>(
-          dev_ctx,
-          phi::IntArray(common::vectorize(filter_grad->dims())),
-          0,
-          filter_grad);
-    }
-    return;
-  }
-
-  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-  DenseTensor transformed_input(input.type());
-  DenseTensor transformed_output_grad(output_grad.type());
-
-  if (channel_last) {
-    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
-    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
-
-    ResizeToChannelFirst<Context, T>(
-        dev_ctx, &output_grad, &transformed_output_grad);
-    TransToChannelFirst<Context, T>(
-        dev_ctx, &output_grad, &transformed_output_grad);
-  } else {
-    transformed_input = input;
-    transformed_output_grad = output_grad;
-  }
-
-  // update padding and dilation
-  auto in_dims = transformed_input.dims();
-  auto filter_dims = filter.dims();
-  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
-  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
-  UpdatePaddingAndDilation<int>(
-      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-
-  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-  std::vector<int64_t> filter_shape_vec(common::vectorize(filter.dims()));
-  // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-  std::vector<int64_t> output_shape_vec(
-      common::vectorize(transformed_output_grad.dims()));
-
-  // use col_shape in the im2col calculation
-  // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
-  // o_h, o_w}
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = transformed_input.dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  DDim col_shape(common::make_ddim(col_shape_vec));
-
-  // use col_matrix_shape in the gemm calculation
-  // size: (i_c/g * k_h * k_w, o_h * o_w)
-  // or
-  // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
-  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
-
-  DDim input_shape =
-      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
-
-  DDim filter_matrix_shape = {filter.dims()[0],
-                              filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-
-  DDim output_matrix_shape = {
-      transformed_output_grad.dims()[1],
-      transformed_output_grad.numel() / (transformed_output_grad.dims()[0] *
-                                         transformed_output_grad.dims()[1])};
-
-  // convolution backward input operator:  gemm + col2im(or col2vol)
-  // convolution backward weight operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-  int out_step = static_cast<int>(transformed_output_grad.dims()[1]) / groups;
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-
-  DenseTensor col;
-  // col_matrix shares the same piece of data with col,
-  // but will be reshaped into a two-dimensional matrix shape
-  // to call the matrix multiplication interface.
-  DenseTensor col_matrix;
-  if (is_expand) {
-    col.Resize(col_shape);
-    dev_ctx.template Alloc<T>(&col);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  phi::funcs::SetConstant<Context, T> set_zero;
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  if (input_grad) {
-    dev_ctx.template Alloc<T>(input_grad);
-    DenseTensor transformed_input_grad(input_grad->type());
-    if (channel_last) {
-      ResizeToChannelFirst<Context, T>(
-          dev_ctx, input_grad, &transformed_input_grad);
-
-    } else {
-      transformed_input_grad = *input_grad;
-    }
-    // if is_expand is false, the operation of set_zero is unnecessary,
-    // because math::matmul will reset input_grad.
-    if (is_expand) {
-      set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
-    }
-    phi::funcs::Col2ImFunctor<phi::funcs::ColFormat::kCFO, Context, T> col2im;
-    phi::funcs::Col2VolFunctor<Context, T> col2vol;
-
-    for (int i = 0; i < batch_size; i++) {
-      DenseTensor out_grad_batch =
-          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-      DenseTensor in_grad_batch =
-          transformed_input_grad.Slice(i, i + 1).Resize(input_shape);
-      for (int g = 0; g < groups; g++) {
-        // gemm
-        DenseTensor out_grad_slice =
-            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-        DenseTensor filter_slice =
-            filter.Slice(g * out_step, (g + 1) * out_step);
-
-        DenseTensor in_grad_slice =
-            in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col_matrix.ShareDataWith(in_grad_slice);
-          col_matrix.Resize(col_matrix_shape);
-        }
-        blas.MatMul(filter_slice,
-                    true,
-                    out_grad_slice,
-                    false,
-                    T(1.0),
-                    &col_matrix,
-                    T(0.0));
-
-        if (is_expand && data_dim == 2U) {
-          col2im(dev_ctx,
-                 col,
-                 dilations,
-                 strides,
-                 std::vector<int>{
-                     paddings[0], paddings[2], paddings[1], paddings[3]},
-                 &in_grad_slice);
-        } else if (is_expand && data_dim == 3U) {
-          col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
-        }
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<Context, T>(
-          dev_ctx, &transformed_input_grad, input_grad);
-    }
-  }
-
-  if (filter_grad) {
-    dev_ctx.template Alloc<T>(filter_grad);
-    Tensor filter_grad_ = *filter_grad;
-    filter_grad_.Resize(filter_matrix_shape);
-    set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-    phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kCFO, Context, T> im2col;
-    phi::funcs::Vol2ColFunctor<Context, T> vol2col;
-    for (int i = 0; i < batch_size; i++) {
-      DenseTensor out_grad_batch =
-          transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape);
-      DenseTensor in_batch =
-          transformed_input.Slice(i, i + 1).Resize(input_shape);
-      for (int g = 0; g < groups; g++) {
-        // im2col
-        DenseTensor out_grad_slice =
-            out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
-        DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          im2col(dev_ctx,
-                 in_slice,
-                 dilations,
-                 strides,
-                 std::vector<int>{
-                     paddings[0], paddings[2], paddings[1], paddings[3]},
-                 &col);
-
-        } else if (data_dim == 3U) {
-          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-        }
-
-        // gemm
-        DenseTensor filter_grad_slice =
-            filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(out_grad_slice,
-                    false,
-                    col_matrix,
-                    true,
-                    T(1.0),
-                    &filter_grad_slice,
-                    T(1.0));
-      }
-    }
-  }
-}
-
-template <typename T, typename Context>
-void ConvGradGradKernel(const Context& dev_ctx,
-                        const DenseTensor& input,
-                        const DenseTensor& filter,
-                        const DenseTensor& out_grad,
-                        const paddle::optional<DenseTensor>& input_grad_grad,
-                        const paddle::optional<DenseTensor>& filter_grad_grad,
-                        const std::vector<int>& strides_t,
-                        const std::vector<int>& paddings_t,
-                        const std::string& padding_algorithm,
-                        const std::vector<int>& dilations_t,
-                        int groups,
-                        const std::string& data_format,
-                        DenseTensor* input_grad,
-                        DenseTensor* filter_grad,
-                        DenseTensor* out_grad_grad) {
-  const DenseTensor* X = &input;
-  const DenseTensor* dY = &out_grad;
-  const DenseTensor* ddX = input_grad_grad.get_ptr();
-  const DenseTensor* ddW_in = filter_grad_grad.get_ptr();
-
-  DenseTensor* ddY = out_grad_grad;
-  DenseTensor* dW = filter_grad;
-  DenseTensor* dX = input_grad;
-  DenseTensor W = filter;
-
-  if (!ddY && !dW && !dX) return;
-
-  const std::vector<int> strides = strides_t;
-  std::vector<int> paddings = paddings_t;
-  std::vector<int> dilations = dilations_t;
-
-  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-  // transform Tensor
-  DenseTensor transformed_X(X->type());
-  DenseTensor transformed_dY(dY->type());
-  DenseTensor transformed_ddX(X->type());
-
-  if (channel_last) {
-    ResizeToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
-    TransToChannelFirst<Context, T>(dev_ctx, X, &transformed_X);
-
-    ResizeToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
-    TransToChannelFirst<Context, T>(dev_ctx, dY, &transformed_dY);
-
-    if (ddX) {
-      ResizeToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
-      TransToChannelFirst<Context, T>(dev_ctx, ddX, &transformed_ddX);
-    }
-  } else {
-    transformed_X = *X;
-    transformed_dY = *dY;
-    if (ddX) {
-      transformed_ddX = *ddX;
-    }
-  }
-
-  // update padding and dilation
-  auto in_dims = transformed_X.dims();
-  auto filter_dims = W.dims();
-
-  DDim in_data_dims = slice_ddim(in_dims, 2, in_dims.size());
-  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
-  UpdatePaddingAndDilation(
-      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-  const int batch_size = static_cast<int>(transformed_X.dims()[0]);
-  std::vector<int64_t> filter_shape_vec(common::vectorize(W.dims()));
-  std::vector<int64_t> output_shape_vec(
-      common::vectorize(transformed_dY.dims()));
-
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  // col_shape [in_channel/group, kh, kw, oh, ow]
-  col_shape_vec[0] = transformed_X.dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2];
-  }
-  DDim col_shape(common::make_ddim(col_shape_vec));
-  // col_matrix_shape [in_channel/group * kh * kw, oh * ow]
-  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
-  // input_shape [Cin, H, W]
-  DDim input_shape =
-      slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size());
-  // filter_matrix_shape [Cout, Cin * kh * kw]
-  DDim filter_matrix_shape = {W.dims()[0], W.numel() / W.dims()[0]};
-
-  W.Resize(filter_matrix_shape);
-  DDim output_matrix_shape = {
-      transformed_dY.dims()[1],
-      transformed_dY.numel() /
-          (transformed_dY.dims()[0] * transformed_dY.dims()[1])};
-  int in_step = static_cast<int>(transformed_X.dims()[1]) / groups;
-  int out_step = static_cast<int>(transformed_dY.dims()[1]) / groups;
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  DenseTensor col;
-  DenseTensor col_matrix;
-  if (is_expand) {
-    col.Resize(col_shape);
-    dev_ctx.template Alloc<T>(&col);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  phi::funcs::SetConstant<Context, T> set_zero;
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  // dx convolution double grad:  gemm + col2im(col2vol)
-  // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
-  // oH, oW)
-  if (dX && ddW_in) {
-    Tensor ddW;
-    ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-    dev_ctx.template Alloc<T>(dX);
-
-    DenseTensor transformed_dX(dX->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<Context, T>(dev_ctx, dX, &transformed_dX);
-
-    } else {
-      transformed_dX = *dX;
-    }
-    // if is_expand is false, the operation of set_zero is unnecessary
-    // because math::matmul will reset dx
-    if (is_expand) {
-      set_zero(dev_ctx, &transformed_dX, static_cast<T>(0));
-    }
-    phi::funcs::Col2ImFunctor<phi::funcs::ColFormat::kCFO, Context, T> col2im;
-    phi::funcs::Col2VolFunctor<Context, T> col2vol;
-
-    for (int i = 0; i < batch_size; i++) {
-      DenseTensor dy_batch =
-          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-      DenseTensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape);
-      for (int g = 0; g < groups; g++) {
-        // gemm
-        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-        DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-        DenseTensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step);
-        if (!is_expand) {
-          col_matrix.ShareDataWith(dx_slice);
-          col_matrix.Resize(col_matrix_shape);
-        }
-        blas.MatMul(
-            ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, T(0.0));
-
-        if (is_expand && data_dim == 2U) {
-          col2im(dev_ctx,
-                 col,
-                 dilations,
-                 strides,
-                 std::vector<int>{
-                     paddings[0], paddings[2], paddings[1], paddings[3]},
-                 &dx_slice);
-        } else if (is_expand && data_dim == 3U) {
-          col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice);
-        }
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<Context, T>(dev_ctx, &transformed_dX, dX);
-    }
-  }
-
-  // dw = ddx * dy  ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout,
-  // oH, oW)
-  // dw convolution double grad:  im2col(vol2col) + gemm
-  if (dW && ddX) {
-    dev_ctx.template Alloc<T>(dW);
-    set_zero(dev_ctx, dW, static_cast<T>(0));
-    DenseTensor dW_arr = *dW;
-    dW_arr.Resize(filter_matrix_shape);
-    phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kCFO, Context, T> im2col;
-    phi::funcs::Vol2ColFunctor<Context, T> vol2col;
-    for (int i = 0; i < batch_size; ++i) {
-      DenseTensor dy_batch =
-          transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-      Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-      for (int g = 0; g < groups; ++g) {
-        // im2col
-        DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
-        DenseTensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-        if (!is_expand) {
-          col.ShareDataWith(ddx_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          im2col(dev_ctx,
-                 ddx_slice,
-                 dilations,
-                 strides,
-                 std::vector<int>{
-                     paddings[0], paddings[2], paddings[1], paddings[3]},
-                 &col);
-        } else if (data_dim == 3U) {
-          vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-        }
-
-        DenseTensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(
-            dy_slice, false, col_matrix, true, T(1.0), &dw_slice, T(1.0));
-      }
-    }
-  }
-
-  // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W),
-  // w/ddw(Cout, Cin, kh, kw)
-  // ddy convolution double grad: im2col(vol2col) + gemm
-  if (ddY) {
-    dev_ctx.template Alloc<T>(ddY);
-
-    DenseTensor transformed_ddY(ddY->type());
-    if (channel_last) {
-      ResizeToChannelFirst<Context, T>(dev_ctx, ddY, &transformed_ddY);
-    } else {
-      transformed_ddY = *ddY;
-    }
-
-    set_zero(dev_ctx, &transformed_ddY, static_cast<T>(0));
-    phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kCFO, Context, T> im2col;
-    phi::funcs::Vol2ColFunctor<Context, T> vol2col;
-    for (int i = 0; i < batch_size; ++i) {
-      DenseTensor ddy_batch =
-          transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape);
-      for (int g = 0; g < groups; ++g) {
-        // gemm
-        DenseTensor ddy_slice =
-            ddy_batch.Slice(g * out_step, (g + 1) * out_step);
-
-        if (ddX) {
-          DenseTensor ddx_batch =
-              transformed_ddX.Slice(i, i + 1).Resize(input_shape);
-          DenseTensor ddx_slice =
-              ddx_batch.Slice(g * in_step, (g + 1) * in_step);
-          if (!is_expand) {
-            col.ShareDataWith(ddx_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx,
-                   ddx_slice,
-                   dilations,
-                   strides,
-                   std::vector<int>{
-                       paddings[0], paddings[2], paddings[1], paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col);
-          }
-          DenseTensor w_slice = W.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(
-              w_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(0.0));
-        }
-
-        if (ddW_in) {
-          DenseTensor x_batch =
-              transformed_X.Slice(i, i + 1).Resize(input_shape);
-          DenseTensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
-
-          DenseTensor ddW;
-          ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
-          if (!is_expand) {
-            col.ShareDataWith(x_slice);
-            col_matrix.ShareDataWith(col);
-            col_matrix.Resize(col_matrix_shape);
-          } else if (data_dim == 2U) {
-            im2col(dev_ctx,
-                   x_slice,
-                   dilations,
-                   strides,
-                   std::vector<int>{
-                       paddings[0], paddings[2], paddings[1], paddings[3]},
-                   &col);
-          } else if (data_dim == 3U) {
-            vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col);
-          }
-
-          // gemm
-          DenseTensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step);
-          blas.MatMul(
-              ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, T(1.0));
-        }
-      }
-    }
-    if (channel_last) {
-      TransToChannelLast<Context, T>(dev_ctx, &transformed_ddY, ddY);
-    }
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/conv_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_kernel_impl.h
deleted file mode 100644
index 4395e5d578..0000000000
--- a/backends/metax_gpu/kernels/impl/conv_kernel_impl.h
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/kernels/conv_kernel.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/im2col.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/vol2col.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ConvKernelImpl(const Context& dev_ctx,
-                    const DenseTensor& input,
-                    const DenseTensor& filter_t,
-                    const std::vector<int>& strides,
-                    const std::vector<int>& paddings_t,
-                    const std::string& padding_algorithm,
-                    int groups,
-                    const std::vector<int>& dilations_t,
-                    const std::string& data_format,
-                    DenseTensor* output) {
-  std::vector<int> paddings = paddings_t;
-  std::vector<int> dilations = dilations_t;
-  DenseTensor filter = filter_t;
-  if (input.numel() == 0) {
-    phi::Full<T, Context>(
-        dev_ctx, phi::IntArray(common::vectorize(output->dims())), 0, output);
-    return;
-  }
-  // The filter will be reshaped in the calculations,
-  // so here use an assignment operation,
-  // that avoids modifying the variable in the Scope.
-  dev_ctx.template Alloc<T>(output);
-
-  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-  DenseTensor transformed_input(input.type());
-  DenseTensor transformed_output(output->type());
-
-  if (channel_last) {
-    ResizeToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
-    TransToChannelFirst<Context, T>(dev_ctx, &input, &transformed_input);
-
-    ResizeToChannelFirst<Context, T>(dev_ctx, output, &transformed_output);
-
-  } else {
-    transformed_input = input;
-    transformed_output = *output;
-  }
-
-  // update padding and dilation
-  auto trans_in_dims = transformed_input.dims();
-  auto filter_dims = filter.dims();
-
-  DDim in_data_dims = slice_ddim(trans_in_dims, 2, trans_in_dims.size());
-  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-
-  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
-  UpdatePaddingAndDilation(
-      &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-  const int batch_size = static_cast<int>(transformed_input.dims()[0]);
-
-  // filter_shape_vec:
-  // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-  std::vector<int64_t> filter_shape_vec(common::vectorize(filter.dims()));
-
-  // output_shape_vec:
-  // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
-  std::vector<int64_t> output_shape_vec(
-      common::vectorize(transformed_output.dims()));
-
-  // use col_shape in the im2col calculation
-  // col_shape_vec:
-  // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w,
-  // o_d,o_h, o_w}
-  size_t data_dim = filter_shape_vec.size() - 2;
-
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = trans_in_dims[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-
-  DDim col_shape(common::make_ddim(col_shape_vec));
-
-  // use col_matrix_shape in the gemm calculation
-  // size:
-  // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h *
-  // o_w)
-
-  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim);
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-
-  DenseTensor col;
-  // col_matrix shares the same piece of data with col,
-  // but will be reshaped into a two-dimensional matrix shape
-  // to call the matrix multiplication interface.
-  DenseTensor col_matrix;
-  if (is_expand) {
-    // col = dev_ctx.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-    col.Resize(col_shape);
-    dev_ctx.template Alloc<T>(&col);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  DDim in_matrix_shape =
-      slice_ddim(transformed_input.dims(), 1, transformed_input.dims().size());
-
-  DDim filter_matrix_shape = {filter.dims()[0],
-                              filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-
-  DDim output_matrix_shape = {
-      transformed_output.dims()[1],
-      transformed_output.numel() /
-          (transformed_output.dims()[0] * transformed_output.dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(transformed_input.dims()[1]) / groups;
-  int out_step = static_cast<int>(transformed_output.dims()[1]) / groups;
-
-  phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kCFO, Context, T> im2col;
-  phi::funcs::Vol2ColFunctor<Context, T> vol2col;
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  for (int i = 0; i < batch_size; i++) {
-    DenseTensor in_batch =
-        transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
-    DenseTensor out_batch =
-        transformed_output.Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      DenseTensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        im2col(dev_ctx,
-               in_slice,
-               dilations,
-               strides,
-               std::vector<int>{
-                   paddings[0], paddings[2], paddings[1], paddings[3]},
-               &col);
-
-      } else if (data_dim == 3U) {
-        vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      DenseTensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      DenseTensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      blas.MatMul(
-          filter_slice, false, col_matrix, false, T(1.0), &out_slice, T(0.0));
-    }
-  }
-  if (channel_last) {
-    TransToChannelLast<Context, T>(dev_ctx, &transformed_output, output);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h
deleted file mode 100644
index aadc5d2b8a..0000000000
--- a/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h
+++ /dev/null
@@ -1,287 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/common/ddim.h"
-#include "paddle/common/layout.h"
-#include "paddle/phi/kernels/conv_transpose_kernel.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
-#include "paddle/phi/kernels/funcs/im2col.h"
-#include "paddle/phi/kernels/funcs/slice.h"
-#include "paddle/phi/kernels/funcs/vol2col.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void ConvTransposeRawKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& filter,
-                            const std::vector<int>& strides,
-                            const std::vector<int>& paddings,
-                            const std::string& padding_algorithm,
-                            int groups,
-                            const std::vector<int>& dilations,
-                            const std::string& data_format,
-                            DenseTensor* out) {
-  if (x.numel() == 0 || filter.numel() == 0) {
-    phi::Full<T, Context>(
-        dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
-    return;
-  }
-  const DataLayout data_layout = common::StringToDataLayout(data_format);
-  // The filter will be reshaped, so it should not be constant
-  DenseTensor filter_ = filter;
-  std::vector<int> paddings_ = paddings;
-  std::vector<int> dilations_ = dilations;
-
-  auto x_dims = x.dims();
-  auto filter_dims = filter_.dims();
-  auto out_dims = out->dims();
-  const int batch_size = static_cast<int>(x.dims()[0]);
-
-  DDim in_data_dims;
-  if (data_layout != DataLayout::kNHWC) {
-    in_data_dims = slice_ddim(x_dims, 2, x_dims.size());
-  } else {
-    in_data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1);
-  }
-  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
-  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
-  UpdatePaddingAndDilation(
-      &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
-
-  // x_shape_vec: {n, c, h, w} or {n, c, d, h, w} for channel_first
-  // x_shape_vec: {n, h, w, c} or {n, d, h, w, c} for channel_last
-  std::vector<int64_t> x_shape_vec = common::vectorize(x.dims());
-  // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
-  std::vector<int64_t> filter_shape_vec = common::vectorize(filter_.dims());
-
-  // use col_shape in the im2col and col2im (or vol2col and col2vol)
-  // calculation
-  // col_shape_vec: {o_c/g, k_h, k_w, h, w} or {o_c/g, k_d, k_h, k_w, d, h, w}
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  if (data_layout != DataLayout::kNHWC) {
-    col_shape_vec[0] = out_dims[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 2];
-    }
-  } else {
-    col_shape_vec[0] = out_dims[out_dims.size() - 1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = x_shape_vec[j + 1];
-    }
-  }
-  DDim col_shape(common::make_ddim(col_shape_vec));
-
-  // use col_matrix_shape in the gemm calculation
-  // size: (o_c/g * k_h * k_w, h * w) or (o_c/g * k_d * k_h * k_w, d * h * w)
-  DDim col_matrix_shape = flatten_to_2d(col_shape, data_dim + 1);
-
-  DenseTensor col;
-  col.Resize(col_shape);
-  dev_ctx.template Alloc<T>(&col);
-  // col_matrix shares the same piece of data with col,
-  // but will be reshaped into a two-dimensional matrix shape
-  // to call the matrix multiplication interface.
-  DenseTensor col_matrix;
-  col_matrix.ShareDataWith(col);
-  col_matrix.Resize(col_matrix_shape);
-
-  // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
-  // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
-  DDim out_shape = slice_ddim(out->dims(), 1, out->dims().size());
-
-  // x matrix size: (i_c, h * w) or (i_c, d * h * w) for channel_first
-  // x matrix size: (h * w, i_c) or (d * h * w, i_c) for channel_last
-  DDim x_matrix_shape;
-  if (data_layout != DataLayout::kNHWC) {
-    x_matrix_shape = {x_dims[1], col_matrix_shape[1]};
-  } else {
-    x_matrix_shape = {col_matrix_shape[1], x_dims[x_dims.size() - 1]};
-  }
-
-  // filter size: (i_c, o_c/g * k_h * k_w) or (i_c, o_c/g * k_d * k_h * k_w)
-  DDim filter_matrix_shape;
-  if (data_layout != DataLayout::kNHWC) {
-    filter_matrix_shape = {x_dims[1], col_matrix_shape[0]};
-  } else {
-    filter_matrix_shape = {x_dims[x_dims.size() - 1], col_matrix_shape[0]};
-  }
-  filter_.Resize(filter_matrix_shape);
-
-  dev_ctx.template Alloc<T>(out);
-
-  funcs::SetConstant<Context, T> set_zero;
-
-  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
-  set_zero(dev_ctx, out, static_cast<T>(0));
-
-  int in_step = (data_layout != DataLayout::kNHWC
-                     ? static_cast<int>(x_dims[1]) / groups
-                     : static_cast<int>(x_dims[x_dims.size() - 1]) / groups);
-
-  int out_step =
-      (data_layout != DataLayout::kNHWC
-           ? static_cast<int>(out_dims[1]) / groups
-           : static_cast<int>(out_dims[out_dims.size() - 1]) / groups);
-  phi::funcs::Col2ImFunctor<phi::funcs::ColFormat::kCFO, Context, T> col2im;
-  phi::funcs::Col2VolFunctor<Context, T> col2vol;
-  funcs::ConcatFunctor<Context, T> concat_functor;
-
-  // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
-  // on x)
-  size_t D = x.dims().size();
-  for (int i = 0; i < batch_size; i++) {
-    // batch with size (i_c, h * w) or (i_c, d * h * w) for channel_first
-    // batch with size (h * w, i_c) or (d * h * w, i_c) for channel_last
-    DenseTensor x_batch = x.Slice(i, i + 1).Resize(x_matrix_shape);
-
-    // out size: (o_c, o_h, o_w) or (o_c, o_d, o_h, o_w) for channel_first
-    // out size: (o_h, o_w, o_c) or (o_d, o_h, o_w, o_c) for channel_last
-    DenseTensor out_batch = out->Slice(i, i + 1).Resize(out_shape);
-
-    std::vector<DenseTensor> out_batch_vec;
-    for (int g = 0; g < groups; g++) {
-      int64_t start = g * in_step;
-      int64_t end = (g + 1) * in_step;
-      int axes = (data_layout != DataLayout::kNHWC ? 0 : 1);
-      DenseTensor filter_slice = filter_.Slice(g * in_step, (g + 1) * in_step);
-      DenseTensor in_slice, out_slice;
-
-      // col_matrix = filter_slice * x_slice
-      // of shape (o_c/g * k_h * k_w, h * w)
-      // or (o_c/g * k_d * k_h * k_w, d * h * w)
-      if (data_layout != DataLayout::kNHWC) {
-        in_slice = x_batch.Slice(g * in_step, (g + 1) * in_step);
-        out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        blas.MatMul(filter_slice,
-                    true,
-                    in_slice,
-                    false,
-                    static_cast<T>(1.0),
-                    &col_matrix,
-                    static_cast<T>(0.0));
-      } else {
-        funcs::Slice<Context, T, 2>(
-            dev_ctx, &x_batch, &in_slice, start, end, axes);
-        start = g * out_step;
-        end = (g + 1) * out_step;
-        axes = D - 2;
-        if (D == 4U) {
-          funcs::Slice<Context, T, 3>(
-              dev_ctx, &out_batch, &out_slice, start, end, axes);
-        } else if (D == 5U) {
-          funcs::Slice<Context, T, 4>(
-              dev_ctx, &out_batch, &out_slice, start, end, axes);
-        }
-        blas.MatMul(filter_slice,
-                    true,
-                    in_slice,
-                    true,
-                    static_cast<T>(1.0),
-                    &col_matrix,
-                    static_cast<T>(0.0));
-      }
-
-      if (data_dim == 2U) {
-        // col2im: col_matrix -> dy from (o_c/g * k_h * k_w, h * w) to (o_c/g,
-        // o_h, o_w) or (o_h, o_w, o_c/g)
-        col2im(dev_ctx,
-               col,
-               dilations_,
-               strides,
-               std::vector<int>{
-                   paddings_[0], paddings_[2], paddings_[1], paddings_[3]},
-               &out_slice,
-               data_layout);
-      } else if (data_dim == 3U) {
-        // col2vol: col_matrix -> dy from (o_c/g * k_d * k_h * k_w, d * h * w)
-        // to (o_c/g, o_d, o_h, o_w) or (o_d, o_h, o_w, o_c/g)
-        col2vol(dev_ctx,
-                col,
-                dilations_,
-                strides,
-                paddings_,
-                &out_slice,
-                data_layout);
-      }
-      if (data_layout == DataLayout::kNHWC) {
-        out_batch_vec.push_back(out_slice);
-      }
-    }
-    if (data_layout == DataLayout::kNHWC) {
-      concat_functor(
-          dev_ctx, out_batch_vec, static_cast<int>(D - 2), &out_batch);
-    }
-  }
-}
-
-template <typename T, typename Context>
-void Conv2dTransposeKernel(const Context& dev_ctx,
-                           const DenseTensor& x,
-                           const DenseTensor& filter,
-                           const std::vector<int>& strides,
-                           const std::vector<int>& paddings,
-                           const std::vector<int>& output_padding UNUSED,
-                           const IntArray& output_size UNUSED,
-                           const std::string& padding_algorithm,
-                           int groups,
-                           const std::vector<int>& dilations,
-                           const std::string& data_format,
-                           DenseTensor* out) {
-  ConvTransposeRawKernel<T, Context>(dev_ctx,
-                                     x,
-                                     filter,
-                                     strides,
-                                     paddings,
-                                     padding_algorithm,
-                                     groups,
-                                     dilations,
-                                     data_format,
-                                     out);
-}
-
-template <typename T, typename Context>
-void Conv3dTransposeKernel(const Context& dev_ctx,
-                           const DenseTensor& x,
-                           const DenseTensor& filter,
-                           const std::vector<int>& strides,
-                           const std::vector<int>& paddings,
-                           const std::vector<int>& output_padding UNUSED,
-                           const std::vector<int>& output_size UNUSED,
-                           const std::string& padding_algorithm,
-                           int groups,
-                           const std::vector<int>& dilations,
-                           const std::string& data_format,
-                           DenseTensor* out) {
-  ConvTransposeRawKernel<T, Context>(dev_ctx,
-                                     x,
-                                     filter,
-                                     strides,
-                                     paddings,
-                                     padding_algorithm,
-                                     groups,
-                                     dilations,
-                                     data_format,
-                                     out);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h
deleted file mode 100644
index b9931a8997..0000000000
--- a/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ /dev/null
@@ -1,365 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/common/hostdevice.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-
-namespace phi {
-
-template <typename T>
-HOSTDEVICE T DmcnGetGradientWeight(T argmax_h,
-                                   T argmax_w,
-                                   const int h,
-                                   const int w,
-                                   const int height,
-                                   const int width) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  weight = (h == argmax_h_low && w == argmax_w_low)
-               ? (h + 1 - argmax_h) * (w + 1 - argmax_w)
-               : weight;
-  weight = (h == argmax_h_low && w == argmax_w_high)
-               ? (h + 1 - argmax_h) * (argmax_w + 1 - w)
-               : weight;
-  weight = (h == argmax_h_high && w == argmax_w_low)
-               ? (argmax_h + 1 - h) * (w + 1 - argmax_w)
-               : weight;
-  weight = (h == argmax_h_high && w == argmax_w_high)
-               ? (argmax_h + 1 - h) * (argmax_w + 1 - w)
-               : weight;
-
-  return weight;
-}
-
-template <typename T>
-HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
-                                     T argmax_w,
-                                     const int height,
-                                     const int width,
-                                     const T* im_data,
-                                     const int data_width,
-                                     const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  if (bp_dir == 0) {
-    weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
-                  ? -1 * (argmax_w_low + 1 - argmax_w) *
-                        im_data[argmax_h_low * data_width + argmax_w_low]
-                  : 0;
-
-    weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-                  ? -1 * (argmax_w - argmax_w_low) *
-                        im_data[argmax_h_low * data_width + argmax_w_high]
-                  : 0;
-
-    weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-                  ? (argmax_w_low + 1 - argmax_w) *
-                        im_data[argmax_h_high * data_width + argmax_w_low]
-                  : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-                  ? (argmax_w - argmax_w_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_high]
-                  : 0;
-  } else if (bp_dir == 1) {
-    weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
-                  ? -1 * (argmax_h_low + 1 - argmax_h) *
-                        im_data[argmax_h_low * data_width + argmax_w_low]
-                  : 0;
-    weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-                  ? (argmax_h_low + 1 - argmax_h) *
-                        im_data[argmax_h_low * data_width + argmax_w_high]
-                  : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-                  ? -1 * (argmax_h - argmax_h_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_low]
-                  : 0;
-    weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-                  ? (argmax_h - argmax_h_low) *
-                        im_data[argmax_h_high * data_width + argmax_w_high]
-                  : 0;
-  }
-
-  return weight;
-}
-
-template <typename T, typename Context>
-void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
-                                    const T* data_col,
-                                    const T* data_im,
-                                    const T* data_offset,
-                                    const T* data_mask,
-                                    const std::vector<int64_t>& im_shape,
-                                    const std::vector<int64_t>& col_shape,
-                                    const std::vector<int64_t>& kernel_shape,
-                                    const std::vector<int>& paddings,
-                                    const std::vector<int>& strides,
-                                    const std::vector<int>& dilations,
-                                    const int deformable_groups,
-                                    T* grad_offset,
-                                    T* grad_mask);
-
-template <typename T, typename Context>
-void ModulatedDeformableCol2im(const Context& dev_ctx,
-                               const T* data_col,
-                               const T* data_offset,
-                               const T* data_mask,
-                               const std::vector<int64_t>& im_shape,
-                               const std::vector<int64_t>& col_shape,
-                               const std::vector<int64_t>& kernel_shape,
-                               const std::vector<int>& pad,
-                               const std::vector<int>& stride,
-                               const std::vector<int>& dilation,
-                               const int deformable_group,
-                               T* grad_im);
-
-template <typename T, typename Context>
-void FilterGradAddup(const Context& dev_ctx,
-                     const int nthreads,
-                     const int n,
-                     const int height,
-                     const int width,
-                     const T* dweight_3d,
-                     T* filter_grad);
-
-template <typename T, typename Context>
-void DeformableConvGradKernel(const Context& dev_ctx,
-                              const DenseTensor& x,
-                              const DenseTensor& offset,
-                              const DenseTensor& filter,
-                              const paddle::optional<DenseTensor>& mask,
-                              const DenseTensor& out_grad,
-                              const std::vector<int>& strides,
-                              const std::vector<int>& paddings,
-                              const std::vector<int>& dilations,
-                              int deformable_groups,
-                              int groups,
-                              int im2col_step,
-                              DenseTensor* dx,
-                              DenseTensor* offset_grad,
-                              DenseTensor* filter_grad,
-                              DenseTensor* mask_grad) {
-  const int batch_size = static_cast<int>(x.dims()[0]);
-
-  DDim input_shape = common::slice_ddim(x.dims(), 1, x.dims().size());
-  std::vector<int64_t> input_shape_vec = common::vectorize(input_shape);
-  std::vector<int64_t> filter_shape_vec(common::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(common::vectorize(out_grad.dims()));
-
-  std::vector<int64_t> col_buffer_shape_vec(filter_shape_vec.size());
-  col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3];
-  col_buffer_shape_vec[1] = im2col_step;
-  for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) {
-    col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2];
-  }
-  std::vector<int64_t> output_buffer_shape_vec(1);
-  output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] *
-                               output_shape_vec[2] * output_shape_vec[3];
-
-  DenseTensor col_buffer = Empty<T>(dev_ctx, col_buffer_shape_vec);
-  DenseTensor output_buffer;
-  output_buffer.ShareDataWith(out_grad).Resize(
-      common::make_ddim(output_buffer_shape_vec));
-
-  int64_t M =
-      input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3];
-  int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3];
-  int64_t K = output_shape_vec[1] / groups;
-
-  DDim weight_3d_shape = {groups, K, M};
-  DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K, N};
-  DDim col_buffer_3d_shape = {groups, M, N};
-  DDim filter_grad_shape = {groups, K, M};
-
-  DenseTensor weight_3d;
-  weight_3d.ShareDataWith(filter).Resize(weight_3d_shape);
-  DenseTensor out_grad_4d;
-  out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape);
-  DenseTensor col_buffer_3d;
-  col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
-
-  phi::funcs::SetConstant<Context, T> set_zero;
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  int input_dim = x.numel() / x.dims()[0];
-  int input_offset_dim = offset.numel() / offset.dims()[0];
-  int input_mask_dim = mask ? mask->numel() / mask->dims()[0] : 0;
-
-  if (filter_grad) {
-    Full<T>(dev_ctx,
-            {filter_grad_shape.Get(), filter_grad_shape.size()},
-            0,
-            filter_grad);
-  }
-
-  if (dx) {
-    dev_ctx.template Alloc<T>(dx);
-    set_zero(dev_ctx, dx, static_cast<T>(0));
-  }
-
-  if (offset_grad) {
-    dev_ctx.template Alloc<T>(offset_grad);
-    set_zero(dev_ctx, offset_grad, static_cast<T>(0));
-
-    if (mask_grad) {
-      dev_ctx.template Alloc<T>(mask_grad);
-      set_zero(dev_ctx, mask_grad, static_cast<T>(0));
-    }
-  }
-
-  for (int i = 0; i < batch_size / im2col_step; ++i) {
-    DenseTensor out_grad_3d = out_grad_4d.Slice(i, i + 1).Resize(
-        common::slice_ddim(out_grad_4d.dims(), 1, out_grad_4d.dims().size()));
-    for (int g = 0; g < groups; ++g) {
-      DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
-          common::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size()));
-      DenseTensor out_grad_3d_slice = out_grad_3d.Slice(g, g + 1).Resize(
-          common::slice_ddim(out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-      DenseTensor col_buffer_3d_slice =
-          col_buffer_3d.Slice(g, g + 1).Resize(common::slice_ddim(
-              col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-      blas.MatMul(weight_3d_slice,
-                  true,
-                  out_grad_3d_slice,
-                  false,
-                  T(1.0),
-                  &col_buffer_3d_slice,
-                  T(0.0));
-    }
-    col_buffer.Resize(common::make_ddim(col_buffer_shape_vec));
-
-    T* col_buffer_ptr = col_buffer.data<T>();
-    const T* input_ptr = x.data<T>();
-    const T* offset_ptr = offset.data<T>();
-    const T* mask_data_ptr =
-        mask ? mask->data<T>() + i * im2col_step * input_mask_dim : nullptr;
-    if (offset_grad) {
-      T* offset_grad_ptr = offset_grad->data<T>();
-      T* mask_grad_data_ptr =
-          mask_grad ? mask_grad->data<T>() + i * im2col_step * input_mask_dim
-                    : nullptr;
-      // get grad of offset and mask
-      ModulatedDeformableCol2imCoord(
-          dev_ctx,
-          col_buffer_ptr,
-          input_ptr + i * im2col_step * input_dim,
-          offset_ptr + i * im2col_step * input_offset_dim,
-          mask_data_ptr,
-          input_shape_vec,
-          col_buffer_shape_vec,
-          filter_shape_vec,
-          paddings,
-          strides,
-          dilations,
-          deformable_groups,
-          offset_grad_ptr + i * im2col_step * input_offset_dim,
-          mask_grad_data_ptr);
-    }
-    if (dx) {
-      T* dx_ptr = dx->data<T>();
-      // get grad of input
-      ModulatedDeformableCol2im(dev_ctx,
-                                col_buffer_ptr,
-                                offset_ptr + i * im2col_step * input_offset_dim,
-                                mask_data_ptr,
-                                input_shape_vec,
-                                col_buffer_shape_vec,
-                                filter_shape_vec,
-                                paddings,
-                                strides,
-                                dilations,
-                                deformable_groups,
-                                dx_ptr + i * im2col_step * input_dim);
-      dx->Resize(x.dims());
-    }
-
-    funcs::ModulatedDeformableIm2col(
-        dev_ctx,
-        input_ptr + i * im2col_step * input_dim,
-        offset_ptr + i * im2col_step * input_offset_dim,
-        mask_data_ptr,
-        input_shape_vec,
-        col_buffer_shape_vec,
-        filter_shape_vec,
-        paddings,
-        strides,
-        dilations,
-        deformable_groups,
-        col_buffer_ptr);
-
-    col_buffer_3d.Resize(col_buffer_3d_shape);
-
-    if (filter_grad) {
-      DenseTensor dweight_3d = Empty<T>(
-          dev_ctx, {filter_grad_shape.Get(), filter_grad_shape.size()});
-      for (int g = 0; g < groups; ++g) {
-        DenseTensor out_grad_3d_slice =
-            out_grad_3d.Slice(g, g + 1).Resize(common::slice_ddim(
-                out_grad_3d.dims(), 1, out_grad_3d.dims().size()));
-        DenseTensor col_buffer_3d_slice =
-            col_buffer_3d.Slice(g, g + 1).Resize(common::slice_ddim(
-                col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-        DenseTensor dweight_3d_slice = dweight_3d.Slice(g, g + 1).Resize(
-            common::slice_ddim(dweight_3d.dims(), 1, dweight_3d.dims().size()));
-
-        blas.MatMul(out_grad_3d_slice,
-                    false,
-                    col_buffer_3d_slice,
-                    true,
-                    T(1.0),
-                    &dweight_3d_slice,
-                    T(0.0));
-      }
-
-      // update grad of weights
-      FilterGradAddup<T>(dev_ctx,
-                         dweight_3d.numel(),
-                         groups,
-                         K,
-                         M,
-                         dweight_3d.data<T>(),
-                         filter_grad->data<T>());
-    }
-  }
-  if (filter_grad) {
-    filter_grad->Resize(filter.dims());
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
deleted file mode 100644
index a627bbf702..0000000000
--- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
+++ /dev/null
@@ -1,339 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include <random>
-
-#include "kernels/custom_kernel/elementwise.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/dirichlet_kernel.h"
-#include "paddle/phi/kernels/elementwise_divide_kernel.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-#include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-#include "paddle/phi/kernels/reduce_sum_kernel.h"
-
-// ROCM hcc doesn't work well with using std:: in kernel functions
-#if defined(PADDLE_WITH_CUDA)
-#define COMPAT_EXP exp
-#define COMPAT_CEIL ceil
-#define COMPAT_FLOOR floor
-#define COMPAT_LOG log
-#define COMPAT_POW pow
-#define COMPAT_SQRT sqrt
-#define COMPAT_TAN tan
-#define COMPAT_ABS abs
-#define COMPAT_LOG1P log1p
-#else
-#define COMPAT_EXP std::exp
-#define COMPAT_CEIL std::ceil
-#define COMPAT_FLOOR std::floor
-#define COMPAT_LOG std::log
-#define COMPAT_POW std::pow
-#define COMPAT_SQRT std::sqrt
-#define COMPAT_TAN std::tan
-#define COMPAT_ABS std::abs
-#define COMPAT_LOG1P std::log1p
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-#include <curand_kernel.h>
-#endif
-#ifdef PADDLE_WITH_HIP
-#include <hiprand_kernel.h>
-#endif
-
-#if defined(PADDLE_WITH_CUDA)
-using COMPAT_RANDSTATEPHILOX4_32_10_T = curandStatePhilox4_32_10_t;
-#define COMPAT_RAND_INIT curand_init
-#define COMPAT_RAND_UNIFORM curand_uniform
-#define COMPAT_RAND_NORMAL curand_normal
-#elif defined(PADDLE_WITH_HIP)
-using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
-#define COMPAT_RAND_INIT hiprand_init
-#define COMPAT_RAND_UNIFORM hiprand_uniform
-#define COMPAT_RAND_NORMAL hiprand_normal
-#endif
-
-namespace phi {
-
-template <typename ScalarT, typename SamplerT>
-struct BaseSampler {
-  SamplerT sampler_;
-  HOSTDEVICE BaseSampler(const SamplerT& sampler) : sampler_(sampler) {}
-  HOSTDEVICE ScalarT sample() {
-    // Sometimes convert float to float16/bfloat16
-    return static_cast<ScalarT>(sampler_());
-  }
-};
-
-template <typename Context, typename T>
-struct GammaSampler {
-  void operator()(const Context& dev_ctx,
-                  const DenseTensor& alpha,
-                  DenseTensor* out);
-};
-
-template <typename Context, typename T>
-struct DirichletSampler {
-  void operator()(const Context& dev_ctx,
-                  const DenseTensor& alpha,
-                  DenseTensor* out);
-};
-
-// `sample_gamma` is d from Numpy's distributions.c, and add support for
-//  paddle data type and code style.
-//  Source MIT licensed:
-/* Copyright 2005 Robert Kern (robert.kern@gmail.com)
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-template <typename ScalarT,
-          typename AccscalarT,
-          typename UniformSamplerT,
-          typename NormalSamplerT>
-HOSTDEVICE ScalarT
-sample_gamma(ScalarT alpha,
-             BaseSampler<AccscalarT, UniformSamplerT> standard_uniform,
-             BaseSampler<AccscalarT, NormalSamplerT> standard_normal) {
-  using MPTypeScalar = typename phi::dtype::MPTypeTrait<ScalarT>::Type;
-  using MPTypeAccscalar = typename phi::dtype::MPTypeTrait<AccscalarT>::Type;
-
-  MPTypeAccscalar mp_scale = static_cast<MPTypeAccscalar>(1.0f);
-  MPTypeScalar mp_alpha = static_cast<MPTypeScalar>(alpha);
-
-  // Boost alpha for higher acceptance probability.
-  if (mp_alpha < 1.0f) {
-    if (mp_alpha == 0.f) return static_cast<ScalarT>(0.f);
-    MPTypeAccscalar mp_sample =
-        static_cast<MPTypeAccscalar>(standard_uniform.sample());
-    mp_scale *= COMPAT_POW(1 - mp_sample, 1.0f / mp_alpha);
-    mp_alpha += 1.0f;
-  }
-
-  // This implements the acceptance-rejection method of Marsaglia and Tsang
-  // (2000)
-  // doi:10.1145/358407.358414
-  const MPTypeAccscalar d = mp_alpha - 1.0f / 3.0f;
-  const MPTypeAccscalar c = 1.0f / COMPAT_SQRT(9.0f * d);
-  for (;;) {
-    MPTypeAccscalar x, y;
-    do {
-      x = static_cast<MPTypeAccscalar>(standard_normal.sample());
-      y = 1.0f + c * x;
-    } while (y <= 0);
-    const MPTypeAccscalar v = y * y * y;
-    const MPTypeAccscalar u =
-        1 - static_cast<MPTypeAccscalar>(standard_uniform.sample());
-    const MPTypeAccscalar xx = x * x;
-    if (u < 1.0f - 0.0331f * xx * xx)
-      return static_cast<ScalarT>(mp_scale * d * v);
-    if (COMPAT_LOG(u) < 0.5f * xx + d * (1.0f - v + COMPAT_LOG(v)))
-      return static_cast<ScalarT>(mp_scale * d * v);
-  }
-}
-
-template <typename T, typename UniformSamplerT, typename NormalSamplerT>
-struct GammaCPUFunctor {
-  GammaCPUFunctor(const T* alpha,
-                  T* gamma,
-                  BaseSampler<T, UniformSamplerT> uniform,
-                  BaseSampler<T, NormalSamplerT> normal)
-      : alpha_(alpha), gamma_(gamma), uniform_(uniform), normal_(normal) {}
-
-  HOST void operator()(int64_t index) {
-    auto sample = sample_gamma<T, T, UniformSamplerT, NormalSamplerT>(
-        alpha_[index], uniform_, normal_);
-    gamma_[index] = std::max(std::numeric_limits<T>::min(), sample);
-  }
-
-  const T* alpha_;
-  T* gamma_;
-  BaseSampler<T, UniformSamplerT> uniform_;
-  BaseSampler<T, NormalSamplerT> normal_;
-};
-
-template <typename T>
-struct GammaSampler<CPUContext, T> {
-  void operator()(const CPUContext& dev_ctx,
-                  const DenseTensor& alpha,
-                  DenseTensor* out) {
-    auto generator = dev_ctx.GetGenerator()->GetCPUEngine();
-
-    auto uniform = [&generator]() -> T {
-      std::uniform_real_distribution<T> u(0.0, 1.0);
-      return u(*generator);
-    };
-    BaseSampler<T, decltype(uniform)> standard_uniform(uniform);
-
-    auto normal = [&generator]() {
-      std::normal_distribution<T> n(0.0, 1.0);
-      return n(*generator);
-    };
-    BaseSampler<T, decltype(normal)> standard_normal(normal);
-
-    GammaCPUFunctor<T, decltype(uniform), decltype(normal)> gamma_functor(
-        alpha.data<T>(), out->data<T>(), standard_uniform, standard_normal);
-    funcs::ForRange<CPUContext> for_range(dev_ctx, out->numel());
-    for_range(gamma_functor);
-  }
-};
-
-template <typename T>
-struct DirichletSampler<CPUContext, T> {
-  void operator()(const CPUContext& dev_ctx,
-                  const DenseTensor& alpha,
-                  DenseTensor* out) {
-    // sample from K gamma distributions, where K=alpha.numel()
-    DenseTensor gamma_samples;
-    gamma_samples.Resize(alpha.dims());
-    dev_ctx.template Alloc<T>(&gamma_samples);
-
-    GammaSampler<CPUContext, T> gamma_sampler;
-    gamma_sampler(dev_ctx, alpha, &gamma_samples);
-
-    // normalize them into a simplex, along the last axis
-    DenseTensor gamma_sum;
-    auto new_shape = gamma_samples.dims();
-    new_shape[new_shape.size() - 1] = 1;
-    gamma_sum.Resize(new_shape);
-    dev_ctx.template Alloc<T>(&gamma_sum);
-
-    funcs::ReduceKernelImpl<CPUContext, T, T, funcs::SumFunctor>(
-        dev_ctx,
-        gamma_samples,
-        &gamma_sum,
-        {new_shape.size() - 1},
-        true,
-        false);
-
-    funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-        dev_ctx, gamma_samples, gamma_sum, funcs::DivideFunctor<T>(), out);
-  }
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-struct GammaCUDAFunctor {
-  GammaCUDAFunctor(const T* alpha, T* gamma, uint64_t seed, uint64_t offset)
-      : alpha_(alpha), gamma_(gamma), seed_(seed), offset_(offset) {}
-
-  DEVICE void operator()(int64_t index) {
-    // curand initialization
-    COMPAT_RANDSTATEPHILOX4_32_10_T state;
-    COMPAT_RAND_INIT(
-        /*seed=*/seed_, /*subsequence=*/index, /*offset=*/offset_, &state);
-
-    // sample
-    auto uniform_lambda = [&state]() { return COMPAT_RAND_UNIFORM(&state); };
-    BaseSampler<T, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
-    auto normal_lambda = [&state]() { return COMPAT_RAND_NORMAL(&state); };
-    BaseSampler<T, decltype(normal_lambda)> standard_normal(normal_lambda);
-
-    auto sample =
-        sample_gamma<T, T, decltype(uniform_lambda), decltype(normal_lambda)>(
-            alpha_[index], standard_uniform, standard_normal);
-    gamma_[index] = std::max(std::numeric_limits<T>::min(), sample);
-  }
-
-  const T* alpha_;
-  T* gamma_;
-  const uint64_t seed_;
-  const uint64_t offset_;
-};
-
-template <typename T>
-struct GammaSampler<GPUContext, T> {
-  void operator()(const GPUContext& dev_ctx,
-                  const DenseTensor& alpha,
-                  DenseTensor* out) {
-    auto p_gen = dev_ctx.GetGenerator();
-    auto seed_and_offset = p_gen->IncrementOffset(10);  // hard-coded offset
-    auto seed = seed_and_offset.first;
-    auto offset = seed_and_offset.second;
-
-    GammaCUDAFunctor<T> gamma_functor(
-        alpha.data<T>(), out->data<T>(), seed, offset);
-    funcs::ForRange<GPUContext> for_range(dev_ctx, out->numel());
-    for_range(gamma_functor);
-  }
-};
-
-template <typename T>
-struct DirichletSampler<GPUContext, T> {
-  void operator()(const GPUContext& dev_ctx,
-                  const DenseTensor& alpha,
-                  DenseTensor* out) {
-    // sample from K gamma distributions, where K=alpha.numel()
-    DenseTensor gamma_samples;
-    gamma_samples.Resize(alpha.dims());
-    dev_ctx.template Alloc<T>(&gamma_samples);
-
-    GammaSampler<GPUContext, T> gamma_sampler;
-    gamma_sampler(dev_ctx, alpha, &gamma_samples);
-
-    // normalize them into a simplex, along the last axis
-    DenseTensor gamma_sum;
-    auto new_shape = gamma_samples.dims();
-    new_shape[new_shape.size() - 1] = 1;
-    gamma_sum.Resize(new_shape);
-    dev_ctx.template Alloc<T>(&gamma_sum);
-
-    phi::SumRawKernel<T, GPUContext>(dev_ctx,
-                                     gamma_samples,
-                                     {new_shape.size() - 1},
-                                     true,
-                                     false,
-                                     gamma_sum.dtype(),
-                                     &gamma_sum);
-    phi::DivideKernel<T, GPUContext>(dev_ctx, gamma_samples, gamma_sum, out);
-  }
-};
-#endif
-
-template <typename T, typename Context>
-void Dirichletkernel(const Context& dev_ctx,
-                     const DenseTensor& alpha,
-                     DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-  DirichletSampler<Context, T> sampler;
-  sampler(dev_ctx, alpha, out);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/elementwise.h b/backends/metax_gpu/kernels/impl/elementwise.h
deleted file mode 100644
index b9f3d8af1c..0000000000
--- a/backends/metax_gpu/kernels/impl/elementwise.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace phi {
-
-// FORWARD CODE
-
-// Add
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsAddFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsAddFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    auto blas = phi::funcs::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VADD(
-        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsAddFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    dev_ctx.template Alloc<T>(z);
-    auto eigen_x = phi::EigenVector<T>::Flatten(x);
-    auto eigen_y = phi::EigenVector<T>::Flatten(y);
-    auto eigen_z = phi::EigenVector<T>::Flatten(*z);
-    auto& place = *dev_ctx.eigen_device();
-    eigen_z.device(place) = eigen_x + eigen_y;
-  }
-};
-
-// Subtract
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsSubtractFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsSubtractFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    auto blas = phi::funcs::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VSUB(
-        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsSubtractFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    auto eigen_x = phi::EigenVector<T>::Flatten(x);
-    auto eigen_y = phi::EigenVector<T>::Flatten(y);
-    auto eigen_z = phi::EigenVector<T>::Flatten(*z);
-    auto& place = *dev_ctx.eigen_device();
-    eigen_z.device(place) = eigen_x - eigen_y;
-  }
-};
-
-// Divide
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsDivideFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsDivideFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx UNUSED,
-                  const DenseTensor& x UNUSED,
-                  const DenseTensor& y UNUSED,
-                  DenseTensor* z UNUSED) {
-    common::errors::InvalidArgument(
-        "If use SameDimsDivideFunctor, template args(T) must be floating "
-        "point. ");
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsDivideFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    auto blas = phi::funcs::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VDIV(
-        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
-  }
-};
-
-// Multiply
-template <typename DevCtx, typename T, class Enable = void>
-struct SameDimsMultiplyFunctor {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z);
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsMultiplyFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    auto blas = phi::funcs::GetBlas<DevCtx, T>(dev_ctx);
-    blas.VMUL(
-        x.numel(), x.data<T>(), y.data<T>(), dev_ctx.template Alloc<T>(z));
-  }
-};
-
-template <typename DevCtx, typename T>
-struct SameDimsMultiplyFunctor<
-    DevCtx,
-    T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const DevCtx& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    auto eigen_x = phi::EigenVector<T>::Flatten(x);
-    auto eigen_y = phi::EigenVector<T>::Flatten(y);
-    auto eigen_z = phi::EigenVector<T>::Flatten(*z);
-    auto& place = *dev_ctx.eigen_device();
-    eigen_z.device(place) = eigen_x * eigen_y;
-  }
-};
-
-template <typename Functor>
-struct SameDimsElementwiseCompute {
-  void operator()(const CPUContext& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* z) {
-    Functor()(dev_ctx, x, y, z);
-  }
-};
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h b/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h
deleted file mode 100644
index dc4059a722..0000000000
--- a/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/flatten_grad_kernel.h"
-#include "paddle/phi/kernels/flatten_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/flatten2_utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void Flatten2Kernel(const Context &dev_ctx,
-                    const DenseTensor &x,
-                    int axis,
-                    DenseTensor *out,
-                    DenseTensor *x_shape) {
-  auto &axes = axis;
-
-  auto *in = &x;
-  auto x_dims = in->dims();
-
-  auto out_dims = common::make_ddim(phi::funcs::GetOutputShape(axes, x_dims));
-
-  dev_ctx.Alloc(out, x.dtype());
-  phi::Copy(dev_ctx, *in, dev_ctx.GetPlace(), false, out);
-  out->Resize(out_dims);
-}
-
-template <typename T, typename Context>
-void Flatten2GradKernel(const Context &dev_ctx,
-                        const DenseTensor &x,
-                        const DenseTensor &x_shape,
-                        const DenseTensor &out_grad,
-                        int axis,
-                        DenseTensor *x_grad) {
-  auto *d_x = x_grad;
-  auto *d_out = &out_grad;
-
-  auto xshape_dims = x_shape.dims();
-  auto x_dims = common::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-  dev_ctx.Alloc(x_grad, out_grad.dtype());
-  phi::Copy(dev_ctx, *d_out, dev_ctx.GetPlace(), false, d_x);
-  d_x->Resize(x_dims);
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h b/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h
deleted file mode 100644
index ef12141f91..0000000000
--- a/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h
+++ /dev/null
@@ -1,340 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/activation_functor.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/utils/optional.h"
-namespace phi {
-
-enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
-
-template <typename T, typename Device, typename X, typename Y>
-void ActCompute(
-    const int act_type, const Device& d, X x, Y y, phi::Place place) {
-  if (act_type == identity) {
-    y.device(d) = x;
-  } else if (act_type == sigmoid) {
-    phi::funcs::SigmoidFunctor<T>()(d, x, y);
-  } else if (act_type == tanh) {
-    phi::funcs::TanhFunctor<T>()(d, x, y);
-  } else if (act_type == relu) {
-    if (place == phi::CPUPlace())
-      phi::funcs::ReluCPUFunctor<T>()(d, x, y);
-    else
-      phi::funcs::ReluCUDAFunctor<T>()(d, x, y);
-  } else {
-    PADDLE_THROW(common::errors::Unimplemented(
-        "Unsupported activation type, only supports identity, sigmoid, tanh "
-        "and relu."));
-  }
-}
-
-#define ACT_COMPUTE ActCompute<T>
-
-template <typename T, typename Context>
-void GRUUnitKernel(const Context& dev_ctx,
-                   const DenseTensor& input,
-                   const DenseTensor& hidden_prev,
-                   const DenseTensor& weight,
-                   const paddle::optional<DenseTensor>& bias,
-                   int activation,
-                   int gate_activation,
-                   bool origin_mode,
-                   DenseTensor* gate,
-                   DenseTensor* reset_hidden_prev,
-                   DenseTensor* hidden) {
-  auto* input_p = &input;
-  auto* hidden_prev_p = &hidden_prev;
-
-  dev_ctx.template Alloc<T>(gate);
-  dev_ctx.template Alloc<T>(reset_hidden_prev);
-  dev_ctx.template Alloc<T>(hidden);
-
-  int batch_size = input_p->dims()[0];
-  int frame_size = hidden_prev_p->dims()[1];
-
-  auto x = phi::EigenMatrix<T>::From(input);
-  auto h_p = phi::EigenMatrix<T>::From(hidden_prev);
-  auto g = phi::EigenMatrix<T>::From(*gate);
-  auto r_h_p = phi::EigenMatrix<T>::From(*reset_hidden_prev);
-  auto h = phi::EigenMatrix<T>::From(*hidden);
-  auto& place = *dev_ctx.eigen_device();
-
-  // calculate unactivated gate outputs
-  if (bias) {
-    auto b = phi::EigenMatrix<T>::From(bias.get());
-    g.device(place) =
-        x + b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
-                .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
-  } else {
-    g.device(place) = x;
-  }
-  const T* hidden_prev_data = hidden_prev.data<T>();
-  const T* weight_data = weight.data<T>();
-  T* gate_data = gate->data<T>();
-  T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  blas.GEMM(false,
-            false,
-            batch_size,
-            2 * frame_size,
-            frame_size,
-            1,
-            hidden_prev_data,
-            frame_size,
-            weight_data,
-            frame_size * 2,
-            1,
-            gate_data,
-            frame_size * 3);
-
-  // calculate activated gate
-  Eigen::array<int, 2> extents{{batch_size, frame_size}};
-  Eigen::array<int, 2> u_offsets{{0, 0}};
-  ACT_COMPUTE(gate_activation,
-              place,
-              g.slice(u_offsets, extents),
-              g.slice(u_offsets, extents),
-              dev_ctx.GetPlace());
-  auto u = g.slice(u_offsets, extents);  // update gate
-  Eigen::array<int, 2> r_offsets{{0, frame_size}};
-  ACT_COMPUTE(gate_activation,
-              place,
-              g.slice(r_offsets, extents),
-              g.slice(r_offsets, extents),
-              dev_ctx.GetPlace());
-  auto r = g.slice(r_offsets, extents);  // reset gate
-  r_h_p.device(place) = r * h_p;         // reset previous hidden state
-  blas.GEMM(false,
-            false,
-            batch_size,
-            frame_size,
-            frame_size,
-            1,
-            reset_hidden_prev_data,
-            frame_size,
-            weight_data + frame_size * frame_size * 2,
-            frame_size,
-            1,
-            gate_data + frame_size * 2,
-            frame_size * 3);
-
-  Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
-  ACT_COMPUTE(activation,
-              place,
-              g.slice(c_offsets, extents),
-              g.slice(c_offsets, extents),
-              dev_ctx.GetPlace());
-  auto c = g.slice(c_offsets, extents);  // output candidate
-
-  // calculate final output
-  if (origin_mode) {
-    h.device(place) = c + u * (h_p - c);  // (1 - u) * c + u * h_p
-  } else {
-    h.device(place) = u * (c - h_p) + h_p;  // u * c + (1 - u) * h_p
-  }
-}
-
-template <typename T,
-          typename Device,
-          typename X,
-          typename Y,
-          typename DX,
-          typename DY>
-void ActGradCompute(
-    const int act_type, const Device& d, X x, Y y, DX dx, DY dy) {
-  // x is dummy and won't be used even in Relu(use y instead)
-  if (act_type == identity)
-    dx.device(d) = dy;
-  else if (act_type == sigmoid)
-    phi::funcs::SigmoidGradFunctor<T>()(d, x, y, dy, dx);
-  else if (act_type == tanh)
-    phi::funcs::TanhGradFunctor<T>()(d, x, y, dy, dx);
-  else if (act_type == relu)
-    phi::funcs::ReluGradFunctor<T>()(d, x, y, dy, dx);
-  else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "Unsupported activation type, only supports identity, sigmoid, tanh "
-        "and relu."));
-}
-
-#define ACT_GRAD_COMPUTE ActGradCompute<T>
-
-template <typename T, typename Context>
-void GRUUnitGradKernel(const Context& dev_ctx,
-                       const DenseTensor& input,
-                       const DenseTensor& hidden_prev,
-                       const DenseTensor& weight,
-                       const paddle::optional<DenseTensor>& bias,
-                       const DenseTensor& gate,
-                       const DenseTensor& reset_hidden_prev,
-                       const DenseTensor& hidden_grad,
-                       int activation,
-                       int gate_activation,
-                       bool origin_mode,
-                       DenseTensor* input_grad,
-                       DenseTensor* hidden_prev_grad,
-                       DenseTensor* weight_grad,
-                       DenseTensor* bias_grad) {
-  phi::DenseTensor gate_grad;
-  phi::DenseTensor reset_hidden_prev_grad;
-
-  const T* hidden_prev_data = hidden_prev.data<T>();
-  const T* weight_data = weight.data<T>();
-  gate_grad.Resize(input.dims());
-  T* gate_grad_data = dev_ctx.template Alloc<T>(&gate_grad);
-  const T* reset_hidden_prev_data = reset_hidden_prev.data<T>();
-  reset_hidden_prev_grad.Resize(reset_hidden_prev.dims());
-  T* reset_hidden_prev_grad_data =
-      dev_ctx.template Alloc<T>(&reset_hidden_prev_grad);
-
-  auto h_p = phi::EigenMatrix<T>::From(hidden_prev);
-  auto g = phi::EigenMatrix<T>::From(gate);
-  auto d_h = phi::EigenMatrix<T>::From(hidden_grad);
-  auto d_g = phi::EigenMatrix<T>::From(gate_grad);
-  auto d_r_h_p = phi::EigenMatrix<T>::From(reset_hidden_prev_grad);
-  auto& place = *dev_ctx.eigen_device();
-
-  int batch_size = input.dims()[0];
-  int frame_size = hidden_prev.dims()[1];
-
-  Eigen::array<int, 2> extents{{batch_size, frame_size}};
-  Eigen::array<int, 2> u_offsets{{0, 0}};
-  auto u = g.slice(u_offsets, extents);  // update gate
-  Eigen::array<int, 2> r_offsets{{0, frame_size}};
-  auto r = g.slice(r_offsets, extents);  // reset gate
-  Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
-  auto c = g.slice(c_offsets, extents);  // output candidate
-
-  // backward for unactivated update gate
-  if (origin_mode) {
-    ACT_GRAD_COMPUTE(gate_activation,
-                     place,
-                     u,
-                     u,
-                     d_g.slice(u_offsets, extents),
-                     d_h * (h_p - c));
-    // backward for unactivated output candidate
-    ACT_GRAD_COMPUTE(
-        activation, place, c, c, d_g.slice(c_offsets, extents), d_h * (1 - u));
-  } else {
-    ACT_GRAD_COMPUTE(gate_activation,
-                     place,
-                     u,
-                     u,
-                     d_g.slice(u_offsets, extents),
-                     d_h * (c - h_p));
-    // backward for unactivated output candidate
-    ACT_GRAD_COMPUTE(
-        activation, place, c, c, d_g.slice(c_offsets, extents), d_h * u);
-  }
-  // backward for reset_hidden_prev
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  blas.GEMM(false,
-            true,
-            batch_size,
-            frame_size,
-            frame_size,
-            1,
-            gate_grad_data + frame_size * 2,
-            frame_size * 3,
-            weight_data + frame_size * frame_size * 2,
-            frame_size,
-            0,
-            reset_hidden_prev_grad_data,
-            frame_size);
-  // backward for unactivated reset gate
-  ACT_GRAD_COMPUTE(gate_activation,
-                   place,
-                   r,
-                   r,
-                   d_g.slice(r_offsets, extents),
-                   d_r_h_p * h_p);
-  // backward for weight
-  if (weight_grad) {
-    T* weight_grad_data = dev_ctx.template Alloc<T>(weight_grad);
-    // backward for state_weight
-    blas.GEMM(true,
-              false,
-              frame_size,
-              frame_size,
-              batch_size,
-              1,
-              reset_hidden_prev_data,
-              frame_size,
-              gate_grad_data + frame_size * 2,
-              frame_size * 3,
-              0,
-              weight_grad_data + frame_size * frame_size * 2,
-              frame_size);
-
-    // backward for update_gate_weight and reset_gate_weight
-    blas.GEMM(true,
-              false,
-              frame_size,
-              frame_size * 2,
-              batch_size,
-              1,
-              hidden_prev_data,
-              frame_size,
-              gate_grad_data,
-              frame_size * 3,
-              0,
-              weight_grad_data,
-              frame_size * 2);
-  }
-  // backward for hidden_prev
-  if (hidden_prev_grad) {
-    T* hidden_prev_grad_data = dev_ctx.template Alloc<T>(hidden_prev_grad);
-    auto d_h_p = phi::EigenMatrix<T>::From(*hidden_prev_grad);
-    if (origin_mode) {
-      d_h_p.device(place) = d_r_h_p * r + d_h * u;
-    } else {
-      d_h_p.device(place) = d_r_h_p * r + d_h * (1 - u);
-    }
-    blas.GEMM(false,
-              true,
-              batch_size,
-              frame_size,
-              frame_size * 2,
-              1,
-              gate_grad_data,
-              frame_size * 3,
-              weight_data,
-              frame_size * 2,
-              1,
-              hidden_prev_grad_data,
-              frame_size);
-  }
-  // backward for input
-  if (input_grad) {
-    dev_ctx.template Alloc<T>(input_grad);
-    auto d_x = phi::EigenMatrix<T>::From(*input_grad);
-    d_x.device(place) = d_g;
-  }
-  // backward for bias
-  if (bias_grad) {
-    dev_ctx.template Alloc<T>(bias_grad);
-    auto d_b = phi::EigenVector<T>::Flatten(*bias_grad);
-    d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
-  }
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/index_select_impl.h b/backends/metax_gpu/kernels/impl/index_select_impl.h
deleted file mode 100644
index ac39cab270..0000000000
--- a/backends/metax_gpu/kernels/impl/index_select_impl.h
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "glog/logging.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-template <typename Context, typename T, class Enable = void>
-struct IndexSelectAdd {
-  void operator()(const Context& dev_ctx UNUSED,
-                  int slice_size,
-                  const T* src_pointer,
-                  const T* p_pointer,
-                  T* dist_pointer) {
-    for (int i = 0; i < slice_size; i++) {
-      dist_pointer[i] = src_pointer[i] + p_pointer[i];
-    }
-  }
-};
-
-template <typename Context, typename T>
-struct IndexSelectAdd<
-    Context,
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const Context& dev_ctx,
-                  int slice_size,
-                  const T* src_pointer,
-                  const T* p_pointer,
-                  T* dist_pointer) {
-    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-    blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
-  }
-};
-
-template <typename Context, typename T, typename IndexT = int>
-void IndexSelectInner(const Context& dev_ctx,
-                      DenseTensor* input,
-                      const DenseTensor& index,
-                      DenseTensor* output,
-                      int dim) {
-  auto input_dim = input->dims();
-  auto input_dim_size = input_dim.size();
-  auto output_dim = output->dims();
-  auto index_size = index.dims()[0];
-
-  DenseTensor index_cpu_copy;
-  if (index.place().GetType() != phi::AllocationType::CPU) {
-    phi::Copy(dev_ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
-  }
-  const IndexT* index_data = index.place().GetType() == phi::AllocationType::CPU
-                                 ? index.data<IndexT>()
-                                 : index_cpu_copy.data<IndexT>();
-  dev_ctx.template Alloc<T>(output);
-
-  auto slice_size = 1;
-  for (auto i = dim + 1; i < input_dim_size; i++) {
-    slice_size *= input_dim[i];
-  }
-
-  auto outer_nums = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_nums *= input_dim[i];
-  }
-
-  for (int i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_GE(
-        index_data[i],
-        -input_dim[dim],
-        common::errors::InvalidArgument(
-            "Variable value (index) of OP(index_select) "
-            "expected >= %ld and < %ld, but got %ld. Please check input "
-            "value.",
-            -input_dim[dim],
-            input_dim[dim],
-            index_data[i]));
-    PADDLE_ENFORCE_LT(
-        index_data[i],
-        input_dim[dim],
-        common::errors::InvalidArgument(
-            "Variable value (index) of OP(index_select) "
-            "expected >= %ld and < %ld, but got %ld. Please check input "
-            "value.",
-            -input_dim[dim],
-            input_dim[dim],
-            index_data[i]));
-  }
-
-  VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
-          << "; slice_size: " << slice_size << "; index_size: " << index_size;
-
-  input->Resize(common::make_ddim({outer_nums, input_dim[dim], slice_size}));
-  output->Resize(common::make_ddim({outer_nums, index_size, slice_size}));
-
-  auto input_tensor = EigenTensor<T, 3>::From(*input);
-  auto output_tensor = EigenTensor<T, 3>::From(*output);
-
-  auto& place = *dev_ctx.eigen_device();
-
-  for (auto j = 0; j < index_size; j++) {
-    IndexT index_value = index_data[j];
-    if (index_value < 0) {
-      index_value += input_dim[dim];
-    }
-    auto output_t = output_tensor.chip(j, 1);
-    output_t.device(place) = input_tensor.chip(index_value, 1);
-  }
-  input->Resize(input_dim);
-  output->Resize(output_dim);
-}
-
-template <typename Context, typename T, typename IndexT = int>
-void IndexSelectGradInner(const Context& dev_ctx,
-                          const DenseTensor& out_grad,
-                          const DenseTensor& index,
-                          DenseTensor* x_grad,
-                          int dim) {
-  const T* input_data = out_grad.data<T>();
-  const IndexT* index_data = index.data<IndexT>();
-
-  const T* p_output = dev_ctx.template Alloc<T>(x_grad);
-  T* out_data = dev_ctx.template Alloc<T>(x_grad);
-
-  auto input_dim = out_grad.dims();
-  auto input_dim_size = input_dim.size();
-  auto output_dim = x_grad->dims();
-
-  phi::funcs::SetConstant<Context, T> set_constant;
-  set_constant(dev_ctx, x_grad, static_cast<T>(0.0));
-
-  auto slice_size = 1;
-  for (auto i = dim + 1; i < input_dim_size; i++) {
-    slice_size *= input_dim[i];
-  }
-
-  auto input_width = slice_size * input_dim[dim];
-  auto output_width = slice_size * output_dim[dim];
-
-  auto outer_nums = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_nums *= input_dim[i];
-  }
-
-  auto index_size = index.dims()[0];
-  VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums
-          << "; slice_size: " << slice_size << "; input_width: " << input_width
-          << "; output_width: " << output_width
-          << "; index_size: " << index_size;
-
-  for (auto i = 0; i < outer_nums; i++) {
-    auto input_start_offset = i * input_width;
-    auto output_start_offset = i * output_width;
-
-    for (auto j = 0; j < index_size; j++) {
-      IndexT index_value = index_data[j];
-      if (index_value < 0) {
-        index_value += input_dim[dim];
-      }
-      auto src = input_data + input_start_offset + j * slice_size;
-      auto p_out = p_output + output_start_offset + index_value * slice_size;
-      auto dst = out_data + output_start_offset + index_value * slice_size;
-      IndexSelectAdd<Context, T> index_select_add;
-      index_select_add(dev_ctx, slice_size, src, p_out, dst);
-    }
-  }
-  x_grad->Resize(output_dim);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h
deleted file mode 100644
index 64b56f2cd1..0000000000
--- a/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/matrix_inverse.h"
-#include "paddle/phi/kernels/inverse_grad_kernel.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void InverseGradKernel(const Context& dev_ctx,
-                       const DenseTensor& out,
-                       const DenseTensor& out_grad,
-                       DenseTensor* in_grad) {
-  if (in_grad) {
-    dev_ctx.template Alloc<T>(in_grad);
-    if (out_grad.numel() == 0) {
-      return;
-    }
-    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-    DenseTensor tmp_out;
-    tmp_out.Resize(out.dims());
-    dev_ctx.template Alloc<T>(&tmp_out);
-
-    if (IsComplexType(out.dtype())) {
-      DenseTensor out_conj;
-      out_conj.Resize(out.dims());
-      dev_ctx.template Alloc<T>(&out_conj);
-
-      phi::ConjKernel<T, Context>(dev_ctx, out, &out_conj);
-
-      auto mat_dim_a0 =
-          phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false);
-      auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
-      blas.MatMul(
-          out_grad, mat_dim_a0, out_conj, mat_dim_b0, T(1), &tmp_out, T(0));
-
-      auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
-      auto mat_dim_b1 =
-          phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
-      blas.MatMul(
-          out_conj, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0));
-    } else {
-      auto mat_dim_a0 =
-          phi::funcs::CreateMatrixDescriptor(out_grad.dims(), 0, false);
-      auto mat_dim_b0 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
-      blas.MatMul(out_grad, mat_dim_a0, out, mat_dim_b0, T(1), &tmp_out, T(0));
-
-      auto mat_dim_a1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
-      auto mat_dim_b1 =
-          phi::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
-      blas.MatMul(out, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), in_grad, T(0));
-    }
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/load_kernel_impl.h b/backends/metax_gpu/kernels/impl/load_kernel_impl.h
deleted file mode 100644
index a3358e669b..0000000000
--- a/backends/metax_gpu/kernels/impl/load_kernel_impl.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "kernels/funcs/dense_tensor_serialize.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/cast_kernel.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void LoadKernel(const Context& dev_ctx,
-                const std::string& file_path,
-                int64_t seek,
-                const std::vector<int64_t>& shape,
-                bool load_as_fp16,
-                phi::DenseTensor* out) {
-  // FIXME(yuyang18): We save variable to local file now, but we should change
-  // it to save an output stream.
-  std::ifstream fin(file_path, std::ios::binary);
-  PADDLE_ENFORCE_EQ(
-      static_cast<bool>(fin),
-      true,
-      errors::Unavailable("Load operator fail to open file %s, please check "
-                          "whether the model file is complete or damaged.",
-                          file_path));
-  PADDLE_ENFORCE_NOT_NULL(
-      out,
-      errors::InvalidArgument("The variable to be loaded cannot be found."));
-
-  if (seek != -1) {
-    PADDLE_ENFORCE_GE(seek,
-                      0,
-                      errors::InvalidArgument(
-                          "seek with tensor must great than or equal to 0"));
-    phi::DeserializeFromStream(fin, out, dev_ctx, seek, shape);
-  } else {
-    phi::DeserializeFromStream(fin, out, dev_ctx);
-  }
-
-  auto in_dtype = out->dtype();
-  auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype;
-  if (in_dtype != out_dtype) {
-    phi::CastKernel<T>(dev_ctx, *out, out_dtype, out);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h b/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h
deleted file mode 100644
index 4a061fe471..0000000000
--- a/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h
+++ /dev/null
@@ -1,443 +0,0 @@
-// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
-#include "paddle/phi/kernels/funcs/lstm_compute.h"
-#include "paddle/phi/kernels/funcs/lstm_utils.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void LSTMKernel(const Context& dev_ctx,
-                const DenseTensor& input,
-                const paddle::optional<DenseTensor>& h0,
-                const paddle::optional<DenseTensor>& c0,
-                const DenseTensor& weight,
-                const DenseTensor& bias,
-                bool use_peepholes,
-                bool is_reverse,
-                bool is_test,
-                const std::string& gate_activation,
-                const std::string& cell_activation,
-                const std::string& candidate_activation,
-                DenseTensor* hidden,
-                DenseTensor* cell,
-                DenseTensor* batch_gate,
-                DenseTensor* batch_cell_pre_act) {
-  auto* hidden_t0 = h0.get_ptr();
-  auto* cell_t0 = c0.get_ptr();
-
-  phi::DenseTensor* batch_gate_new = nullptr;
-  phi::DenseTensor batch_gate_temp;
-  if (is_test) {
-    batch_gate_new = &batch_gate_temp;
-    batch_gate_new->Resize(input.dims());
-  } else {
-    batch_gate_new = batch_gate;
-  }
-
-  dev_ctx.template Alloc<T>(batch_gate_new);
-  dev_ctx.template Alloc<T>(hidden);
-  dev_ctx.template Alloc<T>(cell);
-
-  phi::funcs::DenseTensor2BatchFunctor<Context, T> to_batch;
-  to_batch(dev_ctx, input, batch_gate_new, true, is_reverse);
-
-  auto in_dims = input.dims();
-  int frame_size = static_cast<int>(in_dims[1] / 4);
-  phi::DDim dims({in_dims[0], frame_size});
-
-  if (bias.initialized()) {
-    phi::DenseTensor b = bias;
-    b.Resize({bias.numel(), 1});
-    phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
-    phi::funcs::RowwiseAdd<Context, T> add_bias;
-    add_bias(dev_ctx, *batch_gate_new, gate_bias, batch_gate_new);
-  }
-
-  phi::funcs::LstmMetaValue<T> lstm_value;
-  if (bias.initialized() && use_peepholes) {
-    T* bias_data = const_cast<T*>(bias.data<T>());
-    // the code style in LstmMetaValue will be updated later.
-
-    lstm_value.check_ig = bias_data + 4 * frame_size;
-    lstm_value.check_fg = lstm_value.check_ig + frame_size;
-    lstm_value.check_og = lstm_value.check_fg + frame_size;
-  } else {
-    lstm_value.check_ig = nullptr;
-    lstm_value.check_fg = nullptr;
-    lstm_value.check_og = nullptr;
-  }
-  lstm_value.prev_state_value = nullptr;
-  phi::DenseTensor ordered_c0;
-
-  phi::Vector<size_t> order(batch_gate_new->lod()[2]);
-
-  if (cell_t0) {
-    // Since the batch computing for LSTM reorders the input sequence
-    // according to their length. The initialized cell state also needs
-    // to reorder.
-    ReorderInitState<Context, T>(dev_ctx, *cell_t0, order, &ordered_c0, true);
-    lstm_value.prev_state_value = ordered_c0.data<T>();
-  }
-
-  // Use the local variable as here.
-  phi::DenseTensor batch_hidden, batch_cell, batch_cell_pre_act_temp;
-  phi::DenseTensor* batch_cell_pre_act_p;
-  if (is_test) {
-    batch_cell_pre_act_p = &batch_cell_pre_act_temp;
-  } else {
-    batch_cell_pre_act_p = batch_cell_pre_act;
-  }
-  batch_hidden.Resize(dims);
-  batch_cell.Resize(dims);
-  dev_ctx.template Alloc<T>(&batch_hidden);
-  dev_ctx.template Alloc<T>(&batch_cell);
-  batch_cell_pre_act_p->Resize(dims);
-  dev_ctx.template Alloc<T>(batch_cell_pre_act_p);
-
-  auto batch_starts = batch_gate_new->lod()[0];
-  size_t num_batch = batch_starts.size() - 1;
-  auto gate_act = phi::funcs::detail::GetActivationType(gate_activation);
-  auto cell_act = phi::funcs::detail::GetActivationType(cell_activation);
-  auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation);
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  for (size_t n = 0; n < num_batch; n++) {
-    int bstart = static_cast<int>(batch_starts[n]);
-    int bend = static_cast<int>(batch_starts[n + 1]);
-
-    phi::DenseTensor gate_t = batch_gate_new->Slice(bstart, bend);
-    phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend);
-    phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
-    phi::DenseTensor cell_pre_act_t = batch_cell_pre_act_p->Slice(bstart, bend);
-
-    int cur_batch_size = bend - bstart;
-
-    if (n > 0) {
-      int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-      int pre_h_end = pre_h_start + cur_batch_size;
-      auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-      blas.MatMul(pre_hidden_t,
-                  false,
-                  weight,
-                  false,
-                  static_cast<T>(1.0),
-                  &gate_t,
-                  static_cast<T>(1.0));
-    } else if (hidden_t0 != nullptr) {
-      // If n == 0 and there is no initialized hidden state, that is to say
-      // the H0 is zeros, the calculation W_h * H0 will be skipped.
-      // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized hidden state also needs
-      // to reorder.
-      phi::DenseTensor ordered_h0;
-      ReorderInitState<Context, T>(
-          dev_ctx, *hidden_t0, order, &ordered_h0, true);
-      blas.MatMul(ordered_h0,
-                  false,
-                  weight,
-                  false,
-                  static_cast<T>(1.0),
-                  &gate_t,
-                  static_cast<T>(1.0));
-    }
-
-    lstm_value.gate_value = gate_t.data<T>();
-    lstm_value.output_value = out_t.data<T>();
-    lstm_value.state_value = cell_t.data<T>();
-    lstm_value.state_active_value = cell_pre_act_t.data<T>();
-    T cell_clip = 0.0;
-    phi::funcs::LstmUnitFunctor<Context, T>::compute(dev_ctx,
-                                                     lstm_value,
-                                                     frame_size,
-                                                     cur_batch_size,
-                                                     cell_clip,
-                                                     gate_act,
-                                                     cell_act,
-                                                     cand_act);
-    lstm_value.prev_state_value = lstm_value.state_value;
-  }
-
-  phi::funcs::Batch2DenseTensorFunctor<Context, T> to_seq;
-  batch_hidden.set_lod(batch_gate_new->lod());
-  // restore the output hidden in phi::DenseTensor from the batch hidden
-  to_seq(dev_ctx, batch_hidden, hidden);
-
-  batch_cell.set_lod(batch_gate_new->lod());
-  // restore the output cell state in phi::DenseTensor from the batch cell
-  to_seq(dev_ctx, batch_cell, cell);
-}
-
-template <typename T, typename Context>
-void LSTMGradKernel(const Context& dev_ctx,
-                    const DenseTensor& input_in,
-                    const paddle::optional<DenseTensor>& h0_in,
-                    const paddle::optional<DenseTensor>& c0_in,
-                    const DenseTensor& weight_in,
-                    const DenseTensor& bias_in,
-                    const DenseTensor& hidden_in,
-                    const DenseTensor& cell_in,
-                    const DenseTensor& batch_gate_in,
-                    const DenseTensor& batch_cell_pre_act_in,
-                    const DenseTensor& hidden_grad,
-                    bool use_peepholes,
-                    bool is_reverse,
-                    bool is_test,
-                    const std::string& gate_activation,
-                    const std::string& cell_activation,
-                    const std::string& candidate_activation,
-                    DenseTensor* input_grad,
-                    DenseTensor* h0_grad,
-                    DenseTensor* c0_grad,
-                    DenseTensor* weight_grad,
-                    DenseTensor* bias_grad) {
-  auto* input = &input_in;
-  auto* weight = &weight_in;
-  auto* bias = &bias_in;
-
-  auto* hidden_out = &hidden_in;
-  auto* cell_out = &cell_in;
-
-  auto* batch_gate = &batch_gate_in;
-  auto* batch_cell_pre_act = &batch_cell_pre_act_in;
-
-  auto* hidden_g = &hidden_grad;
-
-  auto* in_g = input_grad;
-  auto* weight_g = weight_grad;
-  auto* bias_g = bias_grad;
-
-  auto* h0 = h0_in.get_ptr();
-  auto* c0 = c0_in.get_ptr();
-
-  auto* h0_g = h0_grad;
-  auto* c0_g = c0_grad;
-
-  phi::funcs::SetConstant<Context, T> zero;
-  if (weight_g) {
-    dev_ctx.template Alloc<T>(weight_g);
-    zero(dev_ctx, weight_g, static_cast<T>(0.0));
-  }
-
-  // ordered_h0/c0 is the reordered hidden/cell initialization.
-  // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
-  // initialization.
-  phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-  phi::Vector<size_t> order(batch_gate->lod()[2]);
-
-  if (c0) {
-    ReorderInitState<Context, T>(dev_ctx, *c0, order, &ordered_c0, true);
-  }
-  if (c0 && c0_g) {
-    ordered_c0_g.Resize(c0_g->dims());
-    dev_ctx.template Alloc<T>(&ordered_c0_g);
-  }
-
-  auto in_dims = input->dims();
-  auto out_dims = hidden_g->dims();
-  int frame_size = static_cast<int>(in_dims[1] / 4);
-  PADDLE_ENFORCE_EQ(frame_size,
-                    out_dims[1],
-                    common::errors::InvalidArgument(
-                        "The second dimension of Input(hidden_grad) should be "
-                        "%d, but received %d in LSTM@Grad operator.",
-                        frame_size,
-                        out_dims[1]));
-
-  phi::funcs::LstmMetaValue<T> lstm_value;
-  if (bias && use_peepholes) {
-    T* bias_data = const_cast<T*>(bias->data<T>());
-    lstm_value.check_ig = bias_data + 4 * frame_size;
-    lstm_value.check_fg = lstm_value.check_ig + frame_size;
-    lstm_value.check_og = lstm_value.check_fg + frame_size;
-  } else {
-    lstm_value.check_ig = nullptr;
-    lstm_value.check_fg = nullptr;
-    lstm_value.check_og = nullptr;
-  }
-
-  phi::funcs::LstmMetaGrad<T> lstm_grad;
-
-  if (bias && bias_g) {
-    dev_ctx.template Alloc<T>(bias_g);
-    zero(dev_ctx, bias_g, static_cast<T>(0.0));
-  }
-  if (bias && bias_g && use_peepholes) {
-    T* bias_g_data = bias_g->data<T>();
-    lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-    lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
-    lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
-  } else {
-    lstm_grad.check_ig_grad = nullptr;
-    lstm_grad.check_fg_grad = nullptr;
-    lstm_grad.check_og_grad = nullptr;
-  }
-
-  phi::funcs::DenseTensor2BatchFunctor<Context, T> to_batch;
-
-  auto ToBatch = [&batch_gate, &to_batch](const Context& ctx,
-                                          const phi::DenseTensor& src,
-                                          const phi::DDim& dims,
-                                          phi::DenseTensor& dst) {
-    dst.Resize(dims);
-    ctx.template Alloc<T>(&dst);
-    dst.set_lod(batch_gate->lod());
-    to_batch(ctx, src, &dst, false);
-  };
-
-  phi::DenseTensor batch_hidden, batch_hidden_g, batch_cell;
-  ToBatch(dev_ctx, *hidden_out, out_dims, batch_hidden);
-  ToBatch(dev_ctx, *hidden_g, out_dims, batch_hidden_g);
-  ToBatch(dev_ctx, *cell_out, out_dims, batch_cell);
-
-  phi::DenseTensor batch_cell_g, batch_gate_g;
-  batch_cell_g.Resize(out_dims);
-  dev_ctx.template Alloc<T>(&batch_cell_g);
-  // TODO(qingqing) support the case output cell has gradient.
-  // to_batch(dev_ctx, *cell_g, batch_cell_g, false);
-  zero(dev_ctx, &batch_cell_g, static_cast<T>(0.0));
-  batch_gate_g.Resize(batch_gate->dims());
-  dev_ctx.template Alloc<T>(&batch_gate_g);
-  batch_gate_g.set_lod(batch_gate->lod());
-
-  auto gate_act = phi::funcs::detail::GetActivationType(gate_activation);
-  auto cell_act = phi::funcs::detail::GetActivationType(cell_activation);
-  auto cand_act = phi::funcs::detail::GetActivationType(candidate_activation);
-
-  auto batch_starts = batch_gate->lod()[0];
-  size_t num_batch = batch_starts.size() - 1;
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
-    int bstart = static_cast<int>(batch_starts[n]);
-    int bend = static_cast<int>(batch_starts[n + 1]);
-
-    phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
-    phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
-    phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-    lstm_value.gate_value = gate.data<T>();
-    lstm_value.state_value = cell.data<T>();
-    lstm_value.state_active_value = cell_pre_act.data<T>();
-
-    phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
-    phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
-    phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
-    lstm_grad.state_grad = cell_g.data<T>();
-    lstm_grad.gate_grad = gate_g.data<T>();
-    lstm_grad.output_grad = out_g.data<T>();
-
-    if (n > 0) {
-      int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-      phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-      phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-      lstm_value.prev_state_value = cell_pre.data<T>();
-      lstm_grad.prev_state_grad = cell_pre_g.data<T>();
-    } else {
-      lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-      lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
-    }
-
-    // lstm_value.output_value not used in bp, set to nullptr
-    // lstm_grad.state_active_grad not used in bp, set to nullptr
-    lstm_value.output_value = nullptr;
-    lstm_grad.state_active_grad = nullptr;
-    int cur_batch_size = bend - bstart;
-    T cell_clip = 0.0;
-    phi::funcs::LstmUnitGradFunctor<Context, T>::compute(dev_ctx,
-                                                         lstm_value,
-                                                         lstm_grad,
-                                                         frame_size,
-                                                         cur_batch_size,
-                                                         cell_clip,
-                                                         gate_act,
-                                                         cell_act,
-                                                         cand_act);
-
-    if (n > 0) {
-      int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-      int pre_h_end = pre_h_start + cur_batch_size;
-      auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-      blas.MatMul(gate_g,
-                  false,
-                  *weight,
-                  true,
-                  static_cast<T>(1.0),
-                  &pre_hidden_g,
-                  static_cast<T>(1.0));
-      if (weight_g) {
-        /* backward weight */
-        auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-        blas.MatMul(pre_hidden,
-                    true,
-                    gate_g,
-                    false,
-                    static_cast<T>(1.0),
-                    weight_g,
-                    static_cast<T>(1.0));
-      }
-    } else {
-      if (h0 && weight_g) {
-        ReorderInitState<Context, T>(dev_ctx, *h0, order, &ordered_h0, true);
-        blas.MatMul(ordered_h0,
-                    true,
-                    gate_g,
-                    false,
-                    static_cast<T>(1.0),
-                    weight_g,
-                    static_cast<T>(1.0));
-      }
-      if (h0 && h0_g) {
-        ordered_h0_g.Resize(h0_g->dims());
-        dev_ctx.template Alloc<T>(&ordered_h0_g);
-        blas.MatMul(gate_g,
-                    false,
-                    *weight,
-                    true,
-                    static_cast<T>(1.0),
-                    &ordered_h0_g,
-                    static_cast<T>(0.0));
-      }
-    }
-  }
-
-  phi::funcs::Batch2DenseTensorFunctor<Context, T> to_seq;
-  if (in_g) {
-    /* backward data */
-    dev_ctx.template Alloc<T>(in_g);
-    to_seq(dev_ctx, batch_gate_g, in_g);
-  }
-  if (bias && bias_g) {
-    /* backward bias */
-    phi::DenseTensor b_g = *bias_g;
-    b_g.Resize({bias_g->numel(), 1});
-    phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-    phi::funcs::ColwiseSum<Context, T> col_sum;
-    col_sum(dev_ctx, batch_gate_g, &gate_bias_g);
-  }
-
-  if (h0 && h0_g) {
-    ReorderInitState<Context, T>(dev_ctx, ordered_h0_g, order, h0_g, false);
-  }
-  if (c0 && c0_g) {
-    ReorderInitState<Context, T>(dev_ctx, ordered_c0_g, order, c0_g, false);
-  }
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h
deleted file mode 100644
index 5a2e5d48a1..0000000000
--- a/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h
+++ /dev/null
@@ -1,308 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/impl/lu_kernel_impl.h"
-#include "paddle/phi/kernels/triangular_solve_kernel.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void LUGradKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& out,
-                  const DenseTensor& pivots,
-                  const DenseTensor& out_grad,
-                  bool pivot UNUSED,
-                  DenseTensor* x_grad) {
-  dev_ctx.template Alloc<T>(x_grad);
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  auto xdims = x.dims();
-  int xrank = xdims.size();
-  int64_t m = xdims[xrank - 2];
-  int64_t n = xdims[xrank - 1];
-  int64_t k = std::min(m, n);
-
-  DenseTensor L, U, L_narrow, U_narrow, L_narrow_mH, U_narrow_mH, grad_narrow;
-  LU_Unpack<Context, T>(dev_ctx, &out, &L, &U);
-
-  Tensor_narrow<Context, T>(dev_ctx, &L, &L_narrow, 0, k, 0, k);
-  Tensor_narrow<Context, T>(dev_ctx, &U, &U_narrow, 0, k, 0, k);
-  Tensor_narrow<Context, T>(dev_ctx, &out_grad, &grad_narrow, 0, k, 0, k);
-  auto graddims = grad_narrow.dims();
-
-  Tensor_Conj<Context, T>(dev_ctx, L_narrow, &L_narrow_mH);
-  Tensor_Conj<Context, T>(dev_ctx, U_narrow, &U_narrow_mH);
-  L_narrow_mH = Transpose2DTo6D<Context, T>(dev_ctx, L_narrow_mH);
-  U_narrow_mH = Transpose2DTo6D<Context, T>(dev_ctx, U_narrow_mH);
-
-  auto LmHdims = L_narrow_mH.dims();
-  auto UmHdims = U_narrow_mH.dims();
-
-  DenseTensor phi_L, phi_U, phi, psi;
-  phi_L.Resize(LmHdims);
-  dev_ctx.template Alloc<T>(&phi_L);
-  phi_U.Resize(UmHdims);
-  dev_ctx.template Alloc<T>(&phi_U);
-  auto mat_dim_l = phi::funcs::CreateMatrixDescriptor(LmHdims, 0, false);
-  auto mat_dim_u = phi::funcs::CreateMatrixDescriptor(UmHdims, 0, false);
-  auto mat_dim_g = phi::funcs::CreateMatrixDescriptor(graddims, 0, false);
-  blas.MatMul(L_narrow_mH,
-              mat_dim_l,
-              grad_narrow,
-              mat_dim_g,
-              static_cast<T>(1),
-              &phi_L,
-              static_cast<T>(0));
-
-  blas.MatMul(grad_narrow,
-              mat_dim_g,
-              U_narrow_mH,
-              mat_dim_u,
-              static_cast<T>(1),
-              &phi_U,
-              static_cast<T>(0));
-
-  auto phil_rank = LmHdims.size();
-  auto phiu_rank = UmHdims.size();
-  phi::funcs::ForRange<Context> l_for_range(dev_ctx, phi_L.numel());
-  phi::funcs::TrilTriuCompute<T> tril_computer(phi_L.data<T>(),
-                                               -1,
-                                               true,
-                                               LmHdims[phil_rank - 2],
-                                               LmHdims[phil_rank - 1],
-                                               phi_L.data<T>());
-  l_for_range(tril_computer);
-
-  phi::funcs::ForRange<Context> u_for_range(dev_ctx, phi_U.numel());
-  phi::funcs::TrilTriuCompute<T> triu_computer(phi_U.data<T>(),
-                                               0,
-                                               false,
-                                               UmHdims[phiu_rank - 2],
-                                               UmHdims[phiu_rank - 1],
-                                               phi_U.data<T>());
-  u_for_range(triu_computer);
-
-  Tensor_Add<Context, T>(dev_ctx, phi_L, phi_U, &phi);
-  psi.Resize(xdims);
-  dev_ctx.template Alloc<T>(&psi);
-  phi::funcs::SetConstant<Context, T> setter;
-  setter(dev_ctx, &psi, static_cast<T>(0));
-
-  std::vector<int64_t> axes = {xrank - 2, xrank - 1};
-  std::vector<int64_t> slice_starts(2, 0);
-  std::vector<int64_t> slice_ends(2, 0);
-  auto valuedims = common::vectorize(xdims);
-
-  DenseTensor Pmat;
-  Unpack_Pivot<Context, T>(dev_ctx, pivots, &Pmat, m, k);
-
-  if (m <= n) {
-    if (k < n) {
-      DenseTensor U_complement, U_grad_complement, phi_complement,
-          phi_complement_l;
-      Tensor_narrow<Context, T>(dev_ctx, &U, &U_complement, 0, k, k, n);
-      Tensor_narrow<Context, T>(
-          dev_ctx, &out_grad, &U_grad_complement, 0, k, k, n);
-      DenseTensor U_complement_mH =
-          Transpose2DTo6D<Context, T>(dev_ctx, U_complement);
-
-      Tensor_Conj<Context, T>(dev_ctx, U_complement_mH, &U_complement_mH);
-
-      auto mat_dim_g = phi::funcs::CreateMatrixDescriptor(
-          U_grad_complement.dims(), 0, false);
-      auto mat_dim_u =
-          phi::funcs::CreateMatrixDescriptor(U_complement_mH.dims(), 0, false);
-      auto phidims = UmHdims;
-      phidims[UmHdims.size() - 2] = k;
-      phidims[UmHdims.size() - 1] = k;
-      phi_complement.Resize(phidims);
-      dev_ctx.template Alloc<T>(&phi_complement);
-      blas.MatMul(U_grad_complement,
-                  mat_dim_g,
-                  U_complement_mH,
-                  mat_dim_u,
-                  static_cast<T>(1),
-                  &phi_complement,
-                  static_cast<T>(0));
-
-      phi_complement_l.Resize(phidims);
-      dev_ctx.template Alloc<T>(&phi_complement_l);
-      const auto H = phidims[phidims.size() - 2];
-      const auto W = phidims[phidims.size() - 1];
-      phi::funcs::ForRange<Context> x_for_range(dev_ctx,
-                                                phi_complement.numel());
-      phi::funcs::TrilTriuCompute<T> tril_computer(
-          phi_complement.data<T>(), -1, true, H, W, phi_complement_l.data<T>());
-      x_for_range(tril_computer);
-
-      Tensor_Sub<Context, T>(dev_ctx, phi, phi_complement_l, &phi);
-
-      slice_starts[0] = 0;
-      slice_starts[1] = k;
-      slice_ends[0] = k;
-      slice_ends[1] = n;
-      valuedims[xrank - 2] = k;
-      valuedims[xrank - 1] = n - k;
-      SetValueCompute_dispatch<Context, T>(dev_ctx,
-                                           &psi,
-                                           &U_grad_complement,
-                                           &psi,
-                                           axes,
-                                           &slice_starts,
-                                           &slice_ends,
-                                           valuedims,
-                                           xrank);
-    }
-
-    DenseTensor psi_principal, phi_mH, psi_tmp;
-    Tensor_Conj<Context, T>(dev_ctx, phi, &phi_mH);
-    phi_mH = Transpose2DTo6D<Context, T>(dev_ctx, phi_mH);
-
-    phi::TriangularSolveKernel<T, Context>(
-        dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal);
-
-    Tensor_Conj<Context, T>(dev_ctx, psi_principal, &psi_principal);
-    psi_principal = Transpose2DTo6D<Context, T>(dev_ctx, psi_principal);
-    slice_starts[0] = 0;
-    slice_starts[1] = 0;
-    slice_ends[0] = k;
-    slice_ends[1] = k;
-    valuedims[xrank - 2] = k;
-    valuedims[xrank - 1] = k;
-
-    SetValueCompute_dispatch<Context, T>(dev_ctx,
-                                         &psi,
-                                         &psi_principal,
-                                         &psi,
-                                         axes,
-                                         &slice_starts,
-                                         &slice_ends,
-                                         valuedims,
-                                         xrank);
-
-    phi::TriangularSolveKernel<T, Context>(
-        dev_ctx, L_narrow_mH, psi, true, false, true, &psi_tmp);
-
-    auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
-    auto mat_dim_b =
-        phi::funcs::CreateMatrixDescriptor(psi_tmp.dims(), 0, false);
-    blas.MatMul(Pmat,
-                mat_dim_p,
-                psi_tmp,
-                mat_dim_b,
-                static_cast<T>(1),
-                x_grad,
-                static_cast<T>(0));
-  } else {
-    DenseTensor L_complement, L_grad_complement, phi_complement,
-        phi_complement_u;
-    Tensor_narrow<Context, T>(dev_ctx, &L, &L_complement, k, m, 0, k);
-    Tensor_narrow<Context, T>(
-        dev_ctx, &out_grad, &L_grad_complement, k, m, 0, k);
-    DenseTensor L_complement_mH =
-        Transpose2DTo6D<Context, T>(dev_ctx, L_complement);
-    Tensor_Conj<Context, T>(dev_ctx, L_complement_mH, &L_complement_mH);
-
-    auto mat_dim_g =
-        phi::funcs::CreateMatrixDescriptor(L_grad_complement.dims(), 0, false);
-    auto mat_dim_u =
-        phi::funcs::CreateMatrixDescriptor(L_complement_mH.dims(), 0, false);
-    auto phidims = LmHdims;
-    phidims[LmHdims.size() - 2] = k;
-    phidims[LmHdims.size() - 1] = k;
-    phi_complement.Resize(phidims);
-    dev_ctx.template Alloc<T>(&phi_complement);
-    blas.MatMul(L_complement_mH,
-                mat_dim_u,
-                L_grad_complement,
-                mat_dim_g,
-                static_cast<T>(1),
-                &phi_complement,
-                static_cast<T>(0));
-
-    phi_complement_u.Resize(phidims);
-    dev_ctx.template Alloc<T>(&phi_complement_u);
-    const auto H = phidims[phidims.size() - 2];
-    const auto W = phidims[phidims.size() - 1];
-    phi::funcs::ForRange<Context> x_for_range(dev_ctx, phi_complement.numel());
-    phi::funcs::TrilTriuCompute<T> triu_computer(
-        phi_complement.data<T>(), 0, false, H, W, phi_complement_u.data<T>());
-    x_for_range(triu_computer);
-
-    Tensor_Sub<Context, T>(dev_ctx, phi, phi_complement_u, &phi);
-
-    slice_starts[0] = k;
-    slice_starts[1] = 0;
-    slice_ends[0] = m;
-    slice_ends[1] = k;
-    valuedims[xrank - 2] = m - k;
-    valuedims[xrank - 1] = k;
-    SetValueCompute_dispatch<Context, T>(dev_ctx,
-                                         &psi,
-                                         &L_grad_complement,
-                                         &psi,
-                                         axes,
-                                         &slice_starts,
-                                         &slice_ends,
-                                         valuedims,
-                                         xrank);
-    DenseTensor psi_principal, phi_mH, psi_tmp, U_narrow_mH;
-
-    phi::TriangularSolveKernel<T, Context>(
-        dev_ctx, L_narrow_mH, phi, true, false, true, &psi_principal);
-
-    slice_starts[0] = 0;
-    slice_starts[1] = 0;
-    slice_ends[0] = k;
-    slice_ends[1] = k;
-    valuedims[xrank - 2] = k;
-    valuedims[xrank - 1] = k;
-
-    SetValueCompute_dispatch<Context, T>(dev_ctx,
-                                         &psi,
-                                         &psi_principal,
-                                         &psi,
-                                         axes,
-                                         &slice_starts,
-                                         &slice_ends,
-                                         valuedims,
-                                         xrank);
-
-    psi_tmp.Resize(psi.dims());
-    dev_ctx.template Alloc<T>(&psi_tmp);
-    auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(psi.dims(), 0, false);
-    blas.MatMul(Pmat,
-                mat_dim_p,
-                psi,
-                mat_dim_b,
-                static_cast<T>(1),
-                &psi_tmp,
-                static_cast<T>(0));
-    psi_tmp = Transpose2DTo6D<Context, T>(dev_ctx, psi_tmp);
-
-    Tensor_Conj<Context, T>(dev_ctx, U_narrow, &U_narrow_mH);
-    phi::TriangularSolveKernel<T, Context>(
-        dev_ctx, U_narrow_mH, psi_tmp, true, false, false, &psi);
-    *x_grad = Transpose2DTo6D<Context, T>(dev_ctx, psi);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h
deleted file mode 100644
index 24dee650df..0000000000
--- a/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h
+++ /dev/null
@@ -1,224 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/infermeta/binary.h"
-// #include "paddle/phi/paddle/phi/kernels/funcs/blas/blas.h"
-
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/matrix_solve.h"
-#include "paddle/phi/kernels/impl/lu_kernel_impl.h"
-#include "paddle/phi/kernels/lu_solve_grad_kernel.h"
-#include "paddle/phi/kernels/lu_solve_kernel.h"
-#include "paddle/phi/kernels/lu_unpack_kernel.h"
-#include "paddle/phi/kernels/triangular_solve_kernel.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-DenseTensor GetMH(const Context& dev_ctx, const DenseTensor x) {
-  DenseTensor x_mH;
-  phi::Tensor_Conj<Context, T>(dev_ctx, x, &x_mH);
-  return phi::Transpose2DTo6D<Context, T>(dev_ctx, x_mH);
-}
-
-template <typename T, typename Context>
-void LuSolveGradKernel(const Context& dev_ctx,
-                       const DenseTensor& b,
-                       const DenseTensor& lu,
-                       const DenseTensor& pivots,
-                       const DenseTensor& out,
-                       const DenseTensor& out_grad,
-                       const std::string& trans,
-                       DenseTensor* b_grad,
-                       DenseTensor* lu_grad) {
-  if (b_grad != nullptr) {
-    dev_ctx.template Alloc<T>(b_grad);
-    std::string trans_t = (trans == "N") ? "T" : "N";
-    phi::LuSolveKernel<T, Context>(
-        dev_ctx, out_grad, lu, pivots, trans_t, b_grad);
-  }
-
-  if (lu_grad != nullptr) {
-    dev_ctx.template Alloc<T>(lu_grad);
-
-    DenseTensor p, l, u, l_mH, u_mH;
-    MetaTensor meta_p(&p);
-    MetaTensor meta_l(&l);
-    MetaTensor meta_u(&u);
-    bool unpack_pivots = (trans == "N") ? false : true;
-    LUUnpackInferMeta(
-        lu, pivots, true, unpack_pivots, &meta_p, &meta_l, &meta_u);
-    phi::LUUnpackKernel<T, Context>(
-        dev_ctx, lu, pivots, true, unpack_pivots, &p, &l, &u);
-    l_mH = GetMH<T, Context>(dev_ctx, l);
-    u_mH = GetMH<T, Context>(dev_ctx, u);
-    if (trans == "N") {
-      // gR = U^{-H}op_2(-gX)op_2(X)^Ha
-      DenseTensor gR, psi_tmp, out_mH;
-      out_mH = GetMH<T, Context>(dev_ctx, out);
-
-      auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-      auto out_grad_dims = out_grad.dims();
-      auto mat_dim_l =
-          phi::funcs::CreateMatrixDescriptor(out_grad_dims, 0, false);
-      auto out_mH_dims = out_mH.dims();
-      auto mat_dim_g =
-          phi::funcs::CreateMatrixDescriptor(out_mH_dims, 0, false);
-      psi_tmp.Resize(lu.dims());
-      dev_ctx.template Alloc<T>(&psi_tmp);
-      blas.MatMul(out_grad,
-                  mat_dim_l,
-                  out_mH,
-                  mat_dim_g,
-                  static_cast<T>(-1),
-                  &psi_tmp,
-                  static_cast<T>(0));
-      phi::TriangularSolveKernel<T, Context>(
-          dev_ctx, u_mH, psi_tmp, false, false, false, &gR);
-
-      // gL = (L^{-H} gR U^H).tril(-1)
-      DenseTensor mul_tmp, gL;
-      auto gr_dims = gR.dims();
-      auto mat_dim_r = phi::funcs::CreateMatrixDescriptor(gr_dims, 0, false);
-      auto gu_dims = u_mH.dims();
-      auto mat_dim_u = phi::funcs::CreateMatrixDescriptor(gu_dims, 0, false);
-      mul_tmp.Resize(gr_dims);
-      dev_ctx.template Alloc<T>(&mul_tmp);
-      blas.MatMul(gR,
-                  mat_dim_r,
-                  u_mH,
-                  mat_dim_u,
-                  static_cast<T>(1),
-                  &mul_tmp,
-                  static_cast<T>(0));
-      phi::TriangularSolveKernel<T, Context>(
-          dev_ctx, l_mH, mul_tmp, true, false, true, &gL);
-
-      auto phil_rank = gL.dims().size();
-      auto phir_rank = gR.dims().size();
-      phi::funcs::ForRange<Context> l_for_range(dev_ctx, gL.numel());
-      phi::funcs::TrilTriuCompute<T> tril_computer(gL.data<T>(),
-                                                   -1,
-                                                   true,
-                                                   gL.dims()[phil_rank - 2],
-                                                   gL.dims()[phil_rank - 1],
-                                                   gL.data<T>());
-      l_for_range(tril_computer);
-
-      phi::funcs::ForRange<Context> r_for_range(dev_ctx, gR.numel());
-      phi::funcs::TrilTriuCompute<T> triu_computer(gR.data<T>(),
-                                                   0,
-                                                   false,
-                                                   gR.dims()[phir_rank - 2],
-                                                   gR.dims()[phir_rank - 1],
-                                                   gR.data<T>());
-      r_for_range(triu_computer);
-      Tensor_Add<Context, T>(dev_ctx, gL, gR, lu_grad);
-    } else {
-      DenseTensor gR, p_mT, tem_out, out_grad_mH, tem_out1, tem_out2, tem_out3,
-          gU;
-      p_mT = Transpose2DTo6D<Context, T>(dev_ctx, p);
-      auto PmTdims = p_mT.dims();
-      auto Outdims = out.dims();
-      auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(PmTdims, 0, false);
-      auto mat_dim_o = phi::funcs::CreateMatrixDescriptor(Outdims, 0, false);
-      tem_out.Resize(Outdims);
-      dev_ctx.template Alloc<T>(&tem_out);
-      auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-      // gR = -P^T op_3(X)op_1(op_2(gX))P
-      blas.MatMul(p_mT,
-                  mat_dim_p,
-                  out,
-                  mat_dim_o,
-                  static_cast<T>(-1),
-                  &tem_out,
-                  static_cast<T>(0));
-      out_grad_mH = GetMH<T, Context>(dev_ctx, out_grad);
-      auto TemOutdims = tem_out.dims();
-      auto OutGradmHdims = out_grad_mH.dims();
-      auto mat_dim_tem_out =
-          phi::funcs::CreateMatrixDescriptor(TemOutdims, 0, false);
-      auto mat_dim_out_grad_mH =
-          phi::funcs::CreateMatrixDescriptor(OutGradmHdims, 0, false);
-      tem_out1.Resize(lu.dims());
-      dev_ctx.template Alloc<T>(&tem_out1);
-      blas.MatMul(tem_out,
-                  mat_dim_tem_out,
-                  out_grad_mH,
-                  mat_dim_out_grad_mH,
-                  static_cast<T>(1),
-                  &tem_out1,
-                  static_cast<T>(0));
-      auto TemOutdims1 = tem_out1.dims();
-      auto pdims = p.dims();
-      auto mat_dim_tem_out1 =
-          phi::funcs::CreateMatrixDescriptor(TemOutdims1, 0, false);
-      auto mat_dim_p1 = phi::funcs::CreateMatrixDescriptor(pdims, 0, false);
-      tem_out2.Resize(TemOutdims1);
-      dev_ctx.template Alloc<T>(&tem_out2);
-      blas.MatMul(tem_out1,
-                  mat_dim_tem_out1,
-                  p,
-                  mat_dim_p1,
-                  static_cast<T>(1),
-                  &tem_out2,
-                  static_cast<T>(0));
-      // gR = gR L^{-H}
-      phi::TriangularSolveKernel<T, Context>(
-          dev_ctx, l_mH, tem_out2, true, true, true, &gR);
-      // gU = (L^H gR U^{-H}).triu()
-      auto LmHdims = l_mH.dims();
-      auto gRdims = gR.dims();
-      auto mat_dim_l_mh = phi::funcs::CreateMatrixDescriptor(LmHdims, 0, false);
-      auto mat_dim_gr = phi::funcs::CreateMatrixDescriptor(gRdims, 0, false);
-      tem_out3.Resize(LmHdims);
-      dev_ctx.template Alloc<T>(&tem_out3);
-      blas.MatMul(l_mH,
-                  mat_dim_l_mh,
-                  gR,
-                  mat_dim_gr,
-                  static_cast<T>(1),
-                  &tem_out3,
-                  static_cast<T>(0));
-      phi::TriangularSolveKernel<T, Context>(
-          dev_ctx, u_mH, tem_out3, false, true, false, &gU);
-
-      auto phiu_rank = gU.dims().size();
-      auto phir_rank = gR.dims().size();
-      phi::funcs::ForRange<Context> l_for_range(dev_ctx, gR.numel());
-      phi::funcs::TrilTriuCompute<T> tril_computer(gR.data<T>(),
-                                                   -1,
-                                                   true,
-                                                   gR.dims()[phir_rank - 2],
-                                                   gR.dims()[phir_rank - 1],
-                                                   gR.data<T>());
-      l_for_range(tril_computer);
-
-      phi::funcs::ForRange<Context> r_for_range(dev_ctx, gU.numel());
-      phi::funcs::TrilTriuCompute<T> triu_computer(gU.data<T>(),
-                                                   0,
-                                                   false,
-                                                   gU.dims()[phiu_rank - 2],
-                                                   gU.dims()[phiu_rank - 1],
-                                                   gU.data<T>());
-      r_for_range(triu_computer);
-      Tensor_Add<Context, T>(dev_ctx, gR, gU, lu_grad);
-    }
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h b/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h
deleted file mode 100644
index 7ba97234cc..0000000000
--- a/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h
+++ /dev/null
@@ -1,477 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-namespace phi {
-
-template <typename Context, typename T>
-inline DenseTensor MatMul(const Context& dev_ctx,
-                          const DenseTensor& matrix_a,
-                          const DenseTensor& matrix_b,
-                          const phi::DDim& a_dim,
-                          const phi::DDim& b_dim) {
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  DenseTensor matrix_c;
-  phi::DDim c_dim = common::make_ddim({a_dim[0], b_dim[1]});
-  matrix_c.Resize(c_dim);
-  dev_ctx.template Alloc<T>(&matrix_c);
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false);
-  const T alpha = static_cast<T>(1.0);
-  blas.MatMul(matrix_a.data<T>(),
-              mat_dim_a,
-              matrix_b.data<T>(),
-              mat_dim_b,
-              alpha,
-              matrix_c.data<T>(),
-              T(0));
-  return matrix_c;
-}
-
-/**
- * @brief Recursively calculate matrix multiplication according to the optimal
- * order
- * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j]
- *
- * @param
- * ins: the input tensors
- * ins_dims: the shape of ins after reshape
- * order: the optimal order
- * i: the left of sub chain
- * j: the right of sub chain
- * save_result: set true by backward
- * results: save the intermediate result during backward
- */
-template <typename Context, typename T>
-inline DenseTensor MatChainMul(const Context& dev_ctx,
-                               const std::vector<const DenseTensor*>& ins,
-                               const std::vector<phi::DDim>& ins_dims,
-                               const std::vector<uint64_t>& order,
-                               const uint64_t i,
-                               const uint64_t j,
-                               const bool save_result,
-                               std::vector<DenseTensor>* results) {
-  if (i == j) {
-    return *ins[i];
-  }
-
-  const auto A = MatChainMul<Context, T>(dev_ctx,
-                                         ins,
-                                         ins_dims,
-                                         order,
-                                         i,
-                                         order[i * ins.size() + j],
-                                         save_result,
-                                         results);
-  phi::DDim a_dim = A.dims();
-  if (i == order[i * ins.size() + j]) {
-    a_dim = ins_dims[i];
-  }
-
-  const auto B = MatChainMul<Context, T>(dev_ctx,
-                                         ins,
-                                         ins_dims,
-                                         order,
-                                         order[i * ins.size() + j] + 1,
-                                         j,
-                                         save_result,
-                                         results);
-  phi::DDim b_dim = B.dims();
-  if (j == order[i * ins.size() + j] + 1) {
-    b_dim = ins_dims[j];
-  }
-
-  auto result = MatMul<Context, T>(dev_ctx, A, B, a_dim, b_dim);
-  if (save_result) {
-    (*results)[i * ins.size() + j] = result;
-  }
-  return result;
-}
-
-/**
- * @brief get the optimal order
- */
-template <typename Context, typename T>
-std::vector<uint64_t> GetOrder(const std::vector<const DenseTensor*>& ins,
-                               const std::vector<phi::DDim>& ins_dims) {
-  auto n = ins.size();
-  // p: save the ins shape, the ins[i] shape is (p[i], p[i+1])
-  std::vector<uint64_t> p(n + 1);
-  for (uint64_t i = 0; i < n; i++) {
-    p[i] = ins_dims[i][0];
-  }
-  p[n] = ins_dims[n - 1][1];
-
-  // m[i, j]: save the lowest cost for multiplying ins[i...j]
-  std::vector<uint64_t> m(n * n, 0);
-  // define ins[i...j] means multiplying matrices from ins[i] to ins[j]
-  // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then
-  // multiply the resulting matrices is the optimal order for ins[i...j]
-  std::vector<uint64_t> order(n * n);
-  for (uint64_t l = 1; l < n; l++) {
-    for (uint64_t i = 0; i < n - l; i++) {
-      auto j = i + l;
-      m[i * n + j] = 0xffffffff;
-      for (uint64_t k = i; k < j; k++) {
-        uint64_t q =
-            m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1];
-        if (q < m[i * n + j]) {
-          m[i * n + j] = q;
-          order[i * n + j] = k;
-        }
-      }
-    }
-  }
-  return order;
-}
-
-template <typename Context, typename T>
-static inline DenseTensor MultiDotMatChainOrder(
-    const Context& dev_ctx,
-    const std::vector<const DenseTensor*>& ins,
-    const std::vector<phi::DDim>& ins_dims,
-    const bool save_result,
-    std::vector<DenseTensor>* results) {
-  auto order = GetOrder<Context, T>(ins, ins_dims);
-  return MatChainMul<Context, T>(
-      dev_ctx, ins, ins_dims, order, 0, ins.size() - 1, save_result, results);
-}
-
-template <typename Context, typename T>
-inline void GetDims(const std::vector<const DenseTensor*>& ins,
-                    std::vector<phi::DDim>* ins_dims) {
-  const auto n = ins.size();
-  for (size_t i = 0; i < n; i++) {
-    (*ins_dims)[i] = ins[i]->dims();
-    if (i == 0 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = common::make_ddim({1, (*ins_dims)[i][0]});
-    } else if (i == n - 1 && (*ins_dims)[i].size() == 1) {
-      (*ins_dims)[i] = common::make_ddim({(*ins_dims)[i][0], 1});
-    }
-  }
-}
-
-template <typename T, typename Context>
-void MultiDotKernel(const Context& dev_ctx,
-                    const std::vector<const DenseTensor*>& x,
-                    DenseTensor* out) {
-  auto ins = x;
-  dev_ctx.template Alloc<T>(out);
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  auto n = ins.size();
-  std::vector<phi::DDim> ins_dims(n);
-  GetDims<Context, T>(ins, &ins_dims);
-
-  // If any numel is 0, then return.
-  bool size_0 = false;
-  for (size_t i = 0; i < n; i++) {
-    if (x[i]->numel() == 0) size_0 = true;
-  }
-  if (size_0) {
-    // For example: [2, 0], [0, 4] -> [2, 4]
-    if (out && out->numel() > 0) {
-      phi::Full<T, Context>(
-          dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
-    }
-    return;
-  }
-  const T scale = static_cast<T>(1.0);
-  if (n == 2) {
-    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-    blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
-  } else if (n == 3) {
-    const auto Ma = ins_dims[0][0];
-    const auto Ka = ins_dims[0][1];
-    const auto Nb = ins_dims[1][1];
-    const auto Nc = ins_dims[2][1];
-    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-    if (cost1 < cost2) {
-      DenseTensor tmp_out;
-      phi::DDim tmp_dim = common::make_ddim({Ma, Nb});
-      tmp_out.Resize(tmp_dim);
-      dev_ctx.template Alloc<T>(&tmp_out);
-      blas.MatMul(
-          *ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, T(0));
-      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-      blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
-    } else {
-      DenseTensor tmp_out;
-      phi::DDim tmp_dim = common::make_ddim({Ka, Nc});
-      tmp_out.Resize(tmp_dim);
-      dev_ctx.template Alloc<T>(&tmp_out);
-      blas.MatMul(
-          *ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, T(0));
-      auto mat_dim_tmp = phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
-      blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
-    }
-  } else {
-    std::vector<DenseTensor> results;
-    const auto tmp = MultiDotMatChainOrder<Context, T>(
-        dev_ctx, ins, ins_dims, false, &results);
-    auto out_dim = out->dims();
-    *out = tmp;
-    out->Resize(out_dim);
-  }
-
-  // auto& theta_grad = out;
-  // T* th_data = ctx.template Alloc<T>(theta_grad);
-}
-
-/**
- * @brief calculate dA and dB
- * dA = dout * transpose(B)
- * dB = transpose(A) * dout
- */
-template <typename Context, typename T>
-void CalcGrad(const Context& dev_ctx,
-              const DenseTensor& dout,
-              const DenseTensor& A,
-              const DenseTensor& B,
-              const phi::DDim& dout_dim,
-              const phi::DDim& a_dim,
-              const phi::DDim& b_dim,
-              DenseTensor* dA,
-              DenseTensor* dB) {
-  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true);
-  T alpha = static_cast<T>(1.0);
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
-  blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
-}
-
-/**
- * @brief calculate multi matrix multiplication grad by a chain order
- * @param
- * dout: the grad of multi matrix multiplication out
- * dx: the out grad of inputs
- * ins: the input tensors
- * ins_dims: the shape of ins after reshape
- * order: the optimal order
- * i: the left of sub chain
- * j: the right of sub chain
- * results: the intermediate result of forward
- */
-template <typename Context, typename T>
-void MatChainMulGrad(const Context& dev_ctx,
-                     const DenseTensor& dout,
-                     std::vector<DenseTensor*>* dx,
-                     const std::vector<const DenseTensor*>& ins,
-                     const phi::DDim& dout_dim,
-                     const std::vector<phi::DDim>& ins_dims,
-                     const std::vector<uint64_t>& order,
-                     const uint64_t i,
-                     const uint64_t j,
-                     const std::vector<DenseTensor>& results) {
-  if (i == j) {
-    *((*dx)[i]) = dout;
-    return;
-  }
-
-  const auto n = ins.size();
-  const auto right = order[i * n + j];
-  const auto left = order[i * n + j] + 1;
-  // get the multi result of left sub chain
-  const auto* A = &results[i * n + right];
-  phi::DDim a_dim = A->dims();
-  if (i == right) {
-    A = ins[i];
-    a_dim = ins_dims[i];
-  }
-  // get the multi result of right sub chain
-  const auto* B = &results[left * n + j];
-  phi::DDim b_dim = B->dims();
-  if (left == j) {
-    B = ins[j];
-    b_dim = ins_dims[j];
-  }
-  DenseTensor dA, dB;
-  dA.Resize({dout_dim[0], b_dim[0]});
-  dB.Resize({a_dim[1], dout_dim[1]});
-  dev_ctx.template Alloc<T>(&dA);
-  dev_ctx.template Alloc<T>(&dB);
-
-  CalcGrad<Context, T>(dev_ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB);
-  MatChainMulGrad<Context, T>(
-      dev_ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, results);
-  MatChainMulGrad<Context, T>(
-      dev_ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, results);
-}
-
-template <typename Context, typename T>
-void MultiDotGradMatChainOrder(const Context& dev_ctx,
-                               const DenseTensor& dout,
-                               const std::vector<const DenseTensor*>& ins,
-                               const phi::DDim& dout_dim,
-                               const std::vector<phi::DDim>& ins_dims,
-                               std::vector<DenseTensor*>* dx) {
-  auto order = GetOrder<Context, T>(ins, ins_dims);
-  auto n = ins.size();
-  std::vector<DenseTensor> results(n * n);
-  MatChainMul<Context, T>(
-      dev_ctx, ins, ins_dims, order, 0, n - 1, true, &results);
-  MatChainMulGrad<Context, T>(
-      dev_ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, results);
-}
-
-template <typename T, typename Context>
-void MultiDotGradKernel(const Context& dev_ctx,
-                        const std::vector<const DenseTensor*>& x,
-                        const DenseTensor& out_grad,
-                        std::vector<DenseTensor*> x_grad) {
-  auto ins = x;
-  auto dout = out_grad;
-  auto dx = x_grad;
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  bool size_0 = false;
-  const auto n = ins.size();
-  for (size_t i = 0; i < n; i++) {
-    dev_ctx.template Alloc<T>(dx[i]);
-
-    if (dx[i]->numel() == 0) {
-      size_0 = true;
-    }
-  }
-  if (size_0) {
-    for (size_t i = 0; i < n; i++) {
-      if (dx[i]->numel() > 0) {
-        phi::Full<T, Context>(
-            dev_ctx, phi::IntArray(common::vectorize(dx[i]->dims())), 0, dx[i]);
-      }
-    }
-    return;
-  }
-
-  std::vector<phi::DDim> ins_dims(n);
-  GetDims<Context, T>(ins, &ins_dims);
-
-  phi::DDim dout_dim = dout.dims();
-  if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) {
-    dout_dim = common::make_ddim({1, 1});
-  } else if (ins[0]->dims().size() == 1) {
-    if (dout_dim.size() == 1) {
-      dout_dim = common::make_ddim({1, dout_dim[0]});
-    }
-  } else if (ins[n - 1]->dims().size() == 1) {
-    if (dout_dim.size() == 1) {
-      dout_dim = common::make_ddim({dout_dim[0], 1});
-    }
-  }
-
-  T alpha = static_cast<T>(1);
-  auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-  if (n == 2) {
-    CalcGrad<Context, T>(dev_ctx,
-                         dout,
-                         *ins[0],
-                         *ins[1],
-                         dout_dim,
-                         ins_dims[0],
-                         ins_dims[1],
-                         dx[0],
-                         dx[1]);
-  } else if (n == 3) {
-    const auto Ma = ins_dims[0][0];
-    const auto Ka = ins_dims[0][1];
-    const auto Nb = ins_dims[1][1];
-    const auto Nc = ins_dims[2][1];
-    const uint64_t cost1 = Ma * Nb * (Ka + Nc);
-    const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
-    auto mat_dim_c = phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
-    if (cost1 < cost2) {
-      DenseTensor tmp_out, tmp_dout;
-      tmp_out.Resize({Ma, Nb});
-      dev_ctx.template Alloc<T>(&tmp_out);
-      tmp_dout.Resize({mat_dim_dout.height_, Nb});
-      dev_ctx.template Alloc<T>(&tmp_dout);
-      blas.MatMul(
-          *ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, T(0));
-      CalcGrad<Context, T>(dev_ctx,
-                           dout,
-                           tmp_out,
-                           *ins[2],
-                           dout_dim,
-                           tmp_out.dims(),
-                           ins_dims[2],
-                           &tmp_dout,
-                           dx[2]);
-      CalcGrad<Context, T>(dev_ctx,
-                           tmp_dout,
-                           *ins[0],
-                           *ins[1],
-                           tmp_dout.dims(),
-                           ins_dims[0],
-                           ins_dims[1],
-                           dx[0],
-                           dx[1]);
-    } else {
-      DenseTensor tmp_out, tmp_dout;
-      tmp_out.Resize({Ka, Nc});
-      dev_ctx.template Alloc<T>(&tmp_out);
-      tmp_dout.Resize({Ka, mat_dim_dout.width_});
-      dev_ctx.template Alloc<T>(&tmp_dout);
-      blas.MatMul(
-          *ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, T(0));
-      CalcGrad<Context, T>(dev_ctx,
-                           dout,
-                           *ins[0],
-                           tmp_out,
-                           dout_dim,
-                           ins_dims[0],
-                           tmp_dout.dims(),
-                           dx[0],
-                           &tmp_dout);
-      CalcGrad<Context, T>(dev_ctx,
-                           tmp_dout,
-                           *ins[1],
-                           *ins[2],
-                           tmp_dout.dims(),
-                           ins_dims[1],
-                           ins_dims[2],
-                           dx[1],
-                           dx[2]);
-    }
-  } else {
-    MultiDotGradMatChainOrder<Context, T>(
-        dev_ctx, dout, ins, dout_dim, ins_dims, &dx);
-    // if x's shape is: [3] [3, 4] [4]
-    // dx's shape will be: [1, 3] [3, 4] [4, 1]
-    if (ins[n - 1]->dims().size() == 1) {
-      dx[n - 1]->Resize({dx[n - 1]->dims()[0]});
-    }
-    if (ins[0]->dims().size() == 1) {
-      dx[0]->Resize({dx[0]->dims()[1]});
-    }
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/mv_kernel_impl.h b/backends/metax_gpu/kernels/impl/mv_kernel_impl.h
deleted file mode 100644
index 4baee25a09..0000000000
--- a/backends/metax_gpu/kernels/impl/mv_kernel_impl.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void MvKernel(const Context& dev_ctx,
-              const DenseTensor& x,
-              const DenseTensor& vec,
-              DenseTensor* out) {
-  const auto& dim_x = x.dims();
-
-  // get data ptr
-  const T* x_data = x.data<T>();
-  const T* vec_data = vec.data<T>();
-  T* out_data = dev_ctx.template Alloc<T>(out);
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  blas.GEMV(false,
-            dim_x[0],
-            dim_x[1],
-            static_cast<T>(1),
-            x_data,
-            vec_data,
-            static_cast<T>(0),
-            out_data);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/repeat_interleave_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/repeat_interleave_grad_kernel_impl.h
deleted file mode 100644
index e2c5d6c242..0000000000
--- a/backends/metax_gpu/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernels/impl/index_select_impl.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/primitive/functor_primitives.h"
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#else
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#endif
-
-#include "paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h"
-
-namespace phi {
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-using phi::PADDLE_CUDA_NUM_THREADS;
-
-template <typename T, typename IndexT>
-__global__ void index_select_grad_cuda_kernel(const T* output_grad,
-                                              T* input_grad,
-                                              const IndexT* index,
-                                              int64_t N,
-                                              int64_t stride,
-                                              int64_t size,
-                                              int64_t delta) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-
-  int64_t pre_idx = idx / (stride * size);
-  int64_t dim_idx = idx % (stride * size) / stride;
-  IndexT src_dim_idx = index[dim_idx];
-  int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride;
-  phi::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]);
-}
-
-template <typename T>
-__global__ void index_select_grad_init(T* input_grad, int64_t N) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  input_grad[idx] = 0.0;
-}
-#endif
-template <typename T, typename Context>
-void RepeatInterleaveWithTensorIndexGradKernel(
-    const Context& dev_ctx,
-    const DenseTensor& x,
-    const DenseTensor& repeats_tensor,
-    const DenseTensor& out_grad,
-    int dim,
-    DenseTensor* x_grad) {
-  auto input_dim = x_grad->dims();
-  if (dim < 0) {
-    dim += static_cast<int>(input_dim.size());
-  }
-
-  DenseTensor index;
-  PADDLE_ENFORCE_EQ(repeats_tensor.dims()[0] == x_grad->dims()[dim],
-                    true,
-                    common::errors::InvalidArgument(
-                        "The length of Input(RepeatsTensor) must be the "
-                        "same as length of Input(X) in axis. "
-                        "But received: [%s], required: [%d].",
-                        repeats_tensor.dims()[0],
-                        x_grad->dims()[dim]));
-
-  const auto& index_type = repeats_tensor.dtype();
-
-  bool index_type_match =
-      index_type == DataType::INT32 || index_type == DataType::INT64;
-  PADDLE_ENFORCE_EQ(index_type_match,
-                    true,
-                    common::errors::InvalidArgument(
-                        "Input(Repeats) holds the wrong type, it holds %s, but "
-                        "desires to be %s or %s",
-                        DataTypeToString(index_type),
-                        DataTypeToString(DataType::INT32),
-                        DataTypeToString(DataType::INT64)));
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-  auto output_dim = out_grad.dims();
-  auto stride_dim = common::stride(input_dim);
-  int64_t stride = stride_dim[dim];
-  int64_t size = output_dim[dim];
-  int64_t delta = input_dim[dim] - size;
-  int64_t numel = x_grad->numel();
-  int64_t out_nums = out_grad.numel();
-  auto* out_grad_data = out_grad.data<T>();
-  dev_ctx.template Alloc<T>(x_grad);
-  auto* in_grad_data = x_grad->data<T>();
-  auto stream = dev_ctx.stream();
-  index_select_grad_init<T>
-      <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-         PADDLE_CUDA_NUM_THREADS,
-         0,
-         stream>>>(in_grad_data, numel);
-
-  if (index_type == DataType::INT64) {
-    phi::funcs::RepeatsTensor2IndexTensorFunctor<Context, int64_t>()(
-        dev_ctx, repeats_tensor, &index);
-    int64_t index_nums = index.numel();
-
-    const int64_t* index_data = index.data<int64_t>();
-    index_select_grad_cuda_kernel<T, int64_t>
-        <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-           PADDLE_CUDA_NUM_THREADS,
-           0,
-           stream>>>(out_grad_data,
-                     in_grad_data,
-                     index_data,
-                     out_nums,
-                     stride,
-                     size,
-                     delta);
-  } else {
-    phi::funcs::RepeatsTensor2IndexTensorFunctor<Context, int>()(
-        dev_ctx, repeats_tensor, &index);
-    int64_t index_nums = index.numel();
-
-    const int* index_data = index.data<int>();
-    index_select_grad_cuda_kernel<T, int>
-        <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-           PADDLE_CUDA_NUM_THREADS,
-           0,
-           stream>>>(out_grad_data,
-                     in_grad_data,
-                     index_data,
-                     out_nums,
-                     stride,
-                     size,
-                     delta);
-  }
-#endif
-}
-
-template <typename T, typename Context>
-void RepeatInterleaveGradKernel(const Context& dev_ctx,
-                                const DenseTensor& x,
-                                const DenseTensor& out_grad,
-                                int repeats,
-                                int dim,
-                                DenseTensor* x_grad) {
-  if (x_grad && x_grad->numel() == 0) {
-    dev_ctx.template Alloc<T>(x_grad);
-    return;
-  }
-  auto input_dim = x_grad->dims();
-  if (dim < 0) {
-    dim += input_dim.size();
-  }
-
-  DenseTensor index;
-#if defined(__NVCC__) || defined(__HIPCC__)
-  auto output_dim = out_grad.dims();
-  auto stride_dim = common::stride(input_dim);
-  int64_t stride = stride_dim[dim];
-  int64_t size = output_dim[dim];
-  int64_t delta = input_dim[dim] - size;
-  int64_t numel = x_grad->numel();
-  int64_t out_nums = out_grad.numel();
-  auto* out_grad_data = out_grad.data<T>();
-  dev_ctx.template Alloc<T>(x_grad);
-  auto* in_grad_data = x_grad->data<T>();
-  auto stream = dev_ctx.stream();
-  index_select_grad_init<T>
-      <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-         PADDLE_CUDA_NUM_THREADS,
-         0,
-         stream>>>(in_grad_data, numel);
-  int64_t index_size = x_grad->dims()[dim] * repeats;
-  std::vector<int> index_vec(index_size);
-  for (int i = 0; i < x_grad->dims()[dim]; i++) {
-    std::fill_n(index_vec.begin() + i * repeats, repeats, i);
-  }
-  index.Resize(common::make_ddim({index_size}));
-  phi::TensorFromVector<int>(index_vec, dev_ctx, &index);
-
-  const int* index_data = index.data<int>();
-  int64_t index_nums = index.numel();
-  index_select_grad_cuda_kernel<T, int>
-      <<<(out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-         PADDLE_CUDA_NUM_THREADS,
-         0,
-         stream>>>(out_grad_data,
-                   in_grad_data,
-                   index_data,
-                   out_nums,
-                   stride,
-                   size,
-                   delta);
-#endif
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/repeat_interleave_kernel_impl.h b/backends/metax_gpu/kernels/impl/repeat_interleave_kernel_impl.h
deleted file mode 100644
index 041bc1f9e5..0000000000
--- a/backends/metax_gpu/kernels/impl/repeat_interleave_kernel_impl.h
+++ /dev/null
@@ -1,243 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-// #include "paddle/phi/kernels/cpu/index_select_impl.h"
-#include "kernels/impl/index_select_impl.h"
-#include "paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h"
-#include "paddle/phi/kernels/repeat_interleave_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/backends/gpu/gpu_decls.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/backends/gpu/gpu_resources.h"
-#include "paddle/phi/kernels/primitive/functor_primitives.h"
-#endif
-
-namespace phi {
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-using phi::PADDLE_CUDA_NUM_THREADS;
-template <typename T, typename IndexT>
-__global__ void index_select_cuda_kernel(const T* input,
-                                         T* output,
-                                         const IndexT* index,
-                                         int64_t N,
-                                         int64_t stride,
-                                         int64_t size,
-                                         int64_t delta) {
-  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
-    return;
-  }
-  const int64_t stride_size = stride * size;
-
-  const int64_t pre_idx = idx / stride_size;
-  const int64_t remainder = idx % stride_size;
-  const int64_t dim_idx = remainder / stride;
-
-  const IndexT src_dim_idx = index[dim_idx];
-
-  const int64_t input_idx =
-      idx + ((delta * pre_idx) + (src_dim_idx - dim_idx)) * stride;
-  output[idx] = input[input_idx];
-}
-#endif
-
-template <typename T, typename Context>
-void RepeatInterleaveKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            int repeats,
-                            int dim,
-                            DenseTensor* out) {
-  PADDLE_ENFORCE_GT(repeats,
-                    0,
-                    common::errors::InvalidArgument(
-                        "repeats must grater than 0, but got %d", repeats));
-  if (out && out->numel() == 0) {
-    dev_ctx.template Alloc<T>(out);
-    return;
-  }
-  auto place = dev_ctx.GetPlace();
-  auto cpu_place = phi::CPUPlace();
-
-  auto input_dim = x.dims();
-  if (dim < 0) {
-    dim += input_dim.size();
-  }
-
-  DenseTensor index;
-  int64_t index_size = input_dim[dim] * repeats;
-  std::vector<int> index_vec(index_size);
-  for (int i = 0; i < input_dim[dim]; i++) {
-    std::fill_n(index_vec.begin() + i * repeats, repeats, i);
-  }
-  index.Resize(common::make_ddim({index_size}));
-  if (place == cpu_place) {
-    DenseTensor x_copy = x;
-    phi::TensorFromVector<int>(index_vec, dev_ctx, &index);
-
-    auto output_dim = common::vectorize(x.dims());
-    output_dim[dim] = index_size;
-    out->Resize(common::make_ddim(output_dim));
-    phi::IndexSelectInner<Context, T, int>(dev_ctx, &x_copy, index, out, dim);
-#if defined(__NVCC__) || defined(__HIPCC__)
-  } else {
-    auto stride_dim = common::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    phi::TensorFromVector<int>(index_vec, dev_ctx, &index);
-    auto stream = dev_ctx.stream();
-    auto output_dim = common::vectorize(x.dims());
-    output_dim[dim] = index_size;
-    out->Resize(common::make_ddim(output_dim));
-    dev_ctx.template Alloc<T>(out);
-    auto* out_data = out->data<T>();
-    int64_t numel = out->numel();
-    int64_t size = output_dim[dim];
-    int64_t delta = input_dim[dim] - size;
-
-    const int* index_data = index.data<int>();
-    index_select_cuda_kernel<T, int>
-        <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-           PADDLE_CUDA_NUM_THREADS,
-           0,
-           stream>>>(
-            x.data<T>(), out_data, index_data, numel, stride, size, delta);
-  }
-#else
-  }
-#endif
-}
-
-template <typename T, typename Context>
-void RepeatInterleaveWithTensorIndexKernel(const Context& dev_ctx,
-                                           const DenseTensor& x,
-                                           const DenseTensor& repeats_tensor,
-                                           int dim,
-                                           DenseTensor* out) {
-  auto place = dev_ctx.GetPlace();
-  auto cpu_place = phi::CPUPlace();
-
-  auto input_dim = x.dims();
-  if (dim < 0) {
-    dim += input_dim.size();
-  }
-  DenseTensor index;
-  PADDLE_ENFORCE_EQ(repeats_tensor.dims()[0] == x.dims()[dim],
-                    true,
-                    common::errors::InvalidArgument(
-                        "The length of Input(RepeatsTensor) must be the "
-                        "same as length of Input(X) in axis. "
-                        "But received: [%s], required: [%d].",
-                        repeats_tensor.dims()[0],
-                        x.dims()[dim]));
-  const auto& index_type = repeats_tensor.dtype();
-  bool index_type_match =
-      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64;
-  PADDLE_ENFORCE_EQ(
-      index_type_match,
-      true,
-      common::errors::InvalidArgument(
-          "Input(RepeatsTensor) holds the wrong type, it holds %s, but "
-          "desires to be %s or %s",
-          DataTypeToString(index_type),
-          DataTypeToString(phi::DataType::INT32),
-          DataTypeToString(phi::DataType::INT64)));
-
-  if (x.numel() == 0) {
-    // infer out shape
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::RepeatsTensor2IndexTensorFunctor<Context, int>()(
-          dev_ctx, repeats_tensor, &index);
-
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::RepeatsTensor2IndexTensorFunctor<Context, int64_t>()(
-          dev_ctx, repeats_tensor, &index);
-    }
-    auto output_dim = common::vectorize(x.dims());
-    output_dim[dim] = index.dims()[0];
-    out->Resize(common::make_ddim(output_dim));
-    dev_ctx.template Alloc<T>(out);
-    return;
-  }
-  if (place == cpu_place) {
-    auto x_copy = x;
-    if (index_type == phi::DataType::INT32) {
-      phi::funcs::RepeatsTensor2IndexTensorFunctor<Context, int>()(
-          dev_ctx, repeats_tensor, &index);
-      auto output_dim = common::vectorize(x.dims());
-      output_dim[dim] = index.dims()[0];
-      out->Resize(common::make_ddim(output_dim));
-      IndexSelectInner<Context, T, int>(dev_ctx, &x_copy, index, out, dim);
-    } else if (index_type == phi::DataType::INT64) {
-      phi::funcs::RepeatsTensor2IndexTensorFunctor<Context, int64_t>()(
-          dev_ctx, repeats_tensor, &index);
-      auto output_dim = common::vectorize(x.dims());
-      output_dim[dim] = index.dims()[0];
-      out->Resize(common::make_ddim(output_dim));
-      IndexSelectInner<Context, T, int64_t>(dev_ctx, &x_copy, index, out, dim);
-    }
-#if defined(__NVCC__) || defined(__HIPCC__)
-  } else {
-    auto stride_dim = common::stride(input_dim);
-    int64_t stride = stride_dim[dim];
-    auto stream = dev_ctx.stream();
-    auto* in_data = x.data<T>();
-    if (index_type == phi::DataType::INT64) {
-      phi::funcs::RepeatsTensor2IndexTensorFunctor<Context, int64_t>()(
-          dev_ctx, repeats_tensor, &index);
-
-      const int64_t* index_data = index.data<int64_t>();
-      auto output_dim = common::vectorize(x.dims());
-      output_dim[dim] = index.dims()[0];
-      out->Resize(common::make_ddim(output_dim));
-      T* out_data = dev_ctx.template Alloc<T>(out);
-      int64_t numel = out->numel();
-      int64_t size = output_dim[dim];
-      int64_t delta = input_dim[dim] - size;
-
-      index_select_cuda_kernel<T, int64_t>
-          <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS,
-             0,
-             stream>>>(
-              in_data, out_data, index_data, numel, stride, size, delta);
-    } else {
-      phi::funcs::RepeatsTensor2IndexTensorFunctor<Context, int>()(
-          dev_ctx, repeats_tensor, &index);
-
-      const int* index_data = index.data<int>();
-      auto output_dim = common::vectorize(x.dims());
-      output_dim[dim] = index.dims()[0];
-      out->Resize(common::make_ddim(output_dim));
-      T* out_data = dev_ctx.template Alloc<T>(out);
-      int64_t numel = out->numel();
-      int64_t size = output_dim[dim];
-      int64_t delta = input_dim[dim] - size;
-      index_select_cuda_kernel<T, int>
-          <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
-             PADDLE_CUDA_NUM_THREADS,
-             0,
-             stream>>>(
-              in_data, out_data, index_data, numel, stride, size, delta);
-    }
-  }
-#else
-  }
-#endif
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/standard_gamma_kernel_impl.h b/backends/metax_gpu/kernels/impl/standard_gamma_kernel_impl.h
deleted file mode 100644
index 2667cf2cae..0000000000
--- a/backends/metax_gpu/kernels/impl/standard_gamma_kernel_impl.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernels/impl/dirichlet_kernel_impl.h"
-#include "paddle/phi/kernels/standard_gamma_kernel.h"
-
-namespace phi {
-template <typename T, typename Context>
-void StandardGammaKernel(const Context& dev_ctx,
-                         const DenseTensor& alpha,
-                         DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-  GammaSampler<Context, T> sampler;
-  sampler(dev_ctx, alpha, out);
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/stft_kernel_impl.h b/backends/metax_gpu/kernels/impl/stft_kernel_impl.h
deleted file mode 100644
index 46fce91937..0000000000
--- a/backends/metax_gpu/kernels/impl/stft_kernel_impl.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "kernels/impl/elementwise.h"
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/type_traits.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/fft.h"
-#include "paddle/phi/kernels/funcs/fft_fill_conj.h"
-#include "paddle/phi/kernels/funcs/frame_functor.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void StftKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& window,
-                int n_fft,
-                int hop_length,
-                bool normalized,
-                bool onesided,
-                DenseTensor* out) {
-  using C = phi::dtype::complex<T>;
-
-  dev_ctx.template Alloc<C>(out);
-
-  const size_t x_rank = x.dims().size();
-  const size_t out_rank = out->dims().size();
-
-  const int n_frames = out->dims()[out_rank - 1];
-  const int seq_length = x.dims()[x_rank - 1];
-
-  std::vector<int64_t> axes = {1};
-
-  // Frame
-  phi::DenseTensor frames;
-  phi::DDim frames_dims(out->dims());
-  frames_dims.at(axes.back()) = n_fft;
-  frames.Resize(frames_dims);
-  dev_ctx.template Alloc<T>(&frames);
-  phi::funcs::FrameFunctor<Context, T>()(dev_ctx,
-                                         &x,
-                                         &frames,
-                                         seq_length,
-                                         n_fft,
-                                         n_frames,
-                                         hop_length,
-                                         /*is_grad*/ false);
-
-  // Window
-  phi::DenseTensor frames_w;
-  frames_w.Resize(frames_dims);
-  dev_ctx.template Alloc<T>(&frames_w);
-  phi::funcs::ElementwiseCompute<phi::funcs::MultiplyFunctor<T>, T, T>(
-      dev_ctx,
-      frames,
-      window,
-      phi::funcs::MultiplyFunctor<T>(),
-      &frames_w,
-      axes.back());
-
-  // FFTR2C
-  phi::funcs::FFTNormMode normalization;
-  if (normalized) {
-    normalization = phi::funcs::get_norm_from_string("ortho", true);
-  } else {
-    normalization = phi::funcs::get_norm_from_string("backward", true);
-  }
-  phi::funcs::FFTR2CFunctor<Context, T, C> fft_r2c_func;
-
-  if (onesided) {
-    fft_r2c_func(dev_ctx, frames_w, out, axes, normalization, true);
-  } else {
-    phi::DDim onesided_dims(out->dims());
-    const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1;
-    onesided_dims.at(axes.back()) = onesided_axis_size;
-    phi::DenseTensor onesided_out;
-    onesided_out.Resize(onesided_dims);
-    dev_ctx.template Alloc<T>(&onesided_out);
-    fft_r2c_func(dev_ctx, frames_w, &onesided_out, axes, normalization, true);
-    phi::funcs::FFTFillConj<Context, C>(dev_ctx, &onesided_out, out, axes);
-  }
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h
deleted file mode 100644
index ad656b7a6c..0000000000
--- a/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-#include "paddle/phi/kernels/funcs/matrix_reduce.h"
-#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
-#include "paddle/phi/kernels/triangular_solve_grad_kernel.h"
-#include "paddle/phi/kernels/triangular_solve_kernel.h"
-namespace phi {
-
-template <typename T, typename Context>
-void TriangularSolveGradKernel(const Context& dev_ctx,
-                               const DenseTensor& x,
-                               const DenseTensor& y,
-                               const DenseTensor& out,
-                               const DenseTensor& dout,
-                               bool upper,
-                               bool transpose,
-                               bool unitriangular,
-                               DenseTensor* dx,
-                               DenseTensor* dy) {
-  if (out.numel() == 0) {
-    if (dx) {
-      phi::Full<T, Context>(
-          dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx);
-    }
-    if (dy) {
-      phi::Full<T, Context>(
-          dev_ctx, phi::IntArray(common::vectorize(dy->dims())), 0, dy);
-    }
-    return;
-  }
-  std::vector<int64_t> x_bst_dims_vec;
-  std::vector<int64_t> y_bst_dims_vec;
-  std::tie(x_bst_dims_vec, y_bst_dims_vec) =
-      funcs::MatrixGetBroadcastDims(x, y);
-
-  IntArray y_bst_dims_array(y_bst_dims_vec);
-  DenseTensor dy_bst = phi::Empty<T, Context>(dev_ctx, y_bst_dims_array);
-  if (dy) {
-    // calculate x's conjugate for complex
-    DenseTensor x_conj;
-    x_conj.Resize(x.dims());
-
-    phi::funcs::ForRange<Context> x_for_range(dev_ctx, x.numel());
-    phi::funcs::ConjFunctor<T> x_functor(
-        x.data<T>(), x.numel(), dev_ctx.template Alloc<T>(&x_conj));
-    x_for_range(x_functor);
-
-    // reuse forward to get dy_bst, and the result has been broadcasted already.
-    TriangularSolveKernel<T, Context>(
-        dev_ctx, x_conj, dout, upper, !transpose, unitriangular, &dy_bst);
-
-    dy->Resize(y.dims());
-    dev_ctx.template Alloc<T>(dy);
-    if (dy_bst.dims() == y.dims()) {
-      Copy<Context>(dev_ctx, dy_bst, dev_ctx.GetPlace(), false, dy);
-    } else {
-      funcs::MatrixReduceSumFunctor<T, Context> functor;
-      functor(dev_ctx, dy_bst, dy);
-      dy->Resize(y.dims());
-    }
-  }
-
-  IntArray x_bst_dims_array(x_bst_dims_vec);
-  DenseTensor dx_bst = phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
-  if (dx) {
-    // calculate x's conjugate for complex
-    DenseTensor out_conj;
-    out_conj.Resize(out.dims());
-
-    phi::funcs::ForRange<Context> out_for_range(dev_ctx, out.numel());
-    phi::funcs::ConjFunctor<T> out_functor(
-        out.data<T>(), out.numel(), dev_ctx.template Alloc<T>(&out_conj));
-    out_for_range(out_functor);
-
-    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-    if (transpose) {
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true);
-      blas.MatMul(out_conj,
-                  mat_dim_a,
-                  dy_bst,
-                  mat_dim_b,
-                  static_cast<T>(-1),
-                  &dx_bst,
-                  static_cast<T>(0));
-    } else {
-      auto mat_dim_a =
-          phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false);
-      auto mat_dim_b =
-          phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true);
-      blas.MatMul(dy_bst,
-                  mat_dim_a,
-                  out_conj,
-                  mat_dim_b,
-                  static_cast<T>(-1),
-                  &dx_bst,
-                  static_cast<T>(0));
-    }
-
-    // get upper or lower triangular
-    DenseTensor dx_bst_upper =
-        phi::Empty<T, Context>(dev_ctx, x_bst_dims_array);
-
-    const auto& dims = dx_bst.dims();
-    const auto H = dims[dims.size() - 2];
-    const auto W = dims[dims.size() - 1];
-    phi::funcs::ForRange<Context> x_for_range(dev_ctx, dx_bst.numel());
-    phi::funcs::TrilTriuCompute<T> tril_triu_functor(
-        dx_bst.data<T>(), unitriangular, !upper, H, W, dx_bst_upper.data<T>());
-    x_for_range(tril_triu_functor);
-
-    dx->Resize(x.dims());
-    dev_ctx.template Alloc<T>(dx);
-    if (dx_bst.dims() == x.dims()) {
-      Copy<Context>(dev_ctx, dx_bst_upper, dev_ctx.GetPlace(), false, dx);
-    } else {
-      funcs::MatrixReduceSumFunctor<T, Context> functor;
-      functor(dev_ctx, dx_bst_upper, dx);
-      dx->Resize(x.dims());
-    }
-  }
-}
-
-}  // namespace phi