From 0f243c47dcfa03efb15564a03390662c6a35226a Mon Sep 17 00:00:00 2001 From: co63oc <4617245+co63oc@users.noreply.github.com> Date: Tue, 21 Apr 2026 08:40:51 +0800 Subject: [PATCH 1/3] update gcu kernels --- backends/gcu/kernels/accuracy_kernel.cc | 40 +- backends/gcu/kernels/activation_kernels.cc | 257 ++++++------- backends/gcu/kernels/adam_kernel.cc | 132 +++---- backends/gcu/kernels/add_n_kernel.cc | 6 +- backends/gcu/kernels/arange_kernel.cc | 21 +- backends/gcu/kernels/arg_min_max_kernels.cc | 28 +- backends/gcu/kernels/argsort_kernel.cc | 18 +- backends/gcu/kernels/assign_kernel.cc | 20 +- backends/gcu/kernels/atan2_kernel.cc | 8 +- backends/gcu/kernels/batch_norm_kernel.cc | 160 ++++---- backends/gcu/kernels/binary_kernels.cc | 244 ++++++------ backends/gcu/kernels/bitwise_kernel.cc | 34 +- backends/gcu/kernels/c_identity_kernel.cc | 4 +- backends/gcu/kernels/cast_kernel.cc | 30 +- backends/gcu/kernels/cholesky_kernel.cc | 4 +- backends/gcu/kernels/clip_kernel.cc | 10 +- backends/gcu/kernels/compare_kernels.cc | 122 +++--- backends/gcu/kernels/concat_kernel.cc | 21 +- backends/gcu/kernels/contiguous_kernel.cc | 6 +- backends/gcu/kernels/conv_kernel.cc | 135 ++++--- backends/gcu/kernels/conv_transpose_kernel.cc | 95 +++-- backends/gcu/kernels/copysign_kernel.cc | 6 +- backends/gcu/kernels/cross_entropy_kernel.cc | 16 +- backends/gcu/kernels/cross_kernel.cc | 6 +- backends/gcu/kernels/cumulate_kernel.cc | 48 ++- backends/gcu/kernels/diag_kernel.cc | 8 +- backends/gcu/kernels/diagonal_kernel.cc | 6 +- backends/gcu/kernels/dropout_kernel.cc | 24 +- backends/gcu/kernels/einsum_kernel.cc | 62 ++-- backends/gcu/kernels/embedding_kernel.cc | 57 ++- backends/gcu/kernels/expand_as_kernel.cc | 10 +- backends/gcu/kernels/expand_kernel.cc | 6 +- backends/gcu/kernels/eye_kernel.cc | 4 +- backends/gcu/kernels/fc_kernel.cc | 16 +- backends/gcu/kernels/flatten_kernel.cc | 16 +- backends/gcu/kernels/flip_kernel.cc | 4 +- backends/gcu/kernels/full_kernel.cc | 38 +- backends/gcu/kernels/funcs/common_ops.cc | 350 +++++++++--------- backends/gcu/kernels/funcs/common_ops.h | 98 ++--- backends/gcu/kernels/funcs/gcu_kernel_funcs.h | 2 +- .../gcu/kernels/funcs/gcu_layout_funcs.cc | 80 ++-- backends/gcu/kernels/funcs/gcu_layout_funcs.h | 61 ++- backends/gcu/kernels/funcs/op_utils.cc | 40 +- backends/gcu/kernels/funcs/op_utils.h | 93 +++-- .../gcu/kernels/funcs/topsaten_op_launch.h | 125 ++++--- .../gcu/kernels/funcs/topsaten_op_utils.cc | 55 ++- .../gcu/kernels/funcs/topsaten_op_utils.h | 8 +- .../kernels/fused_conv2d_add_act_kernel.cc | 28 +- .../fused_fc_elementwise_layernorm_kernel.cc | 48 +-- backends/gcu/kernels/gather_kernel.cc | 41 +- backends/gcu/kernels/gather_nd_kernel.cc | 36 +- backends/gcu/kernels/gaussian_kernel.cc | 10 +- backends/gcu/kernels/grid_sample_kernel.cc | 6 +- backends/gcu/kernels/huber_loss_kernel.cc | 16 +- backends/gcu/kernels/increment_kernel.cc | 4 +- backends/gcu/kernels/index_add_kernel.cc | 8 +- backends/gcu/kernels/index_put_kernel.cc | 17 +- backends/gcu/kernels/index_sample_kernel.cc | 36 +- backends/gcu/kernels/index_select_kernel.cc | 6 +- backends/gcu/kernels/instance_norm_kernel.cc | 39 +- backends/gcu/kernels/interpolate_kernels.cc | 6 +- backends/gcu/kernels/is_empty_kernel.cc | 6 +- backends/gcu/kernels/isclose_kernel.cc | 10 +- backends/gcu/kernels/isfinite_kernel.cc | 6 +- backends/gcu/kernels/isinf_kernel.cc | 6 +- backends/gcu/kernels/isnan_kernel.cc | 6 +- backends/gcu/kernels/label_smooth_kernel.cc | 6 +- backends/gcu/kernels/layer_norm_kernel.cc | 64 ++-- backends/gcu/kernels/lerp_kernel.cc | 8 +- backends/gcu/kernels/llama_stub_kernels.cc | 103 +++--- backends/gcu/kernels/log_loss_kernel.cc | 16 +- backends/gcu/kernels/log_softmax_kernel.cc | 10 +- backends/gcu/kernels/logcumsumexp_kernel.cc | 6 +- backends/gcu/kernels/logical_kernels.cc | 72 ++-- backends/gcu/kernels/logsumexp_kernel.cc | 4 +- backends/gcu/kernels/masked_select_kernel.cc | 18 +- backends/gcu/kernels/matmul_kernel.cc | 32 +- backends/gcu/kernels/mean_all_kernel.cc | 10 +- backends/gcu/kernels/memcpy_kernels.cc | 16 +- backends/gcu/kernels/merged_adam_kernel.cc | 95 +++-- .../gcu/kernels/merged_momentum_kernel.cc | 41 +- backends/gcu/kernels/meshgrid_kernel.cc | 8 +- backends/gcu/kernels/momentum_kernel.cc | 20 +- .../gcu/kernels/multiclass_nms3_kernel.cc | 105 +++--- backends/gcu/kernels/multinomial_kernel.cc | 30 +- backends/gcu/kernels/nms_kernel.cc | 16 +- backends/gcu/kernels/numel_kernel.cc | 6 +- backends/gcu/kernels/one_hot_kernel.cc | 26 +- backends/gcu/kernels/pool2d_kernel.cc | 23 +- backends/gcu/kernels/prior_box_kernel.cc | 8 +- backends/gcu/kernels/randperm_kernel.cc | 12 +- backends/gcu/kernels/reduce_kernels.cc | 96 +++-- backends/gcu/kernels/reshape_kernel.cc | 22 +- backends/gcu/kernels/rmsprop_kernel.cc | 32 +- backends/gcu/kernels/rnn_kernel.cc | 103 +++--- backends/gcu/kernels/roi_align_kernel.cc | 18 +- backends/gcu/kernels/roll_kernel.cc | 4 +- backends/gcu/kernels/scale_kernel.cc | 27 +- backends/gcu/kernels/scatter_kernel.cc | 34 +- backends/gcu/kernels/set_value_kernel.cc | 30 +- ...igmoid_cross_entropy_with_logits_kernel.cc | 14 +- backends/gcu/kernels/sign_kernel.cc | 4 +- backends/gcu/kernels/slice_kernel.cc | 12 +- backends/gcu/kernels/softmax_kernel.cc | 10 +- backends/gcu/kernels/split_kernel.cc | 14 +- .../gcu/kernels/squared_l2_norm_kernel.cc | 4 +- backends/gcu/kernels/squeeze_kernel.cc | 16 +- backends/gcu/kernels/stack_kernel.cc | 32 +- backends/gcu/kernels/strided_copy_kernel.cc | 10 +- backends/gcu/kernels/strided_slice_kernel.cc | 6 +- backends/gcu/kernels/swiglu_kernel.cc | 10 +- backends/gcu/kernels/take_along_axis.cc | 2 +- backends/gcu/kernels/temporal_shift_kernel.cc | 40 +- backends/gcu/kernels/tile_kernel.cc | 9 +- backends/gcu/kernels/top_p_sampling_kernel.cc | 10 +- backends/gcu/kernels/topk_kernel.cc | 18 +- .../gcu/kernels/transfer_layout_kernel.cc | 4 +- backends/gcu/kernels/transpose_kernel.cc | 8 +- backends/gcu/kernels/tril_triu_kernel.cc | 32 +- backends/gcu/kernels/trunc_kernel.cc | 4 +- .../truncated_gaussian_random_kernel.cc | 10 +- backends/gcu/kernels/uniform_kernel.cc | 14 +- backends/gcu/kernels/unsqueeze_kernel.cc | 16 +- .../gcu/kernels/weight_quantize_kernel.cc | 38 +- backends/gcu/kernels/where_kernel.cc | 27 +- 125 files changed, 2186 insertions(+), 2268 deletions(-) diff --git a/backends/gcu/kernels/accuracy_kernel.cc b/backends/gcu/kernels/accuracy_kernel.cc index 109ac24027b..387fe5bbb6e 100644 --- a/backends/gcu/kernels/accuracy_kernel.cc +++ b/backends/gcu/kernels/accuracy_kernel.cc @@ -19,12 +19,12 @@ namespace custom_kernel { template void AccuracyRawKernel(const Context& dev_ctx, - const phi::DenseTensor& infer_out, - const phi::DenseTensor& indices, - const phi::DenseTensor& label, - phi::DenseTensor* accuracy, - phi::DenseTensor* correct, - phi::DenseTensor* total) { + const DenseTensor& infer_out, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total) { PADDLE_GCU_KERNEL_TRACE("accuracy"); dev_ctx.template Alloc(accuracy); dev_ctx.template Alloc(correct); @@ -47,36 +47,34 @@ void AccuracyRawKernel(const Context& dev_ctx, auto indices_i32 = MaybeCreateOrTrans64To32bits(dev_ctx, indices); auto label_i32 = MaybeCreateOrTrans64To32bits(dev_ctx, label); - phi::DenseTensorMeta equal_out_meta = {phi::DataType::BOOL, - infer_out.dims()}; + DenseTensorMeta equal_out_meta = {DataType::BOOL, infer_out.dims()}; auto equal_out = custom_kernel::TensorEmpty(dev_ctx, equal_out_meta); LAUNCH_TOPSATENOP(topsatenEq, dev_ctx, equal_out, indices_i32, label_i32); auto equal_out_f = - custom_kernel::Cast(dev_ctx, equal_out, phi::DataType::FLOAT32); + custom_kernel::Cast(dev_ctx, equal_out, DataType::FLOAT32); // correct: reduce_max + reduce_sum - phi::DenseTensorMeta correct_max_meta = {phi::DataType::FLOAT32, - phi::make_ddim({num_samples})}; + DenseTensorMeta correct_max_meta = {DataType::FLOAT32, + phi::make_ddim({num_samples})}; auto correct_max = custom_kernel::TensorEmpty(dev_ctx, correct_max_meta); int axis = 1; bool keep_dim = false; LAUNCH_TOPSATENOP( topsatenMax, dev_ctx, correct_max, equal_out_f, axis, keep_dim); - phi::DenseTensorMeta correct_sum_meta = {phi::DataType::FLOAT32, - correct->dims()}; + DenseTensorMeta correct_sum_meta = {DataType::FLOAT32, correct->dims()}; auto correct_sum = custom_kernel::TensorEmpty(dev_ctx, correct_sum_meta); LAUNCH_TOPSATENOP( - topsatenSum, dev_ctx, correct_sum, correct_max, phi::DataType::FLOAT32); - custom_kernel::Cast(dev_ctx, correct_sum, phi::DataType::INT32, correct); + topsatenSum, dev_ctx, correct_sum, correct_max, DataType::FLOAT32); + custom_kernel::Cast(dev_ctx, correct_sum, DataType::INT32, correct); // total FillGcuTensorWithConstant( total, dev_ctx, static_cast(num_samples)); // accuracy - phi::DenseTensorMeta total_f_meta = {phi::DataType::FLOAT32, total->dims()}; + DenseTensorMeta total_f_meta = {DataType::FLOAT32, total->dims()}; auto total_f = custom_kernel::TensorEmpty(dev_ctx, total_f_meta); FillGcuTensorWithConstant( &total_f, dev_ctx, static_cast(num_samples)); @@ -118,9 +116,9 @@ PD_REGISTER_PLUGIN_KERNEL(accuracy, phi::dtype::bfloat16, phi::dtype::float16, int) { - kernel->InputAt(1).SetDataType(phi::DataType::INT64); - kernel->InputAt(2).SetDataType(phi::DataType::INT64); - kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(1).SetDataType(phi::DataType::INT32); - kernel->OutputAt(2).SetDataType(phi::DataType::INT32); + kernel->InputAt(1).SetDataType(DataType::INT64); + kernel->InputAt(2).SetDataType(DataType::INT64); + kernel->OutputAt(0).SetDataType(DataType::FLOAT32); + kernel->OutputAt(1).SetDataType(DataType::INT32); + kernel->OutputAt(2).SetDataType(DataType::INT32); } diff --git a/backends/gcu/kernels/activation_kernels.cc b/backends/gcu/kernels/activation_kernels.cc index f1e7291013b..74061871884 100644 --- a/backends/gcu/kernels/activation_kernels.cc +++ b/backends/gcu/kernels/activation_kernels.cc @@ -19,38 +19,37 @@ namespace custom_kernel { template extern void ScaleKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& in_scale, const phi::Scalar& in_bias, bool bias_after_scale, - phi::DenseTensor* out); + DenseTensor* out); template extern void ClipKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& min, const phi::Scalar& max, - phi::DenseTensor* out); + DenseTensor* out); -#define DEFINE_UNARY_AOT_ACTIVATION_KERNEL(name, functor_prefix) \ - template \ - void functor_prefix##Kernel(const Context& dev_ctx, \ - const phi::DenseTensor& x, \ - phi::DenseTensor* out) { \ - PADDLE_GCU_KERNEL_TRACE(#name); \ - if (LaunchAOTKernel()) { \ - dev_ctx.template Alloc(out); \ - LAUNCH_TOPSATENOP(topsaten##functor_prefix, dev_ctx, *out, x); \ - } else { /* kernel impl base on JIT */ \ - THROW_JIT_UNIMPLEMENTED(); \ - } \ +#define DEFINE_UNARY_AOT_ACTIVATION_KERNEL(name, functor_prefix) \ + template \ + void functor_prefix##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + PADDLE_GCU_KERNEL_TRACE(#name); \ + if (LaunchAOTKernel()) { \ + dev_ctx.template Alloc(out); \ + LAUNCH_TOPSATENOP(topsaten##functor_prefix, dev_ctx, *out, x); \ + } else { /* kernel impl base on JIT */ \ + THROW_JIT_UNIMPLEMENTED(); \ + } \ } template void ActivationBaseKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const GcuAttributeMap& attrs, - phi::DenseTensor* out, + DenseTensor* out, const std::string& op_type) { dev_ctx.template Alloc(out); @@ -73,10 +72,10 @@ void ActivationBaseKernel(const Context& dev_ctx, template void ActivationGradBaseKernel(const Context& dev_ctx, const std::string& x_name, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, const GcuAttributeMap& attrs, - phi::DenseTensor* dx, + DenseTensor* dx, const std::string& op_type) { dev_ctx.template Alloc(dx); @@ -99,9 +98,7 @@ void ActivationGradBaseKernel(const Context& dev_ctx, } template -void AbsKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void AbsKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("abs"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -114,9 +111,9 @@ void AbsKernel(const Context& dev_ctx, template void AbsGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("abs_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -127,9 +124,7 @@ void AbsGradKernel(const Context& dev_ctx, } template -void CosKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void CosKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("cos"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -142,9 +137,9 @@ void CosKernel(const Context& dev_ctx, template void CosGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("cos_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -155,9 +150,7 @@ void CosGradKernel(const Context& dev_ctx, } template -void SinKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void SinKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("sin"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -170,8 +163,8 @@ void SinKernel(const Context& dev_ctx, template void AtanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("atan"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -184,9 +177,9 @@ void AtanKernel(const Context& dev_ctx, template void AtanGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("atan_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -197,9 +190,7 @@ void AtanGradKernel(const Context& dev_ctx, } template -void ExpKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void ExpKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("exp"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -212,9 +203,9 @@ void ExpKernel(const Context& dev_ctx, template void ExpGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("exp_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -226,8 +217,8 @@ void ExpGradKernel(const Context& dev_ctx, template void FloorKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("floor"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -240,8 +231,8 @@ void FloorKernel(const Context& dev_ctx, template void FloorGradKernel(const Context& dev_ctx, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("floor_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -267,8 +258,8 @@ void FloorGradKernel(const Context& dev_ctx, template void CeilKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("ceil"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -281,8 +272,8 @@ void CeilKernel(const Context& dev_ctx, template void SwishKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("swish"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -294,9 +285,9 @@ void SwishKernel(const Context& dev_ctx, template void SwishGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("swish_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -311,8 +302,8 @@ void SwishGradKernel(const Context& dev_ctx, template void ReluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("relu"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -324,9 +315,9 @@ void ReluKernel(const Context& dev_ctx, template void ReluGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("relu_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -338,8 +329,8 @@ void ReluGradKernel(const Context& dev_ctx, template void Relu6Kernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("relu6"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -351,9 +342,9 @@ void Relu6Kernel(const Context& dev_ctx, template void Relu6GradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("relu6_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -365,9 +356,9 @@ void Relu6GradKernel(const Context& dev_ctx, template void LeakyReluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, double alpha, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("leaky_relu"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -385,10 +376,10 @@ void LeakyReluKernel(const Context& dev_ctx, template void LeakyReluGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, double alpha, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("leaky_relu_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -403,9 +394,9 @@ void LeakyReluGradKernel(const Context& dev_ctx, template void GeluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, bool approximate, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("gelu"); if (LaunchAOTKernel()) { const char* gelu_approximate = "none"; @@ -427,10 +418,10 @@ void GeluKernel(const Context& dev_ctx, template void GeluGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, bool approximate, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("gelu_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -447,8 +438,8 @@ void GeluGradKernel(const Context& dev_ctx, template void TanhKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("tanh"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -461,9 +452,9 @@ void TanhKernel(const Context& dev_ctx, template void TanhGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("tanh_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -475,8 +466,8 @@ void TanhGradKernel(const Context& dev_ctx, template void SigmoidKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("sigmoid"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -488,9 +479,9 @@ void SigmoidKernel(const Context& dev_ctx, template void SigmoidGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("sigmoid_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -502,8 +493,8 @@ void SigmoidGradKernel(const Context& dev_ctx, template void SqrtKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("sqrt"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -515,9 +506,7 @@ void SqrtKernel(const Context& dev_ctx, } template -void LogKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { +void LogKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("log"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -530,9 +519,9 @@ void LogKernel(const Context& dev_ctx, template void LogGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("log_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -544,9 +533,9 @@ void LogGradKernel(const Context& dev_ctx, template void PowKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& factor_scalar, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("pow"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -561,10 +550,10 @@ void PowKernel(const Context& dev_ctx, template void PowGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, const phi::Scalar& factor_scalar, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("pow_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -578,8 +567,8 @@ void PowGradKernel(const Context& dev_ctx, template void SquareKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("square"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -592,9 +581,9 @@ void SquareKernel(const Context& dev_ctx, template void SquareGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("square_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -606,10 +595,10 @@ void SquareGradKernel(const Context& dev_ctx, template void Hard_SigmoidKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float slope, float offset, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("hard_sigmoid"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -630,21 +619,21 @@ void Hard_SigmoidKernel(const Context& dev_ctx, template void HardSigmoidKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float slope, float offset, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("hardsigmoid"); Hard_SigmoidKernel(dev_ctx, x, slope, offset, out); } template void HardSigmoidGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, float slope, float offset, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("hard_sigmoid_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -660,8 +649,8 @@ void HardSigmoidGradKernel(const Context& dev_ctx, template void HardSwishKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("hard_swish"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -673,9 +662,9 @@ void HardSwishKernel(const Context& dev_ctx, template void HardSwishGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("hard_swish_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -694,9 +683,9 @@ void HardSwishGradKernel(const Context& dev_ctx, template void LogitKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float eps, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("logit"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -710,9 +699,9 @@ void LogitKernel(const Context& dev_ctx, template void CeluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float alpha, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("celu"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -726,9 +715,9 @@ void CeluKernel(const Context& dev_ctx, template void HardShrinkKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float threshold, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("hard_shrink"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -742,9 +731,9 @@ void HardShrinkKernel(const Context& dev_ctx, template void SoftShrinkKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float lambda, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("softshrink"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -758,10 +747,10 @@ void SoftShrinkKernel(const Context& dev_ctx, template void SoftplusKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, double beta, double threshold, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("softplus"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -776,10 +765,10 @@ void SoftplusKernel(const Context& dev_ctx, template void HardtanhKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float t_min, float t_max, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("hardtanh"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -794,9 +783,9 @@ void HardtanhKernel(const Context& dev_ctx, template void EluKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float alpha, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("elu"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -813,9 +802,9 @@ void EluKernel(const Context& dev_ctx, template void RoundKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const int decimals, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("round"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/adam_kernel.cc b/backends/gcu/kernels/adam_kernel.cc index 4258154f5da..be40d8e5b09 100644 --- a/backends/gcu/kernels/adam_kernel.cc +++ b/backends/gcu/kernels/adam_kernel.cc @@ -19,23 +19,23 @@ namespace custom_kernel { template void AdamBaseKernel(const Context& dev_ctx, - const phi::DenseTensor& param, - const phi::DenseTensor& grad, - const phi::DenseTensor& learning_rate, - const phi::DenseTensor& moment1, - const phi::DenseTensor& moment2, - const phi::DenseTensor& beta1_pow_in, - const phi::DenseTensor& beta2_pow_in, - const paddle::optional& master_param, - const paddle::optional& skip_update, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow_in, + const DenseTensor& beta2_pow_in, + const paddle::optional& master_param, + const paddle::optional& skip_update, const GcuAttributeMap& attrs, bool use_global_beta_pow, - phi::DenseTensor* param_out, - phi::DenseTensor* moment1_out, - phi::DenseTensor* moment2_out, - phi::DenseTensor* beta1_pow_out, - phi::DenseTensor* beta2_pow_out, - phi::DenseTensor* master_param_out, + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_out, const std::string& op_type) { bool skip_update_ = false; if (skip_update.is_initialized()) { @@ -62,8 +62,8 @@ void AdamBaseKernel(const Context& dev_ctx, return; } - phi::DenseTensor* beta1_pow = const_cast(&beta1_pow_in); - phi::DenseTensor* beta2_pow = const_cast(&beta2_pow_in); + DenseTensor* beta1_pow = const_cast(&beta1_pow_in); + DenseTensor* beta2_pow = const_cast(&beta2_pow_in); VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; @@ -73,8 +73,8 @@ void AdamBaseKernel(const Context& dev_ctx, // beta1_pow and beta2_pow may on CPU and not transform // place. - phi::DenseTensor beta1_pow_tmp; - phi::DenseTensor beta2_pow_tmp; + DenseTensor beta1_pow_tmp; + DenseTensor beta2_pow_tmp; if (beta1_pow->place().GetType() == phi::AllocationType::CPU) { T beta1 = *beta1_pow->data(); beta1_pow_tmp.Resize({1}); @@ -108,23 +108,23 @@ void AdamBaseKernel(const Context& dev_ctx, inputs["Beta1Pow"] = {beta1_pow}; inputs["Beta2Pow"] = {beta2_pow}; - phi::DenseTensor param_out_tmp; + DenseTensor param_out_tmp; param_out_tmp.set_meta(param_out->meta()); dev_ctx.template Alloc(¶m_out_tmp); - phi::DenseTensor moment1_out_tmp; + DenseTensor moment1_out_tmp; moment1_out_tmp.set_meta(moment1_out->meta()); dev_ctx.template Alloc(&moment1_out_tmp); - phi::DenseTensor moment2_out_tmp; + DenseTensor moment2_out_tmp; moment2_out_tmp.set_meta(moment2_out->meta()); dev_ctx.template Alloc(&moment2_out_tmp); - phi::DenseTensor beta1_pow_out_tmp; + DenseTensor beta1_pow_out_tmp; beta1_pow_out_tmp.set_meta(beta1_pow_out->meta()); dev_ctx.template Alloc(&beta1_pow_out_tmp); - phi::DenseTensor beta2_pow_out_tmp; + DenseTensor beta2_pow_out_tmp; beta2_pow_out_tmp.set_meta(beta2_pow_out->meta()); dev_ctx.template Alloc(&beta2_pow_out_tmp); @@ -172,15 +172,15 @@ void AdamBaseKernel(const Context& dev_ctx, template void AdamKernel(const Context& dev_ctx, - const phi::DenseTensor& param, - const phi::DenseTensor& grad, - const phi::DenseTensor& learning_rate, - const phi::DenseTensor& moment1, - const phi::DenseTensor& moment2, - const phi::DenseTensor& beta1_pow_in, - const phi::DenseTensor& beta2_pow_in, - const paddle::optional& master_param, - const paddle::optional& skip_update, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow_in, + const DenseTensor& beta2_pow_in, + const paddle::optional& master_param, + const paddle::optional& skip_update, const phi::Scalar& beta1_in, const phi::Scalar& beta2_in, const phi::Scalar& epsilon_in, @@ -188,12 +188,12 @@ void AdamKernel(const Context& dev_ctx, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow, - phi::DenseTensor* param_out, - phi::DenseTensor* moment1_out, - phi::DenseTensor* moment2_out, - phi::DenseTensor* beta1_pow_out, - phi::DenseTensor* beta2_pow_out, - phi::DenseTensor* master_param_out) { + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_out) { PADDLE_GCU_KERNEL_TRACE("adam"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -228,15 +228,15 @@ void AdamKernel(const Context& dev_ctx, template void AdamwKernel(const Context& dev_ctx, - const phi::DenseTensor& param, - const phi::DenseTensor& grad, - const phi::DenseTensor& learning_rate, - const phi::DenseTensor& moment1, - const phi::DenseTensor& moment2, - const phi::DenseTensor& beta1_pow_in, - const phi::DenseTensor& beta2_pow_in, - const paddle::optional& master_param, - const paddle::optional& skip_update, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& learning_rate, + const DenseTensor& moment1, + const DenseTensor& moment2, + const DenseTensor& beta1_pow_in, + const DenseTensor& beta2_pow_in, + const paddle::optional& master_param, + const paddle::optional& skip_update, const phi::Scalar& beta1_in, const phi::Scalar& beta2_in, const phi::Scalar& epsilon_in, @@ -247,12 +247,12 @@ void AdamwKernel(const Context& dev_ctx, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow, - phi::DenseTensor* param_out, - phi::DenseTensor* moment1_out, - phi::DenseTensor* moment2_out, - phi::DenseTensor* beta1_pow_out, - phi::DenseTensor* beta2_pow_out, - phi::DenseTensor* master_param_out) { + DenseTensor* param_out, + DenseTensor* moment1_out, + DenseTensor* moment2_out, + DenseTensor* beta1_pow_out, + DenseTensor* beta2_pow_out, + DenseTensor* master_param_out) { PADDLE_GCU_KERNEL_TRACE("adamw"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -301,12 +301,12 @@ void AdamwKernel(const Context& dev_ctx, // kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); -// if (kernel_key.dtype() == phi::DataType::FLOAT16) { -// kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); -// kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); -// kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); -// kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); -// kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); +// if (kernel_key.dtype() == DataType::FLOAT16) { +// kernel->OutputAt(1).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(2).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(3).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(4).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(5).SetDataType(DataType::FLOAT32); // } // kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED); // kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED); @@ -323,12 +323,12 @@ void AdamwKernel(const Context& dev_ctx, // kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); -// if (kernel_key.dtype() == phi::DataType::FLOAT16) { -// kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); -// kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); -// kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); -// kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); -// kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); +// if (kernel_key.dtype() == DataType::FLOAT16) { +// kernel->OutputAt(1).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(2).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(3).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(4).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(5).SetDataType(DataType::FLOAT32); // } // kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED); // kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED); diff --git a/backends/gcu/kernels/add_n_kernel.cc b/backends/gcu/kernels/add_n_kernel.cc index 6f2d5c1f820..5edf0da06d4 100644 --- a/backends/gcu/kernels/add_n_kernel.cc +++ b/backends/gcu/kernels/add_n_kernel.cc @@ -19,8 +19,8 @@ namespace custom_kernel { template void AddNKernel(const Context& dev_ctx, - const std::vector& x, - phi::DenseTensor* out) { + const std::vector& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("add_n"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -40,7 +40,7 @@ void AddNKernel(const Context& dev_ctx, TensorValueMap inputs; std::vector names; names.reserve(x.size()); - std::vector values; + std::vector values; values.reserve(x.size()); for (size_t i = 0; i < x.size(); ++i) { names.emplace_back(std::string("x_") + std::to_string(i)); diff --git a/backends/gcu/kernels/arange_kernel.cc b/backends/gcu/kernels/arange_kernel.cc index ee0f868dcd4..94d569d80f0 100644 --- a/backends/gcu/kernels/arange_kernel.cc +++ b/backends/gcu/kernels/arange_kernel.cc @@ -49,7 +49,7 @@ void ArangeKernel(const Context& dev_ctx, const phi::Scalar& start, const phi::Scalar& end, const phi::Scalar& step, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("arange"); // VLOG(6) << "[HOST_KERNEL] Impl on host for arange"; T start_value = start.to(); @@ -61,8 +61,7 @@ void ArangeKernel(const Context& dev_ctx, dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - phi::DenseTensor output_t = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor output_t = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); topsatenTensor output = CreateTopsatenTensor(output_t); topsatenScalar_t start_s = ScalarToTopsatenScalar(start); @@ -102,23 +101,23 @@ void ArangeKernel(const Context& dev_ctx, template void ArangeTensorKernel(const Context& dev_ctx, - const phi::DenseTensor& start_t, - const phi::DenseTensor& end_t, - const phi::DenseTensor& step_t, - phi::DenseTensor* out) { + const DenseTensor& start_t, + const DenseTensor& end_t, + const DenseTensor& step_t, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("arange_tensor"); // VLOG(6) << "[HOST_KERNEL] Impl on host for arange_tensor"; - phi::DenseTensor n; + DenseTensor n; n.Resize(start_t.dims()); T* n_data = dev_ctx.template HostAlloc(&n); - TensorCopy(dev_ctx, start_t, true, &n, phi::CPUPlace()); + TensorCopy(dev_ctx, start_t, true, &n, CPUPlace()); T start = n_data[0]; - TensorCopy(dev_ctx, end_t, true, &n, phi::CPUPlace()); + TensorCopy(dev_ctx, end_t, true, &n, CPUPlace()); T end = n_data[0]; - TensorCopy(dev_ctx, step_t, true, &n, phi::CPUPlace()); + TensorCopy(dev_ctx, step_t, true, &n, CPUPlace()); T step = n_data[0]; custom_kernel::ArangeKernel( diff --git a/backends/gcu/kernels/arg_min_max_kernels.cc b/backends/gcu/kernels/arg_min_max_kernels.cc index ea2aa6fa345..ada6839fabd 100644 --- a/backends/gcu/kernels/arg_min_max_kernels.cc +++ b/backends/gcu/kernels/arg_min_max_kernels.cc @@ -19,12 +19,12 @@ namespace custom_kernel { template void ArgMinMaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& axis, bool keepdims, bool flatten, - phi::DataType dtype, - phi::DenseTensor* out, + DataType dtype, + DenseTensor* out, const std::string& op_type) { dev_ctx.Alloc(out, out->dtype()); @@ -52,17 +52,16 @@ void ArgMinMaxKernel(const Context& dev_ctx, template void ArgMinKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& axis, bool keepdims, bool flatten, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("argmin"); if (LaunchAOTKernel()) { dev_ctx.Alloc(out, out->dtype()); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); int64_t rank = x.dims().size(); int64_t axis_value = axis.to(); @@ -94,17 +93,16 @@ void ArgMinKernel(const Context& dev_ctx, template void ArgMaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& axis, bool keepdims, bool flatten, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("argmax"); if (LaunchAOTKernel()) { dev_ctx.Alloc(out, out->dtype()); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); int64_t rank = x.dims().size(); int64_t axis_value = axis.to(); @@ -144,7 +142,7 @@ PD_REGISTER_PLUGIN_KERNEL(argmin, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); } PD_REGISTER_PLUGIN_KERNEL(argmax, @@ -155,5 +153,5 @@ PD_REGISTER_PLUGIN_KERNEL(argmax, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); } diff --git a/backends/gcu/kernels/argsort_kernel.cc b/backends/gcu/kernels/argsort_kernel.cc index 195bd8657a0..225b2253d6c 100644 --- a/backends/gcu/kernels/argsort_kernel.cc +++ b/backends/gcu/kernels/argsort_kernel.cc @@ -19,12 +19,12 @@ namespace custom_kernel { template void ArgsortKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, bool descending, bool stable, - phi::DenseTensor* output, - phi::DenseTensor* indices) { + DenseTensor* output, + DenseTensor* indices) { PADDLE_GCU_KERNEL_TRACE("argsort"); dev_ctx.template Alloc(output); dev_ctx.template Alloc(indices); @@ -34,7 +34,7 @@ void ArgsortKernel(const Context& dev_ctx, axis += x.dims().size(); } - phi::DenseTensor indices_out = + DenseTensor indices_out = MaybeCreateOrTrans64To32bits(dev_ctx, *indices, false); LAUNCH_TOPSATENOP(topsatenSort, @@ -75,13 +75,13 @@ void ArgsortKernel(const Context& dev_ctx, template void ArgsortGradKernel(const Context& dev_ctx, - const phi::DenseTensor& indices, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& indices, + const DenseTensor& x, + const DenseTensor& out_grad, int axis, bool descending, bool stable, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("argsort_grad"); dev_ctx.template Alloc(x_grad); if (LaunchAOTKernel()) { @@ -126,7 +126,7 @@ PD_REGISTER_PLUGIN_KERNEL(argsort, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(1).SetDataType(phi::DataType::INT64); + kernel->OutputAt(1).SetDataType(DataType::INT64); } PD_REGISTER_PLUGIN_KERNEL(argsort_grad, diff --git a/backends/gcu/kernels/assign_kernel.cc b/backends/gcu/kernels/assign_kernel.cc index 6a76914337d..18398a47572 100644 --- a/backends/gcu/kernels/assign_kernel.cc +++ b/backends/gcu/kernels/assign_kernel.cc @@ -19,8 +19,8 @@ namespace custom_kernel { template void AssignKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("assign"); VLOG(6) << "[HOST_KERNEL] Impl on host for assign"; dev_ctx.template Alloc(out); @@ -29,8 +29,8 @@ void AssignKernel(const Context& dev_ctx, template void AssignRawKernel(const Context& dev_ctx, - const paddle::optional& x, - phi::DenseTensor* out) { + const paddle::optional& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("assign_raw"); VLOG(6) << "[HOST_KERNEL] Impl on host for assign_raw"; if (x) { @@ -44,8 +44,8 @@ void AssignRawKernel(const Context& dev_ctx, template void AssignArrayKernel(const Context& dev_ctx, - const std::vector& x, - std::vector out) { + const std::vector& x, + std::vector out) { PADDLE_GCU_KERNEL_TRACE("assign_array"); VLOG(6) << "[HOST_KERNEL] Impl on host for assign_array"; for (size_t i = 0; i < x.size(); ++i) { @@ -57,7 +57,7 @@ template typename std::enable_if::value>::type CopyVectorToTensor( const Context& dev_ctx, const std::vector& values, - phi::DenseTensor* out) { + DenseTensor* out) { // If attribute value dtype is vector, it will be converted to // vector. at the same time, we can not use vector to hold // the value, because the c++ use bit value to replace byte value. @@ -82,7 +82,7 @@ template typename std::enable_if::value>::type CopyVectorToTensor( const Context& dev_ctx, const std::vector& values, - phi::DenseTensor* out) { + DenseTensor* out) { std::vector> assign_values; assign_values.reserve(values.size()); for (const auto& val : values) { @@ -95,9 +95,9 @@ typename std::enable_if::value>::type CopyVectorToTensor( template void AssignValueKernel(const Context& dev_ctx, const std::vector& shape, - phi::DataType dtype, + DataType dtype, const std::vector& values, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("assign_value"); VLOG(6) << "[HOST_KERNEL] Impl on host for assign_value"; auto template_dtype = phi::CppTypeToDataType::Type(); diff --git a/backends/gcu/kernels/atan2_kernel.cc b/backends/gcu/kernels/atan2_kernel.cc index 12349e7fcad..c0cd98e7c8e 100644 --- a/backends/gcu/kernels/atan2_kernel.cc +++ b/backends/gcu/kernels/atan2_kernel.cc @@ -18,9 +18,9 @@ namespace custom_kernel { template void Atan2Kernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("atan2"); PADDLE_ENFORCE_LE( x.numel(), @@ -50,5 +50,5 @@ PD_REGISTER_PLUGIN_KERNEL(atan2, double, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); } diff --git a/backends/gcu/kernels/batch_norm_kernel.cc b/backends/gcu/kernels/batch_norm_kernel.cc index 2cb8455cb87..31cc1b2f9b2 100644 --- a/backends/gcu/kernels/batch_norm_kernel.cc +++ b/backends/gcu/kernels/batch_norm_kernel.cc @@ -19,23 +19,23 @@ namespace custom_kernel { template void BatchNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& running_mean, - const phi::DenseTensor& running_var, - const paddle::optional& scale, - const paddle::optional& bias, + const DenseTensor& x, + const DenseTensor& running_mean, + const DenseTensor& running_var, + const paddle::optional& scale, + const paddle::optional& bias, bool is_test, float momentum, float epsilon, const std::string& data_layout_str, bool use_global_stats, bool trainable_stats, - phi::DenseTensor* y, - phi::DenseTensor* mean_out, - phi::DenseTensor* variance_out, - phi::DenseTensor* saved_mean, - phi::DenseTensor* saved_variance, - phi::DenseTensor* reserve_space) { + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space) { PADDLE_GCU_KERNEL_TRACE("batch_norm"); PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC", true, @@ -62,8 +62,8 @@ void BatchNormKernel(const Context& dev_ctx, auto* scale_ptr = scale.get_ptr(); auto* bias_ptr = bias.get_ptr(); - phi::DenseTensor new_scale; - phi::DenseTensor new_bias; + DenseTensor new_scale; + DenseTensor new_bias; if (scale_ptr) { new_scale = scale.get(); } else { @@ -91,18 +91,16 @@ void BatchNormKernel(const Context& dev_ctx, dev_ctx.Alloc(saved_variance, saved_variance->dtype()); if (LaunchAOTKernel()) { - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor scale_x = MaybeCreateOrTrans64To32bits(dev_ctx, new_scale); - phi::DenseTensor bias_x = MaybeCreateOrTrans64To32bits(dev_ctx, new_bias); - phi::DenseTensor mean_x = - MaybeCreateOrTrans64To32bits(dev_ctx, running_mean); - phi::DenseTensor variance_x = - MaybeCreateOrTrans64To32bits(dev_ctx, running_var); - - phi::DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *y, false); - phi::DenseTensor saved_mean_output = + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor scale_x = MaybeCreateOrTrans64To32bits(dev_ctx, new_scale); + DenseTensor bias_x = MaybeCreateOrTrans64To32bits(dev_ctx, new_bias); + DenseTensor mean_x = MaybeCreateOrTrans64To32bits(dev_ctx, running_mean); + DenseTensor variance_x = MaybeCreateOrTrans64To32bits(dev_ctx, running_var); + + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *y, false); + DenseTensor saved_mean_output = MaybeCreateOrTrans64To32bits(dev_ctx, *saved_mean, false); - phi::DenseTensor saved_variance_output = + DenseTensor saved_variance_output = MaybeCreateOrTrans64To32bits(dev_ctx, *saved_variance, false); double momentum_d = 1.0 - momentum; @@ -130,11 +128,11 @@ void BatchNormKernel(const Context& dev_ctx, *variance_out = running_var; } else { // kernel impl base on JIT - phi::DenseTensor mean_out_tmp; + DenseTensor mean_out_tmp; mean_out_tmp.set_meta(mean_out->meta()); dev_ctx.template Alloc(&mean_out_tmp); - phi::DenseTensor variance_out_tmp; + DenseTensor variance_out_tmp; variance_out_tmp.set_meta(variance_out->meta()); dev_ctx.template Alloc(&variance_out_tmp); @@ -198,26 +196,25 @@ void BatchNormKernel(const Context& dev_ctx, } template -void BatchNormGradKernel( - const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale, - const paddle::optional& bias, - const paddle::optional& mean, - const paddle::optional& variance, - const phi::DenseTensor& saved_mean, - const phi::DenseTensor& saved_variance, - const paddle::optional& reserve_space, - const phi::DenseTensor& y_grad, - float momentum, - float epsilon, - const std::string& data_layout_str, - bool is_test, - bool use_global_stats, - bool trainable_statistics, - phi::DenseTensor* x_grad, - phi::DenseTensor* scale_grad, - phi::DenseTensor* bias_grad) { +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional& scale, + const paddle::optional& bias, + const paddle::optional& mean, + const paddle::optional& variance, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const paddle::optional& reserve_space, + const DenseTensor& y_grad, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { PADDLE_GCU_KERNEL_TRACE("batch_norm_grad"); const auto& x_dims = x.dims(); int C = 1; @@ -230,8 +227,8 @@ void BatchNormGradKernel( auto* scale_ptr = scale.get_ptr(); auto* bias_ptr = bias.get_ptr(); - phi::DenseTensor new_scale; - phi::DenseTensor new_bias; + DenseTensor new_scale; + DenseTensor new_bias; if (scale_ptr) { new_scale = scale.get(); } else { @@ -303,17 +300,17 @@ void BatchNormGradKernel( template void BatchNormInferKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mean, - const phi::DenseTensor& variance, - const phi::DenseTensor& scale, - const phi::DenseTensor& bias, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + const DenseTensor& scale, + const DenseTensor& bias, float momentum, float epsilon, const std::string& data_layout_str, - phi::DenseTensor* y, - phi::DenseTensor* mean_out, - phi::DenseTensor* variance_out) { + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out) { PADDLE_GCU_KERNEL_TRACE("batch_norm_infer"); const auto& x_dims = x.dims(); PADDLE_ENFORCE_EQ( @@ -335,17 +332,16 @@ void BatchNormInferKernel(const Context& dev_ctx, dev_ctx.Alloc(variance_out, variance_out->dtype()); if (LaunchAOTKernel()) { - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor scale_x = MaybeCreateOrTrans64To32bits(dev_ctx, scale); - phi::DenseTensor bias_x = MaybeCreateOrTrans64To32bits(dev_ctx, bias); - phi::DenseTensor mean_x = MaybeCreateOrTrans64To32bits(dev_ctx, mean); - phi::DenseTensor variance_x = - MaybeCreateOrTrans64To32bits(dev_ctx, variance); - - phi::DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *y, false); - phi::DenseTensor mean_output = + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor scale_x = MaybeCreateOrTrans64To32bits(dev_ctx, scale); + DenseTensor bias_x = MaybeCreateOrTrans64To32bits(dev_ctx, bias); + DenseTensor mean_x = MaybeCreateOrTrans64To32bits(dev_ctx, mean); + DenseTensor variance_x = MaybeCreateOrTrans64To32bits(dev_ctx, variance); + + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *y, false); + DenseTensor mean_output = MaybeCreateOrTrans64To32bits(dev_ctx, *mean_out, false); - phi::DenseTensor variance_output = + DenseTensor variance_output = MaybeCreateOrTrans64To32bits(dev_ctx, *variance_out, false); double momentum_d = 1.0 - momentum; @@ -412,15 +408,15 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm, double, phi::dtype::bfloat16, phi::dtype::float16) { - if (kernel_key.dtype() == phi::DataType::FLOAT16) { - kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); // mean - kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); // variance - kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); // scale - kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); // bias - kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // mean_out - kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // variance_out - kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); // saved_mean - kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); // saved_variance + if (kernel_key.dtype() == DataType::FLOAT16) { + kernel->InputAt(1).SetDataType(DataType::FLOAT32); // mean + kernel->InputAt(2).SetDataType(DataType::FLOAT32); // variance + kernel->InputAt(3).SetDataType(DataType::FLOAT32); // scale + kernel->InputAt(4).SetDataType(DataType::FLOAT32); // bias + kernel->OutputAt(1).SetDataType(DataType::FLOAT32); // mean_out + kernel->OutputAt(2).SetDataType(DataType::FLOAT32); // variance_out + kernel->OutputAt(3).SetDataType(DataType::FLOAT32); // saved_mean + kernel->OutputAt(4).SetDataType(DataType::FLOAT32); // saved_variance } } @@ -432,10 +428,10 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm_grad, double, phi::dtype::bfloat16, phi::dtype::float16) { - if (kernel_key.dtype() == phi::DataType::FLOAT16) { - kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); // x_grad - kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // scale_grad - kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // bias_grad + if (kernel_key.dtype() == DataType::FLOAT16) { + kernel->OutputAt(0).SetDataType(DataType::FLOAT32); // x_grad + kernel->OutputAt(1).SetDataType(DataType::FLOAT32); // scale_grad + kernel->OutputAt(2).SetDataType(DataType::FLOAT32); // bias_grad } } @@ -447,8 +443,8 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm_infer, double, phi::dtype::bfloat16, phi::dtype::float16) { - if (kernel_key.dtype() == phi::DataType::FLOAT16) { - kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // mean_out - kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // variance_out + if (kernel_key.dtype() == DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(DataType::FLOAT32); // mean_out + kernel->OutputAt(2).SetDataType(DataType::FLOAT32); // variance_out } } diff --git a/backends/gcu/kernels/binary_kernels.cc b/backends/gcu/kernels/binary_kernels.cc index a19e2c83b66..6acdff096aa 100644 --- a/backends/gcu/kernels/binary_kernels.cc +++ b/backends/gcu/kernels/binary_kernels.cc @@ -47,13 +47,13 @@ inline phi::DDim GetDimsWithAxis(const phi::DDim& x_dims, return phi::make_ddim(y_shape); } -std::vector PaddingDims(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const int axis) { +std::vector PaddingDims(const DenseTensor& x, + const DenseTensor& y, + const int axis) { auto x_dims = x.dims(); auto y_dims = y.dims(); - phi::DenseTensor x_tensor(x); - phi::DenseTensor y_tensor(y); + DenseTensor x_tensor(x); + DenseTensor y_tensor(y); auto fixed_axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); @@ -66,14 +66,14 @@ std::vector PaddingDims(const phi::DenseTensor& x, return {x_tensor, y_tensor}; } -bool NeedTranspose(const phi::DenseTensor& tensor) { +bool NeedTranspose(const DenseTensor& tensor) { auto dims = common::vectorize(tensor.dims()); return std::count(dims.begin(), dims.end(), 1) < 3; } bool UnifyLayout(const phi::CustomContext& dev_ctx, - phi::DenseTensor& x, // NOLINT - phi::DenseTensor& y) { // NOLINT + DenseTensor& x, // NOLINT + DenseTensor& y) { // NOLINT if (!EnableTransposeOptimize()) { return false; } @@ -116,10 +116,10 @@ bool UnifyLayout(const phi::CustomContext& dev_ctx, template void ElementBaseKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out, + DenseTensor* out, const std::string& op_type) { dev_ctx.template Alloc(out); @@ -146,12 +146,12 @@ void ElementBaseKernel(const Context& dev_ctx, template void ElementBaseGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy, + DenseTensor* dx, + DenseTensor* dy, const std::string& op_type) { TensorNameMap input_names; input_names["X"] = {"x"}; @@ -164,11 +164,11 @@ void ElementBaseGradKernel(const Context& dev_ctx, VLOG(6) << op_type << " input y shape: " << y.dims().to_str() << " initialized: " << y.initialized(); - phi::DenseTensor input_x_tmp; - phi::DenseTensor input_y_tmp; + DenseTensor input_x_tmp; + DenseTensor input_y_tmp; - phi::DenseTensor* input_x = const_cast(&x); - phi::DenseTensor* input_y = const_cast(&y); + DenseTensor* input_x = const_cast(&x); + DenseTensor* input_y = const_cast(&y); if (!x.initialized()) { input_x_tmp.set_meta(x.meta()); dev_ctx.template Alloc(&input_x_tmp); @@ -207,21 +207,20 @@ void ElementBaseGradKernel(const Context& dev_ctx, template void AddRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("add_raw"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); auto padding_shapes = PaddingDims(x, y, axis); auto scalar = phi::Scalar(1.0f); - phi::DenseTensor input_x = + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, padding_shapes[0]); - phi::DenseTensor input_y = + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, padding_shapes[1]); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); // VLOG(6) << "Transpose debug, AddKernel add_raw input_x:" // << custom_kernel::TensorDetailsToString(input_x); @@ -249,9 +248,9 @@ void AddRawKernel(const Context& dev_ctx, template void AddKernel(const Context& dev_ctx, - const phi::DenseTensor& x, // NHWC - const phi::DenseTensor& y, // NCHW - phi::DenseTensor* out) { + const DenseTensor& x, // NHWC + const DenseTensor& y, // NCHW + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("add"); custom_kernel::AddRawKernel(dev_ctx, x, y, -1, out); VLOG(6) << "Transpose debug, AddKernel output:" @@ -260,12 +259,12 @@ void AddKernel(const Context& dev_ctx, template void AddGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { PADDLE_GCU_KERNEL_TRACE("add_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -277,17 +276,16 @@ void AddGradKernel(const Context& dev_ctx, template void SubtractKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("subtract"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); auto scalar = phi::Scalar(1.0f); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP(topsatenSub, dev_ctx, output, input_x, input_y, scalar); MaybeTransResult(dev_ctx, output, out); @@ -298,12 +296,12 @@ void SubtractKernel(const Context& dev_ctx, template void SubtractGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { PADDLE_GCU_KERNEL_TRACE("subtract_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -315,19 +313,18 @@ void SubtractGradKernel(const Context& dev_ctx, template void MultiplyKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("multiply"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); auto padding_shapes = PaddingDims(x, y, -1); - phi::DenseTensor input_x = + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, padding_shapes[0]); - phi::DenseTensor input_y = + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, padding_shapes[1]); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); // VLOG(6) << "Transpose debug, MultiplyKernel input_x:" // << custom_kernel::TensorDetailsToString(input_x); @@ -354,12 +351,12 @@ void MultiplyKernel(const Context& dev_ctx, template void MultiplyGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { PADDLE_GCU_KERNEL_TRACE("multiply_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -371,9 +368,9 @@ void MultiplyGradKernel(const Context& dev_ctx, template void DivideKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("divide"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -386,13 +383,13 @@ void DivideKernel(const Context& dev_ctx, template void DivideGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { PADDLE_GCU_KERNEL_TRACE("divide_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -404,16 +401,15 @@ void DivideGradKernel(const Context& dev_ctx, template void MinimumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("minimum"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP(topsatenMinimum, dev_ctx, output_z, input_x, input_y); MaybeTransResult(dev_ctx, output_z, out); @@ -424,11 +420,11 @@ void MinimumKernel(const Context& dev_ctx, template void MinimumGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy) { PADDLE_GCU_KERNEL_TRACE("minimum_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -440,16 +436,15 @@ void MinimumGradKernel(const Context& dev_ctx, template void MaximumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("maximum"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP(topsatenMaximum, dev_ctx, output_z, input_x, input_y); MaybeTransResult(dev_ctx, output_z, out); @@ -460,11 +455,11 @@ void MaximumKernel(const Context& dev_ctx, template void MaximumGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy) { PADDLE_GCU_KERNEL_TRACE("maximum_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -476,16 +471,15 @@ void MaximumGradKernel(const Context& dev_ctx, template void ElementwisePowKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("elementwise_pow"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP(topsatenPow, dev_ctx, output_z, input_x, input_y); MaybeTransResult(dev_ctx, output_z, out); @@ -496,16 +490,15 @@ void ElementwisePowKernel(const Context& dev_ctx, template void RemainderKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("remainder"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP(topsatenRemainder, dev_ctx, output_z, input_x, input_y); MaybeTransResult(dev_ctx, output_z, out); @@ -516,16 +509,15 @@ void RemainderKernel(const Context& dev_ctx, template void FloorDivideKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("floor_divide"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); static const char* const rounding_mode = "floor"; LAUNCH_TOPSATENOP( topsatenDiv, dev_ctx, output_z, input_x, input_y, rounding_mode); @@ -538,16 +530,15 @@ void FloorDivideKernel(const Context& dev_ctx, template void FMaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("fmax"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP(topsatenFmax, dev_ctx, output_z, input_x, input_y); MaybeTransResult(dev_ctx, output_z, out); @@ -558,16 +549,15 @@ void FMaxKernel(const Context& dev_ctx, template void FMinKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("fmin"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP(topsatenFmin, dev_ctx, output_z, input_x, input_y); MaybeTransResult(dev_ctx, output_z, out); diff --git a/backends/gcu/kernels/bitwise_kernel.cc b/backends/gcu/kernels/bitwise_kernel.cc index 8975fa54fc8..4f0de93941b 100644 --- a/backends/gcu/kernels/bitwise_kernel.cc +++ b/backends/gcu/kernels/bitwise_kernel.cc @@ -18,9 +18,9 @@ namespace custom_kernel { template void BitwiseAndKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("bitwise_and"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -33,8 +33,8 @@ void BitwiseAndKernel(const Context& dev_ctx, template void BitwiseNotKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("bitwise_not"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -69,9 +69,9 @@ void BitwiseNotKernel(const Context& dev_ctx, template void BitwiseOrKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("bitwise_or"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -84,9 +84,9 @@ void BitwiseOrKernel(const Context& dev_ctx, template void BitwiseXorKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("bitwise_xor"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -99,10 +99,10 @@ void BitwiseXorKernel(const Context& dev_ctx, template void BitwiseLeftShiftKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, bool is_arithmetic, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("bitwise_left_shift"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -115,10 +115,10 @@ void BitwiseLeftShiftKernel(const Context& dev_ctx, template void BitwiseRightShiftKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, bool is_arithmetic, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("bitwise_right_shift"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/c_identity_kernel.cc b/backends/gcu/kernels/c_identity_kernel.cc index ee5765a9090..cb4ffd81eb9 100644 --- a/backends/gcu/kernels/c_identity_kernel.cc +++ b/backends/gcu/kernels/c_identity_kernel.cc @@ -19,11 +19,11 @@ namespace custom_kernel { template void CIdentityKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int ring_id, bool use_calc_stream, bool use_model_parallel, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("c_identity"); PADDLE_ENFORCE_GE( ring_id, diff --git a/backends/gcu/kernels/cast_kernel.cc b/backends/gcu/kernels/cast_kernel.cc index 71b1110f302..d038c4a0278 100644 --- a/backends/gcu/kernels/cast_kernel.cc +++ b/backends/gcu/kernels/cast_kernel.cc @@ -18,35 +18,35 @@ namespace custom_kernel { template void CastKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DataType dtype, - phi::DenseTensor* out) { + const DenseTensor& x, + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("cast"); if (LaunchAOTKernel()) { custom_kernel::Cast(dev_ctx, x, dtype, out); } else { // kernel impl base on JIT - if (dtype == phi::DataType::FLOAT32) { + if (dtype == DataType::FLOAT32) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::FLOAT64) { + } else if (dtype == DataType::FLOAT64) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::FLOAT16) { + } else if (dtype == DataType::FLOAT16) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::INT16) { + } else if (dtype == DataType::INT16) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::INT32) { + } else if (dtype == DataType::INT32) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::INT64) { + } else if (dtype == DataType::INT64) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::BOOL) { + } else if (dtype == DataType::BOOL) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::UINT8) { + } else if (dtype == DataType::UINT8) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::INT8) { + } else if (dtype == DataType::INT8) { dev_ctx.template Alloc(out); - } else if (dtype == phi::DataType::COMPLEX64) { + } else if (dtype == DataType::COMPLEX64) { dev_ctx.template Alloc>(out); - } else if (dtype == phi::DataType::COMPLEX128) { + } else if (dtype == DataType::COMPLEX128) { dev_ctx.template Alloc>(out); } else { phi::errors::InvalidArgument("Unsupported cast dtype %s", dtype); @@ -88,5 +88,5 @@ PD_REGISTER_PLUGIN_KERNEL(cast, int32_t, int64_t, bool) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); } diff --git a/backends/gcu/kernels/cholesky_kernel.cc b/backends/gcu/kernels/cholesky_kernel.cc index 1d067d5c1ff..8f5fb6ceafc 100644 --- a/backends/gcu/kernels/cholesky_kernel.cc +++ b/backends/gcu/kernels/cholesky_kernel.cc @@ -18,9 +18,9 @@ namespace custom_kernel { template void CholeskyKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, bool upper, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("cholesky"); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/clip_kernel.cc b/backends/gcu/kernels/clip_kernel.cc index 3285660891c..b1d356f9036 100644 --- a/backends/gcu/kernels/clip_kernel.cc +++ b/backends/gcu/kernels/clip_kernel.cc @@ -19,10 +19,10 @@ namespace custom_kernel { template void ClipKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& min, const phi::Scalar& max, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("clip"); dev_ctx.template Alloc(out); @@ -64,11 +64,11 @@ void ClipKernel(const Context& dev_ctx, template void ClipGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, const phi::Scalar& min, const phi::Scalar& max, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("clip_grad"); dev_ctx.template Alloc(dx); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/compare_kernels.cc b/backends/gcu/kernels/compare_kernels.cc index 129ce3b7a76..fcab639396d 100644 --- a/backends/gcu/kernels/compare_kernels.cc +++ b/backends/gcu/kernels/compare_kernels.cc @@ -28,10 +28,10 @@ void CheckParam(const std::string& name, int axis, size_t rank) { template void CompareBaseKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out, + DenseTensor* out, const std::string& op_type) { dev_ctx.template Alloc(out); @@ -58,16 +58,16 @@ void CompareBaseKernel(const Context& dev_ctx, template void EqualKernelRaw(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("equal_raw"); if (LaunchAOTKernel()) { CheckParam("equal_raw", axis, x.dims().size()); dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenEq, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT @@ -77,14 +77,14 @@ void EqualKernelRaw(const Context& dev_ctx, template void EqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("equal"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenEq, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT @@ -94,14 +94,14 @@ void EqualKernel(const Context& dev_ctx, template void NotEqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("not_equal"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenNe, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT @@ -111,14 +111,14 @@ void NotEqualKernel(const Context& dev_ctx, template void LessEqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("less_equal"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenLe, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT @@ -128,16 +128,16 @@ void LessEqualKernel(const Context& dev_ctx, template void LessThanKernelRaw(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("less_than_raw"); if (LaunchAOTKernel()) { CheckParam("less_than_raw", axis, x.dims().size()); dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenLt, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT CompareBaseKernel(dev_ctx, x, y, -1, out, "less_than"); @@ -146,14 +146,14 @@ void LessThanKernelRaw(const Context& dev_ctx, template void LessThanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("less_than"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenLt, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT @@ -163,16 +163,16 @@ void LessThanKernel(const Context& dev_ctx, template void GreaterEqualKernelRaw(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("greater_equal_raw"); if (LaunchAOTKernel()) { CheckParam("greater_equal_raw", axis, x.dims().size()); dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenGe, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT CompareBaseKernel(dev_ctx, x, y, -1, out, "greater_equal"); @@ -181,14 +181,14 @@ void GreaterEqualKernelRaw(const Context& dev_ctx, template void GreaterEqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("greater_equal"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenGe, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT CompareBaseKernel(dev_ctx, x, y, -1, out, "greater_equal"); @@ -197,14 +197,14 @@ void GreaterEqualKernel(const Context& dev_ctx, template void GreaterThanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("greater_than"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); LAUNCH_TOPSATENOP(topsatenGt, dev_ctx, *out, input_x, input_y); } else { // kernel impl base on JIT @@ -214,18 +214,18 @@ void GreaterThanKernel(const Context& dev_ctx, } // namespace custom_kernel -#define PD_REGISTER_COMPARE_KERNEL(name, func) \ - PD_REGISTER_PLUGIN_KERNEL(name, \ - gcu, \ - ALL_LAYOUT, \ - custom_kernel::func##Kernel, \ - bool, \ - int, \ - int64_t, \ - float, \ - phi::dtype::bfloat16, \ - phi::dtype::float16) { \ - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); \ +#define PD_REGISTER_COMPARE_KERNEL(name, func) \ + PD_REGISTER_PLUGIN_KERNEL(name, \ + gcu, \ + ALL_LAYOUT, \ + custom_kernel::func##Kernel, \ + bool, \ + int, \ + int64_t, \ + float, \ + phi::dtype::bfloat16, \ + phi::dtype::float16) { \ + kernel->OutputAt(0).SetDataType(DataType::BOOL); \ } #define PD_REGISTER_COMPARE_RAW_KERNEL(name, func) \ @@ -239,7 +239,7 @@ void GreaterThanKernel(const Context& dev_ctx, float, \ phi::dtype::bfloat16, \ phi::dtype::float16) { \ - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); \ + kernel->OutputAt(0).SetDataType(DataType::BOOL); \ } PD_REGISTER_COMPARE_KERNEL(less_than, LessThan) diff --git a/backends/gcu/kernels/concat_kernel.cc b/backends/gcu/kernels/concat_kernel.cc index 62bd71bc53b..1cd035e0c38 100644 --- a/backends/gcu/kernels/concat_kernel.cc +++ b/backends/gcu/kernels/concat_kernel.cc @@ -19,15 +19,15 @@ namespace custom_kernel { template void ConcatKernel(const Context& dev_ctx, - const std::vector& ins, + const std::vector& ins, const phi::Scalar& axis_scalar, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("concat"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { bool use_nhwc = false; - std::vector input_tensors; + std::vector input_tensors; for (const auto& in : ins) { input_tensors.emplace_back(MaybeCreateOrTrans64To32bits(dev_ctx, *in)); if (EnableTransposeOptimize() && (!use_nhwc) && @@ -35,8 +35,7 @@ void ConcatKernel(const Context& dev_ctx, use_nhwc = true; } } - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); if (use_nhwc) { PdCustomNHWCRepresentAsAtenNHWC(output, true); } @@ -80,7 +79,7 @@ void ConcatKernel(const Context& dev_ctx, TensorValueMap inputs; std::vector names; names.reserve(ins.size()); - std::vector values; + std::vector values; values.reserve(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { names.emplace_back(std::string("x_") + std::to_string(i)); @@ -105,10 +104,10 @@ void ConcatKernel(const Context& dev_ctx, template void ConcatGradKernel(const Context& dev_ctx, - const std::vector& ins, - const phi::DenseTensor& dout, + const std::vector& ins, + const DenseTensor& dout, const phi::Scalar& axis_scalar, - std::vector outs) { + std::vector outs) { PADDLE_GCU_KERNEL_TRACE("concat_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -118,7 +117,7 @@ void ConcatGradKernel(const Context& dev_ctx, { std::vector names; names.reserve(ins.size()); - std::vector values; + std::vector values; values.reserve(ins.size()); for (size_t i = 0; i < ins.size(); ++i) { names.emplace_back(std::string("x_") + std::to_string(i)); @@ -136,7 +135,7 @@ void ConcatGradKernel(const Context& dev_ctx, { std::vector names; names.reserve(outs.size()); - std::vector values; + std::vector values; values.reserve(outs.size()); for (size_t i = 0; i < outs.size(); ++i) { if ((outs[i] != nullptr) && (outs[i]->numel() != 0UL)) { diff --git a/backends/gcu/kernels/contiguous_kernel.cc b/backends/gcu/kernels/contiguous_kernel.cc index e709a8f0541..26117dd7bf1 100644 --- a/backends/gcu/kernels/contiguous_kernel.cc +++ b/backends/gcu/kernels/contiguous_kernel.cc @@ -19,10 +19,10 @@ namespace custom_kernel { template void ContiguousKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - phi::DenseTensor* out) { + const DenseTensor& input, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("contiguous"); - phi::DenseTensorMeta meta = input.meta(); + DenseTensorMeta meta = input.meta(); meta.strides = meta.calc_strides(meta.dims); meta.offset = 0; out->set_meta(meta); diff --git a/backends/gcu/kernels/conv_kernel.cc b/backends/gcu/kernels/conv_kernel.cc index cbd248a27f0..bc30ccecb48 100644 --- a/backends/gcu/kernels/conv_kernel.cc +++ b/backends/gcu/kernels/conv_kernel.cc @@ -22,15 +22,15 @@ static std::unordered_set g_depthwise_conv2d_weights_nhwc; template void GcuConvKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, + const DenseTensor& input, + const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* out, + DenseTensor* out, const std::string& op_type) { dev_ctx.template Alloc(out); @@ -62,17 +62,17 @@ void GcuConvKernel(const Context& dev_ctx, template void GcuConvGradKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, - const phi::DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* input_grad, - phi::DenseTensor* filter_grad, + DenseTensor* input_grad, + DenseTensor* filter_grad, const std::string& op_type) { TensorNameMap input_names; input_names["Input"] = {"input"}; @@ -113,7 +113,7 @@ template void Conv2dBiasKernel(const Context& dev_ctx, const DenseTensor& input, const DenseTensor& filter, - const paddle::optional& bias, + const paddle::optional& bias, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, @@ -123,18 +123,17 @@ void Conv2dBiasKernel(const Context& dev_ctx, DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("conv2d_bias"); if (LaunchAOTKernel()) { - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, input); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, input); // if (data_format == "NHWC") { // input_x = custom_kernel::Transpose(dev_ctx, input, {0, 3, 1, 2}); // } dev_ctx.template Alloc(out); - phi::DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, filter); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, filter); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); - // phi::DenseTensor input_x = input; - // phi::DenseTensor filter_x = filter; - // phi::DenseTensor output = *out; + // DenseTensor input_x = input; + // DenseTensor filter_x = filter; + // DenseTensor output = *out; // update paddings and dilations according to padding_algorithm std::vector paddings_vec = paddings; @@ -166,7 +165,7 @@ void Conv2dBiasKernel(const Context& dev_ctx, PdCustomNHWCRepresentAsAtenNHWC(output, true); if (g_conv2d_weights_nhwc.count(filter.data()) == 0) { auto filter_trans = NCHWTransToPdCustomNHWC(dev_ctx, filter); - phi::DenseTensor* filter_ptr = const_cast(&filter); + DenseTensor* filter_ptr = const_cast(&filter); TensorCopy(dev_ctx, filter_trans, false, filter_ptr); g_conv2d_weights_nhwc.emplace(filter.data()); VLOG(6) << "Transpose debug, trans filter for conv2d."; @@ -179,12 +178,12 @@ void Conv2dBiasKernel(const Context& dev_ctx, } } - phi::DenseTensor input_bias; + DenseTensor input_bias; if (bias) { input_bias = bias.get(); } else { - auto meta = phi::DenseTensorMeta(input.dtype(), - phi::make_ddim({filter_x.dims().at(0)})); + auto meta = DenseTensorMeta(input.dtype(), + phi::make_ddim({filter_x.dims().at(0)})); input_bias = TensorZeros(dev_ctx, meta); } @@ -249,33 +248,32 @@ void Conv2dKernel(const Context& dev_ctx, const std::string& data_format, DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("conv2d"); - custom_kernel::Conv2dBiasKernel( - dev_ctx, - input, - filter, - paddle::optional(), - strides, - paddings, - padding_algorithm, - dilations, - groups, - data_format, - out); + custom_kernel::Conv2dBiasKernel(dev_ctx, + input, + filter, + paddle::optional(), + strides, + paddings, + padding_algorithm, + dilations, + groups, + data_format, + out); } template void Conv2DGradKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, - const phi::DenseTensor& output_grad, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& output_grad, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, const std::vector& dilations, int groups, const std::string& data_format, - phi::DenseTensor* input_grad, - phi::DenseTensor* filter_grad) { + DenseTensor* input_grad, + DenseTensor* filter_grad) { PADDLE_GCU_KERNEL_TRACE("conv2d_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -298,29 +296,29 @@ void Conv2DGradKernel(const Context& dev_ctx, template void DepthwiseConv2dKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, + const DenseTensor& input, + const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("depthwise_conv2d"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - // phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, input); + // DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, input); // if (data_format == "NHWC") { // input_x = custom_kernel::Transpose(dev_ctx, input, {0, 3, 1, 2}); // } - // phi::DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, - // filter); phi::DenseTensor output = + // DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, + // filter); DenseTensor output = // MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); - phi::DenseTensor input_x = input; - phi::DenseTensor filter_x = filter; - phi::DenseTensor output = *out; + DenseTensor input_x = input; + DenseTensor filter_x = filter; + DenseTensor output = *out; // update paddings and dilations according to padding_algorithm std::vector paddings_vec = paddings; @@ -352,7 +350,7 @@ void DepthwiseConv2dKernel(const Context& dev_ctx, PdCustomNHWCRepresentAsAtenNHWC(output, true); if (g_depthwise_conv2d_weights_nhwc.count(filter.data()) == 0) { auto filter_trans = NCHWTransToPdCustomNHWC(dev_ctx, filter); - phi::DenseTensor* filter_ptr = const_cast(&filter); + DenseTensor* filter_ptr = const_cast(&filter); TensorCopy(dev_ctx, filter_trans, false, filter_ptr); g_depthwise_conv2d_weights_nhwc.emplace(filter.data()); VLOG(6) << "Transpose debug, trans filter for depthwise_conv2d."; @@ -365,8 +363,8 @@ void DepthwiseConv2dKernel(const Context& dev_ctx, } } - auto meta = phi::DenseTensorMeta(input.dtype(), - phi::make_ddim({filter.dims().at(0)})); + auto meta = + DenseTensorMeta(input.dtype(), phi::make_ddim({filter.dims().at(0)})); auto bias = TensorZeros(dev_ctx, meta); std::vector strides_v = {strides.begin(), strides.end()}; @@ -411,17 +409,17 @@ void DepthwiseConv2dKernel(const Context& dev_ctx, template void DepthwiseConv2dGradKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, - const phi::DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* input_grad, - phi::DenseTensor* filter_grad) { + DenseTensor* input_grad, + DenseTensor* filter_grad) { PADDLE_GCU_KERNEL_TRACE("depthwise_conv2d_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -444,22 +442,21 @@ void DepthwiseConv2dGradKernel(const Context& dev_ctx, template void Conv3dKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, + const DenseTensor& input, + const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("conv3d"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, input); - phi::DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, filter); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, input); + DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, filter); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); // update paddings and dilations according to padding_algorithm std::vector paddings_vec = paddings; @@ -491,7 +488,7 @@ void Conv3dKernel(const Context& dev_ctx, PdCustomNHWCRepresentAsAtenNHWC(output, true); if (g_conv3d_weights_nhwc.count(filter.data()) == 0) { auto filter_trans = NCHWTransToPdCustomNHWC(dev_ctx, filter); - phi::DenseTensor* filter_ptr = const_cast(&filter); + DenseTensor* filter_ptr = const_cast(&filter); TensorCopy(dev_ctx, filter_trans, false, filter_ptr); g_conv3d_weights_nhwc.emplace(filter.data()); VLOG(6) << "Transpose debug, trans filter for conv3d."; @@ -504,8 +501,8 @@ void Conv3dKernel(const Context& dev_ctx, } } - auto meta = phi::DenseTensorMeta(input.dtype(), - phi::make_ddim({filter_x.dims().at(0)})); + auto meta = + DenseTensorMeta(input.dtype(), phi::make_ddim({filter_x.dims().at(0)})); auto bias = TensorZeros(dev_ctx, meta); std::vector strides_v = {strides.begin(), strides.end()}; @@ -561,17 +558,17 @@ void Conv3dKernel(const Context& dev_ctx, template void Conv3dGradKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, - const phi::DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& out_grad, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* input_grad, - phi::DenseTensor* filter_grad) { + DenseTensor* input_grad, + DenseTensor* filter_grad) { PADDLE_GCU_KERNEL_TRACE("conv3d_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); diff --git a/backends/gcu/kernels/conv_transpose_kernel.cc b/backends/gcu/kernels/conv_transpose_kernel.cc index 9f17b997723..012eba9b4c1 100644 --- a/backends/gcu/kernels/conv_transpose_kernel.cc +++ b/backends/gcu/kernels/conv_transpose_kernel.cc @@ -22,8 +22,8 @@ static std::unordered_set g_conv3d_transpose_weights_nhwc; template void ConvTransposeRawKernel(const std::string& conv_type, const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, + const DenseTensor& x, + const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, const std::vector& output_padding, @@ -32,7 +32,7 @@ void ConvTransposeRawKernel(const std::string& conv_type, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* out) { + DenseTensor* out) { dev_ctx.template Alloc(out); TensorNameMap input_names; @@ -40,8 +40,8 @@ void ConvTransposeRawKernel(const std::string& conv_type, input_names["Filter"] = {"filter"}; TensorValueMap inputs; - inputs["Input"] = {const_cast(&x)}; - inputs["Filter"] = {const_cast(&filter)}; + inputs["Input"] = {const_cast(&x)}; + inputs["Filter"] = {const_cast(&filter)}; TensorNameMap output_names; output_names["Out"] = {"out"}; @@ -68,9 +68,9 @@ void ConvTransposeRawKernel(const std::string& conv_type, template void ConvTransposeGradRawKernel(const std::string& conv_grad_type, const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, const std::vector& strides, const std::vector& paddings, const std::vector& output_padding, @@ -79,8 +79,8 @@ void ConvTransposeGradRawKernel(const std::string& conv_grad_type, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* dx, - phi::DenseTensor* dfilter) { + DenseTensor* dx, + DenseTensor* dfilter) { dev_ctx.template Alloc(dx); dev_ctx.template Alloc(dfilter); @@ -90,9 +90,9 @@ void ConvTransposeGradRawKernel(const std::string& conv_grad_type, input_names[GradVarName("Output")] = {"dout"}; TensorValueMap inputs; - inputs["Input"] = {const_cast(&x)}; - inputs["Filter"] = {const_cast(&filter)}; - inputs[GradVarName("Output")] = {const_cast(&dout)}; + inputs["Input"] = {const_cast(&x)}; + inputs["Filter"] = {const_cast(&filter)}; + inputs[GradVarName("Output")] = {const_cast(&dout)}; TensorNameMap output_names; output_names[GradVarName("Input")] = {"dx"}; @@ -125,9 +125,9 @@ void ConvTransposeGradRawKernel(const std::string& conv_grad_type, template void Conv2dTransposeBiasKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, - const paddle::optional& bias, + const DenseTensor& x, + const DenseTensor& filter, + const paddle::optional& bias, const std::vector& strides, const std::vector& paddings, const std::vector& output_padding UNUSED, @@ -136,14 +136,13 @@ void Conv2dTransposeBiasKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format UNUSED, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("conv2d_transpose_bias"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, filter); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, filter); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); // update paddings and dilations according to padding_algorithm std::vector paddings_vec = paddings; @@ -175,7 +174,7 @@ void Conv2dTransposeBiasKernel(const Context& dev_ctx, PdCustomNHWCRepresentAsAtenNHWC(output, true); if (g_conv2d_transpose_weights_nhwc.count(filter.data()) == 0) { auto filter_trans = NCHWTransToPdCustomNHWC(dev_ctx, filter); - phi::DenseTensor* filter_ptr = const_cast(&filter); + DenseTensor* filter_ptr = const_cast(&filter); TensorCopy(dev_ctx, filter_trans, false, filter_ptr); g_conv2d_transpose_weights_nhwc.emplace(filter.data()); VLOG(6) << "Transpose debug, trans filter for conv2d_transpose_bias."; @@ -188,12 +187,12 @@ void Conv2dTransposeBiasKernel(const Context& dev_ctx, } } - phi::DenseTensor input_bias; + DenseTensor input_bias; if (bias) { input_bias = bias.get(); } else { - auto meta = phi::DenseTensorMeta(x.dtype(), - phi::make_ddim({filter_x.dims().at(1)})); + auto meta = + DenseTensorMeta(x.dtype(), phi::make_ddim({filter_x.dims().at(1)})); input_bias = TensorZeros(dev_ctx, meta); } @@ -257,8 +256,8 @@ void Conv2dTransposeBiasKernel(const Context& dev_ctx, template void Conv2dTransposeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, + const DenseTensor& x, + const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, const std::vector& output_padding UNUSED, @@ -267,13 +266,13 @@ void Conv2dTransposeKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format UNUSED, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("conv2d_transpose"); custom_kernel::Conv2dTransposeBiasKernel( dev_ctx, x, filter, - paddle::optional(), + paddle::optional(), strides, paddings, output_padding, @@ -287,9 +286,9 @@ void Conv2dTransposeKernel(const Context& dev_ctx, template void Conv2dTransposeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, const std::vector& strides, const std::vector& paddings, const std::vector& output_padding, @@ -298,8 +297,8 @@ void Conv2dTransposeGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* dx, - phi::DenseTensor* dfilter) { + DenseTensor* dx, + DenseTensor* dfilter) { PADDLE_GCU_KERNEL_TRACE("conv2d_transpose_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -324,8 +323,8 @@ void Conv2dTransposeGradKernel(const Context& dev_ctx, template void Conv3dTransposeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, + const DenseTensor& x, + const DenseTensor& filter, const std::vector& strides, const std::vector& paddings, const std::vector& output_padding, @@ -334,15 +333,15 @@ void Conv3dTransposeKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("conv3d_transpose"); if (LaunchAOTKernel()) { // The aten operator library does not support conv3d_transpose THROW_AOT_UNIMPLEMENTED(); // dev_ctx.template Alloc(out); - // phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - // phi::DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, - // filter); phi::DenseTensor output = + // DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + // DenseTensor filter_x = MaybeCreateOrTrans64To32bits(dev_ctx, + // filter); DenseTensor output = // MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); // // update paddings and dilations according to padding_algorithm @@ -374,8 +373,8 @@ void Conv3dTransposeKernel(const Context& dev_ctx, // PdCustomNHWCRepresentAsAtenNHWC(output, true); // if (g_conv3d_transpose_weights_nhwc.count(filter.data()) == 0) { // auto filter_trans = NCHWTransToPdCustomNHWC(dev_ctx, filter); - // phi::DenseTensor* filter_ptr = - // const_cast(&filter); TensorCopy(dev_ctx, + // DenseTensor* filter_ptr = + // const_cast(&filter); TensorCopy(dev_ctx, // filter_trans, false, filter_ptr); // g_conv3d_transpose_weights_nhwc.emplace(filter.data()); // VLOG(6) << "Transpose debug, trans filter for conv3d_transpose."; @@ -388,7 +387,7 @@ void Conv3dTransposeKernel(const Context& dev_ctx, // } // } - // auto meta = phi::DenseTensorMeta(x.dtype(), + // auto meta = DenseTensorMeta(x.dtype(), // phi::make_ddim({filter_x.dims().at(1)})); // auto bias = TensorZeros(dev_ctx, meta); @@ -440,9 +439,9 @@ void Conv3dTransposeKernel(const Context& dev_ctx, template void Conv3dTransposeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& filter, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& filter, + const DenseTensor& dout, const std::vector& strides, const std::vector& paddings, const std::vector& output_padding, @@ -451,8 +450,8 @@ void Conv3dTransposeGradKernel(const Context& dev_ctx, int groups, const std::vector& dilations, const std::string& data_format, - phi::DenseTensor* dx, - phi::DenseTensor* dfilter) { + DenseTensor* dx, + DenseTensor* dfilter) { PADDLE_GCU_KERNEL_TRACE("conv3d_transpose_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); diff --git a/backends/gcu/kernels/copysign_kernel.cc b/backends/gcu/kernels/copysign_kernel.cc index 3ace9c9c39c..6050ab450a1 100644 --- a/backends/gcu/kernels/copysign_kernel.cc +++ b/backends/gcu/kernels/copysign_kernel.cc @@ -18,9 +18,9 @@ namespace custom_kernel { template void CopySignKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("copysign"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/cross_entropy_kernel.cc b/backends/gcu/kernels/cross_entropy_kernel.cc index b73f83944a3..fcbb041f0cd 100644 --- a/backends/gcu/kernels/cross_entropy_kernel.cc +++ b/backends/gcu/kernels/cross_entropy_kernel.cc @@ -19,15 +19,15 @@ namespace custom_kernel { template void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, - const phi::DenseTensor& logits, - const phi::DenseTensor& labels, + const DenseTensor& logits, + const DenseTensor& labels, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis, - phi::DenseTensor* softmax, - phi::DenseTensor* loss) { + DenseTensor* softmax, + DenseTensor* loss) { PADDLE_GCU_KERNEL_TRACE("cross_entropy_with_softmax"); dev_ctx.template Alloc(loss); dev_ctx.template Alloc(softmax); @@ -70,15 +70,15 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, template void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, - const phi::DenseTensor& labels, - const phi::DenseTensor& softmax, - const phi::DenseTensor& loss_grad, + const DenseTensor& labels, + const DenseTensor& softmax, + const DenseTensor& loss_grad, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis, - phi::DenseTensor* logits_grad) { + DenseTensor* logits_grad) { PADDLE_GCU_KERNEL_TRACE("cross_entropy_with_softmax_grad"); dev_ctx.template Alloc(logits_grad); diff --git a/backends/gcu/kernels/cross_kernel.cc b/backends/gcu/kernels/cross_kernel.cc index 7ae27d0292c..502190c1a90 100644 --- a/backends/gcu/kernels/cross_kernel.cc +++ b/backends/gcu/kernels/cross_kernel.cc @@ -18,10 +18,10 @@ namespace custom_kernel { template void CrossKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("copysign"); int64_t dim = axis; diff --git a/backends/gcu/kernels/cumulate_kernel.cc b/backends/gcu/kernels/cumulate_kernel.cc index 3599274e7ab..0a8d695382f 100644 --- a/backends/gcu/kernels/cumulate_kernel.cc +++ b/backends/gcu/kernels/cumulate_kernel.cc @@ -17,16 +17,16 @@ namespace custom_kernel { template void CumsumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& axis_scalar, bool flatten, bool exclusive, bool reverse, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("cumsum"); if (LaunchAOTKernel()) { auto axis = axis_scalar.to(); - phi::DenseTensor input_tensor(x); + DenseTensor input_tensor(x); if (flatten) { PADDLE_ENFORCE_EQ( axis, @@ -55,11 +55,11 @@ void CumsumKernel(const Context& dev_ctx, template void CummaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - phi::DataType dtype, - phi::DenseTensor* out, - phi::DenseTensor* indices) { + DataType dtype, + DenseTensor* out, + DenseTensor* indices) { PADDLE_GCU_KERNEL_TRACE("cummax"); if (LaunchAOTKernel()) { if (axis < 0) { @@ -68,17 +68,16 @@ void CummaxKernel(const Context& dev_ctx, dev_ctx.template Alloc(out); - phi::DenseTensor indices_out; - if (dtype == phi::DataType::INT64) { + DenseTensor indices_out; + if (dtype == DataType::INT64) { dev_ctx.template Alloc(indices); indices_out = MaybeCreateOrTrans64To32bits(dev_ctx, *indices, false); - } else if (dtype == phi::DataType::INT32) { + } else if (dtype == DataType::INT32) { dev_ctx.template Alloc(indices); indices_out = *indices; } else { - PADDLE_THROW( - phi::errors::InvalidArgument("Unsupported indices dtype: %s.", - phi::DataTypeToString(dtype).c_str())); + PADDLE_THROW(phi::errors::InvalidArgument( + "Unsupported indices dtype: %s.", DataTypeToString(dtype).c_str())); } LAUNCH_TOPSATENOP(topsatenCummax, dev_ctx, *out, indices_out, x, axis); MaybeTransResult(dev_ctx, indices_out, indices); @@ -90,11 +89,11 @@ void CummaxKernel(const Context& dev_ctx, template void CumminKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - phi::DataType dtype, - phi::DenseTensor* out, - phi::DenseTensor* indices) { + DataType dtype, + DenseTensor* out, + DenseTensor* indices) { PADDLE_GCU_KERNEL_TRACE("cummin"); if (LaunchAOTKernel()) { if (axis < 0) { @@ -103,17 +102,16 @@ void CumminKernel(const Context& dev_ctx, dev_ctx.template Alloc(out); - phi::DenseTensor indices_out; - if (dtype == phi::DataType::INT64) { + DenseTensor indices_out; + if (dtype == DataType::INT64) { dev_ctx.template Alloc(indices); indices_out = MaybeCreateOrTrans64To32bits(dev_ctx, *indices, false); - } else if (dtype == phi::DataType::INT32) { + } else if (dtype == DataType::INT32) { dev_ctx.template Alloc(indices); indices_out = *indices; } else { - PADDLE_THROW( - phi::errors::InvalidArgument("Unsupported indices dtype: %s.", - phi::DataTypeToString(dtype).c_str())); + PADDLE_THROW(phi::errors::InvalidArgument( + "Unsupported indices dtype: %s.", DataTypeToString(dtype).c_str())); } LAUNCH_TOPSATENOP(topsatenCummin, dev_ctx, *out, indices_out, x, axis); MaybeTransResult(dev_ctx, indices_out, indices); @@ -125,11 +123,11 @@ void CumminKernel(const Context& dev_ctx, template void CumprodKernel(const Context& dev_ctx, - const phi::DenseTensor& input, + const DenseTensor& input, int dim, bool exclusive, bool reverse, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("cumprod"); if (LaunchAOTKernel()) { if (dim < 0) { diff --git a/backends/gcu/kernels/diag_kernel.cc b/backends/gcu/kernels/diag_kernel.cc index 50fd9a2dba3..22ab44dbdc1 100644 --- a/backends/gcu/kernels/diag_kernel.cc +++ b/backends/gcu/kernels/diag_kernel.cc @@ -35,11 +35,11 @@ void DiagKernel(const Context& dev_ctx, } else { LAUNCH_TOPSATENOP(topsatenDiag, dev_ctx, *out, x, offset); - phi::DenseTensor mask_tmp = custom_kernel::TensorEmpty( - dev_ctx, {phi::DataType::BOOL, out->dims()}); + DenseTensor mask_tmp = + custom_kernel::TensorEmpty(dev_ctx, {DataType::BOOL, out->dims()}); - phi::DenseTensor cpu_tensor; - phi::DenseTensorMeta cpu_meta = {phi::DataType::BOOL, out->dims()}; + DenseTensor cpu_tensor; + DenseTensorMeta cpu_meta = {DataType::BOOL, out->dims()}; cpu_tensor.set_meta(cpu_meta); bool* host_mask = dev_ctx.template HostAlloc(&cpu_tensor); for (size_t i = 0; i < mask_tmp.numel(); i++) { diff --git a/backends/gcu/kernels/diagonal_kernel.cc b/backends/gcu/kernels/diagonal_kernel.cc index e3c4869ed66..d7d5d70c3dd 100644 --- a/backends/gcu/kernels/diagonal_kernel.cc +++ b/backends/gcu/kernels/diagonal_kernel.cc @@ -18,15 +18,15 @@ namespace custom_kernel { template void DiagonalKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int offset, int axis1, int axis2, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("diagonal"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - phi::DenseTensor out_tmp; + DenseTensor out_tmp; out_tmp.set_meta(out->meta()); auto out_tmp_tensor = CreateTopsatenTensorWithoutInitialized(out_tmp); diff --git a/backends/gcu/kernels/dropout_kernel.cc b/backends/gcu/kernels/dropout_kernel.cc index f2fde115d91..3221a51845e 100644 --- a/backends/gcu/kernels/dropout_kernel.cc +++ b/backends/gcu/kernels/dropout_kernel.cc @@ -20,7 +20,7 @@ namespace custom_kernel { template inline void GetSeedDataAndIncrement( const Context& dev_ctx, - const paddle::optional& seed_tensor, + const paddle::optional& seed_tensor, const bool is_fix_seed, const int seed_val, const int offset, @@ -28,7 +28,7 @@ inline void GetSeedDataAndIncrement( uint64_t* increment) { auto gen_custom = dev_ctx.GetGenerator(); if (seed_tensor) { - phi::DenseTensor seed_cpu_tensor; + DenseTensor seed_cpu_tensor; TensorCopy( dev_ctx, seed_tensor.get(), true, &seed_cpu_tensor, phi::CustomPlace()); *seed_data = static_cast(seed_cpu_tensor.data()[0]); @@ -48,8 +48,8 @@ inline void GetSeedDataAndIncrement( template inline std::pair GetSeedOffset( const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& seed_tensor, + const DenseTensor& x, + const paddle::optional& seed_tensor, int seed, bool fix_seed) { // Refer to the implementation of GPU dropout at: @@ -71,15 +71,15 @@ inline std::pair GetSeedOffset( template void DropoutKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& seed_tensor, + const DenseTensor& x, + const paddle::optional& seed_tensor, const phi::Scalar& p, bool is_test, const std::string& mode, int seed, bool fix_seed, - phi::DenseTensor* out, - phi::DenseTensor* mask) { + DenseTensor* out, + DenseTensor* mask) { PADDLE_GCU_KERNEL_TRACE("dropout"); dev_ctx.template Alloc(out); if (mask) { @@ -152,12 +152,12 @@ void DropoutKernel(const Context& dev_ctx, template void DropoutGradKernel(const Context& dev_ctx, - const phi::DenseTensor& mask, - const phi::DenseTensor& dout, + const DenseTensor& mask, + const DenseTensor& dout, const phi::Scalar& p, bool is_test, const std::string& mode, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("dropout_grad"); dev_ctx.template Alloc(dx); @@ -203,7 +203,7 @@ PD_REGISTER_PLUGIN_KERNEL(dropout, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); + kernel->OutputAt(1).SetDataType(DataType::UINT8); } PD_REGISTER_PLUGIN_KERNEL(dropout_grad, diff --git a/backends/gcu/kernels/einsum_kernel.cc b/backends/gcu/kernels/einsum_kernel.cc index cc004fc1d3f..483f11898fa 100644 --- a/backends/gcu/kernels/einsum_kernel.cc +++ b/backends/gcu/kernels/einsum_kernel.cc @@ -17,11 +17,11 @@ namespace custom_kernel { template void EinsumKernel(const Context& dev_ctx, - const std::vector& inputs, + const std::vector& inputs, const std::string& equation, - phi::DenseTensor* out, - std::vector cache, - std::vector xshape UNUSED) { + DenseTensor* out, + std::vector cache, + std::vector xshape UNUSED) { PADDLE_GCU_KERNEL_TRACE("einsum"); if (LaunchAOTKernel()) { ContextPinnedGuard ctx_pinned_guard(dev_ctx); @@ -54,36 +54,30 @@ void EinsumKernel(const Context& dev_ctx, inputs.size())); // Only float16 is supported, other data types will fallback to CPU. PADDLE_ENFORCE_EQ(inputs[0]->dtype(), - phi::DataType::FLOAT16, + DataType::FLOAT16, phi::errors::InvalidArgument( "Only float16 is supported, but got %s.", - phi::DataTypeToString(inputs[0]->dtype()).c_str())); - std::vector inputs_gcu_tmp(inputs.size()); - std::vector cache_gcu_tmp(cache.size()); - std::vector inputs_cpu(inputs.size()); - std::vector cache_cpu(cache.size()); - std::vector cache_out_gcu(cache.size()); + DataTypeToString(inputs[0]->dtype()).c_str())); + std::vector inputs_gcu_tmp(inputs.size()); + std::vector cache_gcu_tmp(cache.size()); + std::vector inputs_cpu(inputs.size()); + std::vector cache_cpu(cache.size()); + std::vector cache_out_gcu(cache.size()); - std::vector inputs_f32(inputs.size(), nullptr); - std::vector cache_f32(cache.size(), nullptr); - phi::DenseTensor out_cpu_f32; + std::vector inputs_f32(inputs.size(), nullptr); + std::vector cache_f32(cache.size(), nullptr); + DenseTensor out_cpu_f32; // convert inputs for (size_t i = 0; i < inputs.size(); ++i) { if (inputs[i] != nullptr) { - phi::DenseTensorMeta gcu_meta(phi::DataType::FLOAT32, - inputs[i]->dims()); + DenseTensorMeta gcu_meta(DataType::FLOAT32, inputs[i]->dims()); inputs_gcu_tmp[i].set_meta(gcu_meta); if (inputs[i]->initialized()) { - custom_kernel::Cast(dev_ctx, - *(inputs[i]), - phi::DataType::FLOAT32, - &inputs_gcu_tmp[i]); - TensorCopy(dev_ctx, - inputs_gcu_tmp[i], - false, - &inputs_cpu[i], - phi::CPUPlace()); + custom_kernel::Cast( + dev_ctx, *(inputs[i]), DataType::FLOAT32, &inputs_gcu_tmp[i]); + TensorCopy( + dev_ctx, inputs_gcu_tmp[i], false, &inputs_cpu[i], CPUPlace()); } inputs_f32[i] = &inputs_cpu[i]; } @@ -92,13 +86,13 @@ void EinsumKernel(const Context& dev_ctx, // convert cache for (size_t i = 0; i < cache.size(); ++i) { if (cache[i] != nullptr) { - phi::DenseTensorMeta gcu_meta(phi::DataType::FLOAT32, cache[i]->dims()); + DenseTensorMeta gcu_meta(DataType::FLOAT32, cache[i]->dims()); cache_gcu_tmp[i].set_meta(gcu_meta); if (cache[i]->initialized()) { custom_kernel::Cast( - dev_ctx, *(cache[i]), phi::DataType::FLOAT32, &cache_gcu_tmp[i]); + dev_ctx, *(cache[i]), DataType::FLOAT32, &cache_gcu_tmp[i]); TensorCopy( - dev_ctx, cache_gcu_tmp[i], false, &cache_cpu[i], phi::CPUPlace()); + dev_ctx, cache_gcu_tmp[i], false, &cache_cpu[i], CPUPlace()); } cache_f32[i] = &cache_cpu[i]; } @@ -108,26 +102,26 @@ void EinsumKernel(const Context& dev_ctx, dev_ctx.Wait(); // call the CPU implementation - phi::CPUContext dev_ctx_cpu; + CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator(&(dev_ctx.GetHostAllocator())); dev_ctx_cpu.SetHostAllocator(&(dev_ctx.GetHostAllocator())); - phi::DenseTensorMeta cpu_meta(phi::DataType::FLOAT32, out->dims()); + DenseTensorMeta cpu_meta(DataType::FLOAT32, out->dims()); out_cpu_f32.set_meta(cpu_meta); - phi::EinsumKernel( + phi::EinsumKernel( dev_ctx_cpu, inputs_f32, equation, &out_cpu_f32, cache_f32, xshape); dev_ctx.Wait(); // convert result - phi::DenseTensor out_gcu_f32; + DenseTensor out_gcu_f32; TensorCopy(dev_ctx, out_cpu_f32, false, &out_gcu_f32); - custom_kernel::Cast(dev_ctx, out_gcu_f32, phi::DataType::FLOAT16, out); + custom_kernel::Cast(dev_ctx, out_gcu_f32, DataType::FLOAT16, out); // convert cache for (size_t i = 0; i < cache.size(); ++i) { if (cache[i] != nullptr && cache[i]->initialized()) { TensorCopy(dev_ctx, *(cache_f32[i]), false, &cache_out_gcu[i]); custom_kernel::Cast( - dev_ctx, cache_out_gcu[i], phi::DataType::FLOAT16, cache[i]); + dev_ctx, cache_out_gcu[i], DataType::FLOAT16, cache[i]); } } dev_ctx.Wait(); diff --git a/backends/gcu/kernels/embedding_kernel.cc b/backends/gcu/kernels/embedding_kernel.cc index 8b33a2e8df4..0bd554243ca 100644 --- a/backends/gcu/kernels/embedding_kernel.cc +++ b/backends/gcu/kernels/embedding_kernel.cc @@ -20,31 +20,31 @@ template void FullKernel(const Context& dev_ctx, const phi::IntArray& shape, const phi::Scalar& val, - phi::DataType dtype, - phi::DenseTensor* out); + DataType dtype, + DenseTensor* out); template void EqualKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out); + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); template void WhereKernel(const Context& dev_ctx, - const phi::DenseTensor& condition, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out); + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); template void EmbeddingKernel(const Context& dev_ctx, - const phi::DenseTensor& inputx, - const phi::DenseTensor& weight, + const DenseTensor& inputx, + const DenseTensor& weight, int64_t padding_idx, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("embedding"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - phi::DenseTensor x = MaybeCreateOrTrans64To32bits(dev_ctx, inputx); + DenseTensor x = MaybeCreateOrTrans64To32bits(dev_ctx, inputx); LAUNCH_TOPSATENOP( topsatenEmbedding, dev_ctx, *out, weight, x, -1, false, false); if (padding_idx == -1) { @@ -52,38 +52,37 @@ void EmbeddingKernel(const Context& dev_ctx, } PADDLE_ENFORCE_EQ( x.dtype(), - phi::DataType::INT32, + DataType::INT32, phi::errors::Unimplemented( "The input tensor's dtype should be INT32 but get %s.", - phi::DataTypeToString(x.dtype()).c_str())); + DataTypeToString(x.dtype()).c_str())); // padding_idx is not -1 // implement padding_idx by using where kernel // out = x_brd == padding_idx ? 0 : topsatenEmbedding(weight, x, -1) - phi::DenseTensor pad_tensor; - phi::DenseTensor zero_tensor; - phi::DenseTensor mask_tensor; + DenseTensor pad_tensor; + DenseTensor zero_tensor; + DenseTensor mask_tensor; phi::IntArray shape(common::vectorize(out->dims())); - phi::DenseTensorMeta meta_info = x.meta(); + DenseTensorMeta meta_info = x.meta(); meta_info.dims = out->dims(); - meta_info.strides = phi::DenseTensorMeta::calc_strides(meta_info.dims); + meta_info.strides = DenseTensorMeta::calc_strides(meta_info.dims); pad_tensor.set_meta(meta_info); zero_tensor.set_meta(meta_info); custom_kernel::FullKernel( dev_ctx, shape, phi::Scalar(padding_idx), x.dtype(), &pad_tensor); custom_kernel::FullKernel( dev_ctx, shape, phi::Scalar(0), x.dtype(), &zero_tensor); - meta_info.dtype = phi::DataType::BOOL; + meta_info.dtype = DataType::BOOL; mask_tensor.set_meta(meta_info); - phi::DenseTensor x_brd; + DenseTensor x_brd; // x firstly expand to the same shape with pad_tensor for broadcast by // adding a new dimension on last - phi::DenseTensor x_expand = x; + DenseTensor x_expand = x; auto x_expand_meta = x_expand.meta(); auto x_expand_shape = common::vectorize(x_expand_meta.dims); x_expand_shape.push_back(1); x_expand_meta.dims = common::make_ddim(x_expand_shape); - x_expand_meta.strides = - phi::DenseTensorMeta::calc_strides(x_expand_meta.dims); + x_expand_meta.strides = DenseTensorMeta::calc_strides(x_expand_meta.dims); x_expand.set_meta(x_expand_meta); x_brd.set_meta(pad_tensor.meta()); dev_ctx.Alloc(&x_brd, x_brd.dtype()); @@ -125,11 +124,11 @@ void EmbeddingKernel(const Context& dev_ctx, template void EmbeddingGradKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& weight, - const phi::DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& weight, + const DenseTensor& out_grad, int64_t padding_idx, - phi::DenseTensor* weight_grad) { + DenseTensor* weight_grad) { PADDLE_GCU_KERNEL_TRACE("embedding_grad"); dev_ctx.template Alloc(weight_grad); diff --git a/backends/gcu/kernels/expand_as_kernel.cc b/backends/gcu/kernels/expand_as_kernel.cc index 6d19fa6d7be..6a5e8abc02a 100644 --- a/backends/gcu/kernels/expand_as_kernel.cc +++ b/backends/gcu/kernels/expand_as_kernel.cc @@ -21,16 +21,16 @@ namespace custom_kernel { template extern void ExpandKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& shape, - phi::DenseTensor* out); + DenseTensor* out); template void ExpandAsKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& y, + const DenseTensor& x, + const paddle::optional& y, const std::vector& target_shape_64, - phi::DenseTensor* out) { + DenseTensor* out) { std::vector target_shape = std::vector(target_shape_64.begin(), target_shape_64.end()); PADDLE_GCU_KERNEL_TRACE("expand_as"); diff --git a/backends/gcu/kernels/expand_kernel.cc b/backends/gcu/kernels/expand_kernel.cc index 01c2b1dbdcf..a67a042eb36 100644 --- a/backends/gcu/kernels/expand_kernel.cc +++ b/backends/gcu/kernels/expand_kernel.cc @@ -19,14 +19,14 @@ namespace custom_kernel { template void ExpandKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& shape, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("expand"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - phi::DenseTensor as_strides_out; + DenseTensor as_strides_out; auto x_tensor = CreateTopsatenTensor(x); auto out_tensor = CreateTopsatenTensor(*out); auto view_out_tensor = CreateTopsatenTensor(as_strides_out); diff --git a/backends/gcu/kernels/eye_kernel.cc b/backends/gcu/kernels/eye_kernel.cc index 3fb6fd735e4..cd8bb7602e8 100644 --- a/backends/gcu/kernels/eye_kernel.cc +++ b/backends/gcu/kernels/eye_kernel.cc @@ -20,8 +20,8 @@ template void EyeKernel(const Context& dev_ctx, const phi::Scalar& rows, const phi::Scalar& columns, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("eye"); dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/fc_kernel.cc b/backends/gcu/kernels/fc_kernel.cc index a439f208a2d..3a430584620 100644 --- a/backends/gcu/kernels/fc_kernel.cc +++ b/backends/gcu/kernels/fc_kernel.cc @@ -17,7 +17,7 @@ namespace custom_kernel { namespace { -void AdjustStrides(phi::DenseTensor& tensor) { // NOLINT +void AdjustStrides(DenseTensor& tensor) { // NOLINT size_t rank = tensor.dims().size(); if (rank <= 1) { return; @@ -32,18 +32,18 @@ void AdjustStrides(phi::DenseTensor& tensor) { // NOLINT template void FCKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& w, - const paddle::optional& bias, + const DenseTensor& input, + const DenseTensor& w, + const paddle::optional& bias, const int in_num_col_dims, const std::string& activation_type, const bool padding_weights, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("fc"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); - const phi::DenseTensor x_matrix = + const DenseTensor x_matrix = input.dims().size() > 2 ? phi::ReshapeToMatrix(input, in_num_col_dims) : input; @@ -55,12 +55,12 @@ void FCKernel(const Context& dev_ctx, auto w_trans = w; AdjustStrides(w_trans); - phi::DenseTensor fc_bias; + DenseTensor fc_bias; if (bias) { fc_bias = bias.get(); } else { auto meta = - phi::DenseTensorMeta(input.dtype(), phi::make_ddim({w.dims().at(0)})); + DenseTensorMeta(input.dtype(), phi::make_ddim({w.dims().at(0)})); fc_bias = TensorZeros(dev_ctx, meta); } LAUNCH_TOPSATENOP( diff --git a/backends/gcu/kernels/flatten_kernel.cc b/backends/gcu/kernels/flatten_kernel.cc index ffecba92e14..390cc8b9a96 100644 --- a/backends/gcu/kernels/flatten_kernel.cc +++ b/backends/gcu/kernels/flatten_kernel.cc @@ -18,10 +18,10 @@ namespace custom_kernel { template void FlattenKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int start_axis UNUSED, int stop_axis UNUSED, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("flatten"); if (LaunchAOTKernel()) { VLOG(6) << "[HOST_KERNEL] Impl on host for flatten"; @@ -60,11 +60,11 @@ void FlattenKernel(const Context& dev_ctx, template void FlattenWithXShapeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int start_axis, int stop_axis, - phi::DenseTensor* out, - phi::DenseTensor* xshape) { + DenseTensor* out, + DenseTensor* xshape) { PADDLE_GCU_KERNEL_TRACE("flatten_with_xshape"); if (LaunchAOTKernel()) { custom_kernel::FlattenKernel( @@ -105,8 +105,8 @@ void FlattenWithXShapeKernel(const Context& dev_ctx, template void FlattenGradKernel(const Context& dev_ctx, - const phi::DenseTensor& xshape, - const phi::DenseTensor& out_grad, + const DenseTensor& xshape, + const DenseTensor& out_grad, DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("flatten_grad"); dev_ctx.template Alloc(x_grad); @@ -114,7 +114,7 @@ void FlattenGradKernel(const Context& dev_ctx, if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); } else { // kernel impl base on JIT - phi::DenseTensor* tmp_tensor = nullptr; + DenseTensor* tmp_tensor = nullptr; TensorNameMap input_names; input_names["XShape"] = {"xshape"}; diff --git a/backends/gcu/kernels/flip_kernel.cc b/backends/gcu/kernels/flip_kernel.cc index 14a3bf0b9fe..ec0a3fbc1c7 100644 --- a/backends/gcu/kernels/flip_kernel.cc +++ b/backends/gcu/kernels/flip_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void FlipKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("flip"); dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/full_kernel.cc b/backends/gcu/kernels/full_kernel.cc index 8a2b1c8040a..edacf71b9d4 100644 --- a/backends/gcu/kernels/full_kernel.cc +++ b/backends/gcu/kernels/full_kernel.cc @@ -21,8 +21,8 @@ template void FullKernel(const Context& dev_ctx, const phi::IntArray& shape, const phi::Scalar& val, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("full"); if (LaunchAOTKernel()) { auto shape_vec = shape.GetData(); @@ -32,13 +32,11 @@ void FullKernel(const Context& dev_ctx, auto out_dim = phi::make_ddim(shape_vec); out->ResizeAndAllocate(out_dim); dev_ctx.template Alloc(out); - phi::DenseTensor output(*out); - if (out->dtype() == phi::DataType::BOOL || - out->dtype() == phi::DataType::INT32 || - out->dtype() == phi::DataType::INT64 || - out->dtype() == phi::DataType::FLOAT64) { + DenseTensor output(*out); + if (out->dtype() == DataType::BOOL || out->dtype() == DataType::INT32 || + out->dtype() == DataType::INT64 || out->dtype() == DataType::FLOAT64) { auto meta = out->meta(); - meta.dtype = phi::DataType::FLOAT32; + meta.dtype = DataType::FLOAT32; output.set_meta(meta); dev_ctx.template Alloc(&output); } @@ -48,10 +46,8 @@ void FullKernel(const Context& dev_ctx, } // topsatenFull not support bool or int32 yet. LAUNCH_TOPSATENOP(topsatenFull, dev_ctx, output, shape_vec, val); - if (out->dtype() == phi::DataType::BOOL || - out->dtype() == phi::DataType::INT32 || - out->dtype() == phi::DataType::INT64 || - out->dtype() == phi::DataType::FLOAT64) { + if (out->dtype() == DataType::BOOL || out->dtype() == DataType::INT32 || + out->dtype() == DataType::INT64 || out->dtype() == DataType::FLOAT64) { custom_kernel::Cast(dev_ctx, output, out->dtype(), out); } @@ -67,10 +63,10 @@ void FullKernel(const Context& dev_ctx, template void FullLikeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& val, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("full_like"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -111,13 +107,13 @@ void FullLikeKernel(const Context& dev_ctx, template void FullBatchSizeLikeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& shape, const phi::Scalar& val, - phi::DataType dtype, + DataType dtype, int x_batch_size_dim, int out_batch_size_dim, - phi::DenseTensor* out) { + DenseTensor* out) { if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -134,10 +130,10 @@ void FullBatchSizeLikeKernel(const Context& dev_ctx, template void FullWithTensorKernel(const Context& dev_ctx, - const phi::DenseTensor& value, + const DenseTensor& value, const phi::IntArray& shape, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { if (LaunchAOTKernel()) { out->Resize(common::make_ddim(shape.GetData())); custom_kernel::FullKernel( diff --git a/backends/gcu/kernels/funcs/common_ops.cc b/backends/gcu/kernels/funcs/common_ops.cc index 84b1851d29c..50e8b861273 100644 --- a/backends/gcu/kernels/funcs/common_ops.cc +++ b/backends/gcu/kernels/funcs/common_ops.cc @@ -16,9 +16,9 @@ namespace custom_kernel { namespace { -std::unordered_map kDataTypeTrans64To32 = { - {phi::DataType::INT64, phi::DataType::INT32}, - {phi::DataType::FLOAT64, phi::DataType::FLOAT32}, +std::unordered_map kDataTypeTrans64To32 = { + {DataType::INT64, DataType::INT32}, + {DataType::FLOAT64, DataType::FLOAT32}, }; std::vector InferBroadcastDimMap( @@ -56,23 +56,22 @@ std::vector InferBroadcastDimMap( return dim_maps; } -inline std::string GetDataTypePairKey(const phi::DataType src_type, - const phi::DataType dst_type) { - return phi::DataTypeToString(src_type) + "_to_" + - phi::DataTypeToString(dst_type); +inline std::string GetDataTypePairKey(const DataType src_type, + const DataType dst_type) { + return DataTypeToString(src_type) + "_to_" + DataTypeToString(dst_type); } } // namespace -phi::DenseTensor MaybeCreateOrTrans( +DenseTensor MaybeCreateOrTrans( const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - const std::unordered_map& tans_map, + const DenseTensor& src, + const std::unordered_map& tans_map, bool need_cast) { auto src_dtype = src.dtype(); if (tans_map.count(src_dtype) == 0) { return src; } - phi::DenseTensor dst; + DenseTensor dst; if (need_cast) { custom_kernel::Cast(dev_ctx, src, tans_map.at(src_dtype), &dst); } else { @@ -84,24 +83,24 @@ phi::DenseTensor MaybeCreateOrTrans( return dst; } -phi::DenseTensor MaybeCreateOrTrans64To32bits(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - bool need_cast) { +DenseTensor MaybeCreateOrTrans64To32bits(const phi::CustomContext& dev_ctx, + const DenseTensor& src, + bool need_cast) { return MaybeCreateOrTrans(dev_ctx, src, kDataTypeTrans64To32, need_cast); } -phi::DenseTensor MaybeCreateOrTransFp16ToFp32(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - bool need_cast) { - static const std::unordered_map kFp16ToFp32 = { - {phi::DataType::FLOAT16, phi::DataType::FLOAT32}, +DenseTensor MaybeCreateOrTransFp16ToFp32(const phi::CustomContext& dev_ctx, + const DenseTensor& src, + bool need_cast) { + static const std::unordered_map kFp16ToFp32 = { + {DataType::FLOAT16, DataType::FLOAT32}, }; return MaybeCreateOrTrans(dev_ctx, src, kFp16ToFp32, need_cast); } void MaybeTransResult(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& result, - phi::DenseTensor* dst) { + const DenseTensor& result, + DenseTensor* dst) { auto dst_dtype = dst->dtype(); if (dst_dtype == result.dtype()) { return; @@ -110,14 +109,14 @@ void MaybeTransResult(const phi::CustomContext& dev_ctx, } void Broadcast(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst) { + const DenseTensor& src, + DenseTensor* dst) { if (src.numel() <= 0) { VLOG(1) << "Common op Broadcast, src numel:" << src.numel() << ", will do nothing."; return; } - phi::DenseTensor as_strides_out; + DenseTensor as_strides_out; auto src_tensor = CreateTopsatenTensor(src); auto dst_tensor = CreateTopsatenTensor(*dst); auto view_out_tensor = CreateTopsatenTensor(as_strides_out); @@ -138,127 +137,127 @@ void Broadcast(const phi::CustomContext& dev_ctx, topsatenCopy, dev_ctx, abstract_info, dst_tensor, view_out_tensor, false); } -phi::DenseTensor Broadcast(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - const std::vector& output_shapes) { - auto meta = phi::DenseTensorMeta(src.dtype(), phi::make_ddim(output_shapes)); - phi::DenseTensor dst = TensorEmpty(dev_ctx, meta); +DenseTensor Broadcast(const phi::CustomContext& dev_ctx, + const DenseTensor& src, + const std::vector& output_shapes) { + auto meta = DenseTensorMeta(src.dtype(), phi::make_ddim(output_shapes)); + DenseTensor dst = TensorEmpty(dev_ctx, meta); Broadcast(dev_ctx, src, &dst); return dst; } namespace { -bool IsCastSupport(const phi::DataType src_type, const phi::DataType dst_type) { +bool IsCastSupport(const DataType src_type, const DataType dst_type) { static const std::unordered_set kSupportedCast = { // ******************* bool convert ***************** // // bool <--> int32 - GetDataTypePairKey(phi::DataType::BOOL, phi::DataType::INT32), - GetDataTypePairKey(phi::DataType::INT32, phi::DataType::BOOL), + GetDataTypePairKey(DataType::BOOL, DataType::INT32), + GetDataTypePairKey(DataType::INT32, DataType::BOOL), // bool <--> float16 - GetDataTypePairKey(phi::DataType::BOOL, phi::DataType::FLOAT16), - GetDataTypePairKey(phi::DataType::FLOAT16, phi::DataType::BOOL), + GetDataTypePairKey(DataType::BOOL, DataType::FLOAT16), + GetDataTypePairKey(DataType::FLOAT16, DataType::BOOL), // bool <--> float32 - GetDataTypePairKey(phi::DataType::BOOL, phi::DataType::FLOAT32), - GetDataTypePairKey(phi::DataType::FLOAT32, phi::DataType::BOOL), + GetDataTypePairKey(DataType::BOOL, DataType::FLOAT32), + GetDataTypePairKey(DataType::FLOAT32, DataType::BOOL), // ******************** 64 bits ******************** // // int32 <--> int64 - GetDataTypePairKey(phi::DataType::INT32, phi::DataType::INT64), - GetDataTypePairKey(phi::DataType::INT64, phi::DataType::INT32), + GetDataTypePairKey(DataType::INT32, DataType::INT64), + GetDataTypePairKey(DataType::INT64, DataType::INT32), // ***************** int to float16 **************** // // int32 <--> float16 - GetDataTypePairKey(phi::DataType::INT32, phi::DataType::FLOAT16), - GetDataTypePairKey(phi::DataType::FLOAT16, phi::DataType::INT32), + GetDataTypePairKey(DataType::INT32, DataType::FLOAT16), + GetDataTypePairKey(DataType::FLOAT16, DataType::INT32), // ***************** int to float32 *************** // // int8 <--> float32 - GetDataTypePairKey(phi::DataType::INT8, phi::DataType::FLOAT32), - GetDataTypePairKey(phi::DataType::FLOAT32, phi::DataType::INT8), + GetDataTypePairKey(DataType::INT8, DataType::FLOAT32), + GetDataTypePairKey(DataType::FLOAT32, DataType::INT8), // int16 <--> float32 - GetDataTypePairKey(phi::DataType::INT16, phi::DataType::FLOAT32), - GetDataTypePairKey(phi::DataType::FLOAT32, phi::DataType::INT16), + GetDataTypePairKey(DataType::INT16, DataType::FLOAT32), + GetDataTypePairKey(DataType::FLOAT32, DataType::INT16), // int32 <--> float32 - GetDataTypePairKey(phi::DataType::INT32, phi::DataType::FLOAT32), - GetDataTypePairKey(phi::DataType::FLOAT32, phi::DataType::INT32), + GetDataTypePairKey(DataType::INT32, DataType::FLOAT32), + GetDataTypePairKey(DataType::FLOAT32, DataType::INT32), // ***************** float convert ***************** // // float16 <--> float32 - GetDataTypePairKey(phi::DataType::FLOAT16, phi::DataType::FLOAT32), - GetDataTypePairKey(phi::DataType::FLOAT32, phi::DataType::FLOAT16), + GetDataTypePairKey(DataType::FLOAT16, DataType::FLOAT32), + GetDataTypePairKey(DataType::FLOAT32, DataType::FLOAT16), // bfloat16 <--> float32 - GetDataTypePairKey(phi::DataType::BFLOAT16, phi::DataType::FLOAT32), - GetDataTypePairKey(phi::DataType::FLOAT32, phi::DataType::BFLOAT16), + GetDataTypePairKey(DataType::BFLOAT16, DataType::FLOAT32), + GetDataTypePairKey(DataType::FLOAT32, DataType::BFLOAT16), // ***************** int convert ****************** // // int8 <--> int16 - GetDataTypePairKey(phi::DataType::INT8, phi::DataType::INT16), - GetDataTypePairKey(phi::DataType::INT16, phi::DataType::INT8), + GetDataTypePairKey(DataType::INT8, DataType::INT16), + GetDataTypePairKey(DataType::INT16, DataType::INT8), // int8 <--> int32 - GetDataTypePairKey(phi::DataType::INT8, phi::DataType::INT32), - GetDataTypePairKey(phi::DataType::INT32, phi::DataType::INT8), + GetDataTypePairKey(DataType::INT8, DataType::INT32), + GetDataTypePairKey(DataType::INT32, DataType::INT8), // int16 <--> int32 - GetDataTypePairKey(phi::DataType::INT16, phi::DataType::INT32), - GetDataTypePairKey(phi::DataType::INT32, phi::DataType::INT16), + GetDataTypePairKey(DataType::INT16, DataType::INT32), + GetDataTypePairKey(DataType::INT32, DataType::INT16), // uint8 <--> uint16 - GetDataTypePairKey(phi::DataType::UINT8, phi::DataType::UINT16), - GetDataTypePairKey(phi::DataType::UINT16, phi::DataType::UINT8), + GetDataTypePairKey(DataType::UINT8, DataType::UINT16), + GetDataTypePairKey(DataType::UINT16, DataType::UINT8), // uint8 <--> uint32 - GetDataTypePairKey(phi::DataType::UINT8, phi::DataType::UINT32), - GetDataTypePairKey(phi::DataType::UINT32, phi::DataType::UINT8), + GetDataTypePairKey(DataType::UINT8, DataType::UINT32), + GetDataTypePairKey(DataType::UINT32, DataType::UINT8), // uint16 <--> uint16 - GetDataTypePairKey(phi::DataType::UINT16, phi::DataType::UINT32), - GetDataTypePairKey(phi::DataType::UINT32, phi::DataType::UINT16), + GetDataTypePairKey(DataType::UINT16, DataType::UINT32), + GetDataTypePairKey(DataType::UINT32, DataType::UINT16), }; return (kSupportedCast.count(GetDataTypePairKey(src_type, dst_type)) > 0); } -phi::DataType IntermediateDtypeToCast(const phi::DataType src_type, - const phi::DataType dst_type) { - static const std::unordered_map +DataType IntermediateDtypeToCast(const DataType src_type, + const DataType dst_type) { + static const std::unordered_map kSupportedIndirectCast = { // float32 <--> int64 - {GetDataTypePairKey(phi::DataType::FLOAT32, phi::DataType::INT64), - phi::DataType::INT32}, - {GetDataTypePairKey(phi::DataType::INT64, phi::DataType::FLOAT32), - phi::DataType::INT32}, + {GetDataTypePairKey(DataType::FLOAT32, DataType::INT64), + DataType::INT32}, + {GetDataTypePairKey(DataType::INT64, DataType::FLOAT32), + DataType::INT32}, // float16 <--> int64 - {GetDataTypePairKey(phi::DataType::FLOAT16, phi::DataType::INT64), - phi::DataType::INT32}, - {GetDataTypePairKey(phi::DataType::INT64, phi::DataType::FLOAT16), - phi::DataType::INT32}, + {GetDataTypePairKey(DataType::FLOAT16, DataType::INT64), + DataType::INT32}, + {GetDataTypePairKey(DataType::INT64, DataType::FLOAT16), + DataType::INT32}, // bool <--> int64 - {GetDataTypePairKey(phi::DataType::BOOL, phi::DataType::INT64), - phi::DataType::INT32}, - {GetDataTypePairKey(phi::DataType::INT64, phi::DataType::BOOL), - phi::DataType::INT32}, + {GetDataTypePairKey(DataType::BOOL, DataType::INT64), + DataType::INT32}, + {GetDataTypePairKey(DataType::INT64, DataType::BOOL), + DataType::INT32}, // int8 <--> int64 - {GetDataTypePairKey(phi::DataType::INT8, phi::DataType::INT64), - phi::DataType::INT32}, - {GetDataTypePairKey(phi::DataType::INT64, phi::DataType::INT8), - phi::DataType::INT32}, + {GetDataTypePairKey(DataType::INT8, DataType::INT64), + DataType::INT32}, + {GetDataTypePairKey(DataType::INT64, DataType::INT8), + DataType::INT32}, // int16 <--> int64 - {GetDataTypePairKey(phi::DataType::INT16, phi::DataType::INT64), - phi::DataType::INT32}, - {GetDataTypePairKey(phi::DataType::INT64, phi::DataType::INT16), - phi::DataType::INT32}, + {GetDataTypePairKey(DataType::INT16, DataType::INT64), + DataType::INT32}, + {GetDataTypePairKey(DataType::INT64, DataType::INT16), + DataType::INT32}, }; auto key = GetDataTypePairKey(src_type, dst_type); return ((kSupportedIndirectCast.count(key) > 0) ? kSupportedIndirectCast.at(key) - : phi::DataType::UNDEFINED); + : DataType::UNDEFINED); } void CastImpl(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DataType& dtype, - phi::DenseTensor* out) { + const DenseTensor& x, + const DataType& dtype, + DenseTensor* out) { auto meta = x.meta(); meta.dtype = dtype; out->set_meta(meta); @@ -286,34 +285,33 @@ void CastImpl(const phi::CustomContext& dev_ctx, topsaten_format); } -#define FOR_EACH_DATA_TYPE(_) \ - _(bool, phi::DataType::BOOL) \ - _(int8_t, phi::DataType::INT8) \ - _(uint8_t, phi::DataType::UINT8) \ - _(int16_t, phi::DataType::INT16) \ - _(uint16_t, phi::DataType::UINT16) \ - _(int32_t, phi::DataType::INT32) \ - _(uint32_t, phi::DataType::UINT32) \ - _(int64_t, phi::DataType::INT64) \ - _(uint64_t, phi::DataType::UINT64) \ - _(phi::bfloat16, phi::DataType::BFLOAT16) \ - _(phi::float16, phi::DataType::FLOAT16) \ - _(float, phi::DataType::FLOAT32) \ - _(double, phi::DataType::FLOAT64) \ - _(phi::complex64, phi::DataType::COMPLEX64) \ - _(phi::complex128, phi::DataType::COMPLEX128) \ - _(phi::pstring, phi::DataType::PSTRING) - -#define CALL_CPU_CAST_KERNEL(cpp_type, data_type) \ - case data_type: \ - phi::CastKernel( \ - dev_ctx_cpu, x_cpu, dtype, out_cpu); \ +#define FOR_EACH_DATA_TYPE(_) \ + _(bool, DataType::BOOL) \ + _(int8_t, DataType::INT8) \ + _(uint8_t, DataType::UINT8) \ + _(int16_t, DataType::INT16) \ + _(uint16_t, DataType::UINT16) \ + _(int32_t, DataType::INT32) \ + _(uint32_t, DataType::UINT32) \ + _(int64_t, DataType::INT64) \ + _(uint64_t, DataType::UINT64) \ + _(phi::bfloat16, DataType::BFLOAT16) \ + _(phi::float16, DataType::FLOAT16) \ + _(float, DataType::FLOAT32) \ + _(double, DataType::FLOAT64) \ + _(phi::complex64, DataType::COMPLEX64) \ + _(phi::complex128, DataType::COMPLEX128) \ + _(phi::pstring, DataType::PSTRING) + +#define CALL_CPU_CAST_KERNEL(cpp_type, data_type) \ + case data_type: \ + phi::CastKernel(dev_ctx_cpu, x_cpu, dtype, out_cpu); \ break; -void CallCPUCastImpl(const phi::CPUContext& dev_ctx_cpu, - const phi::DenseTensor& x_cpu, - const phi::DataType& dtype, - phi::DenseTensor* out_cpu) { +void CallCPUCastImpl(const CPUContext& dev_ctx_cpu, + const DenseTensor& x_cpu, + const DataType& dtype, + DenseTensor* out_cpu) { switch (x_cpu.dtype()) { FOR_EACH_DATA_TYPE(CALL_CPU_CAST_KERNEL) } } @@ -321,9 +319,9 @@ void CallCPUCastImpl(const phi::CPUContext& dev_ctx_cpu, #undef FOR_EACH_DATA_TYPE void CastCPUImpl(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DataType& dtype, - phi::DenseTensor* out) { + const DenseTensor& x, + const DataType& dtype, + DenseTensor* out) { auto meta = x.meta(); meta.dtype = dtype; out->set_meta(meta); @@ -335,16 +333,16 @@ void CastCPUImpl(const phi::CustomContext& dev_ctx, // 1. Copy x to CPU ContextPinnedGuard ctx_pinned_guard(dev_ctx); - phi::DenseTensor x_cpu; + DenseTensor x_cpu; x_cpu.set_meta(x.meta()); - TensorCopy(dev_ctx, x, false, &x_cpu, phi::CPUPlace()); + TensorCopy(dev_ctx, x, false, &x_cpu, CPUPlace()); dev_ctx.Wait(); // 2. Call the CPU implementation - phi::CPUContext dev_ctx_cpu; + CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator(&(dev_ctx.GetHostAllocator())); dev_ctx_cpu.SetHostAllocator(&(dev_ctx.GetHostAllocator())); - phi::DenseTensor out_cpu; + DenseTensor out_cpu; out_cpu.set_meta(meta); CallCPUCastImpl(dev_ctx_cpu, x_cpu, dtype, &out_cpu); dev_ctx.Wait(); @@ -356,9 +354,9 @@ void CastCPUImpl(const phi::CustomContext& dev_ctx, } // namespace void Cast(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DataType& dtype, - phi::DenseTensor* out) { + const DenseTensor& x, + const DataType& dtype, + DenseTensor* out) { std::string key = "convert_" + GetDataTypePairKey(x.dtype(), dtype); PADDLE_GCU_KERNEL_TRACE(key); auto meta = x.meta(); @@ -374,42 +372,40 @@ void Cast(const phi::CustomContext& dev_ctx, return; } auto media_type = IntermediateDtypeToCast(x.dtype(), dtype); - if (media_type != phi::DataType::UNDEFINED) { - VLOG(3) << "Cast intermediately, convert " - << phi::DataTypeToString(x.dtype()) << " to " - << phi::DataTypeToString(media_type) << " to " - << phi::DataTypeToString(dtype); - phi::DenseTensor tmp; + if (media_type != DataType::UNDEFINED) { + VLOG(3) << "Cast intermediately, convert " << DataTypeToString(x.dtype()) + << " to " << DataTypeToString(media_type) << " to " + << DataTypeToString(dtype); + DenseTensor tmp; CastImpl(dev_ctx, x, media_type, &tmp); CastImpl(dev_ctx, tmp, dtype, out); } else { VLOG(3) << "[CPU_KERNEL] Use CastCPUImpl, convert " - << phi::DataTypeToString(x.dtype()) << " to " - << phi::DataTypeToString(dtype); + << DataTypeToString(x.dtype()) << " to " << DataTypeToString(dtype); CastCPUImpl(dev_ctx, x, dtype, out); } } -phi::DenseTensor Cast(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DataType& dtype) { - phi::DenseTensor out; +DenseTensor Cast(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DataType& dtype) { + DenseTensor out; Cast(dev_ctx, x, dtype, &out); return out; } -phi::DenseTensor CastOrCopyToPinnedMemory(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DataType& dtype) { +DenseTensor CastOrCopyToPinnedMemory(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DataType& dtype) { std::string key = "pinned_convert_" + GetDataTypePairKey(x.dtype(), dtype); PADDLE_GCU_KERNEL_TRACE(key); ContextPinnedGuard ctx_pinned_guard(dev_ctx); - // phi::DenseTensor cast_out = x; + // DenseTensor cast_out = x; // if (x.dtype() != dtype) { // cast_out = custom_kernel::Cast(dev_ctx, x, dtype); // } - // phi::DenseTensor out; + // DenseTensor out; // out.set_meta(cast_out.meta()); // dev_ctx.HostAlloc(&out, out.dtype()); @@ -426,7 +422,7 @@ phi::DenseTensor CastOrCopyToPinnedMemory(const phi::CustomContext& dev_ctx, return x; } - phi::DenseTensor out; + DenseTensor out; auto meta = x.meta(); meta.dtype = dtype; out.set_meta(meta); @@ -435,7 +431,7 @@ phi::DenseTensor CastOrCopyToPinnedMemory(const phi::CustomContext& dev_ctx, // if (x.dtype() == dtype) { // VLOG(3) << "CastOrCopyToPinnedMemory, will copy D2D, dtype:" - // << phi::DataTypeToString(dtype); + // << DataTypeToString(dtype); // C_Device_st device; // device.id = x.place().GetDeviceId(); // C_Stream stream = static_cast(dev_ctx.stream()); @@ -465,56 +461,56 @@ phi::DenseTensor CastOrCopyToPinnedMemory(const phi::CustomContext& dev_ctx, return out; } -phi::DenseTensor ReshapeWithoutCopy(const phi::DenseTensor& src, - const std::vector& out_shapes) { +DenseTensor ReshapeWithoutCopy(const DenseTensor& src, + const std::vector& out_shapes) { PADDLE_ENFORCE_EQ( src.numel(), phi::product(phi::make_ddim(out_shapes)), phi::errors::InvalidArgument( "The memory size before and after reshape should be the same.")); - phi::DenseTensor dst(src); + DenseTensor dst(src); dst.Resize(phi::make_ddim(out_shapes)); return dst; } -phi::DenseTensor TensorEmpty(const phi::CustomContext& dev_ctx, - const phi::DenseTensorMeta& meta) { - phi::DenseTensor output_tensor; +DenseTensor TensorEmpty(const phi::CustomContext& dev_ctx, + const DenseTensorMeta& meta) { + DenseTensor output_tensor; output_tensor.set_meta(meta); dev_ctx.Alloc(&output_tensor, output_tensor.dtype()); return output_tensor; } -phi::DenseTensor TensorOnes(const phi::CustomContext& dev_ctx, - const phi::DenseTensorMeta& meta) { - phi::DenseTensor out = TensorEmpty(dev_ctx, meta); +DenseTensor TensorOnes(const phi::CustomContext& dev_ctx, + const DenseTensorMeta& meta) { + DenseTensor out = TensorEmpty(dev_ctx, meta); auto shape = phi::vectorize(meta.dims); LAUNCH_TOPSATENOP(topsatenOnes, dev_ctx, out, shape, meta.dtype); return out; } -phi::DenseTensor TensorZeros(const phi::CustomContext& dev_ctx, - const phi::DenseTensorMeta& meta) { - phi::DenseTensor out = TensorEmpty(dev_ctx, meta); +DenseTensor TensorZeros(const phi::CustomContext& dev_ctx, + const DenseTensorMeta& meta) { + DenseTensor out = TensorEmpty(dev_ctx, meta); auto shape = phi::vectorize(meta.dims); LAUNCH_TOPSATENOP(topsatenZeros, dev_ctx, out, shape, meta.dtype); return out; } -phi::DenseTensor Add(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensorMeta& out_meta) { - phi::DenseTensor out = TensorEmpty(dev_ctx, out_meta); +DenseTensor Add(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensorMeta& out_meta) { + DenseTensor out = TensorEmpty(dev_ctx, out_meta); phi::Scalar scalar(1.0f); LAUNCH_TOPSATENOP(topsatenAdd, dev_ctx, out, x, y, scalar); return out; } -phi::DenseTensor Add(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y) { - phi::DenseTensor out; +DenseTensor Add(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor out; phi::MetaTensor meta_out(out); phi::ElementwiseInferMeta(x, y, &meta_out); out.Resize(meta_out.dims()); @@ -524,20 +520,20 @@ phi::DenseTensor Add(const phi::CustomContext& dev_ctx, return out; } -phi::DenseTensor Subtract(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensorMeta& out_meta) { - phi::DenseTensor out = TensorEmpty(dev_ctx, out_meta); +DenseTensor Subtract(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensorMeta& out_meta) { + DenseTensor out = TensorEmpty(dev_ctx, out_meta); phi::Scalar scalar(1.0f); LAUNCH_TOPSATENOP(topsatenSub, dev_ctx, out, x, y, scalar); return out; } -phi::DenseTensor Subtract(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y) { - phi::DenseTensor out; +DenseTensor Subtract(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor out; phi::MetaTensor meta_out(out); phi::ElementwiseInferMeta(x, y, &meta_out); out.Resize(meta_out.dims()); @@ -548,10 +544,10 @@ phi::DenseTensor Subtract(const phi::CustomContext& dev_ctx, } void SliceBase(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const std::vector& starts, - phi::DenseTensor* out) { + DenseTensor* out) { std::vector sizes(phi::vectorize(out->dims())); std::vector strides(phi::vectorize(x.strides())); int64_t offset = 0; @@ -563,7 +559,7 @@ void SliceBase(const phi::CustomContext& dev_ctx, dev_ctx.Alloc(out, out->dtype()); } - phi::DenseTensor as_strides_out; + DenseTensor as_strides_out; auto x_tensor = CreateTopsatenTensor(x); auto out_tensor = CreateTopsatenTensor(*out); auto view_out_tensor = CreateTopsatenTensor(as_strides_out); diff --git a/backends/gcu/kernels/funcs/common_ops.h b/backends/gcu/kernels/funcs/common_ops.h index 4c282e42989..065e0c1a3a3 100644 --- a/backends/gcu/kernels/funcs/common_ops.h +++ b/backends/gcu/kernels/funcs/common_ops.h @@ -19,80 +19,80 @@ namespace custom_kernel { -phi::DenseTensor MaybeCreateOrTrans( +DenseTensor MaybeCreateOrTrans( const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - const std::unordered_map& tans_map, + const DenseTensor& src, + const std::unordered_map& tans_map, bool need_cast = true); -phi::DenseTensor MaybeCreateOrTrans64To32bits(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - bool need_cast = true); +DenseTensor MaybeCreateOrTrans64To32bits(const phi::CustomContext& dev_ctx, + const DenseTensor& src, + bool need_cast = true); -phi::DenseTensor MaybeCreateOrTransFp16ToFp32(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - bool need_cast = true); +DenseTensor MaybeCreateOrTransFp16ToFp32(const phi::CustomContext& dev_ctx, + const DenseTensor& src, + bool need_cast = true); void MaybeTransResult(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& result, - phi::DenseTensor* dst); + const DenseTensor& result, + DenseTensor* dst); void Broadcast(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - phi::DenseTensor* dst); + const DenseTensor& src, + DenseTensor* dst); -phi::DenseTensor Broadcast(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& src, - const std::vector& output_shapes); +DenseTensor Broadcast(const phi::CustomContext& dev_ctx, + const DenseTensor& src, + const std::vector& output_shapes); void Cast(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DataType& dtype, - phi::DenseTensor* out); + const DenseTensor& x, + const DataType& dtype, + DenseTensor* out); -phi::DenseTensor Cast(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DataType& dtype); +DenseTensor Cast(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DataType& dtype); -phi::DenseTensor CastOrCopyToPinnedMemory(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DataType& dtype); +DenseTensor CastOrCopyToPinnedMemory(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DataType& dtype); -phi::DenseTensor ReshapeWithoutCopy(const phi::DenseTensor& src, - const std::vector& out_shapes); +DenseTensor ReshapeWithoutCopy(const DenseTensor& src, + const std::vector& out_shapes); -phi::DenseTensor TensorEmpty(const phi::CustomContext& dev_ctx, - const phi::DenseTensorMeta& meta); +DenseTensor TensorEmpty(const phi::CustomContext& dev_ctx, + const DenseTensorMeta& meta); -phi::DenseTensor TensorOnes(const phi::CustomContext& dev_ctx, - const phi::DenseTensorMeta& meta); +DenseTensor TensorOnes(const phi::CustomContext& dev_ctx, + const DenseTensorMeta& meta); -phi::DenseTensor TensorZeros(const phi::CustomContext& dev_ctx, - const phi::DenseTensorMeta& meta); +DenseTensor TensorZeros(const phi::CustomContext& dev_ctx, + const DenseTensorMeta& meta); // meta reuse ops -phi::DenseTensor Add(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensorMeta& out_meta); +DenseTensor Add(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensorMeta& out_meta); -phi::DenseTensor Add(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y); +DenseTensor Add(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y); -phi::DenseTensor Subtract(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensorMeta& out_meta); +DenseTensor Subtract(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensorMeta& out_meta); -phi::DenseTensor Subtract(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y); +DenseTensor Subtract(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y); void SliceBase(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const std::vector& starts, - phi::DenseTensor* out); + DenseTensor* out); } // namespace custom_kernel diff --git a/backends/gcu/kernels/funcs/gcu_kernel_funcs.h b/backends/gcu/kernels/funcs/gcu_kernel_funcs.h index 43771743d83..3497d3c8290 100644 --- a/backends/gcu/kernels/funcs/gcu_kernel_funcs.h +++ b/backends/gcu/kernels/funcs/gcu_kernel_funcs.h @@ -29,7 +29,7 @@ __FUNCTION__)) namespace custom_kernel { -using DenseTensor = phi::DenseTensor; +using DenseTensor = DenseTensor; using TensorNameMap = std::map>; using TensorValueMap = std::map>; diff --git a/backends/gcu/kernels/funcs/gcu_layout_funcs.cc b/backends/gcu/kernels/funcs/gcu_layout_funcs.cc index 6f6a6c1ceaf..a1b84572db4 100644 --- a/backends/gcu/kernels/funcs/gcu_layout_funcs.cc +++ b/backends/gcu/kernels/funcs/gcu_layout_funcs.cc @@ -60,7 +60,7 @@ bool EnableTransposeOptimize() { return false; } -void SetLayout(phi::DenseTensor& tensor, // NOLINT +void SetLayout(DenseTensor& tensor, // NOLINT const common::DataLayout& layout) { auto meta = tensor.meta(); meta.layout = layout; @@ -68,23 +68,23 @@ void SetLayout(phi::DenseTensor& tensor, // NOLINT } void Transpose(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, - phi::DenseTensor* out) { + DenseTensor* out) { auto x_perm = x; PermutedShapeAndStrides(x_perm, axis); LAUNCH_TOPSATENOP(topsatenCopy, dev_ctx, *out, x_perm, false); } -phi::DenseTensor Transpose(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const std::vector& axis) { +DenseTensor Transpose(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const std::vector& axis) { // infer dst shape std::vector src_dims = phi::vectorize(x.dims()); std::vector dst_dims = ReorderVector(src_dims, axis); - phi::DenseTensor dst_tensor; - phi::DenseTensorMeta meta(x.dtype(), phi::make_ddim(dst_dims)); + DenseTensor dst_tensor; + DenseTensorMeta meta(x.dtype(), phi::make_ddim(dst_dims)); dst_tensor.set_meta(meta); dev_ctx.Alloc(&dst_tensor, dst_tensor.dtype()); Transpose(dev_ctx, x, axis, &dst_tensor); @@ -111,22 +111,21 @@ phi::DenseTensor Transpose(const phi::CustomContext& dev_ctx, // 5. Only the layout of PdCustomNHWC and AtenNHWC is expressed as kNHWC, Note // that the layout of PdOriginNHWC uses the default value kNCHW. // -bool DataPdCustomNHWC(const phi::DenseTensor& tensor) { +bool DataPdCustomNHWC(const DenseTensor& tensor) { return (EnableTransposeOptimize() && tensor.layout() == common::DataLayout::kNHWC); } -bool DataPdCustomNHWC(const std::vector& tensors) { +bool DataPdCustomNHWC(const std::vector& tensors) { return (EnableTransposeOptimize() && - std::any_of(tensors.begin(), - tensors.end(), - [](const phi::DenseTensor& tensor) { - return tensor.layout() == common::DataLayout::kNHWC; - })); + std::any_of( + tensors.begin(), tensors.end(), [](const DenseTensor& tensor) { + return tensor.layout() == common::DataLayout::kNHWC; + })); } // //////////////// Permuted funcs //////////////// -void PermutedShapeWithcontiguousStrides(phi::DenseTensor& tensor, // NOLINT +void PermutedShapeWithcontiguousStrides(DenseTensor& tensor, // NOLINT const std::vector& permutation, const common::DataLayout& layout) { auto meta = tensor.meta(); @@ -140,12 +139,12 @@ void PermutedShapeWithcontiguousStrides(phi::DenseTensor& tensor, // NOLINT tensor.set_meta(meta); } -void RecoverPdCustomNHWCMeta(phi::DenseTensor& tensor) { // NOLINT +void RecoverPdCustomNHWCMeta(DenseTensor& tensor) { // NOLINT PermutedShapeWithcontiguousStrides( tensor, layout_trans::kNCHW_to_NHWC, common::DataLayout::kNCHW); } -void PermutedStridesWithoutShape(phi::DenseTensor& tensor, // NOLINT +void PermutedStridesWithoutShape(DenseTensor& tensor, // NOLINT const std::vector& shape_perm, const std::vector& strides_perm, const common::DataLayout& layout) { @@ -165,7 +164,7 @@ void PermutedStridesWithoutShape(phi::DenseTensor& tensor, // NOLINT tensor.set_meta(meta); } -void PermutedShapeAndStrides(phi::DenseTensor& tensor, // NOLINT +void PermutedShapeAndStrides(DenseTensor& tensor, // NOLINT const std::vector& permutation, const common::DataLayout& layout) { auto meta = tensor.meta(); @@ -184,8 +183,8 @@ void PermutedShapeAndStrides(phi::DenseTensor& tensor, // NOLINT } // //////////////// Transpose funcs //////////////// -phi::DenseTensor NCHWTransToPdOriginNHWC(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x) { +DenseTensor NCHWTransToPdOriginNHWC(const phi::CustomContext& dev_ctx, + const DenseTensor& x) { PADDLE_ENFORCE_EQ( x.layout(), common::DataLayout::kNCHW, @@ -194,8 +193,8 @@ phi::DenseTensor NCHWTransToPdOriginNHWC(const phi::CustomContext& dev_ctx, return out; // shape is NHWC, strides is NHWC, contiguous } -phi::DenseTensor NCHWTransToPdCustomNHWC(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x) { +DenseTensor NCHWTransToPdCustomNHWC(const phi::CustomContext& dev_ctx, + const DenseTensor& x) { auto out = NCHWTransToPdOriginNHWC(dev_ctx, x); auto meta = x.meta(); meta.layout = common::DataLayout::kNHWC; @@ -203,28 +202,28 @@ phi::DenseTensor NCHWTransToPdCustomNHWC(const phi::CustomContext& dev_ctx, return out; } -phi::DenseTensor NCHWTransToAtenNHWC(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x) { +DenseTensor NCHWTransToAtenNHWC(const phi::CustomContext& dev_ctx, + const DenseTensor& x) { auto out = NCHWTransToPdCustomNHWC(dev_ctx, x); PdCustomNHWCRepresentAsAtenNHWC(out); return out; } -phi::DenseTensor PdCustomNHWCTransToNCHW(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x) { +DenseTensor PdCustomNHWCTransToNCHW(const phi::CustomContext& dev_ctx, + const DenseTensor& x) { PADDLE_ENFORCE_EQ( x.layout(), common::DataLayout::kNHWC, phi::errors::InvalidArgument("Layout of x should be PdCustomNHWC.")); - phi::DenseTensor tensor = x; // shape is NCHW, strides is NCHW, contiguous + DenseTensor tensor = x; // shape is NCHW, strides is NCHW, contiguous RecoverPdCustomNHWCMeta(tensor); tensor = custom_kernel::Transpose(dev_ctx, tensor, layout_trans::kNHWC_to_NCHW); return tensor; // shape is NCHW, strides is NCHW, contiguous } -phi::DenseTensor PdOriginNHWCTransToNCHW(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x) { +DenseTensor PdOriginNHWCTransToNCHW(const phi::CustomContext& dev_ctx, + const DenseTensor& x) { PADDLE_ENFORCE_EQ( x.layout(), common::DataLayout::kNCHW, @@ -234,18 +233,17 @@ phi::DenseTensor PdOriginNHWCTransToNCHW(const phi::CustomContext& dev_ctx, } // //////////////// Represent funcs //////////////// -phi::DenseTensor NoNeedTransNCHWRepresentAsOriginNHWC( - const phi::DenseTensor& x) { +DenseTensor NoNeedTransNCHWRepresentAsOriginNHWC(const DenseTensor& x) { PADDLE_ENFORCE_EQ( x.layout(), common::DataLayout::kNCHW, phi::errors::InvalidArgument("Layout of x should be origin NHWC.")); - phi::DenseTensor tensor = x; + DenseTensor tensor = x; RecoverPdCustomNHWCMeta(tensor); return tensor; } -void PdCustomNHWCRepresentAsAtenNHWC(phi::DenseTensor& x, // NOLINT +void PdCustomNHWCRepresentAsAtenNHWC(DenseTensor& x, // NOLINT bool weight_or_output) { if (!weight_or_output) { PADDLE_ENFORCE_EQ( @@ -261,7 +259,7 @@ void PdCustomNHWCRepresentAsAtenNHWC(phi::DenseTensor& x, // NOLINT common::DataLayout::kNHWC); } -void AtenNHWCRepresentAsPdCustomNHWC(phi::DenseTensor& x, // NOLINT +void AtenNHWCRepresentAsPdCustomNHWC(DenseTensor& x, // NOLINT bool raw_output) { if (!raw_output) { PADDLE_ENFORCE_EQ( @@ -277,7 +275,7 @@ void AtenNHWCRepresentAsPdCustomNHWC(phi::DenseTensor& x, // NOLINT x.set_meta(meta); } -void OriginNHWCRepresentAsAtenNHWC(phi::DenseTensor& x) { // NOLINT +void OriginNHWCRepresentAsAtenNHWC(DenseTensor& x) { // NOLINT // PADDLE_ENFORCE_EQ( // x.layout(), // common::DataLayout::kNCHW, @@ -288,7 +286,7 @@ void OriginNHWCRepresentAsAtenNHWC(phi::DenseTensor& x) { // NOLINT x, layout_trans::kNHWC_to_NCHW, common::DataLayout::kNHWC); } -void AtenNHWCRepresentAsOriginNHWC(phi::DenseTensor& x) { // NOLINT +void AtenNHWCRepresentAsOriginNHWC(DenseTensor& x) { // NOLINT PADDLE_ENFORCE_EQ( x.layout(), common::DataLayout::kNHWC, @@ -299,7 +297,7 @@ void AtenNHWCRepresentAsOriginNHWC(phi::DenseTensor& x) { // NOLINT x, layout_trans::kNCHW_to_NHWC, common::DataLayout::kNCHW); } -void PdCustomNHWCRepresentAsOriginNHWC(phi::DenseTensor& x, // NOLINT +void PdCustomNHWCRepresentAsOriginNHWC(DenseTensor& x, // NOLINT bool raw_output) { if (!raw_output) { PADDLE_ENFORCE_EQ( @@ -312,7 +310,7 @@ void PdCustomNHWCRepresentAsOriginNHWC(phi::DenseTensor& x, // NOLINT RecoverPdCustomNHWCMeta(x); } -void OriginNHWCRepresentAsPdCustomNHWC(phi::DenseTensor& x) { // NOLINT +void OriginNHWCRepresentAsPdCustomNHWC(DenseTensor& x) { // NOLINT PADDLE_ENFORCE_EQ( x.layout(), common::DataLayout::kNCHW, @@ -323,8 +321,8 @@ void OriginNHWCRepresentAsPdCustomNHWC(phi::DenseTensor& x) { // NOLINT x, layout_trans::kNHWC_to_NCHW, common::DataLayout::kNHWC); } -void RepresentPdCustomNHWC(phi::DenseTensor& x) { // NOLINT - x.Resize(x.dims()); // calc contiguous strides +void RepresentPdCustomNHWC(DenseTensor& x) { // NOLINT + x.Resize(x.dims()); // calc contiguous strides auto meta = x.meta(); meta.layout = common::DataLayout::kNHWC; x.set_meta(meta); diff --git a/backends/gcu/kernels/funcs/gcu_layout_funcs.h b/backends/gcu/kernels/funcs/gcu_layout_funcs.h index 607e2a6f512..d55b4b1377d 100644 --- a/backends/gcu/kernels/funcs/gcu_layout_funcs.h +++ b/backends/gcu/kernels/funcs/gcu_layout_funcs.h @@ -24,76 +24,75 @@ const std::vector kNHWC_to_NCHW = {0, 3, 1, 2}; bool EnableTransposeOptimize(); -void SetLayout(phi::DenseTensor& tensor, // NOLINT +void SetLayout(DenseTensor& tensor, // NOLINT const common::DataLayout& layout); void Transpose(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, - phi::DenseTensor* out); + DenseTensor* out); -phi::DenseTensor Transpose(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x, - const std::vector& axis); +DenseTensor Transpose(const phi::CustomContext& dev_ctx, + const DenseTensor& x, + const std::vector& axis); -bool DataPdCustomNHWC(const phi::DenseTensor& tensor); +bool DataPdCustomNHWC(const DenseTensor& tensor); -bool DataPdCustomNHWC(const std::vector& tensors); +bool DataPdCustomNHWC(const std::vector& tensors); // //////////////// Permuted funcs //////////////// void PermutedShapeWithcontiguousStrides( - phi::DenseTensor& tensor, // NOLINT + DenseTensor& tensor, // NOLINT const std::vector& permutation, const common::DataLayout& layout = common::DataLayout::kNCHW); -void RecoverPdCustomNHWCMeta(phi::DenseTensor& tensor); // NOLINT +void RecoverPdCustomNHWCMeta(DenseTensor& tensor); // NOLINT void PermutedStridesWithoutShape( - phi::DenseTensor& tensor, // NOLINT + DenseTensor& tensor, // NOLINT const std::vector& shape_perm, const std::vector& strides_perm, const common::DataLayout& layout = common::DataLayout::kNCHW); void PermutedShapeAndStrides( - phi::DenseTensor& tensor, // NOLINT + DenseTensor& tensor, // NOLINT const std::vector& permutation, const common::DataLayout& layout = common::DataLayout::kNCHW); // //////////////// Transpose funcs //////////////// -phi::DenseTensor NCHWTransToPdOriginNHWC(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x); +DenseTensor NCHWTransToPdOriginNHWC(const phi::CustomContext& dev_ctx, + const DenseTensor& x); -phi::DenseTensor NCHWTransToPdCustomNHWC(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x); +DenseTensor NCHWTransToPdCustomNHWC(const phi::CustomContext& dev_ctx, + const DenseTensor& x); -phi::DenseTensor NCHWTransToAtenNHWC(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x); +DenseTensor NCHWTransToAtenNHWC(const phi::CustomContext& dev_ctx, + const DenseTensor& x); -phi::DenseTensor PdCustomNHWCTransToNCHW(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x); +DenseTensor PdCustomNHWCTransToNCHW(const phi::CustomContext& dev_ctx, + const DenseTensor& x); -phi::DenseTensor PdOriginNHWCTransToNCHW(const phi::CustomContext& dev_ctx, - const phi::DenseTensor& x); +DenseTensor PdOriginNHWCTransToNCHW(const phi::CustomContext& dev_ctx, + const DenseTensor& x); // //////////////// Represent funcs //////////////// -phi::DenseTensor NoNeedTransNCHWRepresentAsOriginNHWC( - const phi::DenseTensor& x); +DenseTensor NoNeedTransNCHWRepresentAsOriginNHWC(const DenseTensor& x); -void PdCustomNHWCRepresentAsAtenNHWC(phi::DenseTensor& x, // NOLINT +void PdCustomNHWCRepresentAsAtenNHWC(DenseTensor& x, // NOLINT bool weight_or_output = false); -void AtenNHWCRepresentAsPdCustomNHWC(phi::DenseTensor& x, // NOLINT +void AtenNHWCRepresentAsPdCustomNHWC(DenseTensor& x, // NOLINT bool raw_output = false); -void OriginNHWCRepresentAsAtenNHWC(phi::DenseTensor& x); // NOLINT +void OriginNHWCRepresentAsAtenNHWC(DenseTensor& x); // NOLINT -void AtenNHWCRepresentAsOriginNHWC(phi::DenseTensor& x); // NOLINT +void AtenNHWCRepresentAsOriginNHWC(DenseTensor& x); // NOLINT -void PdCustomNHWCRepresentAsOriginNHWC(phi::DenseTensor& x, // NOLINT +void PdCustomNHWCRepresentAsOriginNHWC(DenseTensor& x, // NOLINT bool raw_output = false); -void OriginNHWCRepresentAsPdCustomNHWC(phi::DenseTensor& x); // NOLINT +void OriginNHWCRepresentAsPdCustomNHWC(DenseTensor& x); // NOLINT -void RepresentPdCustomNHWC(phi::DenseTensor& x); // NOLINT +void RepresentPdCustomNHWC(DenseTensor& x); // NOLINT } // namespace custom_kernel diff --git a/backends/gcu/kernels/funcs/op_utils.cc b/backends/gcu/kernels/funcs/op_utils.cc index ed478ccb36a..2a2a975b35d 100644 --- a/backends/gcu/kernels/funcs/op_utils.cc +++ b/backends/gcu/kernels/funcs/op_utils.cc @@ -22,9 +22,9 @@ PHI_DECLARE_bool(use_stride_kernel); namespace custom_kernel { -void *GcuDataPtr(const phi::DenseTensor &tensor) { +void *GcuDataPtr(const DenseTensor &tensor) { if (tensor.initialized()) { - auto contiguous_strides = phi::DenseTensorMeta::calc_strides(tensor.dims()); + auto contiguous_strides = DenseTensorMeta::calc_strides(tensor.dims()); bool is_contiguous = (tensor.strides() == contiguous_strides); auto tensor_tmp = tensor; if (!is_contiguous && !FLAGS_use_stride_kernel) { @@ -37,10 +37,10 @@ void *GcuDataPtr(const phi::DenseTensor &tensor) { return nullptr; } -std::string TensorToString(const phi::DenseTensor &tensor) { +std::string TensorToString(const DenseTensor &tensor) { std::stringstream ss; ss << "DenseTensor<"; - ss << phi::DataTypeToString(tensor.dtype()) << ", "; + ss << DataTypeToString(tensor.dtype()) << ", "; ss << "Shape(" << tensor.dims() << "), "; ss << "layout:" << tensor.layout() << ", "; if (tensor.initialized()) { @@ -53,7 +53,7 @@ std::string TensorToString(const phi::DenseTensor &tensor) { return ss.str(); } -std::string TensorVectorToString(const std::vector &tensors) { +std::string TensorVectorToString(const std::vector &tensors) { std::stringstream ss; ss << "{"; if (tensors.size() == 0) { @@ -68,11 +68,11 @@ std::string TensorVectorToString(const std::vector &tensors) { return ss.str(); } -std::string TensorDetailsToString(const phi::DenseTensor &tensor) { +std::string TensorDetailsToString(const DenseTensor &tensor) { std::stringstream ss; ss << "DenseTensor<"; if (tensor.initialized()) { - ss << phi::DataTypeToString(tensor.dtype()) << ", "; + ss << DataTypeToString(tensor.dtype()) << ", "; ss << tensor.place() << ", "; ss << "dims(" << tensor.dims() << "), "; ss << "strides(" << tensor.strides() << "), "; @@ -88,42 +88,42 @@ std::string TensorDetailsToString(const phi::DenseTensor &tensor) { std::string ScalarToString(const phi::Scalar &scalar_value) { std::stringstream ss; auto scalar_type = scalar_value.dtype(); - ss << "Scalar<" << phi::DataTypeToString(scalar_type) << ", "; + ss << "Scalar<" << DataTypeToString(scalar_type) << ", "; switch (scalar_type) { - case phi::DataType::BOOL: + case DataType::BOOL: ss << scalar_value.to(); break; - case phi::DataType::UINT8: + case DataType::UINT8: ss << scalar_value.to(); break; - case phi::DataType::INT8: + case DataType::INT8: ss << scalar_value.to(); break; - case phi::DataType::INT16: + case DataType::INT16: ss << scalar_value.to(); break; - case phi::DataType::INT32: + case DataType::INT32: ss << scalar_value.to(); break; - case phi::DataType::INT64: + case DataType::INT64: ss << scalar_value.to(); break; - case phi::DataType::FLOAT16: + case DataType::FLOAT16: ss << scalar_value.to(); break; - case phi::DataType::BFLOAT16: + case DataType::BFLOAT16: ss << scalar_value.to(); break; - case phi::DataType::FLOAT32: + case DataType::FLOAT32: ss << scalar_value.to(); break; - case phi::DataType::FLOAT64: + case DataType::FLOAT64: ss << scalar_value.to(); break; default: { PADDLE_THROW(phi::errors::Unimplemented( "ScalarToTopsatenScalar, unsupported data type %s", - phi::DataTypeToString(scalar_type).c_str())); + DataTypeToString(scalar_type).c_str())); break; } } @@ -167,7 +167,7 @@ std::vector InferSize(const std::vector &a, } std::vector ComputeBroadcastShape( - const std::vector operands) { + const std::vector operands) { auto operands_size = operands.size(); PADDLE_ENFORCE_GT( operands_size, diff --git a/backends/gcu/kernels/funcs/op_utils.h b/backends/gcu/kernels/funcs/op_utils.h index a713a99add9..ad9ff49e31d 100644 --- a/backends/gcu/kernels/funcs/op_utils.h +++ b/backends/gcu/kernels/funcs/op_utils.h @@ -22,13 +22,13 @@ #include "runtime/runtime.h" namespace custom_kernel { -void *GcuDataPtr(const phi::DenseTensor &tensor); +void *GcuDataPtr(const DenseTensor &tensor); -std::string TensorToString(const phi::DenseTensor &tensor); +std::string TensorToString(const DenseTensor &tensor); -std::string TensorVectorToString(const std::vector &tensors); +std::string TensorVectorToString(const std::vector &tensors); -std::string TensorDetailsToString(const phi::DenseTensor &tensor); +std::string TensorDetailsToString(const DenseTensor &tensor); std::string ScalarToString(const phi::Scalar &scalar_value); @@ -38,7 +38,7 @@ std::vector InferSize(const std::vector &a, const std::vector &b); std::vector ComputeBroadcastShape( - const std::vector operands); + const std::vector operands); void GcuOpMaybeStreamSync(const phi::DeviceContext &dev_ctx); @@ -54,9 +54,8 @@ struct aot_op_variable_info { }; template <> -struct aot_op_variable_info { - aot_op_variable_info(const phi::DenseTensor &tensor, - const std::string &name) { +struct aot_op_variable_info { + aot_op_variable_info(const DenseTensor &tensor, const std::string &name) { std::stringstream ss; ss << "[" << name << ":" << TensorToString(tensor) << "]; "; info = ss.str(); @@ -66,12 +65,12 @@ struct aot_op_variable_info { }; template <> -struct aot_op_variable_info> { - aot_op_variable_info(const paddle::optional &opt_tensor, +struct aot_op_variable_info> { + aot_op_variable_info(const paddle::optional &opt_tensor, const std::string &name) { std::stringstream ss; if (opt_tensor) { - ss << aot_op_variable_info(opt_tensor.get(), name).info; + ss << aot_op_variable_info(opt_tensor.get(), name).info; } else { ss << "OPTIONAL_NULLPTR_TENSOR"; } @@ -82,15 +81,14 @@ struct aot_op_variable_info> { }; template <> -struct aot_op_variable_info> { - aot_op_variable_info(const std::vector &tensor_list, +struct aot_op_variable_info> { + aot_op_variable_info(const std::vector &tensor_list, const std::string &name) { std::stringstream ss; ss << "[" << name << ":{"; for (int64_t i = 0; i < tensor_list.size(); ++i) { std::string tensor_name = "list_tensor_" + std::to_string(i); - ss << aot_op_variable_info(tensor_list[i], tensor_name) - .info; + ss << aot_op_variable_info(tensor_list[i], tensor_name).info; } ss << "}]; "; info = ss.str(); @@ -100,15 +98,15 @@ struct aot_op_variable_info> { }; template <> -struct aot_op_variable_info *> { - aot_op_variable_info(const std::vector *tensor_list, +struct aot_op_variable_info *> { + aot_op_variable_info(const std::vector *tensor_list, const std::string &name) { std::stringstream ss; ss << "[" << name << ":{"; for (int64_t i = 0; i < tensor_list->size(); ++i) { std::string tensor_name = "list_tensor_" + std::to_string(i); - ss << aot_op_variable_info(*(tensor_list->at(i)), - tensor_name) + ss << aot_op_variable_info(*(tensor_list->at(i)), + tensor_name) .info; } ss << "}]; "; @@ -201,11 +199,10 @@ struct aot_op_variable_info> { }; template <> -struct aot_op_variable_info { - aot_op_variable_info(const phi::DataType &data_type, - const std::string &name) { +struct aot_op_variable_info { + aot_op_variable_info(const DataType &data_type, const std::string &name) { std::stringstream ss; - ss << "[" << name << ":" << phi::DataTypeToString(data_type) << "]; "; + ss << "[" << name << ":" << DataTypeToString(data_type) << "]; "; info = ss.str(); } @@ -239,8 +236,8 @@ struct aot_op_abstract_info { }; template <> -struct aot_op_abstract_info { - explicit aot_op_abstract_info(const phi::DenseTensor &tensor) { +struct aot_op_abstract_info { + explicit aot_op_abstract_info(const DenseTensor &tensor) { std::stringstream ss; ss << "DenseTensor<"; if (tensor.initialized()) { @@ -256,12 +253,12 @@ struct aot_op_abstract_info { }; template <> -struct aot_op_abstract_info> { +struct aot_op_abstract_info> { explicit aot_op_abstract_info( - const paddle::optional &opt_tensor) { + const paddle::optional &opt_tensor) { std::stringstream ss; if (opt_tensor) { - ss << aot_op_abstract_info(opt_tensor.get()).info; + ss << aot_op_abstract_info(opt_tensor.get()).info; } else { ss << "; "; } @@ -272,13 +269,12 @@ struct aot_op_abstract_info> { }; template <> -struct aot_op_abstract_info> { - explicit aot_op_abstract_info( - const std::vector &tensor_list) { +struct aot_op_abstract_info> { + explicit aot_op_abstract_info(const std::vector &tensor_list) { std::stringstream ss; ss << "ListTensor<"; for (int64_t i = 0; i < tensor_list.size(); ++i) { - ss << aot_op_abstract_info(tensor_list[i]).info; + ss << aot_op_abstract_info(tensor_list[i]).info; } ss << ">; "; info = ss.str(); @@ -288,13 +284,13 @@ struct aot_op_abstract_info> { }; template <> -struct aot_op_abstract_info> { +struct aot_op_abstract_info> { explicit aot_op_abstract_info( - const std::vector &tensor_list) { + const std::vector &tensor_list) { std::stringstream ss; ss << "ListTensor<"; for (int64_t i = 0; i < tensor_list.size(); ++i) { - ss << aot_op_abstract_info(*(tensor_list[i])).info; + ss << aot_op_abstract_info(*(tensor_list[i])).info; } ss << ">; "; info = ss.str(); @@ -304,13 +300,12 @@ struct aot_op_abstract_info> { }; template <> -struct aot_op_abstract_info> { - explicit aot_op_abstract_info( - const std::vector &tensor_list) { +struct aot_op_abstract_info> { + explicit aot_op_abstract_info(const std::vector &tensor_list) { std::stringstream ss; ss << "ListTensor<"; for (int64_t i = 0; i < tensor_list.size(); ++i) { - ss << aot_op_abstract_info(*(tensor_list[i])).info; + ss << aot_op_abstract_info(*(tensor_list[i])).info; } ss << ">; "; info = ss.str(); @@ -396,10 +391,10 @@ struct aot_op_abstract_info> { }; template <> -struct aot_op_abstract_info { - explicit aot_op_abstract_info(const phi::DataType &data_type) { +struct aot_op_abstract_info { + explicit aot_op_abstract_info(const DataType &data_type) { std::stringstream ss; - ss << "DType<" << phi::DataTypeToString(data_type) << ">; "; + ss << "DType<" << DataTypeToString(data_type) << ">; "; info = ss.str(); } @@ -449,18 +444,18 @@ inline std::string GetAbstractInfo(const std::string &op_name, return op_info; } -inline bool IsNarrowType(const phi::DataType &dtype) { - return dtype == phi::DataType::FLOAT64 || dtype == phi::DataType::INT64; +inline bool IsNarrowType(const DataType &dtype) { + return dtype == DataType::FLOAT64 || dtype == DataType::INT64; } -inline void WarnTypeNarrow(const phi::DataType &dtype) { - if (dtype == phi::DataType::FLOAT64) { +inline void WarnTypeNarrow(const DataType &dtype) { + if (dtype == DataType::FLOAT64) { LOG_FIRST_N(WARNING, 1) - << "GCU not support " << phi::DataTypeToString(dtype) + << "GCU not support " << DataTypeToString(dtype) << ", use float32 replace, maybe lead to unexpected overflow issues."; - } else if (dtype == phi::DataType::INT64) { + } else if (dtype == DataType::INT64) { LOG_FIRST_N(WARNING, 1) - << "GCU not support " << phi::DataTypeToString(dtype) + << "GCU not support " << DataTypeToString(dtype) << ", use int32 replace, maybe lead to unexpected overflow issues."; } } diff --git a/backends/gcu/kernels/funcs/topsaten_op_launch.h b/backends/gcu/kernels/funcs/topsaten_op_launch.h index 0b0d25dd16d..cab190baf53 100644 --- a/backends/gcu/kernels/funcs/topsaten_op_launch.h +++ b/backends/gcu/kernels/funcs/topsaten_op_launch.h @@ -28,8 +28,8 @@ struct topsaten_variable { }; template <> -struct topsaten_variable { - explicit topsaten_variable(const phi::DenseTensor& tensor) { +struct topsaten_variable { + explicit topsaten_variable(const DenseTensor& tensor) { value = CreateTopsatenTensor(tensor); } @@ -37,9 +37,8 @@ struct topsaten_variable { }; template <> -struct topsaten_variable> { - explicit topsaten_variable( - const paddle::optional& opt_tensor) { +struct topsaten_variable> { + explicit topsaten_variable(const paddle::optional& opt_tensor) { value = OptionalTensorToTopsatenTensor(opt_tensor); } @@ -47,8 +46,8 @@ struct topsaten_variable> { }; template <> -struct topsaten_variable> { - explicit topsaten_variable(const std::vector& tensor_list) { +struct topsaten_variable> { + explicit topsaten_variable(const std::vector& tensor_list) { for (int64_t i = 0; i < tensor_list.size(); ++i) { value.emplace_back(CreateTopsatenTensor(tensor_list[i])); } @@ -103,8 +102,8 @@ struct topsaten_variable> { }; template <> -struct topsaten_variable { - explicit topsaten_variable(const phi::DataType& data_type) { +struct topsaten_variable { + explicit topsaten_variable(const DataType& data_type) { value = DataTypeToTopsatenDataType(data_type); } @@ -145,10 +144,10 @@ struct topsaten_variable> { template \ auto launch_##topsatenop(const phi::CustomContext& dev_ctx, \ const std::string& abstract_info, \ - phi::DenseTensor& out, \ + DenseTensor& out, \ const Args&... args) { \ auto stream = static_cast(dev_ctx.stream()); \ - auto xout = topsaten_variable(out); \ + auto xout = topsaten_variable(out); \ topsatenStatus_t status; \ { \ GCU_AOT_KERNEL_TRACE(abstract_info); \ @@ -184,12 +183,12 @@ struct topsaten_variable> { template \ auto launch_##topsatenop(const phi::CustomContext& dev_ctx, \ const std::string& abstract_info, \ - phi::DenseTensor& out1, \ - phi::DenseTensor& out2, \ + DenseTensor& out1, \ + DenseTensor& out2, \ const Args&... args) { \ auto stream = static_cast(dev_ctx.stream()); \ - auto xout1 = topsaten_variable(out1); \ - auto xout2 = topsaten_variable(out2); \ + auto xout1 = topsaten_variable(out1); \ + auto xout2 = topsaten_variable(out2); \ topsatenStatus_t status; \ { \ GCU_AOT_KERNEL_TRACE(abstract_info); \ @@ -206,14 +205,14 @@ struct topsaten_variable> { template \ auto launch_##topsatenop(const phi::CustomContext& dev_ctx, \ const std::string& abstract_info, \ - phi::DenseTensor& out1, \ - phi::DenseTensor& out2, \ - phi::DenseTensor& out3, \ + DenseTensor& out1, \ + DenseTensor& out2, \ + DenseTensor& out3, \ const Args&... args) { \ auto stream = static_cast(dev_ctx.stream()); \ - auto xout1 = topsaten_variable(out1); \ - auto xout2 = topsaten_variable(out2); \ - auto xout3 = topsaten_variable(out3); \ + auto xout1 = topsaten_variable(out1); \ + auto xout2 = topsaten_variable(out2); \ + auto xout3 = topsaten_variable(out3); \ topsatenStatus_t status; \ { \ GCU_AOT_KERNEL_TRACE(abstract_info); \ @@ -231,10 +230,10 @@ struct topsaten_variable> { template \ auto launch_##topsatenop(const phi::CustomContext& dev_ctx, \ const std::string& abstract_info, \ - std::vector& out, \ + std::vector& out, \ const Args&... args) { \ auto stream = static_cast(dev_ctx.stream()); \ - auto xout = topsaten_variable>(out); \ + auto xout = topsaten_variable>(out); \ topsatenStatus_t status; \ { \ GCU_AOT_KERNEL_TRACE(abstract_info); \ @@ -257,46 +256,46 @@ struct topsaten_variable> { #define DEFINE_LAUNCH_TOPSATENOP_OUT2_VLLM(topsatenop) \ DEFINE_LAUNCH_TOPSATENOP_OUT2_WITH_NAMESPACE(topsvllm, topsatenop) -#define DEFINE_LAUNCH_TOPSATENOP_NATIVE_BATCH_NORM(topsatenop) \ - template \ - auto launch_##topsatenop(const phi::CustomContext& dev_ctx, \ - const std::string& abstract_info, \ - phi::DenseTensor& out, \ - phi::DenseTensor& save_mean, \ - phi::DenseTensor& save_var, \ - const phi::DenseTensor& x, \ - const phi::DenseTensor& scale, \ - const phi::DenseTensor& bias, \ - phi::DenseTensor& run_mean, \ - phi::DenseTensor& run_var, \ - bool training, \ - double momentum, \ - double eps) { \ - auto stream = static_cast(dev_ctx.stream()); \ - auto xout1 = topsaten_variable(out); \ - auto xout2 = topsaten_variable(save_mean); \ - auto xout3 = topsaten_variable(save_var); \ - auto xout4 = topsaten_variable(run_mean); \ - auto xout5 = topsaten_variable(run_var); \ - topsatenStatus_t status; \ - { \ - GCU_AOT_KERNEL_TRACE(abstract_info); \ - status = topsaten::topsatenop( \ - xout1.value, \ - xout2.value, \ - xout3.value, \ - topsaten_variable(x).value, \ - topsaten_variable(scale).value, \ - topsaten_variable(bias).value, \ - xout4.value, \ - xout5.value, \ - training, \ - momentum, \ - eps, \ - stream); \ - GcuOpMaybeStreamSync(dev_ctx); \ - } \ - return status; \ +#define DEFINE_LAUNCH_TOPSATENOP_NATIVE_BATCH_NORM(topsatenop) \ + template \ + auto launch_##topsatenop(const phi::CustomContext& dev_ctx, \ + const std::string& abstract_info, \ + DenseTensor& out, \ + DenseTensor& save_mean, \ + DenseTensor& save_var, \ + const DenseTensor& x, \ + const DenseTensor& scale, \ + const DenseTensor& bias, \ + DenseTensor& run_mean, \ + DenseTensor& run_var, \ + bool training, \ + double momentum, \ + double eps) { \ + auto stream = static_cast(dev_ctx.stream()); \ + auto xout1 = topsaten_variable(out); \ + auto xout2 = topsaten_variable(save_mean); \ + auto xout3 = topsaten_variable(save_var); \ + auto xout4 = topsaten_variable(run_mean); \ + auto xout5 = topsaten_variable(run_var); \ + topsatenStatus_t status; \ + { \ + GCU_AOT_KERNEL_TRACE(abstract_info); \ + status = \ + topsaten::topsatenop(xout1.value, \ + xout2.value, \ + xout3.value, \ + topsaten_variable(x).value, \ + topsaten_variable(scale).value, \ + topsaten_variable(bias).value, \ + xout4.value, \ + xout5.value, \ + training, \ + momentum, \ + eps, \ + stream); \ + GcuOpMaybeStreamSync(dev_ctx); \ + } \ + return status; \ } #define LAUNCH_TOPSATENOP(topsatenop, dev_ctx, topsatenop_args...) \ diff --git a/backends/gcu/kernels/funcs/topsaten_op_utils.cc b/backends/gcu/kernels/funcs/topsaten_op_utils.cc index 279b8c0fb03..9bf1fdad7fd 100644 --- a/backends/gcu/kernels/funcs/topsaten_op_utils.cc +++ b/backends/gcu/kernels/funcs/topsaten_op_utils.cc @@ -19,8 +19,7 @@ namespace custom_kernel { -topsatenTensor CreateTopsatenTensor(const phi::DenseTensor &tensor, - bool pinned) { +topsatenTensor CreateTopsatenTensor(const DenseTensor &tensor, bool pinned) { if (UNLIKELY(!tensor.initialized())) { VLOG(6) << "Create default topsatenTensor."; return topsatenTensor(); @@ -55,7 +54,7 @@ topsatenTensor CreateTopsatenTensor(const phi::DenseTensor &tensor, } topsatenTensor OptionalTensorToTopsatenTensor( - const paddle::optional &opt_tensor) { + const paddle::optional &opt_tensor) { if (opt_tensor) { return CreateTopsatenTensor(opt_tensor.get()); } else { @@ -64,7 +63,7 @@ topsatenTensor OptionalTensorToTopsatenTensor( } topsatenTensor CreateTopsatenTensorWithoutInitialized( - const phi::DenseTensor &tensor) { + const DenseTensor &tensor) { PADDLE_ENFORCE_EQ( tensor.initialized(), false, @@ -93,31 +92,31 @@ topsatenTensor CreateTopsatenTensorWithoutInitialized( return xt; } -topsatenDataType_t DataTypeToTopsatenDataType(const phi::DataType &dtype) { +topsatenDataType_t DataTypeToTopsatenDataType(const DataType &dtype) { switch (dtype) { - case phi::DataType::BOOL: + case DataType::BOOL: return TOPSATEN_DATA_PRED; - case phi::DataType::UINT8: + case DataType::UINT8: return TOPSATEN_DATA_U8; - case phi::DataType::INT8: + case DataType::INT8: return TOPSATEN_DATA_I8; - case phi::DataType::INT16: + case DataType::INT16: return TOPSATEN_DATA_I16; - case phi::DataType::INT32: + case DataType::INT32: return TOPSATEN_DATA_I32; - case phi::DataType::INT64: + case DataType::INT64: return TOPSATEN_DATA_I64; - case phi::DataType::FLOAT16: + case DataType::FLOAT16: return TOPSATEN_DATA_FP16; - case phi::DataType::BFLOAT16: + case DataType::BFLOAT16: return TOPSATEN_DATA_BF16; - case phi::DataType::FLOAT32: + case DataType::FLOAT32: return TOPSATEN_DATA_FP32; - case phi::DataType::FLOAT64: + case DataType::FLOAT64: return TOPSATEN_DATA_F64; default: { - PADDLE_THROW(phi::errors::Unimplemented( - "Unsupported data type %s", phi::DataTypeToString(dtype).c_str())); + PADDLE_THROW(phi::errors::Unimplemented("Unsupported data type %s", + DataTypeToString(dtype).c_str())); return TOPSATEN_DATA_FP32; } } @@ -127,50 +126,50 @@ topsatenScalar_t ScalarToTopsatenScalar(const phi::Scalar &scalar_value) { topsatenScalar_t xvalue; auto scalar_type = scalar_value.dtype(); switch (scalar_type) { - case phi::DataType::BOOL: + case DataType::BOOL: xvalue.dtype = TOPSATEN_DATA_PRED; xvalue.ival = scalar_value.to(); break; - case phi::DataType::UINT8: + case DataType::UINT8: xvalue.dtype = TOPSATEN_DATA_U8; xvalue.ival = scalar_value.to(); break; - case phi::DataType::INT8: + case DataType::INT8: xvalue.dtype = TOPSATEN_DATA_I8; xvalue.ival = scalar_value.to(); break; - case phi::DataType::INT16: + case DataType::INT16: xvalue.dtype = TOPSATEN_DATA_I16; xvalue.ival = scalar_value.to(); break; - case phi::DataType::INT32: + case DataType::INT32: xvalue.dtype = TOPSATEN_DATA_I32; xvalue.ival = scalar_value.to(); break; - case phi::DataType::INT64: + case DataType::INT64: xvalue.dtype = TOPSATEN_DATA_I64; xvalue.ival = scalar_value.to(); break; - case phi::DataType::FLOAT16: + case DataType::FLOAT16: xvalue.dtype = TOPSATEN_DATA_FP16; xvalue.fval = scalar_value.to(); break; - case phi::DataType::BFLOAT16: + case DataType::BFLOAT16: xvalue.dtype = TOPSATEN_DATA_BF16; xvalue.fval = scalar_value.to(); break; - case phi::DataType::FLOAT32: + case DataType::FLOAT32: xvalue.dtype = TOPSATEN_DATA_FP32; xvalue.fval = scalar_value.to(); break; - case phi::DataType::FLOAT64: + case DataType::FLOAT64: xvalue.dtype = TOPSATEN_DATA_F64; xvalue.fval = scalar_value.to(); break; default: { PADDLE_THROW(phi::errors::Unimplemented( "ScalarToTopsatenScalar, unsupported data type %s", - phi::DataTypeToString(scalar_type).c_str())); + DataTypeToString(scalar_type).c_str())); break; } } diff --git a/backends/gcu/kernels/funcs/topsaten_op_utils.h b/backends/gcu/kernels/funcs/topsaten_op_utils.h index d704338239b..8f0f0ca0791 100644 --- a/backends/gcu/kernels/funcs/topsaten_op_utils.h +++ b/backends/gcu/kernels/funcs/topsaten_op_utils.h @@ -18,16 +18,16 @@ namespace custom_kernel { -topsatenTensor CreateTopsatenTensor(const phi::DenseTensor &tensor, +topsatenTensor CreateTopsatenTensor(const DenseTensor &tensor, bool pinned = false); topsatenTensor OptionalTensorToTopsatenTensor( - const paddle::optional &opt_tensor); + const paddle::optional &opt_tensor); topsatenTensor CreateTopsatenTensorWithoutInitialized( - const phi::DenseTensor &tensor); + const DenseTensor &tensor); -topsatenDataType_t DataTypeToTopsatenDataType(const phi::DataType &dtype); +topsatenDataType_t DataTypeToTopsatenDataType(const DataType &dtype); topsatenScalar_t ScalarToTopsatenScalar(const phi::Scalar &scalar_value); diff --git a/backends/gcu/kernels/fused_conv2d_add_act_kernel.cc b/backends/gcu/kernels/fused_conv2d_add_act_kernel.cc index ff58826b434..d3feea23ed2 100644 --- a/backends/gcu/kernels/fused_conv2d_add_act_kernel.cc +++ b/backends/gcu/kernels/fused_conv2d_add_act_kernel.cc @@ -19,16 +19,16 @@ namespace custom_kernel { static std::unordered_set g_weights_nhwc; template extern void AddKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out); + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); template void FusedConv2dAddActKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& filter, - const phi::DenseTensor& bias, - const paddle::optional& residual, + const DenseTensor& input, + const DenseTensor& filter, + const DenseTensor& bias, + const paddle::optional& residual, const std::vector& strides, const std::vector& paddings, const std::string& padding_algorithm, @@ -40,17 +40,17 @@ void FusedConv2dAddActKernel(const Context& dev_ctx, bool exhaustive_search, int workspace_size_MB, float fuse_alpha, - phi::DenseTensor* output, - std::vector outputs) { + DenseTensor* output, + std::vector outputs) { PADDLE_GCU_KERNEL_TRACE("fused_conv2d_add_act"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(output); - phi::DenseTensor input_perm = input; - phi::DenseTensor filter_perm = filter; - phi::DenseTensor conv_out_perm = *output; - phi::DenseTensor residual_perm; + DenseTensor input_perm = input; + DenseTensor filter_perm = filter; + DenseTensor conv_out_perm = *output; + DenseTensor residual_perm; if (residual) { residual_perm = residual.get(); } @@ -89,7 +89,7 @@ void FusedConv2dAddActKernel(const Context& dev_ctx, PdCustomNHWCRepresentAsAtenNHWC(conv_out_perm, true); if (g_weights_nhwc.count(filter.data()) == 0) { auto filter_trans = NCHWTransToPdCustomNHWC(dev_ctx, filter); - phi::DenseTensor* filter_ptr = const_cast(&filter); + DenseTensor* filter_ptr = const_cast(&filter); TensorCopy(dev_ctx, filter_trans, false, filter_ptr); g_weights_nhwc.emplace(filter.data()); VLOG(6) << "Transpose debug, trans filter for fused_conv2d_add_act."; diff --git a/backends/gcu/kernels/fused_fc_elementwise_layernorm_kernel.cc b/backends/gcu/kernels/fused_fc_elementwise_layernorm_kernel.cc index 208a68fabc6..67e3c76ffba 100644 --- a/backends/gcu/kernels/fused_fc_elementwise_layernorm_kernel.cc +++ b/backends/gcu/kernels/fused_fc_elementwise_layernorm_kernel.cc @@ -19,55 +19,55 @@ namespace custom_kernel { template extern void FCKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& w, - const paddle::optional& bias, + const DenseTensor& input, + const DenseTensor& w, + const paddle::optional& bias, const int in_num_col_dims, const std::string& activation_type, const bool padding_weights, - phi::DenseTensor* out); + DenseTensor* out); template extern void AddKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out); + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); template void LayerNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale_opt, - const paddle::optional& bias_opt, + const DenseTensor& x, + const paddle::optional& scale_opt, + const paddle::optional& bias_opt, float epsilon, int begin_norm_axis, - phi::DenseTensor* out, - phi::DenseTensor* mean, - phi::DenseTensor* variance); + DenseTensor* out, + DenseTensor* mean, + DenseTensor* variance); template void FusedFCElementwiseLayerNormKernel( const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& w, - const phi::DenseTensor& y, - const paddle::optional& bias0, - const paddle::optional& scale, - const paddle::optional& bias1, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& y, + const paddle::optional& bias0, + const paddle::optional& scale, + const paddle::optional& bias1, const int x_num_col_dims, const std::string& activation_type, const float epsilon, const int begin_norm_axis, - phi::DenseTensor* out, - phi::DenseTensor* mean, - phi::DenseTensor* variance) { + DenseTensor* out, + DenseTensor* mean, + DenseTensor* variance) { PADDLE_GCU_KERNEL_TRACE("fused_fc_elementwise_layernorm"); if (LaunchAOTKernel()) { - phi::DenseTensor fc_out = TensorEmpty(dev_ctx, out->meta()); + DenseTensor fc_out = TensorEmpty(dev_ctx, out->meta()); custom_kernel::FCKernel( dev_ctx, x, w, bias0, x_num_col_dims, activation_type, false, &fc_out); if (mean != nullptr && variance != nullptr) { - phi::DenseTensor add_out = TensorEmpty(dev_ctx, out->meta()); + DenseTensor add_out = TensorEmpty(dev_ctx, out->meta()); custom_kernel::AddKernel(dev_ctx, y, fc_out, &add_out); custom_kernel::LayerNormKernel(dev_ctx, add_out, diff --git a/backends/gcu/kernels/gather_kernel.cc b/backends/gcu/kernels/gather_kernel.cc index 8ef45660429..1669b2b22d2 100644 --- a/backends/gcu/kernels/gather_kernel.cc +++ b/backends/gcu/kernels/gather_kernel.cc @@ -19,19 +19,18 @@ namespace custom_kernel { template void GatherKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, + const DenseTensor& x, + const DenseTensor& index, const phi::Scalar& axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("gather"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { VLOG(6) << "GatherKernel, x dims:" << x.dims() << ", out dims:" << out->dims(); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); auto gather_axis = axis.to(); if (gather_axis < 0) { gather_axis += x.dims().size(); @@ -40,22 +39,22 @@ void GatherKernel(const Context& dev_ctx, topsatenIndexSelect, dev_ctx, output, input_x, gather_axis, index); MaybeTransResult(dev_ctx, output, out); - // phi::DenseTensor input_x(x); - // phi::DenseTensor input_index(index); - // phi::DenseTensor output(*out); + // DenseTensor input_x(x); + // DenseTensor input_index(index); + // DenseTensor output(*out); - // if (x.dtype() == phi::DataType::INT64) { - // input_x = custom_kernel::Cast(dev_ctx, x, phi::DataType::INT32); + // if (x.dtype() == DataType::INT64) { + // input_x = custom_kernel::Cast(dev_ctx, x, DataType::INT32); // } - // if (index.dtype() == phi::DataType::INT64) { + // if (index.dtype() == DataType::INT64) { // input_index = custom_kernel::Cast(dev_ctx, index, - // phi::DataType::INT32); + // DataType::INT32); // } - // if (out->dtype() == phi::DataType::INT64) { + // if (out->dtype() == DataType::INT64) { // auto meta = out->meta(); - // meta.dtype = phi::DataType::INT32; + // meta.dtype = DataType::INT32; // output.set_meta(meta); // dev_ctx.template Alloc(&output, output.dtype()); // } @@ -120,8 +119,8 @@ void GatherKernel(const Context& dev_ctx, // indices_are_sorted, // unique_indices); - // if (out->dtype() == phi::DataType::INT64) { - // custom_kernel::Cast(dev_ctx, output, phi::DataType::INT64, out); + // if (out->dtype() == DataType::INT64) { + // custom_kernel::Cast(dev_ctx, output, DataType::INT64, out); // } } else { // kernel impl base on JIT @@ -151,11 +150,11 @@ void GatherKernel(const Context& dev_ctx, template void GatherGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, const phi::Scalar& axis, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("gather_grad"); dev_ctx.template Alloc(x_grad); diff --git a/backends/gcu/kernels/gather_nd_kernel.cc b/backends/gcu/kernels/gather_nd_kernel.cc index c58025e164f..fb9f7bce9df 100644 --- a/backends/gcu/kernels/gather_nd_kernel.cc +++ b/backends/gcu/kernels/gather_nd_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void GatherNdKernel(const Context &dev_ctx, - const phi::DenseTensor &x, - const phi::DenseTensor &index, - phi::DenseTensor *out) { + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out) { PADDLE_GCU_KERNEL_TRACE("gather_nd"); dev_ctx.template Alloc(out); @@ -44,22 +44,22 @@ void GatherNdKernel(const Context &dev_ctx, if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); - // phi::DenseTensor input_x(x); - // phi::DenseTensor input_index(index); - // phi::DenseTensor output(*out); + // DenseTensor input_x(x); + // DenseTensor input_index(index); + // DenseTensor output(*out); - // if (x.dtype() == phi::DataType::INT64) { - // input_x = custom_kernel::Cast(dev_ctx, x, phi::DataType::INT32); + // if (x.dtype() == DataType::INT64) { + // input_x = custom_kernel::Cast(dev_ctx, x, DataType::INT32); // } - // if (index.dtype() == phi::DataType::INT64) { + // if (index.dtype() == DataType::INT64) { // input_index = custom_kernel::Cast(dev_ctx, index, - // phi::DataType::INT32); + // DataType::INT32); // } - // if (out->dtype() == phi::DataType::INT64) { + // if (out->dtype() == DataType::INT64) { // auto meta = out->meta(); - // meta.dtype = phi::DataType::INT32; + // meta.dtype = DataType::INT32; // output.set_meta(meta); // dev_ctx.template Alloc(&output, output.dtype()); // } @@ -129,8 +129,8 @@ void GatherNdKernel(const Context &dev_ctx, // indices_are_sorted, // unique_indices); - // if (out->dtype() == phi::DataType::INT64) { - // custom_kernel::Cast(dev_ctx, output, phi::DataType::INT64, out); + // if (out->dtype() == DataType::INT64) { + // custom_kernel::Cast(dev_ctx, output, DataType::INT64, out); // } } else { // kernel impl base on JIT @@ -164,10 +164,10 @@ void GatherNdKernel(const Context &dev_ctx, template void GatherNdGradKernel(const Context &dev_ctx, - const phi::DenseTensor &x, - const phi::DenseTensor &index, - const phi::DenseTensor &dout, - phi::DenseTensor *dx) { + const DenseTensor &x, + const DenseTensor &index, + const DenseTensor &dout, + DenseTensor *dx) { PADDLE_GCU_KERNEL_TRACE("gather_nd_grad"); auto x_dims = dx->dims(); dev_ctx.template Alloc(dx); diff --git a/backends/gcu/kernels/gaussian_kernel.cc b/backends/gcu/kernels/gaussian_kernel.cc index de1954b57ea..fe03a3e505f 100644 --- a/backends/gcu/kernels/gaussian_kernel.cc +++ b/backends/gcu/kernels/gaussian_kernel.cc @@ -24,8 +24,8 @@ void GaussianKernel(const Context& ctx, float mean, float std, int seed, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("gaussian"); if (LaunchAOTKernel()) { @@ -45,11 +45,11 @@ void GaussianKernel(const Context& ctx, ContextPinnedGuard ctx_pinned_guard(ctx); VLOG(6) << "[HOST_KERNEL] Impl on host for gaussian"; VLOG(6) << "Enter GaussianKernel with mean:" << mean << ", std:" << std - << ", seed:" << seed << ", dtype:" << phi::DataTypeToString(dtype); + << ", seed:" << seed << ", dtype:" << DataTypeToString(dtype); ctx.template Alloc(out); - phi::DenseTensor cpu_tensor; - phi::DenseTensorMeta cpu_meta = {out->dtype(), out->dims()}; + DenseTensor cpu_tensor; + DenseTensorMeta cpu_meta = {out->dtype(), out->dims()}; cpu_tensor.set_meta(cpu_meta); T* cpu_data = ctx.template HostAlloc(&cpu_tensor); std::normal_distribution::Type> dist( diff --git a/backends/gcu/kernels/grid_sample_kernel.cc b/backends/gcu/kernels/grid_sample_kernel.cc index fa6b5f93424..adcf83b5c60 100644 --- a/backends/gcu/kernels/grid_sample_kernel.cc +++ b/backends/gcu/kernels/grid_sample_kernel.cc @@ -48,12 +48,12 @@ int64_t GridSamplePaddingMode(const std::string& mode) { template void GridSampleKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& grid, + const DenseTensor& x, + const DenseTensor& grid, const std::string& mode, const std::string& padding_mode, bool align_corners, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("grid_sample"); dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/huber_loss_kernel.cc b/backends/gcu/kernels/huber_loss_kernel.cc index b0988f1886b..26f90278923 100644 --- a/backends/gcu/kernels/huber_loss_kernel.cc +++ b/backends/gcu/kernels/huber_loss_kernel.cc @@ -19,11 +19,11 @@ namespace custom_kernel { template void HuberLossKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& label, + const DenseTensor& input, + const DenseTensor& label, float delta, - phi::DenseTensor* out, - phi::DenseTensor* residual) { + DenseTensor* out, + DenseTensor* residual) { PADDLE_GCU_KERNEL_TRACE("huber_loss"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -70,11 +70,11 @@ void HuberLossKernel(const Context& dev_ctx, template void HuberLossGradKernel(const Context& dev_ctx, - const phi::DenseTensor& residual, - const phi::DenseTensor& dout, + const DenseTensor& residual, + const DenseTensor& dout, float delta, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { PADDLE_GCU_KERNEL_TRACE("huber_loss_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); diff --git a/backends/gcu/kernels/increment_kernel.cc b/backends/gcu/kernels/increment_kernel.cc index 37245009f40..6d5412ec2a8 100644 --- a/backends/gcu/kernels/increment_kernel.cc +++ b/backends/gcu/kernels/increment_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void IncrementKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, float step, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("increment"); dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/index_add_kernel.cc b/backends/gcu/kernels/index_add_kernel.cc index 4d0f636f77e..006a7acbb1d 100644 --- a/backends/gcu/kernels/index_add_kernel.cc +++ b/backends/gcu/kernels/index_add_kernel.cc @@ -18,11 +18,11 @@ namespace custom_kernel { template void IndexAddKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, - const phi::DenseTensor& add_value, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& add_value, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("index_add"); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/index_put_kernel.cc b/backends/gcu/kernels/index_put_kernel.cc index 02420beff7a..c43fb781cd2 100644 --- a/backends/gcu/kernels/index_put_kernel.cc +++ b/backends/gcu/kernels/index_put_kernel.cc @@ -17,15 +17,15 @@ namespace custom_kernel { template void IndexPutKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const std::vector& indices, - const phi::DenseTensor& value, + const DenseTensor& x, + const std::vector& indices, + const DenseTensor& value, bool accumulate, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("index_put"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - std::vector input_indices; + std::vector input_indices; for (const auto& index : indices) { input_indices.emplace_back(MaybeCreateOrTrans64To32bits(dev_ctx, *index)); } @@ -33,12 +33,11 @@ void IndexPutKernel(const Context& dev_ctx, for (const auto& tensor : input_indices) { indices_tensors.emplace_back(CreateTopsatenTensor(tensor)); } - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_value = MaybeCreateOrTrans64To32bits(dev_ctx, value); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_value = MaybeCreateOrTrans64To32bits(dev_ctx, value); auto input_tensor = CreateTopsatenTensor(input_x); auto value_tensor = CreateTopsatenTensor(input_value); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); auto out_tensor = CreateTopsatenTensor(output); std::string abstract_info = custom_kernel::GetAbstractInfo("topsatenIndexPut", diff --git a/backends/gcu/kernels/index_sample_kernel.cc b/backends/gcu/kernels/index_sample_kernel.cc index 93eb259c176..f545515afdd 100644 --- a/backends/gcu/kernels/index_sample_kernel.cc +++ b/backends/gcu/kernels/index_sample_kernel.cc @@ -17,15 +17,15 @@ namespace custom_kernel { template extern void GatherNdKernel(const Context &dev_ctx, - const phi::DenseTensor &x, - const phi::DenseTensor &index, - phi::DenseTensor *out); + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out); template void IndexSampleGather(const Context &dev_ctx, - const phi::DenseTensor &input, - const phi::DenseTensor &index, - phi::DenseTensor *output) { + const DenseTensor &input, + const DenseTensor &index, + DenseTensor *output) { auto index_dims = index.dims(); auto input_dims = input.dims(); auto batch_size = input_dims[0]; @@ -63,7 +63,7 @@ void IndexSampleGather(const Context &dev_ctx, } } - phi::DenseTensor gather_index; + DenseTensor gather_index; TensorFromVector(dev_ctx, gather_index_vec, dev_ctx, &gather_index); // dev_ctx.Wait(); gather_index.Resize({batch_size, index_length, 2}); @@ -73,30 +73,30 @@ void IndexSampleGather(const Context &dev_ctx, template void IndexSampleKernel(const Context &dev_ctx, - const phi::DenseTensor &x, - const phi::DenseTensor &index, - phi::DenseTensor *out) { + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out) { PADDLE_GCU_KERNEL_TRACE("index_sample"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - phi::DenseTensor input_index = MaybeCreateOrTrans64To32bits(dev_ctx, index); + DenseTensor input_index = MaybeCreateOrTrans64To32bits(dev_ctx, index); bool sparse_grad = false; int64_t axis = (x.dims().size() > 1) ? (x.dims().size() - 1) : 0; LAUNCH_TOPSATENOP( topsatenGather, dev_ctx, *out, x, axis, input_index, sparse_grad); // auto index_type = index.dtype(); - // bool index_type_match = index_type == phi::DataType::INT32 || - // index_type == phi::DataType::INT64; + // bool index_type_match = index_type == DataType::INT32 || + // index_type == DataType::INT64; // PADDLE_ENFORCE_EQ(index_type_match, true, // phi::errors::InvalidArgument( // "Input(Index) holds the wrong type, it holds %s, // but " "desires to be %s or %s", - // phi::DataTypeToString(index_type).c_str(), - // phi::DataTypeToString(phi::DataType::INT32).c_str(), - // phi::DataTypeToString(phi::DataType::INT64).c_str())); - // if (index_type == phi::DataType::INT32) { + // DataTypeToString(index_type).c_str(), + // DataTypeToString(DataType::INT32).c_str(), + // DataTypeToString(DataType::INT64).c_str())); + // if (index_type == DataType::INT32) { // IndexSampleGather(dev_ctx, x, index, out); - // } else if (index_type == phi::DataType::INT64) { + // } else if (index_type == DataType::INT64) { // IndexSampleGather(dev_ctx, x, index, out); // } diff --git a/backends/gcu/kernels/index_select_kernel.cc b/backends/gcu/kernels/index_select_kernel.cc index 5219f976247..0f750a3270d 100644 --- a/backends/gcu/kernels/index_select_kernel.cc +++ b/backends/gcu/kernels/index_select_kernel.cc @@ -18,10 +18,10 @@ namespace custom_kernel { template void IndexSelectKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, + const DenseTensor& x, + const DenseTensor& index, int dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("index_select"); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/instance_norm_kernel.cc b/backends/gcu/kernels/instance_norm_kernel.cc index f3418015072..d1891dbbf82 100644 --- a/backends/gcu/kernels/instance_norm_kernel.cc +++ b/backends/gcu/kernels/instance_norm_kernel.cc @@ -18,13 +18,13 @@ namespace custom_kernel { template void InstanceNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale, - const paddle::optional& bias, + const DenseTensor& x, + const paddle::optional& scale, + const paddle::optional& bias, float epsilon_f, - phi::DenseTensor* y, - phi::DenseTensor* saved_mean, - phi::DenseTensor* saved_variance) { + DenseTensor* y, + DenseTensor* saved_mean, + DenseTensor* saved_variance) { PADDLE_GCU_KERNEL_TRACE("instance_norm"); dev_ctx.template Alloc(y); // The upper caller does not use these two outputs. @@ -36,16 +36,16 @@ void InstanceNormKernel(const Context& dev_ctx, } if (LaunchAOTKernel()) { - phi::DenseTensor new_scale; - phi::DenseTensor new_bias; + DenseTensor new_scale; + DenseTensor new_bias; if (scale.get_ptr()) { new_scale = scale.get(); } if (bias.get_ptr()) { new_bias = bias.get(); } - const phi::DenseTensor running_mean_null; - const phi::DenseTensor running_var_null; + const DenseTensor running_mean_null; + const DenseTensor running_var_null; // OpAtenInstancehNorm Expected running_mean and running_var exist when // training is false. const bool use_input_stats = true; @@ -124,17 +124,16 @@ void InstanceNormKernel(const Context& dev_ctx, template void InstanceNormGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale, - const paddle::optional& bias - UNUSED, - const phi::DenseTensor& saved_mean, - const phi::DenseTensor& saved_variance, - const phi::DenseTensor& d_y, + const DenseTensor& x, + const paddle::optional& scale, + const paddle::optional& bias UNUSED, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const DenseTensor& d_y, float epsilon, - phi::DenseTensor* d_x, - phi::DenseTensor* d_scale, - phi::DenseTensor* d_bias) { + DenseTensor* d_x, + DenseTensor* d_scale, + DenseTensor* d_bias) { PADDLE_GCU_KERNEL_TRACE("instance_norm_grad"); dev_ctx.template Alloc(d_x); dev_ctx.template Alloc(d_scale); diff --git a/backends/gcu/kernels/interpolate_kernels.cc b/backends/gcu/kernels/interpolate_kernels.cc index dd5b5360f2e..718d3991130 100644 --- a/backends/gcu/kernels/interpolate_kernels.cc +++ b/backends/gcu/kernels/interpolate_kernels.cc @@ -124,11 +124,11 @@ void InterpolateKernel( dim_out = {n, out_h, out_w, c}; } - phi::DenseTensorMeta out_meta(output->dtype(), dim_out); + DenseTensorMeta out_meta(output->dtype(), dim_out); output->set_meta(out_meta); ctx.template Alloc(output); if (LaunchAOTKernel()) { - phi::DenseTensor output_perm = *output; + DenseTensor output_perm = *output; if (DataPdCustomNHWC(x)) { PADDLE_ENFORCE_EQ(data_layout, "NCHW", @@ -280,7 +280,7 @@ void InterpolateGradKernel( auto tensors = size_tensor.get(); std::vector in_names; in_names.reserve(tensors.size()); - std::vector in_tensors; + std::vector in_tensors; in_tensors.reserve(tensors.size()); for (size_t i = 0; i < tensors.size(); ++i) { in_names.emplace_back(std::string("size_tensor_") + std::to_string(i)); diff --git a/backends/gcu/kernels/is_empty_kernel.cc b/backends/gcu/kernels/is_empty_kernel.cc index e27af220eca..c565f60bef6 100644 --- a/backends/gcu/kernels/is_empty_kernel.cc +++ b/backends/gcu/kernels/is_empty_kernel.cc @@ -19,8 +19,8 @@ namespace custom_kernel { template void IsEmptyKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { dev_ctx.template HostAlloc(out)[0] = (phi::product(x.dims()) == 0); } @@ -37,5 +37,5 @@ PD_REGISTER_PLUGIN_KERNEL(is_empty, int64_t, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } diff --git a/backends/gcu/kernels/isclose_kernel.cc b/backends/gcu/kernels/isclose_kernel.cc index 96099f549d3..a0a4b59f1ed 100644 --- a/backends/gcu/kernels/isclose_kernel.cc +++ b/backends/gcu/kernels/isclose_kernel.cc @@ -18,21 +18,21 @@ namespace custom_kernel { template void IscloseKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, const phi::Scalar& rtol, const phi::Scalar& atol, bool equal_nan, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("isclose"); PADDLE_ENFORCE_EQ( rtol.dtype(), - phi::DataType::FLOAT64, + DataType::FLOAT64, phi::errors::InvalidArgument("Input(Rtol) type must be double")); PADDLE_ENFORCE_EQ( atol.dtype(), - phi::DataType::FLOAT64, + DataType::FLOAT64, phi::errors::InvalidArgument("Input(Atol) type must be double")); dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/isfinite_kernel.cc b/backends/gcu/kernels/isfinite_kernel.cc index 76414cc2307..47bb2fd700f 100644 --- a/backends/gcu/kernels/isfinite_kernel.cc +++ b/backends/gcu/kernels/isfinite_kernel.cc @@ -18,8 +18,8 @@ namespace custom_kernel { template void IsFiniteKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("isfinite"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -38,5 +38,5 @@ PD_REGISTER_PLUGIN_KERNEL(isfinite, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } diff --git a/backends/gcu/kernels/isinf_kernel.cc b/backends/gcu/kernels/isinf_kernel.cc index ccae51bb1fd..470fba64bcf 100644 --- a/backends/gcu/kernels/isinf_kernel.cc +++ b/backends/gcu/kernels/isinf_kernel.cc @@ -18,8 +18,8 @@ namespace custom_kernel { template void IsinfKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("isinf"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -53,5 +53,5 @@ PD_REGISTER_PLUGIN_KERNEL(isinf, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } diff --git a/backends/gcu/kernels/isnan_kernel.cc b/backends/gcu/kernels/isnan_kernel.cc index 985a2771f4f..17084f3ad17 100644 --- a/backends/gcu/kernels/isnan_kernel.cc +++ b/backends/gcu/kernels/isnan_kernel.cc @@ -18,8 +18,8 @@ namespace custom_kernel { template void IsNanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("isnan"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -38,5 +38,5 @@ PD_REGISTER_PLUGIN_KERNEL(isnan, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } diff --git a/backends/gcu/kernels/label_smooth_kernel.cc b/backends/gcu/kernels/label_smooth_kernel.cc index b303a403214..2ddb25e54d8 100644 --- a/backends/gcu/kernels/label_smooth_kernel.cc +++ b/backends/gcu/kernels/label_smooth_kernel.cc @@ -18,10 +18,10 @@ namespace custom_kernel { template void LabelSmoothKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& dist, + const DenseTensor& x, + const paddle::optional& dist, float epsilon, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("label_smooth"); if (dist) { PADDLE_THROW( diff --git a/backends/gcu/kernels/layer_norm_kernel.cc b/backends/gcu/kernels/layer_norm_kernel.cc index 5aa53660b04..883ee8f2def 100644 --- a/backends/gcu/kernels/layer_norm_kernel.cc +++ b/backends/gcu/kernels/layer_norm_kernel.cc @@ -18,14 +18,14 @@ namespace custom_kernel { template void LayerNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale_opt, - const paddle::optional& bias_opt, + const DenseTensor& x, + const paddle::optional& scale_opt, + const paddle::optional& bias_opt, float epsilon, int begin_norm_axis, - phi::DenseTensor* out, - phi::DenseTensor* mean, - phi::DenseTensor* variance) { + DenseTensor* out, + DenseTensor* mean, + DenseTensor* variance) { PADDLE_GCU_KERNEL_TRACE("layer_norm"); dev_ctx.template Alloc(out); @@ -33,51 +33,47 @@ void LayerNormKernel(const Context& dev_ctx, dev_ctx.template Alloc(variance); if (LaunchAOTKernel()) { - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); std::vector scale_bias_shape; for (int64_t i = begin_norm_axis; i < x.dims().size(); ++i) { scale_bias_shape.push_back(x.dims().at(i)); } - phi::DenseTensor scale_opt_x; + DenseTensor scale_opt_x; if (scale_opt.get_ptr() != nullptr) { scale_opt_x = ReshapeWithoutCopy(scale_opt.get(), scale_bias_shape); } else { - auto meta = - phi::DenseTensorMeta(x.dtype(), phi::make_ddim(scale_bias_shape)); + auto meta = DenseTensorMeta(x.dtype(), phi::make_ddim(scale_bias_shape)); scale_opt_x = TensorOnes(dev_ctx, meta); } - phi::DenseTensor bias_opt_x; + DenseTensor bias_opt_x; if (bias_opt.get_ptr() != nullptr) { bias_opt_x = ReshapeWithoutCopy(bias_opt.get(), scale_bias_shape); } else { - auto meta = - phi::DenseTensorMeta(x.dtype(), phi::make_ddim(scale_bias_shape)); + auto meta = DenseTensorMeta(x.dtype(), phi::make_ddim(scale_bias_shape)); bias_opt_x = TensorZeros(dev_ctx, meta); } - if (x.dtype() == phi::DataType::FLOAT16) { - if (scale_opt_x.dtype() != phi::DataType::FLOAT16) { + if (x.dtype() == DataType::FLOAT16) { + if (scale_opt_x.dtype() != DataType::FLOAT16) { scale_opt_x = - custom_kernel::Cast(dev_ctx, scale_opt_x, phi::DataType::FLOAT16); + custom_kernel::Cast(dev_ctx, scale_opt_x, DataType::FLOAT16); } - if (bias_opt_x.dtype() != phi::DataType::FLOAT16) { + if (bias_opt_x.dtype() != DataType::FLOAT16) { bias_opt_x = - custom_kernel::Cast(dev_ctx, bias_opt_x, phi::DataType::FLOAT16); + custom_kernel::Cast(dev_ctx, bias_opt_x, DataType::FLOAT16); } } - phi::DenseTensor weight = - MaybeCreateOrTrans64To32bits(dev_ctx, scale_opt_x); - phi::DenseTensor bias = MaybeCreateOrTrans64To32bits(dev_ctx, bias_opt_x); + DenseTensor weight = MaybeCreateOrTrans64To32bits(dev_ctx, scale_opt_x); + DenseTensor bias = MaybeCreateOrTrans64To32bits(dev_ctx, bias_opt_x); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); - phi::DenseTensor mean_output = + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor mean_output = MaybeCreateOrTrans64To32bits(dev_ctx, *mean, false); - phi::DenseTensor variance_output = + DenseTensor variance_output = MaybeCreateOrTrans64To32bits(dev_ctx, *variance, false); double epsilon_d = epsilon; @@ -155,17 +151,17 @@ void LayerNormKernel(const Context& dev_ctx, template void LayerNormGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& scale_opt, - const paddle::optional& bias, - const phi::DenseTensor& mean, - const phi::DenseTensor& variance, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const paddle::optional& scale_opt, + const paddle::optional& bias, + const DenseTensor& mean, + const DenseTensor& variance, + const DenseTensor& out_grad, float epsilon, int begin_norm_axis, - phi::DenseTensor* x_grad, - phi::DenseTensor* scale_grad, - phi::DenseTensor* bias_grad) { + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { PADDLE_GCU_KERNEL_TRACE("layer_norm_grad"); dev_ctx.template Alloc(x_grad); diff --git a/backends/gcu/kernels/lerp_kernel.cc b/backends/gcu/kernels/lerp_kernel.cc index ce607a4c30d..327a5fa9799 100644 --- a/backends/gcu/kernels/lerp_kernel.cc +++ b/backends/gcu/kernels/lerp_kernel.cc @@ -19,10 +19,10 @@ namespace custom_kernel { template void LerpKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& weight, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("lerp"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/llama_stub_kernels.cc b/backends/gcu/kernels/llama_stub_kernels.cc index 09a6302a704..e52accae477 100644 --- a/backends/gcu/kernels/llama_stub_kernels.cc +++ b/backends/gcu/kernels/llama_stub_kernels.cc @@ -18,19 +18,19 @@ namespace custom_kernel { template void RmsNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& bias, - const paddle::optional& residual, - const phi::DenseTensor& norm_weight, - const paddle::optional& norm_bias, + const DenseTensor& x, + const paddle::optional& bias, + const paddle::optional& residual, + const DenseTensor& norm_weight, + const paddle::optional& norm_bias, const float epsilon, const int begin_norm_axis, const float quant_scale, const int quant_round_type, const float quant_max_bound, const float quant_min_bound, - phi::DenseTensor* out, - phi::DenseTensor* residual_out) { + DenseTensor* out, + DenseTensor* residual_out) { VLOG(0) << "====== GCU kernel stub: rms_norm ====="; dev_ctx.template Alloc(out); dev_ctx.template Alloc(residual_out); @@ -39,47 +39,46 @@ void RmsNormKernel(const Context& dev_ctx, template void MultiHeadAttentionVariableForwardKernel( const Context& dev_ctx, - const phi::DenseTensor& query, - const phi::DenseTensor& key, - const phi::DenseTensor& value, - const phi::DenseTensor& seq_lens, - const phi::DenseTensor& kv_seq_lens, - const paddle::optional& mask, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const DenseTensor& seq_lens, + const DenseTensor& kv_seq_lens, + const paddle::optional& mask, const float scale, const bool causal, const int pre_cache_length, - phi::DenseTensor* out) { + DenseTensor* out) { VLOG(0) << "====== GCU kernel stub: " "variable_length_memory_efficient_attention ====="; dev_ctx.template Alloc(out); } template -void FusedBiasActKernel( - const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& bias, - const paddle::optional& dequant_scales, - const paddle::optional& shift, - const paddle::optional& smooth, - const std::string& act_method, - const std::string& compute_dtype, - float quant_scale, - int quant_round_type, - float quant_max_bound, - float quant_min_bound, - phi::DenseTensor* out) { +void FusedBiasActKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional& bias, + const paddle::optional& dequant_scales, + const paddle::optional& shift, + const paddle::optional& smooth, + const std::string& act_method, + const std::string& compute_dtype, + float quant_scale, + int quant_round_type, + float quant_max_bound, + float quant_min_bound, + DenseTensor* out) { VLOG(0) << "====== GCU kernel stub: fused_bias_act ====="; dev_ctx.template Alloc(out); } template void FusedLayerNormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& bias, - const paddle::optional& residual, - const paddle::optional& norm_weight, - const paddle::optional& norm_bias, + const DenseTensor& x, + const paddle::optional& bias, + const paddle::optional& residual, + const paddle::optional& norm_weight, + const paddle::optional& norm_bias, const float epsilon, const float residual_alpha, const int begin_norm_axis, @@ -87,10 +86,10 @@ void FusedLayerNormKernel(const Context& dev_ctx, const int quant_round_type, const float quant_max_bound, const float quant_min_bound, - phi::DenseTensor* out, - phi::DenseTensor* residual_out, - phi::DenseTensor* mean, - phi::DenseTensor* variance) { + DenseTensor* out, + DenseTensor* residual_out, + DenseTensor* mean, + DenseTensor* variance) { VLOG(0) << "====== GCU kernel stub: fused_bias_residual_layernorm ====="; dev_ctx.template Alloc(out); dev_ctx.template Alloc(residual_out); @@ -100,17 +99,17 @@ void FusedLayerNormKernel(const Context& dev_ctx, template void MMHAKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& cache_kv, - const paddle::optional& bias, - const paddle::optional& src_mask, - const paddle::optional& cum_offsets, - const paddle::optional& sequence_lengths, - const paddle::optional& rotary_tensor, - const paddle::optional& beam_cache_offset, - const paddle::optional& qkv_out_scale, - const paddle::optional& out_shift, - const paddle::optional& out_smooth, + const DenseTensor& x, + const DenseTensor& cache_kv, + const paddle::optional& bias, + const paddle::optional& src_mask, + const paddle::optional& cum_offsets, + const paddle::optional& sequence_lengths, + const paddle::optional& rotary_tensor, + const paddle::optional& beam_cache_offset, + const paddle::optional& qkv_out_scale, + const paddle::optional& out_shift, + const paddle::optional& out_smooth, int seq_len, int rotary_emb_dims, const bool use_neox_rotary_style, @@ -119,9 +118,9 @@ void MMHAKernel(const Context& dev_ctx, const int quant_round_type, const float quant_max_bound, const float quant_min_bound, - phi::DenseTensor* out, - phi::DenseTensor* cache_kv_out, - phi::DenseTensor* beam_cache_offset_out) { + DenseTensor* out, + DenseTensor* cache_kv_out, + DenseTensor* beam_cache_offset_out) { VLOG(0) << "====== GCU kernel stub: masked_multihead_attention ====="; dev_ctx.template Alloc(out); dev_ctx.template Alloc(cache_kv_out); @@ -150,7 +149,7 @@ PD_REGISTER_PLUGIN_KERNEL( int64_t, phi::dtype::float16, phi::dtype::bfloat16) { - kernel->InputAt(3).SetDataType(phi::DataType::INT32); + kernel->InputAt(3).SetDataType(DataType::INT32); } PD_REGISTER_PLUGIN_KERNEL(fused_bias_act, diff --git a/backends/gcu/kernels/log_loss_kernel.cc b/backends/gcu/kernels/log_loss_kernel.cc index ad0dbe41435..be1474f4d21 100644 --- a/backends/gcu/kernels/log_loss_kernel.cc +++ b/backends/gcu/kernels/log_loss_kernel.cc @@ -18,14 +18,14 @@ namespace custom_kernel { -using Tensor = phi::DenseTensor; +using Tensor = DenseTensor; template void LogLossKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& label, + const DenseTensor& input, + const DenseTensor& label, float epsilon, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("log_loss"); dev_ctx.template Alloc(out); @@ -72,11 +72,11 @@ void LogLossKernel(const Context& dev_ctx, template void LogLossGradKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& label, - const phi::DenseTensor& out_grad, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, float epsilon, - phi::DenseTensor* in_grad) { + DenseTensor* in_grad) { PADDLE_GCU_KERNEL_TRACE("log_loss_grad"); dev_ctx.template Alloc(in_grad); diff --git a/backends/gcu/kernels/log_softmax_kernel.cc b/backends/gcu/kernels/log_softmax_kernel.cc index 706b86be732..cc7282cadc1 100644 --- a/backends/gcu/kernels/log_softmax_kernel.cc +++ b/backends/gcu/kernels/log_softmax_kernel.cc @@ -18,9 +18,9 @@ namespace custom_kernel { template void LogSoftmaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("log_softmax"); dev_ctx.template Alloc(out); @@ -55,10 +55,10 @@ void LogSoftmaxKernel(const Context& dev_ctx, template void LogSoftmaxGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& dout, + const DenseTensor& out, + const DenseTensor& dout, int axis, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("log_softmax_grad"); dev_ctx.template Alloc(dx); diff --git a/backends/gcu/kernels/logcumsumexp_kernel.cc b/backends/gcu/kernels/logcumsumexp_kernel.cc index c45685e28b7..1d2d96c3815 100644 --- a/backends/gcu/kernels/logcumsumexp_kernel.cc +++ b/backends/gcu/kernels/logcumsumexp_kernel.cc @@ -18,15 +18,15 @@ namespace custom_kernel { template void LogcumsumexpKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, bool flatten, bool exclusive, bool reverse, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("logcumsumexp"); if (LaunchAOTKernel()) { - phi::DenseTensor input_tensor(x); + DenseTensor input_tensor(x); if (flatten) { PADDLE_ENFORCE_EQ( axis, diff --git a/backends/gcu/kernels/logical_kernels.cc b/backends/gcu/kernels/logical_kernels.cc index f0213632521..7f916bf56a8 100644 --- a/backends/gcu/kernels/logical_kernels.cc +++ b/backends/gcu/kernels/logical_kernels.cc @@ -18,20 +18,20 @@ namespace custom_kernel { template void LogicalAndKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("logical_and"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - // phi::DenseTensor real_x = x; - // phi::DenseTensor real_y = y; - // if (x.dtype() != phi::DataType::BOOL) { - // real_x = custom_kernel::Cast(dev_ctx, x, phi::DataType::BOOL); + // DenseTensor real_x = x; + // DenseTensor real_y = y; + // if (x.dtype() != DataType::BOOL) { + // real_x = custom_kernel::Cast(dev_ctx, x, DataType::BOOL); // } - // if (y.dtype() != phi::DataType::BOOL) { - // real_y = custom_kernel::Cast(dev_ctx, y, phi::DataType::BOOL); + // if (y.dtype() != DataType::BOOL) { + // real_y = custom_kernel::Cast(dev_ctx, y, DataType::BOOL); // } // LAUNCH_TOPSATENOP(topsatenBitwiseAnd, dev_ctx, *out, real_x, real_y); LAUNCH_TOPSATENOP(topsatenLogicalAnd, dev_ctx, *out, x, y); @@ -65,15 +65,15 @@ void LogicalAndKernel(const Context& dev_ctx, template void LogicalNotKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("logical_not"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - // phi::DenseTensor real_x = x; - // if (x.dtype() != phi::DataType::BOOL) { - // real_x = custom_kernel::Cast(dev_ctx, x, phi::DataType::BOOL); + // DenseTensor real_x = x; + // if (x.dtype() != DataType::BOOL) { + // real_x = custom_kernel::Cast(dev_ctx, x, DataType::BOOL); // } // LAUNCH_TOPSATENOP(topsatenBitwiseNot, dev_ctx, *out, real_x); LAUNCH_TOPSATENOP(topsatenLogicalNot, dev_ctx, *out, x); @@ -105,20 +105,20 @@ void LogicalNotKernel(const Context& dev_ctx, template void LogicalOrKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("logical_or"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - // phi::DenseTensor real_x = x; - // phi::DenseTensor real_y = y; - // if (x.dtype() != phi::DataType::BOOL) { - // real_x = custom_kernel::Cast(dev_ctx, x, phi::DataType::BOOL); + // DenseTensor real_x = x; + // DenseTensor real_y = y; + // if (x.dtype() != DataType::BOOL) { + // real_x = custom_kernel::Cast(dev_ctx, x, DataType::BOOL); // } - // if (y.dtype() != phi::DataType::BOOL) { - // real_y = custom_kernel::Cast(dev_ctx, y, phi::DataType::BOOL); + // if (y.dtype() != DataType::BOOL) { + // real_y = custom_kernel::Cast(dev_ctx, y, DataType::BOOL); // } // LAUNCH_TOPSATENOP(topsatenBitwiseOr, dev_ctx, *out, real_x, real_y); LAUNCH_TOPSATENOP(topsatenLogicalOr, dev_ctx, *out, x, y); @@ -130,20 +130,20 @@ void LogicalOrKernel(const Context& dev_ctx, template void LogicalXorKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("logical_xor"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - // phi::DenseTensor real_x = x; - // phi::DenseTensor real_y = y; - // if (x.dtype() != phi::DataType::BOOL) { - // real_x = custom_kernel::Cast(dev_ctx, x, phi::DataType::BOOL); + // DenseTensor real_x = x; + // DenseTensor real_y = y; + // if (x.dtype() != DataType::BOOL) { + // real_x = custom_kernel::Cast(dev_ctx, x, DataType::BOOL); // } - // if (y.dtype() != phi::DataType::BOOL) { - // real_y = custom_kernel::Cast(dev_ctx, y, phi::DataType::BOOL); + // if (y.dtype() != DataType::BOOL) { + // real_y = custom_kernel::Cast(dev_ctx, y, DataType::BOOL); // } // LAUNCH_TOPSATENOP(topsatenBitwiseOr, dev_ctx, *out, real_x, real_y); LAUNCH_TOPSATENOP(topsatenLogicalXor, dev_ctx, *out, x, y); @@ -162,7 +162,7 @@ PD_REGISTER_PLUGIN_KERNEL(logical_and, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(logical_not, @@ -173,7 +173,7 @@ PD_REGISTER_PLUGIN_KERNEL(logical_not, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(logical_or, @@ -184,7 +184,7 @@ PD_REGISTER_PLUGIN_KERNEL(logical_or, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(logical_xor, @@ -195,5 +195,5 @@ PD_REGISTER_PLUGIN_KERNEL(logical_xor, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } diff --git a/backends/gcu/kernels/logsumexp_kernel.cc b/backends/gcu/kernels/logsumexp_kernel.cc index e2fd6d7f0f0..fad8794e0d6 100644 --- a/backends/gcu/kernels/logsumexp_kernel.cc +++ b/backends/gcu/kernels/logsumexp_kernel.cc @@ -18,11 +18,11 @@ namespace custom_kernel { template void LogsumexpKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, bool keepdim, bool reduce_all, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("logsumexp"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/masked_select_kernel.cc b/backends/gcu/kernels/masked_select_kernel.cc index 21d84af5d27..c32c0da9df9 100644 --- a/backends/gcu/kernels/masked_select_kernel.cc +++ b/backends/gcu/kernels/masked_select_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void MaskedSelectKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mask, - phi::DenseTensor* out) { + const DenseTensor& x, + const DenseTensor& mask, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("masked_select"); if (LaunchAOTKernel()) { // topsatenMaskedSelect does not refresh the meta information of output. @@ -85,10 +85,10 @@ void MaskedSelectKernel(const Context& dev_ctx, template void MaskedSelectGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& mask, - const phi::DenseTensor& out_grad, - phi::DenseTensor* x_grad) { + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& out_grad, + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("masked_select_grad"); if (LaunchAOTKernel()) { @@ -131,7 +131,7 @@ PD_REGISTER_PLUGIN_KERNEL(masked_select, phi::dtype::bfloat16, float, int) { - kernel->InputAt(1).SetDataType(phi::DataType::BOOL); + kernel->InputAt(1).SetDataType(DataType::BOOL); } // PD_REGISTER_PLUGIN_KERNEL(masked_select_grad, @@ -142,5 +142,5 @@ PD_REGISTER_PLUGIN_KERNEL(masked_select, // float, // int, // int64_t) { -// kernel->InputAt(1).SetDataType(phi::DataType::BOOL); +// kernel->InputAt(1).SetDataType(DataType::BOOL); // } diff --git a/backends/gcu/kernels/matmul_kernel.cc b/backends/gcu/kernels/matmul_kernel.cc index bd7567d06a8..f38e1276b1a 100644 --- a/backends/gcu/kernels/matmul_kernel.cc +++ b/backends/gcu/kernels/matmul_kernel.cc @@ -16,7 +16,7 @@ #include "kernels/funcs/gcu_kernel_funcs.h" namespace custom_kernel { -void AdjustStrides(phi::DenseTensor& tensor) { // NOLINT +void AdjustStrides(DenseTensor& tensor) { // NOLINT size_t rank = tensor.dims().size(); if (rank <= 1) { return; @@ -30,17 +30,17 @@ void AdjustStrides(phi::DenseTensor& tensor) { // NOLINT template void MatmulKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, bool trans_x, bool trans_y, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("matmul"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - phi::DenseTensor input_x = x; - phi::DenseTensor input_y = y; + DenseTensor input_x = x; + DenseTensor input_y = y; if (trans_x) { AdjustStrides(input_x); } @@ -80,13 +80,13 @@ void MatmulKernel(const Context& dev_ctx, template void MatmulGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, bool trans_x, bool trans_y, - phi::DenseTensor* dx, - phi::DenseTensor* dy) { + DenseTensor* dx, + DenseTensor* dy) { PADDLE_GCU_KERNEL_TRACE("matmul_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -130,14 +130,14 @@ void MatmulGradKernel(const Context& dev_ctx, template void MatmulWithFlattenKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& y, + const DenseTensor& x, + const DenseTensor& y, int x_num_col_dims, int y_num_col_dims, - phi::DenseTensor* out) { - const phi::DenseTensor x_matrix = + DenseTensor* out) { + const DenseTensor x_matrix = x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x; - const phi::DenseTensor y_matrix = + const DenseTensor y_matrix = y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y; dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/mean_all_kernel.cc b/backends/gcu/kernels/mean_all_kernel.cc index ca39ed47d52..5adb843ae44 100644 --- a/backends/gcu/kernels/mean_all_kernel.cc +++ b/backends/gcu/kernels/mean_all_kernel.cc @@ -19,8 +19,8 @@ namespace custom_kernel { template void MeanAllKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("mean_all"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -51,9 +51,9 @@ void MeanAllKernel(const Context& dev_ctx, template void MeanAllGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& grad, - phi::DenseTensor* x_grad) { + const DenseTensor& x, + const DenseTensor& grad, + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("mean_all_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); diff --git a/backends/gcu/kernels/memcpy_kernels.cc b/backends/gcu/kernels/memcpy_kernels.cc index ed5adffb803..1696994d7a5 100644 --- a/backends/gcu/kernels/memcpy_kernels.cc +++ b/backends/gcu/kernels/memcpy_kernels.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void MemcpyKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int dst_place_type, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("memcpy"); if (!x.initialized()) { return; @@ -37,7 +37,7 @@ void MemcpyKernel(const Context& dev_ctx, // CUSTOM_DEVICE = 6, // }; if (dst_place_type == 0) { // CPU - TensorCopy(dev_ctx, x, false, out, phi::CPUPlace()); + TensorCopy(dev_ctx, x, false, out, CPUPlace()); } else if (dst_place_type == 6) { // custom_device TensorCopy(dev_ctx, x, false, out, dev_ctx.GetPlace()); } else { @@ -49,9 +49,9 @@ void MemcpyKernel(const Context& dev_ctx, template void MemcpyH2DKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int dst_place_type, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("memcpy_h2d"); TensorCopy(dev_ctx, x, false, out, dev_ctx.GetPlace()); dev_ctx.Wait(); @@ -60,15 +60,15 @@ void MemcpyH2DKernel(const Context& dev_ctx, // used in new executor, for memory copy from device to host template void MemcpyD2HKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int dst_place_type, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("memcpy_d2h"); if (x.storage_properties_initialized()) { PADDLE_THROW( phi::errors::Unimplemented("storage_properties is not supported yet.")); } else { - TensorCopy(dev_ctx, x, false, out, phi::CPUPlace()); + TensorCopy(dev_ctx, x, false, out, CPUPlace()); } dev_ctx.Wait(); } diff --git a/backends/gcu/kernels/merged_adam_kernel.cc b/backends/gcu/kernels/merged_adam_kernel.cc index e30a3a55172..faf3726117a 100644 --- a/backends/gcu/kernels/merged_adam_kernel.cc +++ b/backends/gcu/kernels/merged_adam_kernel.cc @@ -16,19 +16,18 @@ #include "kernels/funcs/gcu_kernel_funcs.h" namespace custom_kernel { -static void CheckInputs( - const std::vector& param, - const std::vector& grad, - const std::vector& learning_rate, - const std::vector& moment1, - const std::vector& moment2, - const std::vector& beta1_pow, - const std::vector& beta2_pow, - std::vector param_out, - std::vector moment1_out, - std::vector moment2_out, - std::vector beta1_pow_out, - std::vector beta2_pow_out) { +static void CheckInputs(const std::vector& param, + const std::vector& grad, + const std::vector& learning_rate, + const std::vector& moment1, + const std::vector& moment2, + const std::vector& beta1_pow, + const std::vector& beta2_pow, + std::vector param_out, + std::vector moment1_out, + std::vector moment2_out, + std::vector beta1_pow_out, + std::vector beta2_pow_out) { size_t param_num = param.size(); PADDLE_ENFORCE_GT(param_num, 0); PADDLE_ENFORCE_EQ( @@ -128,25 +127,25 @@ static void CheckInputs( template void MergedAdamKernel( const Context& dev_ctx, - const std::vector& param, - const std::vector& grad, - const std::vector& learning_rate, - const std::vector& moment1, - const std::vector& moment2, - const std::vector& beta1_pow, - const std::vector& beta2_pow, - const paddle::optional>& master_param, + const std::vector& param, + const std::vector& grad, + const std::vector& learning_rate, + const std::vector& moment1, + const std::vector& moment2, + const std::vector& beta1_pow, + const std::vector& beta2_pow, + const paddle::optional>& master_param, const phi::Scalar& beta1, const phi::Scalar& beta2, const phi::Scalar& epsilon, bool multi_precision, bool use_global_beta_pow, - std::vector param_out, - std::vector moment1_out, - std::vector moment2_out, - std::vector beta1_pow_out, - std::vector beta2_pow_out, - std::vector master_param_out) { + std::vector param_out, + std::vector moment1_out, + std::vector moment2_out, + std::vector beta1_pow_out, + std::vector beta2_pow_out, + std::vector master_param_out) { PADDLE_GCU_KERNEL_TRACE("merged_adam"); CheckInputs(param, grad, @@ -167,10 +166,10 @@ void MergedAdamKernel( size_t param_num = param.size(); // beta1_pow and beta2_pow may on CPU and not transform place. - std::vector> beta1_pow_gcu; + std::vector> beta1_pow_gcu; if (beta1_pow[0]->place().GetType() == phi::AllocationType::CPU) { for (size_t i = 0; i < param_num; ++i) { - auto beta1_pow_tmp = std::make_shared(); + auto beta1_pow_tmp = std::make_shared(); T beta1 = *(beta1_pow[i]->data()); beta1_pow_tmp->Resize({1}); dev_ctx.template Alloc(beta1_pow_tmp.get()); @@ -179,10 +178,10 @@ void MergedAdamKernel( } } - std::vector> beta2_pow_gcu; + std::vector> beta2_pow_gcu; if (beta2_pow[0]->place().GetType() == phi::AllocationType::CPU) { for (size_t i = 0; i < param_num; ++i) { - auto beta2_pow_tmp = std::make_shared(); + auto beta2_pow_tmp = std::make_shared(); T beta2 = *(beta2_pow[i]->data()); beta2_pow_tmp->Resize({1}); dev_ctx.template Alloc(beta2_pow_tmp.get()); @@ -221,11 +220,11 @@ void MergedAdamKernel( outputs["Beta1PowOut"].reserve(param_num); outputs["Beta2PowOut"].reserve(param_num); - std::vector> param_outs_tmp; - std::vector> moment1_outs_tmp; - std::vector> moment2_outs_tmp; - std::vector> beta1_pow_outs_tmp; - std::vector> beta2_pow_outs_tmp; + std::vector> param_outs_tmp; + std::vector> moment1_outs_tmp; + std::vector> moment2_outs_tmp; + std::vector> beta1_pow_outs_tmp; + std::vector> beta2_pow_outs_tmp; param_outs_tmp.reserve(param_num); moment1_outs_tmp.reserve(param_num); moment2_outs_tmp.reserve(param_num); @@ -276,27 +275,27 @@ void MergedAdamKernel( output_names["Beta2PowOut"].emplace_back(std::string("beta2_pow_out") + std::to_string(i)); - auto param_out_tmp = std::make_shared(); + auto param_out_tmp = std::make_shared(); param_out_tmp->set_meta(param_out[i]->meta()); dev_ctx.template Alloc(param_out_tmp.get()); param_outs_tmp.emplace_back(param_out_tmp); - auto moment1_out_tmp = std::make_shared(); + auto moment1_out_tmp = std::make_shared(); moment1_out_tmp->set_meta(moment1_out[i]->meta()); dev_ctx.template Alloc(moment1_out_tmp.get()); moment1_outs_tmp.emplace_back(moment1_out_tmp); - auto moment2_out_tmp = std::make_shared(); + auto moment2_out_tmp = std::make_shared(); moment2_out_tmp->set_meta(moment2_out[i]->meta()); dev_ctx.template Alloc(moment2_out_tmp.get()); moment2_outs_tmp.emplace_back(moment2_out_tmp); - auto beta1_pow_out_tmp = std::make_shared(); + auto beta1_pow_out_tmp = std::make_shared(); beta1_pow_out_tmp->set_meta(beta1_pow_out[i]->meta()); dev_ctx.template Alloc(beta1_pow_out_tmp.get()); beta1_pow_outs_tmp.emplace_back(beta1_pow_out_tmp); - auto beta2_pow_out_tmp = std::make_shared(); + auto beta2_pow_out_tmp = std::make_shared(); beta2_pow_out_tmp->set_meta(beta2_pow_out[i]->meta()); dev_ctx.template Alloc(beta2_pow_out_tmp.get()); beta2_pow_outs_tmp.emplace_back(beta2_pow_out_tmp); @@ -374,13 +373,13 @@ PD_REGISTER_PLUGIN_KERNEL(merged_adam, kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); - if (kernel_key.dtype() == phi::DataType::FLOAT16 || - kernel_key.dtype() == phi::DataType::BFLOAT16) { - kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); + if (kernel_key.dtype() == DataType::FLOAT16 || + kernel_key.dtype() == DataType::BFLOAT16) { + kernel->OutputAt(1).SetDataType(DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(DataType::FLOAT32); + kernel->OutputAt(5).SetDataType(DataType::FLOAT32); } kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED); kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED); diff --git a/backends/gcu/kernels/merged_momentum_kernel.cc b/backends/gcu/kernels/merged_momentum_kernel.cc index ab621a82c6e..f371dff298a 100644 --- a/backends/gcu/kernels/merged_momentum_kernel.cc +++ b/backends/gcu/kernels/merged_momentum_kernel.cc @@ -16,15 +16,14 @@ #include "kernels/funcs/gcu_kernel_funcs.h" namespace custom_kernel { -static void CheckInputs( - const std::vector& param, - const std::vector& grad, - const std::vector& velocity, - const std::vector& learning_rate, - const std::vector& regularization_method, - const std::vector& regularization_coeff, - std::vector param_out, - std::vector velocity_out) { +static void CheckInputs(const std::vector& param, + const std::vector& grad, + const std::vector& velocity, + const std::vector& learning_rate, + const std::vector& regularization_method, + const std::vector& regularization_coeff, + std::vector param_out, + std::vector velocity_out) { size_t param_num = param.size(); PADDLE_ENFORCE_GT(param_num, 0); PADDLE_ENFORCE_EQ( @@ -98,20 +97,20 @@ static void CheckInputs( template void MergedMomentumKernel( const Context& dev_ctx, - const std::vector& param, - const std::vector& grad, - const std::vector& velocity, - const std::vector& learning_rate, - const paddle::optional>& master_param, + const std::vector& param, + const std::vector& grad, + const std::vector& velocity, + const std::vector& learning_rate, + const paddle::optional>& master_param, float mu, bool use_nesterov, const std::vector& regularization_method, const std::vector& regularization_coeff, bool multi_precision, float rescale_grad, - std::vector param_out, - std::vector velocity_out, - std::vector master_param_out) { + std::vector param_out, + std::vector velocity_out, + std::vector master_param_out) { PADDLE_GCU_KERNEL_TRACE("merged_momentum"); CheckInputs(param, grad, @@ -142,8 +141,8 @@ void MergedMomentumKernel( output_names["ParamOut"].reserve(param_num); outputs["VelocityOut"].reserve(param_num); outputs["ParamOut"].reserve(param_num); - std::vector> param_outs_tmp; - std::vector> velocity_outs_tmp; + std::vector> param_outs_tmp; + std::vector> velocity_outs_tmp; param_outs_tmp.reserve(param_num); velocity_outs_tmp.reserve(param_num); @@ -168,12 +167,12 @@ void MergedMomentumKernel( output_names["VelocityOut"].emplace_back(std::string("velocity_out") + std::to_string(i)); - auto param_out_tmp = std::make_shared(); + auto param_out_tmp = std::make_shared(); param_out_tmp->set_meta(param_out[i]->meta()); dev_ctx.template Alloc(param_out_tmp.get()); param_outs_tmp.emplace_back(param_out_tmp); - auto velocity_out_tmp = std::make_shared(); + auto velocity_out_tmp = std::make_shared(); velocity_out_tmp->set_meta(velocity_out[i]->meta()); dev_ctx.template Alloc(velocity_out_tmp.get()); velocity_outs_tmp.emplace_back(velocity_out_tmp); diff --git a/backends/gcu/kernels/meshgrid_kernel.cc b/backends/gcu/kernels/meshgrid_kernel.cc index 8d29f42fdf3..9879fda0268 100644 --- a/backends/gcu/kernels/meshgrid_kernel.cc +++ b/backends/gcu/kernels/meshgrid_kernel.cc @@ -18,8 +18,8 @@ namespace custom_kernel { template void MeshgridKernel(const Context& dev_ctx, - const std::vector& ins, - std::vector outs) { + const std::vector& ins, + std::vector outs) { PADDLE_GCU_KERNEL_TRACE("meshgrid"); size_t tensor_size = ins.size(); PADDLE_ENFORCE_EQ( @@ -61,12 +61,12 @@ void MeshgridKernel(const Context& dev_ctx, } else { // kernel impl base on JIT std::vector in_names; in_names.reserve(tensor_size); - std::vector in_tensors; + std::vector in_tensors; in_tensors.reserve(tensor_size); std::vector out_names; out_names.reserve(outs.size()); - std::vector out_tensors; + std::vector out_tensors; out_tensors.reserve(outs.size()); for (size_t i = 0; i < tensor_size; ++i) { diff --git a/backends/gcu/kernels/momentum_kernel.cc b/backends/gcu/kernels/momentum_kernel.cc index fe4290eadf0..e370b2192eb 100644 --- a/backends/gcu/kernels/momentum_kernel.cc +++ b/backends/gcu/kernels/momentum_kernel.cc @@ -18,20 +18,20 @@ namespace custom_kernel { template void MomentumKernel(const Context& dev_ctx, - const phi::DenseTensor& param, - const phi::DenseTensor& grad, - const phi::DenseTensor& velocity, - const phi::DenseTensor& learning_rate, - const paddle::optional& master_param, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& velocity, + const DenseTensor& learning_rate, + const paddle::optional& master_param, float mu_f, bool use_nesterov, const std::string& regularization_method, float regularization_coeff, bool multi_precision, float rescale_grad, - phi::DenseTensor* param_out, - phi::DenseTensor* velocity_out, - phi::DenseTensor* master_param_out) { + DenseTensor* param_out, + DenseTensor* velocity_out, + DenseTensor* master_param_out) { PADDLE_GCU_KERNEL_TRACE("momentum"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -51,11 +51,11 @@ void MomentumKernel(const Context& dev_ctx, inputs["Velocity"] = {const_cast(&velocity)}; inputs["LearningRate"] = {const_cast(&learning_rate)}; - phi::DenseTensor param_out_tmp; + DenseTensor param_out_tmp; param_out_tmp.set_meta(param_out->meta()); dev_ctx.template Alloc(¶m_out_tmp); - phi::DenseTensor velocity_out_tmp; + DenseTensor velocity_out_tmp; velocity_out_tmp.set_meta(velocity_out->meta()); dev_ctx.template Alloc(&velocity_out_tmp); diff --git a/backends/gcu/kernels/multiclass_nms3_kernel.cc b/backends/gcu/kernels/multiclass_nms3_kernel.cc index 508394e145b..57979a9f141 100644 --- a/backends/gcu/kernels/multiclass_nms3_kernel.cc +++ b/backends/gcu/kernels/multiclass_nms3_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void MultiClassNMSKernel(const Context& dev_ctx, - const phi::DenseTensor& bboxes, - const phi::DenseTensor& scores, - const paddle::optional& rois_num, + const DenseTensor& bboxes, + const DenseTensor& scores, + const paddle::optional& rois_num, float score_threshold, int nms_top_k, int keep_top_k, @@ -29,9 +29,9 @@ void MultiClassNMSKernel(const Context& dev_ctx, bool normalized, float nms_eta, int background_label, - phi::DenseTensor* out, - phi::DenseTensor* index, - phi::DenseTensor* nms_rois_num) { + DenseTensor* out, + DenseTensor* index, + DenseTensor* nms_rois_num) { PADDLE_GCU_KERNEL_TRACE("multiclass_nms3"); if (LaunchAOTKernel()) { ContextPinnedGuard ctx_pinned_guard(dev_ctx); @@ -39,37 +39,35 @@ void MultiClassNMSKernel(const Context& dev_ctx, VLOG(6) << "[CPU_KERNEL] Call CPU kernel for multiclass_nms3"; // Copy bboxes to CPU - phi::DenseTensor bboxes_cpu; - phi::DenseTensor bboxes_gcu; - if (bboxes.dtype() == phi::DataType::FLOAT16) { - custom_kernel::Cast(dev_ctx, bboxes, phi::DataType::FLOAT32, &bboxes_gcu); + DenseTensor bboxes_cpu; + DenseTensor bboxes_gcu; + if (bboxes.dtype() == DataType::FLOAT16) { + custom_kernel::Cast(dev_ctx, bboxes, DataType::FLOAT32, &bboxes_gcu); } else { bboxes_gcu = bboxes; } - TensorCopy(dev_ctx, bboxes_gcu, false, &bboxes_cpu, phi::CPUPlace()); + TensorCopy(dev_ctx, bboxes_gcu, false, &bboxes_cpu, CPUPlace()); // Copy scores to CPU - phi::DenseTensor scores_cpu; - phi::DenseTensor scores_gcu; - if (scores.dtype() == phi::DataType::FLOAT16) { - custom_kernel::Cast(dev_ctx, scores, phi::DataType::FLOAT32, &scores_gcu); + DenseTensor scores_cpu; + DenseTensor scores_gcu; + if (scores.dtype() == DataType::FLOAT16) { + custom_kernel::Cast(dev_ctx, scores, DataType::FLOAT32, &scores_gcu); } else { scores_gcu = scores; } - TensorCopy(dev_ctx, scores_gcu, false, &scores_cpu, phi::CPUPlace()); + TensorCopy(dev_ctx, scores_gcu, false, &scores_cpu, CPUPlace()); // Copy rois_num to CPU if need - paddle::optional rois_num_cpu = - paddle::optional(); + paddle::optional rois_num_cpu = + paddle::optional(); if (rois_num) { - phi::DenseTensor rois_num_tensor = rois_num.get(); - phi::DenseTensor rois_num_tensor_cpu; - phi::DenseTensor rois_num_tensor_gcu; - if (rois_num_tensor.dtype() == phi::DataType::FLOAT16) { - custom_kernel::Cast(dev_ctx, - rois_num_tensor, - phi::DataType::FLOAT32, - &rois_num_tensor_gcu); + DenseTensor rois_num_tensor = rois_num.get(); + DenseTensor rois_num_tensor_cpu; + DenseTensor rois_num_tensor_gcu; + if (rois_num_tensor.dtype() == DataType::FLOAT16) { + custom_kernel::Cast( + dev_ctx, rois_num_tensor, DataType::FLOAT32, &rois_num_tensor_gcu); } else { rois_num_tensor_gcu = rois_num_tensor; } @@ -77,49 +75,48 @@ void MultiClassNMSKernel(const Context& dev_ctx, rois_num_tensor_gcu, false, &rois_num_tensor_cpu, - phi::CPUPlace()); - rois_num_cpu = - paddle::make_optional(rois_num_tensor_cpu); + CPUPlace()); + rois_num_cpu = paddle::make_optional(rois_num_tensor_cpu); } // Wait for data ready dev_ctx.Wait(); - phi::DenseTensor out_cpu = *out; - if (out->dtype() == phi::DataType::FLOAT16) { - phi::DenseTensorMeta cpu_meta(phi::DataType::FLOAT32, out->dims()); + DenseTensor out_cpu = *out; + if (out->dtype() == DataType::FLOAT16) { + DenseTensorMeta cpu_meta(DataType::FLOAT32, out->dims()); out_cpu.set_meta(cpu_meta); } - phi::DenseTensor index_cpu = *index; - phi::DenseTensor nms_rois_num_cpu = *nms_rois_num; + DenseTensor index_cpu = *index; + DenseTensor nms_rois_num_cpu = *nms_rois_num; // Call the CPU implementation - phi::CPUContext dev_ctx_cpu; + CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator(&(dev_ctx.GetHostAllocator())); dev_ctx_cpu.SetHostAllocator(&(dev_ctx.GetHostAllocator())); - phi::MultiClassNMSKernel(dev_ctx_cpu, - bboxes_cpu, - scores_cpu, - rois_num_cpu, - score_threshold, - nms_top_k, - keep_top_k, - nms_threshold, - normalized, - nms_eta, - background_label, - &out_cpu, - &index_cpu, - &nms_rois_num_cpu); + phi::MultiClassNMSKernel(dev_ctx_cpu, + bboxes_cpu, + scores_cpu, + rois_num_cpu, + score_threshold, + nms_top_k, + keep_top_k, + nms_threshold, + normalized, + nms_eta, + background_label, + &out_cpu, + &index_cpu, + &nms_rois_num_cpu); dev_ctx.Wait(); // convert result - phi::DenseTensor out_gcu; + DenseTensor out_gcu; TensorCopy(dev_ctx, out_cpu, false, &out_gcu); - if (out->dtype() == phi::DataType::FLOAT16) { - custom_kernel::Cast(dev_ctx, out_gcu, phi::DataType::FLOAT16, out); + if (out->dtype() == DataType::FLOAT16) { + custom_kernel::Cast(dev_ctx, out_gcu, DataType::FLOAT16, out); } else { *out = out_gcu; } @@ -142,6 +139,6 @@ PD_REGISTER_PLUGIN_KERNEL(multiclass_nms3, phi::dtype::float16, phi::dtype::bfloat16, float) { - kernel->OutputAt(1).SetDataType(phi::DataType::INT32); - kernel->OutputAt(2).SetDataType(phi::DataType::INT32); + kernel->OutputAt(1).SetDataType(DataType::INT32); + kernel->OutputAt(2).SetDataType(DataType::INT32); } diff --git a/backends/gcu/kernels/multinomial_kernel.cc b/backends/gcu/kernels/multinomial_kernel.cc index 6025d983ce4..9f699575294 100644 --- a/backends/gcu/kernels/multinomial_kernel.cc +++ b/backends/gcu/kernels/multinomial_kernel.cc @@ -17,10 +17,10 @@ namespace custom_kernel { template void MultinomialKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& num, bool replacement, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("multinomial"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -30,7 +30,7 @@ void MultinomialKernel(const Context& dev_ctx, // seed_offset.first = gen->GetCurrentSeed(); // seed_offset.second = 0; // auto num_samples = num.to(); - // phi::DenseTensor output = + // DenseTensor output = // MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); // LAUNCH_TOPSATENOP(topsatenMultinomial, // dev_ctx, @@ -46,31 +46,31 @@ void MultinomialKernel(const Context& dev_ctx, VLOG(6) << "[CPU_KERNEL] Call CPU kernel for multinomial(float16)"; PADDLE_ENFORCE_EQ( x.dtype(), - phi::DataType::FLOAT16, + DataType::FLOAT16, phi::errors::InvalidArgument("Only float16 is supported, but got % s.", - phi::DataTypeToString(x.dtype()).c_str())); + DataTypeToString(x.dtype()).c_str())); - phi::DenseTensor x_gcu_f32; - phi::DenseTensor x_cpu_f32; - phi::DenseTensor out_cpu_int64; + DenseTensor x_gcu_f32; + DenseTensor x_cpu_f32; + DenseTensor out_cpu_int64; // convert input - phi::DenseTensorMeta gcu_meta = x.meta(); - gcu_meta.dtype = phi::DataType::FLOAT32; + DenseTensorMeta gcu_meta = x.meta(); + gcu_meta.dtype = DataType::FLOAT32; x_gcu_f32.set_meta(gcu_meta); - custom_kernel::Cast(dev_ctx, x, phi::DataType::FLOAT32, &x_gcu_f32); - TensorCopy(dev_ctx, x_gcu_f32, false, &x_cpu_f32, phi::CPUPlace()); + custom_kernel::Cast(dev_ctx, x, DataType::FLOAT32, &x_gcu_f32); + TensorCopy(dev_ctx, x_gcu_f32, false, &x_cpu_f32, CPUPlace()); // Wait for conversion dev_ctx.Wait(); // call the CPU implementation - phi::CPUContext dev_ctx_cpu; + CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator(&(dev_ctx.GetHostAllocator())); dev_ctx_cpu.SetHostAllocator(&(dev_ctx.GetHostAllocator())); dev_ctx_cpu.SetHostGenerator(dev_ctx.GetHostGenerator()); out_cpu_int64.set_meta(out->meta()); - phi::MultinomialKernel( + phi::MultinomialKernel( dev_ctx_cpu, x_cpu_f32, num, replacement, &out_cpu_int64); dev_ctx.Wait(); @@ -90,5 +90,5 @@ PD_REGISTER_PLUGIN_KERNEL(multinomial, custom_kernel::MultinomialKernel, // float, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::INT64); + kernel->OutputAt(0).SetDataType(DataType::INT64); } diff --git a/backends/gcu/kernels/nms_kernel.cc b/backends/gcu/kernels/nms_kernel.cc index 642c2d3ef05..b7ff3bccddf 100644 --- a/backends/gcu/kernels/nms_kernel.cc +++ b/backends/gcu/kernels/nms_kernel.cc @@ -28,16 +28,16 @@ void NMSKernel(const Context& dev_ctx, if (LaunchAOTKernel()) { auto boxes_num = boxes.dims().at(0); - phi::DenseTensor cpu_tensor; - phi::DenseTensorMeta scores_meta = {phi::DataType::FLOAT32, - phi::make_ddim({boxes_num})}; + DenseTensor cpu_tensor; + DenseTensorMeta scores_meta = {DataType::FLOAT32, + phi::make_ddim({boxes_num})}; cpu_tensor.set_meta(scores_meta); float* host_mask = dev_ctx.template HostAlloc(&cpu_tensor); for (size_t i = 0; i < boxes_num; i++) { host_mask[i] = boxes_num - i; } - phi::DenseTensor scores_tensor = + DenseTensor scores_tensor = custom_kernel::TensorEmpty(dev_ctx, scores_meta); // copy mask to device @@ -58,9 +58,9 @@ void NMSKernel(const Context& dev_ctx, "should be equal.")); DenseTensor out_imp = *output; - if (output->dtype() != phi::DataType::INT32) { - phi::DenseTensorMeta int32_meta = {phi::DataType::INT32, - phi::make_ddim({boxes_num})}; + if (output->dtype() != DataType::INT32) { + DenseTensorMeta int32_meta = {DataType::INT32, + phi::make_ddim({boxes_num})}; out_imp = custom_kernel::TensorEmpty(dev_ctx, int32_meta); dev_ctx.template Alloc(&out_imp); } @@ -116,5 +116,5 @@ void NMSKernel(const Context& dev_ctx, PD_REGISTER_PLUGIN_KERNEL( nms, gcu, ALL_LAYOUT, custom_kernel::NMSKernel, float, double) { - kernel->OutputAt(0).SetDataType(phi::DataType::INT64); + kernel->OutputAt(0).SetDataType(DataType::INT64); } diff --git a/backends/gcu/kernels/numel_kernel.cc b/backends/gcu/kernels/numel_kernel.cc index d74a2ecd15a..f66591d96b2 100644 --- a/backends/gcu/kernels/numel_kernel.cc +++ b/backends/gcu/kernels/numel_kernel.cc @@ -18,8 +18,8 @@ namespace custom_kernel { template void NumelKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - phi::DenseTensor* out) { + const DenseTensor& input, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("numel"); dev_ctx.template Alloc(out); @@ -61,5 +61,5 @@ PD_REGISTER_PLUGIN_KERNEL(numel, float, double, bool) { - kernel->OutputAt(0).SetDataType(phi::DataType::INT64); + kernel->OutputAt(0).SetDataType(DataType::INT64); } diff --git a/backends/gcu/kernels/one_hot_kernel.cc b/backends/gcu/kernels/one_hot_kernel.cc index 48ff710889c..cd9debe10db 100644 --- a/backends/gcu/kernels/one_hot_kernel.cc +++ b/backends/gcu/kernels/one_hot_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void OneHotKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& num_classes_s, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("one_hot"); int64_t depth = num_classes_s.to(); auto out_dims = out->dims(); @@ -32,9 +32,9 @@ void OneHotKernel(const Context& dev_ctx, dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - phi::DenseTensor aten_out; + DenseTensor aten_out; auto meta = out->meta(); - meta.dtype = phi::DataType::INT32; + meta.dtype = DataType::INT32; aten_out.set_meta(meta); dev_ctx.template Alloc(&aten_out); LAUNCH_TOPSATENOP(topsatenOneHot, dev_ctx, aten_out, x, depth); @@ -55,7 +55,7 @@ void OneHotKernel(const Context& dev_ctx, GcuAttributeMap attrs; attrs["depth"] = depth; - attrs["dtype"] = static_cast(phi::DataType::FLOAT32); + attrs["dtype"] = static_cast(DataType::FLOAT32); GcuRunner(input_names, inputs, @@ -69,20 +69,20 @@ void OneHotKernel(const Context& dev_ctx, template void OneHotV2Kernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& num_classes_s, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("one_hot_v2"); custom_kernel::OneHotKernel(dev_ctx, x, num_classes_s, out); } template void OneHotRawKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& num_classes_s, - phi::DataType dtype, + DataType dtype, bool allow_out_of_range UNUSED, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("one_hot_raw"); custom_kernel::OneHotKernel(dev_ctx, x, num_classes_s, out); } @@ -90,7 +90,7 @@ void OneHotRawKernel(const Context& dev_ctx, PD_REGISTER_PLUGIN_KERNEL( one_hot, gcu, ALL_LAYOUT, custom_kernel::OneHotKernel, int32_t, int64_t) { - kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(0).SetDataType(DataType::FLOAT32); } PD_REGISTER_PLUGIN_KERNEL(one_hot_v2, @@ -99,7 +99,7 @@ PD_REGISTER_PLUGIN_KERNEL(one_hot_v2, custom_kernel::OneHotV2Kernel, int32_t, int64_t) { - kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(0).SetDataType(DataType::FLOAT32); } PD_REGISTER_PLUGIN_KERNEL(one_hot_raw, @@ -108,5 +108,5 @@ PD_REGISTER_PLUGIN_KERNEL(one_hot_raw, custom_kernel::OneHotRawKernel, int32_t, int64_t) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); } diff --git a/backends/gcu/kernels/pool2d_kernel.cc b/backends/gcu/kernels/pool2d_kernel.cc index 4fce48a9e5d..8260ba69b8c 100644 --- a/backends/gcu/kernels/pool2d_kernel.cc +++ b/backends/gcu/kernels/pool2d_kernel.cc @@ -70,7 +70,7 @@ inline void UpdatePadding(std::vector* paddings, template void Pool2dKernel(const Context& dev_ctx, - const phi::DenseTensor& in_x, + const DenseTensor& in_x, const phi::IntArray& kernel_size, const std::vector& strides_t, const std::vector& paddings_t, @@ -81,7 +81,7 @@ void Pool2dKernel(const Context& dev_ctx, bool global_pooling, bool adaptive, const std::string& padding_algorithm, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("pool2d"); dev_ctx.template Alloc(out); std::vector ksize(kernel_size.GetData().begin(), @@ -107,7 +107,7 @@ void Pool2dKernel(const Context& dev_ctx, << ", padding_algorithm:" << padding_algorithm << ", global_pooling:" << global_pooling << ", adaptive:" << adaptive; - phi::DenseTensor in_x_tensor(in_x), out_tensor(*out); + DenseTensor in_x_tensor(in_x), out_tensor(*out); std::vector ksize_vec(4, 1); std::vector strides_vec(4, 1); @@ -118,9 +118,9 @@ void Pool2dKernel(const Context& dev_ctx, ksize_vec[2] = ksize[1]; strides_vec[1] = strides[0]; strides_vec[2] = strides[1]; - phi::DenseTensorMeta in_x_meta = { + DenseTensorMeta in_x_meta = { in_x_tensor.dtype(), in_x_tensor.dims(), phi::DataLayout::kNHWC}; - phi::DenseTensorMeta out_meta = { + DenseTensorMeta out_meta = { out_tensor.dtype(), out_tensor.dims(), phi::DataLayout::kNHWC}; in_x_tensor.set_meta(in_x_meta); out_tensor.set_meta(out_meta); @@ -147,9 +147,8 @@ void Pool2dKernel(const Context& dev_ctx, ksize); if (LaunchAOTKernel()) { - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, in_x); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, in_x); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); std::vector strides_v = {strides_t.begin(), strides_t.end()}; std::vector paddings_v = { @@ -283,9 +282,9 @@ void Pool2dKernel(const Context& dev_ctx, template void Pool2dGradKernel(const Context& dev_ctx, - const phi::DenseTensor& in_x, - const phi::DenseTensor& out, - const phi::DenseTensor& out_grad, + const DenseTensor& in_x, + const DenseTensor& out, + const DenseTensor& out_grad, const phi::IntArray& kernel_size, const std::vector& strides_t, const std::vector& paddings_t, @@ -296,7 +295,7 @@ void Pool2dGradKernel(const Context& dev_ctx, bool global_pooling, bool adaptive, const std::string& padding_algorithm, - phi::DenseTensor* in_x_grad) { + DenseTensor* in_x_grad) { PADDLE_GCU_KERNEL_TRACE("pool2d_grad"); dev_ctx.template Alloc(in_x_grad); diff --git a/backends/gcu/kernels/prior_box_kernel.cc b/backends/gcu/kernels/prior_box_kernel.cc index 3a91ff1deda..da199eb6b30 100644 --- a/backends/gcu/kernels/prior_box_kernel.cc +++ b/backends/gcu/kernels/prior_box_kernel.cc @@ -19,8 +19,8 @@ namespace custom_kernel { template void PriorBoxKernel(const Context& dev_ctx, - const phi::DenseTensor& input, - const phi::DenseTensor& image, + const DenseTensor& input, + const DenseTensor& image, const std::vector& min_sizes, const std::vector& max_sizes, const std::vector& aspect_ratios, @@ -31,8 +31,8 @@ void PriorBoxKernel(const Context& dev_ctx, float step_h, float offset, bool min_max_aspect_ratios_order, - phi::DenseTensor* out, - phi::DenseTensor* var) { + DenseTensor* out, + DenseTensor* var) { PADDLE_GCU_KERNEL_TRACE("prior_box"); dev_ctx.template Alloc(out); dev_ctx.template Alloc(var); diff --git a/backends/gcu/kernels/randperm_kernel.cc b/backends/gcu/kernels/randperm_kernel.cc index cb3aee6d70a..f10f815c0ed 100644 --- a/backends/gcu/kernels/randperm_kernel.cc +++ b/backends/gcu/kernels/randperm_kernel.cc @@ -19,14 +19,14 @@ namespace custom_kernel { template void RandpermRawKernel(const Context& dev_ctx, int n, - phi::DataType dtype, + DataType dtype, unsigned int seed, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("randperm_raw"); ContextPinnedGuard ctx_pinned_guard(dev_ctx); VLOG(6) << "[HOST_KERNEL] Impl on host for randperm"; VLOG(6) << "Enter RandpermRawKernel with n:" << n << ", seed:" << seed - << ", dtype:" << phi::DataTypeToString(dtype); + << ", dtype:" << DataTypeToString(dtype); std::shared_ptr engine; @@ -45,7 +45,7 @@ void RandpermRawKernel(const Context& dev_ctx, std::shuffle(out_data, out_data + n, *engine); } else { dev_ctx.template Alloc(out); - phi::DenseTensor tmp_tensor; + DenseTensor tmp_tensor; tmp_tensor.Resize(phi::make_ddim({n})); T* tmp_data = dev_ctx.template HostAlloc(&tmp_tensor); for (int i = 0; i < n; ++i) { @@ -59,8 +59,8 @@ void RandpermRawKernel(const Context& dev_ctx, template void RandpermKernel(const Context& dev_ctx, int n, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("randperm"); custom_kernel::RandpermRawKernel(dev_ctx, n, dtype, 0, out); } diff --git a/backends/gcu/kernels/reduce_kernels.cc b/backends/gcu/kernels/reduce_kernels.cc index d966ea2046e..4d3906a4598 100644 --- a/backends/gcu/kernels/reduce_kernels.cc +++ b/backends/gcu/kernels/reduce_kernels.cc @@ -19,11 +19,11 @@ namespace custom_kernel { template void ReduceBaseKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, bool keep_dim, bool reduce_all, - phi::DenseTensor* out, + DenseTensor* out, const std::string& op_type) { dev_ctx.template Alloc(out); @@ -51,12 +51,12 @@ void ReduceBaseKernel(const Context& dev_ctx, template void ReduceGradBaseKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out_grad, const phi::IntArray& dims_array, bool keep_dim, bool reduce_all, - phi::DenseTensor* x_grad, + DenseTensor* x_grad, const std::string& op_type) { dev_ctx.template Alloc(x_grad); @@ -86,10 +86,10 @@ void ReduceGradBaseKernel(const Context& dev_ctx, template void AnyKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("any"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -103,18 +103,18 @@ void AnyKernel(const Context& dev_ctx, ContextPinnedGuard ctx_pinned_guard(dev_ctx); // fallback to CPU // 1. Copy x to CPU - phi::DenseTensor x_cpu; + DenseTensor x_cpu; x_cpu.set_meta(x.meta()); - TensorCopy(dev_ctx, x, false, &x_cpu, phi::CPUPlace()); + TensorCopy(dev_ctx, x, false, &x_cpu, CPUPlace()); dev_ctx.Wait(); // 2. Call the CPU implementation - phi::CPUContext dev_ctx_cpu; + CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator(&(dev_ctx.GetHostAllocator())); dev_ctx_cpu.SetHostAllocator(&(dev_ctx.GetHostAllocator())); - phi::DenseTensor out_cpu; + DenseTensor out_cpu; out_cpu.set_meta(out->meta()); - phi::AnyKernel( + phi::AnyKernel( dev_ctx_cpu, x_cpu, dims, keep_dim, &out_cpu); dev_ctx.Wait(); @@ -130,10 +130,10 @@ void AnyKernel(const Context& dev_ctx, template void AllKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("all"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -158,10 +158,10 @@ void AllKernel(const Context& dev_ctx, template void MaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("max"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -178,9 +178,8 @@ void MaxKernel(const Context& dev_ctx, } } - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP( topsatenMax, dev_ctx, output, input_x, reduce_axis, keep_dim); MaybeTransResult(dev_ctx, output, out); @@ -198,10 +197,10 @@ void MaxKernel(const Context& dev_ctx, template void MinKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("min"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -218,9 +217,8 @@ void MinKernel(const Context& dev_ctx, } } - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor output = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP( topsatenMin, dev_ctx, output, input_x, reduce_axis, keep_dim); MaybeTransResult(dev_ctx, output, out); @@ -238,11 +236,11 @@ void MinKernel(const Context& dev_ctx, template void ProdKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, bool keep_dim, bool reduce_all, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("prod"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -269,11 +267,11 @@ void ProdKernel(const Context& dev_ctx, template void SumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, - phi::DataType out_dtype, + DataType out_dtype, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("sum"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -290,12 +288,12 @@ void SumKernel(const Context& dev_ctx, } } - phi::DenseTensor input_x; - phi::DenseTensor output; - if (x.dtype() == phi::DataType::BOOL) { - custom_kernel::Cast(dev_ctx, x, phi::DataType::INT32, &input_x); + DenseTensor input_x; + DenseTensor output; + if (x.dtype() == DataType::BOOL) { + custom_kernel::Cast(dev_ctx, x, DataType::INT32, &input_x); auto meta = out->meta(); - meta.dtype = phi::DataType::INT32; + meta.dtype = DataType::INT32; output = custom_kernel::TensorEmpty(dev_ctx, meta); } else { input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); @@ -324,12 +322,12 @@ void SumKernel(const Context& dev_ctx, template void SumGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out_grad, const phi::IntArray& dims_array, bool keep_dim, bool reduce_all, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("sum_grad"); if (x.dims().size() == 0) { TensorCopy(dev_ctx, out_grad, true, x_grad); @@ -353,10 +351,10 @@ void SumGradKernel(const Context& dev_ctx, template void MeanKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("mean"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -388,12 +386,12 @@ void MeanKernel(const Context& dev_ctx, template void MeanGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out_grad, const phi::IntArray& axes, bool keep_dim, bool reduce_all, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("mean_grad"); dev_ctx.template Alloc(x_grad); if (x.dims().size() == 0) { @@ -416,10 +414,10 @@ void MeanGradKernel(const Context& dev_ctx, template void AMaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("amax"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -444,10 +442,10 @@ void AMaxKernel(const Context& dev_ctx, template void AMinKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& dims, bool keep_dim, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("amin"); if (LaunchAOTKernel()) { dev_ctx.template Alloc(out); @@ -494,7 +492,7 @@ PD_REGISTER_PLUGIN_KERNEL(any, bool, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(all, @@ -506,7 +504,7 @@ PD_REGISTER_PLUGIN_KERNEL(all, bool, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); + kernel->OutputAt(0).SetDataType(DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(max, @@ -548,7 +546,7 @@ PD_REGISTER_PLUGIN_KERNEL(sum, double, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); } // PD_REGISTER_PLUGIN_KERNEL(sum_grad, diff --git a/backends/gcu/kernels/reshape_kernel.cc b/backends/gcu/kernels/reshape_kernel.cc index a322080c2dc..3052d3c7ae1 100644 --- a/backends/gcu/kernels/reshape_kernel.cc +++ b/backends/gcu/kernels/reshape_kernel.cc @@ -128,9 +128,9 @@ static phi::DDim ValidateShape(const std::vector shape, return common::make_ddim(output_shape); } -void InferMetaFromVecValue(const phi::DenseTensor& x, +void InferMetaFromVecValue(const DenseTensor& x, const std::vector& shape, - phi::DenseTensor* out) { + DenseTensor* out) { auto x_dims = x.dims(); auto out_dims = ValidateShape(shape, x_dims); out->Resize(out_dims); @@ -148,9 +148,9 @@ void InferMetaFromVecValue(const phi::DenseTensor& x, template void ReshapeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& shape, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("reshape"); PADDLE_ENFORCE_NE( x.layout(), @@ -182,22 +182,22 @@ void ReshapeKernel(const Context& dev_ctx, template void ReshapeWithXShapeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& shape, - phi::DenseTensor* out, - phi::DenseTensor* xshape) { + DenseTensor* out, + DenseTensor* xshape) { PADDLE_GCU_KERNEL_TRACE("reshape_with_xshape"); ReshapeKernel(dev_ctx, x, shape, out); } template void ReshapeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out_grad, - phi::DenseTensor* x_grad) { + const DenseTensor& out_grad, + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("reshape_grad"); dev_ctx.template Alloc(x_grad); - phi::DenseTensor* tmp_tensor = nullptr; + DenseTensor* tmp_tensor = nullptr; if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); } else { // kernel impl base on JIT @@ -206,7 +206,7 @@ void ReshapeGradKernel(const Context& dev_ctx, for (auto dim : out_shape) { xshape.emplace_back(dim); } - phi::DenseTensor x_shape; + DenseTensor x_shape; x_shape.Resize(phi::make_ddim(xshape)); dev_ctx.template Alloc(&x_shape); diff --git a/backends/gcu/kernels/rmsprop_kernel.cc b/backends/gcu/kernels/rmsprop_kernel.cc index aec2bce300a..840edc9bea5 100644 --- a/backends/gcu/kernels/rmsprop_kernel.cc +++ b/backends/gcu/kernels/rmsprop_kernel.cc @@ -19,23 +19,23 @@ namespace custom_kernel { template void RmspropDenseKernel(const Context& dev_ctx, - const phi::DenseTensor& param, - const phi::DenseTensor& mean_square, - const phi::DenseTensor& grad, - const phi::DenseTensor& moment, - const phi::DenseTensor& learning_rate, - const paddle::optional& mean_grad, - const paddle::optional& master_param, + const DenseTensor& param, + const DenseTensor& mean_square, + const DenseTensor& grad, + const DenseTensor& moment, + const DenseTensor& learning_rate, + const paddle::optional& mean_grad, + const paddle::optional& master_param, float epsilon, float decay, float momentum, bool centered, bool multi_precision, - phi::DenseTensor* param_out, - phi::DenseTensor* moment_out, - phi::DenseTensor* mean_square_out, - phi::DenseTensor* mean_grad_out, - phi::DenseTensor* master_param_outs) { + DenseTensor* param_out, + DenseTensor* moment_out, + DenseTensor* mean_square_out, + DenseTensor* mean_grad_out, + DenseTensor* master_param_outs) { PADDLE_GCU_KERNEL_TRACE("rmsprop"); PADDLE_ENFORCE_EQ( multi_precision, @@ -63,19 +63,19 @@ void RmspropDenseKernel(const Context& dev_ctx, inputs["MeanSquare"] = {const_cast(&mean_square)}; inputs["MeanGrad"] = {const_cast(&(*mean_grad))}; - phi::DenseTensor param_out_tmp; + DenseTensor param_out_tmp; param_out_tmp.set_meta(param_out->meta()); dev_ctx.template Alloc(¶m_out_tmp); - phi::DenseTensor moment_out_tmp; + DenseTensor moment_out_tmp; moment_out_tmp.set_meta(moment_out->meta()); dev_ctx.template Alloc(&moment_out_tmp); - phi::DenseTensor mean_square_out_tmp; + DenseTensor mean_square_out_tmp; mean_square_out_tmp.set_meta(mean_square_out->meta()); dev_ctx.template Alloc(&mean_square_out_tmp); - phi::DenseTensor mean_grad_out_tmp; + DenseTensor mean_grad_out_tmp; mean_grad_out_tmp.set_meta(mean_grad_out->meta()); dev_ctx.template Alloc(&mean_grad_out_tmp); diff --git a/backends/gcu/kernels/rnn_kernel.cc b/backends/gcu/kernels/rnn_kernel.cc index 5bbe2cdbc8d..8687af85880 100644 --- a/backends/gcu/kernels/rnn_kernel.cc +++ b/backends/gcu/kernels/rnn_kernel.cc @@ -33,11 +33,10 @@ DEFINE_MODE_DETECTOR(gru, GRU); DEFINE_MODE_DETECTOR(rnn_relu, RNN_RELU); DEFINE_MODE_DETECTOR(rnn_tanh, RNN_TANH); -void ResetParameterVector( - const std::vector& raw_params_vec, - int num_layers, - bool is_bidirec, - std::vector>* params_vec) { +void ResetParameterVector(const std::vector& raw_params_vec, + int num_layers, + bool is_bidirec, + std::vector>* params_vec) { // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers @@ -46,10 +45,10 @@ void ResetParameterVector( const int& all_weight_size = num_layers * layer_weight_size; const int& bias_start_idx = all_weight_size / 2; for (int i = 0; i < num_layers; i++) { - std::vector tensor_list; + std::vector tensor_list; tensor_list.reserve(layer_weight_size); for (int j = 0; j < layer_weight_size; j++) { - phi::DenseTensor tensor_holder; + DenseTensor tensor_holder; tensor_list.emplace_back(tensor_holder); } for (int j = 0; j < layer_weight_size; j++) { @@ -67,17 +66,17 @@ void ResetParameterVector( template void rnn_slice(const Context& dev_ctx, - const phi::DenseTensor& input, - std::vector& out) { // NOLINT + const DenseTensor& input, + std::vector& out) { // NOLINT // Warn : This function only slices the index 0 dimension. std::vector axes_t = {0}; - auto meta = phi::DenseTensorMeta( + auto meta = DenseTensorMeta( input.dtype(), phi::make_ddim({1, input.dims().at(1), input.dims().at(2)})); for (int i = 0; i < input.dims().at(0); ++i) { std::vector starts = {i}; - phi::DenseTensor output_tmp = TensorEmpty(dev_ctx, meta); + DenseTensor output_tmp = TensorEmpty(dev_ctx, meta); custom_kernel::SliceBase(dev_ctx, input, axes_t, starts, &output_tmp); out.push_back(output_tmp); } @@ -85,19 +84,19 @@ void rnn_slice(const Context& dev_ctx, template void SlicePreState(const Context& dev_ctx, - const phi::DenseTensor& pre_state, + const DenseTensor& pre_state, int num_layers, bool is_bidirec, - std::vector>* pre_state_vec) { + std::vector>* pre_state_vec) { auto stream = dev_ctx.stream(); const int& direction_num = is_bidirec ? 2 : 1; - std::vector out; + std::vector out; rnn_slice(dev_ctx, pre_state, out); int k = 0; for (int i = 0; i < num_layers; ++i) { - std::vector tensor_list; + std::vector tensor_list; for (int j = 0; j < direction_num; ++j) { tensor_list.emplace_back(out[k]); ++k; @@ -108,8 +107,8 @@ void SlicePreState(const Context& dev_ctx, template void rnn_concat(const Context& dev_ctx, - std::vector input, - phi::DenseTensor& out, // NOLINT + std::vector input, + DenseTensor& out, // NOLINT int64_t dim) { auto out_tensor = CreateTopsatenTensor(out); std::vector in_tensors; @@ -124,29 +123,29 @@ void rnn_concat(const Context& dev_ctx, template void LSTMKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& init_h, - const phi::DenseTensor& init_c, - const phi::DenseTensor& wi, - const phi::DenseTensor& wh, - const phi::DenseTensor& bi, - const phi::DenseTensor& bh, + const DenseTensor& x, + const DenseTensor& init_h, + const DenseTensor& init_c, + const DenseTensor& wi, + const DenseTensor& wh, + const DenseTensor& bi, + const DenseTensor& bh, const std::vector& SequenceLength, float dropout_prob, - phi::DenseTensor* out, - phi::DenseTensor* last_h, - phi::DenseTensor* last_c, + DenseTensor* out, + DenseTensor* last_h, + DenseTensor* last_c, bool is_b) { - std::vector input_list; + std::vector input_list; rnn_slice(dev_ctx, x, input_list); if (is_b) { std::reverse(input_list.begin(), input_list.end()); } - std::vector last_h_list; - std::vector last_c_list; + std::vector last_h_list; + std::vector last_c_list; for (int32_t i = 0; i < x.dims().at(0); ++i) { - phi::DenseTensor tmp_h, tmp_c; + DenseTensor tmp_h, tmp_c; tmp_h.Resize( phi::make_ddim({1, last_h->dims().at(1), last_h->dims().at(2)})); tmp_c.Resize( @@ -157,8 +156,8 @@ void LSTMKernel(const Context& dev_ctx, last_c_list.push_back(tmp_c); } - std::vector out_list; - std::vector init_list; + std::vector out_list; + std::vector init_list; for (int32_t i = 0; i < x.dims().at(0); ++i) { out_list.clear(); @@ -202,7 +201,7 @@ void DropoutHelper(const Context& dev_ctx, const DenseTensor* mask, float dropout_prob) { if (dropout_prob == 1.0f) { - auto meta = phi::DenseTensorMeta(x->dtype(), x->dims()); + auto meta = DenseTensorMeta(x->dtype(), x->dims()); *y = TensorZeros(dev_ctx, meta); } else { LAUNCH_TOPSATENOP(topsatenMul, dev_ctx, *y, *x, *mask); @@ -226,7 +225,7 @@ void DropoutGcuFunctionInplace(const Context& dev_ctx, if (is_test) { return; } - phi::DenseTensor mask_temp; + DenseTensor mask_temp; mask_temp.Resize(mask->dims()); dev_ctx.template HostAlloc(&mask_temp); size_t size = common::product(x->dims()); @@ -259,10 +258,10 @@ void DropoutGcuFunctionInplace(const Context& dev_ctx, template void RnnKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const std::vector& pre_state, - const std::vector& weight_list, - const paddle::optional& sequence_length, + const DenseTensor& x, + const std::vector& pre_state, + const std::vector& weight_list, + const paddle::optional& sequence_length, float dropout_prob, bool is_bidirec, int input_size, @@ -271,10 +270,10 @@ void RnnKernel(const Context& dev_ctx, const std::string& mode, int seed, bool is_test, - phi::DenseTensor* out, - phi::DenseTensor* dropout_state, - std::vector state, - phi::DenseTensor* reserve) { + DenseTensor* out, + DenseTensor* dropout_state, + std::vector state, + DenseTensor* reserve) { auto init_h = pre_state[0]; auto init_c = pre_state[1]; @@ -326,12 +325,12 @@ void RnnKernel(const Context& dev_ctx, if (LaunchAOTKernel()) { // reset parameter, init_h and init_c - std::vector> parameter_lists; + std::vector> parameter_lists; parameter_lists.reserve(num_layers); custom_kernel::ResetParameterVector( weight_list, num_layers, is_bidirec, ¶meter_lists); - std::vector> init_h_list, init_c_list; + std::vector> init_h_list, init_c_list; init_h_list.reserve(num_layers); init_c_list.reserve(num_layers); custom_kernel::SlicePreState( @@ -340,15 +339,15 @@ void RnnKernel(const Context& dev_ctx, dev_ctx, *init_c, num_layers, is_bidirec, &init_c_list); if (is_lstm(mode)) { - std::vector out_vec, last_h_vec, last_c_vec; - phi::DenseTensor input = x; + std::vector out_vec, last_h_vec, last_c_vec; + DenseTensor input = x; bool has_dropout_reset = false; for (int i = 0; i < num_layers; ++i) { int32_t seq_length = x.dims().at(0); int32_t batch_size = x.dims().at(1); int32_t hidden_size = init_h->dims().at(2); - phi::DenseTensor out_tmp_f, last_h_tmp_f, last_c_tmp_f; + DenseTensor out_tmp_f, last_h_tmp_f, last_c_tmp_f; out_tmp_f.Resize(phi::make_ddim({seq_length, batch_size, hidden_size})); last_h_tmp_f.Resize(phi::make_ddim({1, batch_size, hidden_size})); last_c_tmp_f.Resize(phi::make_ddim({1, batch_size, hidden_size})); @@ -379,7 +378,7 @@ void RnnKernel(const Context& dev_ctx, last_c_vec.push_back(last_c_tmp_f); } else { out_vec.clear(); - phi::DenseTensor out_tmp_b, last_h_tmp_b, last_c_tmp_b; + DenseTensor out_tmp_b, last_h_tmp_b, last_c_tmp_b; out_tmp_b.Resize( phi::make_ddim({seq_length, batch_size, hidden_size})); last_h_tmp_b.Resize(phi::make_ddim({1, batch_size, hidden_size})); @@ -424,8 +423,8 @@ void RnnKernel(const Context& dev_ctx, last_h_vec.push_back(last_h_tmp_b); last_c_vec.push_back(last_c_tmp_b); - std::vector new_output{out_tmp_f, out_tmp_b}; - phi::DenseTensor concat_out; + std::vector new_output{out_tmp_f, out_tmp_b}; + DenseTensor concat_out; concat_out.Resize(out->dims()); dev_ctx.template Alloc(&concat_out); rnn_concat(dev_ctx, new_output, concat_out, 2); @@ -474,5 +473,5 @@ void RnnKernel(const Context& dev_ctx, // phi::dtype::float16, // phi::dtype::bfloat16, // float) { -// kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); +// kernel->OutputAt(1).SetDataType(DataType::UINT8); // } diff --git a/backends/gcu/kernels/roi_align_kernel.cc b/backends/gcu/kernels/roi_align_kernel.cc index 8f115a9d6f8..32a9523ff9d 100644 --- a/backends/gcu/kernels/roi_align_kernel.cc +++ b/backends/gcu/kernels/roi_align_kernel.cc @@ -18,15 +18,15 @@ namespace custom_kernel { template void RoiAlignKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& boxes, - const paddle::optional& boxes_num, + const DenseTensor& x, + const DenseTensor& boxes, + const paddle::optional& boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("roi_align"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -74,16 +74,16 @@ void RoiAlignKernel(const Context& dev_ctx, template void RoiAlignGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& boxes, - const paddle::optional& boxes_num, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& boxes, + const paddle::optional& boxes_num, + const DenseTensor& out_grad, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("roi_align_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); diff --git a/backends/gcu/kernels/roll_kernel.cc b/backends/gcu/kernels/roll_kernel.cc index 2d1f3188c61..01ff3cec6d0 100644 --- a/backends/gcu/kernels/roll_kernel.cc +++ b/backends/gcu/kernels/roll_kernel.cc @@ -18,10 +18,10 @@ namespace custom_kernel { template void RollKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& shifts, const std::vector& axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("roll"); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/scale_kernel.cc b/backends/gcu/kernels/scale_kernel.cc index 9723e640cbb..205f48cf7b5 100644 --- a/backends/gcu/kernels/scale_kernel.cc +++ b/backends/gcu/kernels/scale_kernel.cc @@ -18,34 +18,33 @@ namespace custom_kernel { namespace { // topsaten binary not support int tensor and float scale/bais now -const std::unordered_map kBinaryDtypeTrans = { - {phi::DataType::INT64, phi::DataType::FLOAT32}, - {phi::DataType::INT32, phi::DataType::FLOAT32}, - {phi::DataType::FLOAT64, phi::DataType::FLOAT32}, +const std::unordered_map kBinaryDtypeTrans = { + {DataType::INT64, DataType::FLOAT32}, + {DataType::INT32, DataType::FLOAT32}, + {DataType::FLOAT64, DataType::FLOAT32}, }; } // namespace template void ScaleKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& in_scale, const phi::Scalar& in_bias, bool bias_after_scale, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("scale"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { auto input_scale = in_scale; - if (in_scale.dtype() == phi::DataType::FLOAT64) { + if (in_scale.dtype() == DataType::FLOAT64) { input_scale = phi::Scalar(static_cast(in_scale.to())); } auto input_bias = in_bias; - if (in_scale.dtype() == phi::DataType::FLOAT64) { + if (in_scale.dtype() == DataType::FLOAT64) { input_bias = phi::Scalar(static_cast(in_bias.to())); } - phi::DenseTensor input_x = - MaybeCreateOrTrans(dev_ctx, x, kBinaryDtypeTrans); - phi::DenseTensor output = + DenseTensor input_x = MaybeCreateOrTrans(dev_ctx, x, kBinaryDtypeTrans); + DenseTensor output = MaybeCreateOrTrans(dev_ctx, *out, kBinaryDtypeTrans, false); if (bias_after_scale) { // Out = scale ∗ X + bias @@ -54,7 +53,7 @@ void ScaleKernel(const Context& dev_ctx, } else { // Out = scale ∗ (X + bias) auto tmp_scalar = phi::Scalar(1.0f); - phi::DenseTensor tmp_out = TensorEmpty(dev_ctx, output.meta()); + DenseTensor tmp_out = TensorEmpty(dev_ctx, output.meta()); LAUNCH_TOPSATENOP( topsatenAdd, dev_ctx, tmp_out, input_x, input_bias, tmp_scalar); LAUNCH_TOPSATENOP(topsatenMul, dev_ctx, output, input_scale, tmp_out); @@ -63,12 +62,12 @@ void ScaleKernel(const Context& dev_ctx, } else { // kernel impl base on JIT dev_ctx.template Alloc(out); - phi::DenseTensor scale_tensor; + DenseTensor scale_tensor; scale_tensor.Resize({1}); FillGcuTensorWithConstant( &scale_tensor, dev_ctx, in_scale.to()); - phi::DenseTensor bias_tensor; + DenseTensor bias_tensor; bias_tensor.Resize({1}); FillGcuTensorWithConstant( &bias_tensor, dev_ctx, in_bias.to()); diff --git a/backends/gcu/kernels/scatter_kernel.cc b/backends/gcu/kernels/scatter_kernel.cc index 2a70e536b63..43b7b275762 100644 --- a/backends/gcu/kernels/scatter_kernel.cc +++ b/backends/gcu/kernels/scatter_kernel.cc @@ -19,31 +19,31 @@ namespace custom_kernel { template extern void IndexPutKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const std::vector& indices, - const phi::DenseTensor& value, + const DenseTensor& x, + const std::vector& indices, + const DenseTensor& value, bool accumulate, - phi::DenseTensor* out); + DenseTensor* out); template void ScatterKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& index, - const phi::DenseTensor& updates, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& updates, bool overwrite, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("scatter"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { bool accumulate = !overwrite; - phi::DenseTensor intermediate_res = x; + DenseTensor intermediate_res = x; if (accumulate) { auto meta = updates.meta(); - if (meta.dtype == phi::DataType::INT64) { - meta.dtype = phi::DataType::INT32; - } else if (meta.dtype == phi::DataType::FLOAT64) { - meta.dtype = phi::DataType::FLOAT32; + if (meta.dtype == DataType::INT64) { + meta.dtype = DataType::INT32; + } else if (meta.dtype == DataType::FLOAT64) { + meta.dtype = DataType::FLOAT32; } intermediate_res = *out; auto updates_tmp = custom_kernel::TensorZeros(dev_ctx, meta); @@ -73,11 +73,11 @@ void ScatterKernel(const Context& dev_ctx, // phi::errors::InvalidArgument("Scatter dimension ", i, " is // zero.")); // } - // phi::DenseTensor input_index = MaybeCreateOrTrans64To32bits(dev_ctx, - // index); phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, - // x); phi::DenseTensor input_updates = + // DenseTensor input_index = MaybeCreateOrTrans64To32bits(dev_ctx, + // index); DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, + // x); DenseTensor input_updates = // MaybeCreateOrTrans64To32bits(dev_ctx, updates); - // phi::DenseTensor output = + // DenseTensor output = // MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); // LAUNCH_TOPSATENOP(topspaddleScatter, diff --git a/backends/gcu/kernels/set_value_kernel.cc b/backends/gcu/kernels/set_value_kernel.cc index 1474d002d23..037ee9d41f9 100644 --- a/backends/gcu/kernels/set_value_kernel.cc +++ b/backends/gcu/kernels/set_value_kernel.cc @@ -19,23 +19,23 @@ namespace custom_kernel { template extern void IndexPutKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const std::vector& indices, - const phi::DenseTensor& value, + const DenseTensor& x, + const std::vector& indices, + const DenseTensor& value, bool accumulate, - phi::DenseTensor* out); + DenseTensor* out); template void SetTensorValueKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& value, + const DenseTensor& x, + const DenseTensor& value, const phi::IntArray& starts, const phi::IntArray& ends, const phi::IntArray& steps, const std::vector& axes, const std::vector& decrease_axes, const std::vector& none_axes, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("set_value_with_tensor"); dev_ctx.template Alloc(out); @@ -122,7 +122,7 @@ void SetTensorValueKernel(const Context& dev_ctx, // create index tensor. std::vector index_shape = { static_cast(index_indices.size())}; - phi::DenseTensor index_tensor; + DenseTensor index_tensor; index_tensor.Resize(phi::make_ddim(index_shape)); TensorFromVector(dev_ctx, index_indices, dev_ctx, &index_tensor); @@ -147,7 +147,7 @@ void SetTensorValueKernel(const Context& dev_ctx, "larger than or equal the rank of value shape. ")); // Processing value_tensor data. - phi::DenseTensor value_tensor(value); + DenseTensor value_tensor(value); if (slice_dims_for_assign_v != phi::vectorize(value_tensor.dims()) && value_tensor.numel() == 1) { std::vector broadcast_shape = { @@ -156,10 +156,10 @@ void SetTensorValueKernel(const Context& dev_ctx, } std::vector reshape_shape = {static_cast(x.numel())}; - phi::DenseTensor reshape_x = ReshapeWithoutCopy(x, reshape_shape); - phi::DenseTensor reshape_updates = + DenseTensor reshape_x = ReshapeWithoutCopy(x, reshape_shape); + DenseTensor reshape_updates = ReshapeWithoutCopy(value_tensor, {value_tensor.numel()}); - phi::DenseTensor reshape_out = ReshapeWithoutCopy(*out, reshape_shape); + DenseTensor reshape_out = ReshapeWithoutCopy(*out, reshape_shape); custom_kernel::IndexPutKernel(dev_ctx, reshape_x, @@ -176,7 +176,7 @@ void SetTensorValueKernel(const Context& dev_ctx, template void SetValueKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& starts, const phi::IntArray& ends, const phi::IntArray& steps, @@ -185,14 +185,14 @@ void SetValueKernel(const Context& dev_ctx, const std::vector& none_axes, const std::vector& shape, const std::vector& values, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("set_value"); std::vector> assgin_values; assgin_values.reserve(values.size()); for (const auto& val : values) { assgin_values.push_back(val.to()); } - phi::DenseTensor value_tensor; + DenseTensor value_tensor; value_tensor.Resize(phi::make_ddim(shape)); TensorFromVector(dev_ctx, assgin_values, dev_ctx, &value_tensor); value_tensor.Resize(phi::make_ddim(shape)); diff --git a/backends/gcu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc b/backends/gcu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc index 3247dddec98..6fff5d2a6c3 100644 --- a/backends/gcu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc +++ b/backends/gcu/kernels/sigmoid_cross_entropy_with_logits_kernel.cc @@ -18,11 +18,11 @@ namespace custom_kernel { template void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& label, + const DenseTensor& x, + const DenseTensor& label, bool normalize, int ignore_index, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("sigmoid_cross_entropy_with_logits"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { @@ -58,12 +58,12 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx, template void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& label, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& dout, bool normalize, int ignore_index, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("sigmoid_cross_entropy_with_logits_grad"); dev_ctx.template Alloc(dx); diff --git a/backends/gcu/kernels/sign_kernel.cc b/backends/gcu/kernels/sign_kernel.cc index 4b066c6cf13..51529cbf222 100644 --- a/backends/gcu/kernels/sign_kernel.cc +++ b/backends/gcu/kernels/sign_kernel.cc @@ -18,8 +18,8 @@ namespace custom_kernel { template void SignKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("sign"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/slice_kernel.cc b/backends/gcu/kernels/slice_kernel.cc index a09126bfedd..8bf8aecb1c6 100644 --- a/backends/gcu/kernels/slice_kernel.cc +++ b/backends/gcu/kernels/slice_kernel.cc @@ -20,13 +20,13 @@ namespace custom_kernel { template void SliceKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes_t, const phi::IntArray& starts_array, const phi::IntArray& ends_array, const std::vector& infer_flags, const std::vector& decrease_axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("slice"); if (LaunchAOTKernel()) { auto axes = axes_t; @@ -70,7 +70,7 @@ void SliceKernel(const Context& dev_ctx, if (out->data() == x.data()) { *out = TensorEmpty(dev_ctx, out->meta()); } - phi::DenseTensor input_x; + DenseTensor input_x; if (x.place().GetType() == phi::AllocationType::CPU) { TensorCopy(dev_ctx, x, false, &input_x); } else { @@ -128,14 +128,14 @@ void SliceKernel(const Context& dev_ctx, template void SliceGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& out_grad, const std::vector& axes_t, const phi::IntArray& starts_array, const phi::IntArray& ends_array, const std::vector& infer_flags, const std::vector& decrease_axis, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("slice_grad"); std::vector axes(axes_t.begin(), axes_t.end()); auto starts_int = starts_array.GetData(); diff --git a/backends/gcu/kernels/softmax_kernel.cc b/backends/gcu/kernels/softmax_kernel.cc index 454a12c3a16..38eb32021bd 100644 --- a/backends/gcu/kernels/softmax_kernel.cc +++ b/backends/gcu/kernels/softmax_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void SoftmaxKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("softmax"); const int rank = x.dims().size(); if (rank == 0) { @@ -64,10 +64,10 @@ void SoftmaxKernel(const Context& dev_ctx, template void SoftmaxGradKernel(const Context& dev_ctx, - const phi::DenseTensor& out, - const phi::DenseTensor& out_grad, + const DenseTensor& out, + const DenseTensor& out_grad, int axis, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("softmax_grad"); auto dims = x_grad->dims(); const int rank = dims.size(); diff --git a/backends/gcu/kernels/split_kernel.cc b/backends/gcu/kernels/split_kernel.cc index 32df6b21817..73950b3487c 100644 --- a/backends/gcu/kernels/split_kernel.cc +++ b/backends/gcu/kernels/split_kernel.cc @@ -52,10 +52,10 @@ std::vector CalSections(const std::vector& input_shape, template void SplitKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& num_or_sections, const phi::Scalar& axis_scalar, - std::vector outs) { + std::vector outs) { PADDLE_GCU_KERNEL_TRACE("split"); auto origin_sections = num_or_sections.GetData(); PADDLE_ENFORCE_GT( @@ -77,8 +77,8 @@ void SplitKernel(const Context& dev_ctx, outs.size(), sections.size())); - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - std::vector outputs; + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + std::vector outputs; for (size_t i = 0; i < outs.size(); ++i) { dev_ctx.template Alloc(outs[i]); outputs.emplace_back( @@ -104,7 +104,7 @@ void SplitKernel(const Context& dev_ctx, TensorValueMap outputs; std::vector names; names.reserve(outs.size()); - std::vector values; + std::vector values; values.reserve(outs.size()); for (size_t i = 0; i < outs.size(); ++i) { dev_ctx.template Alloc(outs[i]); @@ -128,10 +128,10 @@ void SplitKernel(const Context& dev_ctx, template void SplitWithNumKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int num, const phi::Scalar& axis_scalar, - std::vector outs) { + std::vector outs) { PADDLE_GCU_KERNEL_TRACE("split_with_num"); int axis_value = axis_scalar.to(); auto input_axis_dim = x.dims().at(axis_value); diff --git a/backends/gcu/kernels/squared_l2_norm_kernel.cc b/backends/gcu/kernels/squared_l2_norm_kernel.cc index a15cf59b884..8a9baad4576 100644 --- a/backends/gcu/kernels/squared_l2_norm_kernel.cc +++ b/backends/gcu/kernels/squared_l2_norm_kernel.cc @@ -18,8 +18,8 @@ namespace custom_kernel { template void SquaredL2NormKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("squared_l2_norm"); dev_ctx.template Alloc(out); diff --git a/backends/gcu/kernels/squeeze_kernel.cc b/backends/gcu/kernels/squeeze_kernel.cc index 54405e8233c..02867e155e4 100644 --- a/backends/gcu/kernels/squeeze_kernel.cc +++ b/backends/gcu/kernels/squeeze_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void SqueezeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes_int_array, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("squeeze"); VLOG(6) << "[HOST_KERNEL] Impl on host for squeeze"; auto out_dims = out->dims(); @@ -40,10 +40,10 @@ void SqueezeKernel(const Context& dev_ctx, template void SqueezeWithXShapeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes_int_array, - phi::DenseTensor* out, - phi::DenseTensor* xshape) { + DenseTensor* out, + DenseTensor* xshape) { PADDLE_GCU_KERNEL_TRACE("squeeze_with_xshape"); VLOG(6) << "[HOST_KERNEL] Impl on host for squeeze_with_xshape"; custom_kernel::SqueezeKernel(dev_ctx, x, axes_int_array, out); @@ -51,10 +51,10 @@ void SqueezeWithXShapeKernel(const Context& dev_ctx, template void SqueezeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& dout, const phi::IntArray& axes_int_array, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("squeeze_grad"); VLOG(6) << "[HOST_KERNEL] Impl on host for squeeze_grad"; auto x_dims = dx->dims(); diff --git a/backends/gcu/kernels/stack_kernel.cc b/backends/gcu/kernels/stack_kernel.cc index fc7ece49da9..f74e6c0085b 100644 --- a/backends/gcu/kernels/stack_kernel.cc +++ b/backends/gcu/kernels/stack_kernel.cc @@ -19,16 +19,16 @@ namespace custom_kernel { template void StackKernel(const Context& dev_ctx, - const std::vector& x, + const std::vector& x, int axis, - phi::DenseTensor* y) { + DenseTensor* y) { PADDLE_GCU_KERNEL_TRACE("stack"); dev_ctx.template Alloc(y); if (LaunchAOTKernel()) { - phi::DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *y, false); + DenseTensor output = MaybeCreateOrTrans64To32bits(dev_ctx, *y, false); auto out_tensor = CreateTopsatenTensor(output); - std::vector input_tensors; + std::vector input_tensors; for (const auto& in : x) { input_tensors.emplace_back(MaybeCreateOrTrans64To32bits(dev_ctx, *in)); } @@ -62,7 +62,7 @@ void StackKernel(const Context& dev_ctx, TensorValueMap inputs; std::vector names; names.reserve(x.size()); - std::vector values; + std::vector values; values.reserve(x.size()); for (size_t i = 0; i < x.size(); ++i) { names.emplace_back(std::string("x_") + std::to_string(i)); @@ -87,9 +87,9 @@ void StackKernel(const Context& dev_ctx, template void StackGradKernel(const Context& dev_ctx, - const phi::DenseTensor& dy, + const DenseTensor& dy, int axis, - std::vector dx) { + std::vector dx) { PADDLE_GCU_KERNEL_TRACE("stack_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -105,7 +105,7 @@ void StackGradKernel(const Context& dev_ctx, std::vector names; names.reserve(dx.size()); - std::vector values; + std::vector values; values.reserve(dx.size()); for (size_t i = 0; i < dx.size(); ++i) { dev_ctx.template Alloc(dx[i]); @@ -130,10 +130,10 @@ void StackGradKernel(const Context& dev_ctx, template void UnStackKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, int num, - std::vector outs) { + std::vector outs) { PADDLE_GCU_KERNEL_TRACE("unstack"); if (LaunchAOTKernel()) { for (auto y : outs) { @@ -163,7 +163,7 @@ void UnStackKernel(const Context& dev_ctx, axis, x_dims.at(axis))); - phi::DenseTensor as_strides_out; + DenseTensor as_strides_out; auto x_tensor = CreateTopsatenTensor(x); std::vector split_outs; std::string abstract_info = @@ -179,7 +179,7 @@ void UnStackKernel(const Context& dev_ctx, // because of aten ask rank must be same when call atencopy for (int i = 0; i < split_outs.size(); i++) { - phi::DenseTensor& output = *(outs.at(i)); + DenseTensor& output = *(outs.at(i)); int32_t output_dims_size = output.dims().size() + 1; int64_t new_dim = axis >= 0 ? axis : axis + output_dims_size; auto dims_org = output.dims(); @@ -205,9 +205,9 @@ void UnStackKernel(const Context& dev_ctx, template void UnbindKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int axis, - std::vector outs) { + std::vector outs) { PADDLE_GCU_KERNEL_TRACE("unbind"); if (LaunchAOTKernel()) { for (auto y : outs) { @@ -237,7 +237,7 @@ void UnbindKernel(const Context& dev_ctx, axis, x_dims.at(axis))); - phi::DenseTensor as_strides_out; + DenseTensor as_strides_out; auto x_tensor = CreateTopsatenTensor(x); std::vector split_outs; std::string abstract_info = @@ -253,7 +253,7 @@ void UnbindKernel(const Context& dev_ctx, // because of aten ask rank must be same when call atencopy for (int i = 0; i < split_outs.size(); i++) { - phi::DenseTensor& output = *(outs.at(i)); + DenseTensor& output = *(outs.at(i)); int32_t output_dims_size = output.dims().size() + 1; int64_t new_dim = axis >= 0 ? axis : axis + output_dims_size; auto dims_org = output.dims(); diff --git a/backends/gcu/kernels/strided_copy_kernel.cc b/backends/gcu/kernels/strided_copy_kernel.cc index a8adcd467ec..fe896fb6b11 100644 --- a/backends/gcu/kernels/strided_copy_kernel.cc +++ b/backends/gcu/kernels/strided_copy_kernel.cc @@ -19,13 +19,13 @@ namespace custom_kernel { template void StridedCopyKernel(const Context& dev_ctx, - const phi::DenseTensor& input, + const DenseTensor& input, const std::vector& dims, const std::vector& out_stride, int64_t offset, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("contiguous"); - phi::DenseTensorMeta meta = input.meta(); + DenseTensorMeta meta = input.meta(); meta.strides = common::make_ddim(out_stride); meta.dims = common::make_ddim(dims); meta.offset = offset; @@ -95,11 +95,11 @@ void StridedCopyKernel(const Context& dev_ctx, template void AsStridedKernel(const Context& dev_ctx, - const phi::DenseTensor& input, + const DenseTensor& input, const std::vector& dims, const std::vector& stride, int64_t offset, - phi::DenseTensor* out) { + DenseTensor* out) { *out = input; auto meta = out->meta(); meta.dims = common::make_ddim(dims); diff --git a/backends/gcu/kernels/strided_slice_kernel.cc b/backends/gcu/kernels/strided_slice_kernel.cc index 8b70bb8ca11..6e7d6c422a0 100644 --- a/backends/gcu/kernels/strided_slice_kernel.cc +++ b/backends/gcu/kernels/strided_slice_kernel.cc @@ -246,12 +246,12 @@ static void StridedSliceFunctor(int64_t* starts, template void StridedSliceKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const phi::IntArray& starts, const phi::IntArray& ends, const phi::IntArray& strides, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("strided_slice"); dev_ctx.template Alloc(out); @@ -315,7 +315,7 @@ void StridedSliceKernel(const Context& dev_ctx, offset += slice_info.starts_[i] * x_strides[i]; } - phi::DenseTensor as_strides_out; + DenseTensor as_strides_out; auto x_tensor = CreateTopsatenTensor(x); auto out_tensor = CreateTopsatenTensor(*out); auto view_out_tensor = CreateTopsatenTensor(as_strides_out); diff --git a/backends/gcu/kernels/swiglu_kernel.cc b/backends/gcu/kernels/swiglu_kernel.cc index 2bf16ab3526..c754a578589 100644 --- a/backends/gcu/kernels/swiglu_kernel.cc +++ b/backends/gcu/kernels/swiglu_kernel.cc @@ -18,9 +18,9 @@ namespace custom_kernel { template void SwiGLUKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const paddle::optional& y, - phi::DenseTensor* out) { + const DenseTensor& x, + const paddle::optional& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("swiglu"); dev_ctx.template Alloc(out); @@ -38,12 +38,12 @@ void SwiGLUKernel(const Context& dev_ctx, auto rank = meta.dims.size(); meta.dims[rank - 1] *= 2; meta.strides = meta.calc_strides(meta.dims); - phi::DenseTensor concat_output = TensorEmpty(dev_ctx, meta); + DenseTensor concat_output = TensorEmpty(dev_ctx, meta); std::vector in_tensors = {CreateTopsatenTensor(x), CreateTopsatenTensor(y.get())}; auto out_tensor = CreateTopsatenTensor(concat_output); int64_t axis = rank - 1; - std::vector concat_ins = {x, y.get()}; + std::vector concat_ins = {x, y.get()}; std::string abstract_info = custom_kernel::GetAbstractInfo( "topsatenCat", concat_output, concat_ins, axis); LAUNCH_TOPSATENOP_WITH_RAW_ATEN_DEF( diff --git a/backends/gcu/kernels/take_along_axis.cc b/backends/gcu/kernels/take_along_axis.cc index 00eae206ad4..2a6f094439f 100644 --- a/backends/gcu/kernels/take_along_axis.cc +++ b/backends/gcu/kernels/take_along_axis.cc @@ -40,7 +40,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx, // "axis should be in [-%zu, %zu)!", x_rank, x_rank)); // int64_t axis_64 = axis; - // phi::DenseTensor out_tmp = custom_kernel::TensorEmpty(dev_ctx, x.meta()); + // DenseTensor out_tmp = custom_kernel::TensorEmpty(dev_ctx, x.meta()); // LAUNCH_TOPSATENOP( // topsatenGather, dev_ctx, out_tmp, x, index, axis_64, false); diff --git a/backends/gcu/kernels/temporal_shift_kernel.cc b/backends/gcu/kernels/temporal_shift_kernel.cc index 6367fbfc91a..8e4ab129056 100644 --- a/backends/gcu/kernels/temporal_shift_kernel.cc +++ b/backends/gcu/kernels/temporal_shift_kernel.cc @@ -19,18 +19,18 @@ namespace custom_kernel { template extern void StridedSliceKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axes, const phi::IntArray& starts, const phi::IntArray& ends, const phi::IntArray& strides, - phi::DenseTensor* out); + DenseTensor* out); template extern void ConcatKernel(const Context& dev_ctx, - const std::vector& ins, + const std::vector& ins, const phi::Scalar& axis_scalar, - phi::DenseTensor* out); + DenseTensor* out); template void TemporalShiftKernel(const Context& dev_ctx, @@ -52,7 +52,7 @@ void TemporalShiftKernel(const Context& dev_ctx, auto x5 = x; x5.Resize(common::make_ddim({n, t, h, w, c})); - phi::DenseTensor pad_x; + DenseTensor pad_x; auto pad_x_meta = pad_x.meta(); pad_x_meta.dtype = x.dtype(); pad_x.set_meta(pad_x_meta); @@ -70,22 +70,22 @@ void TemporalShiftKernel(const Context& dev_ctx, topsatenPadMode_t(0), pad_value_scalar); - phi::DenseTensorMeta meta; + DenseTensorMeta meta; meta.dtype = x.dtype(); - phi::DenseTensor slice1; - phi::DenseTensor slice2; - phi::DenseTensor slice3; + DenseTensor slice1; + DenseTensor slice2; + DenseTensor slice3; auto c_slice1 = static_cast(c * shift_ratio); auto c_slice2 = c / 2 - static_cast(c * shift_ratio); auto c_slice3 = c - c / 2; meta.dims = common::make_ddim({n, t, h, w, c_slice1}); - meta.strides = phi::DenseTensorMeta::calc_strides(meta.dims); + meta.strides = DenseTensorMeta::calc_strides(meta.dims); slice1.set_meta(meta); meta.dims = common::make_ddim({n, t, h, w, c_slice2}); - meta.strides = phi::DenseTensorMeta::calc_strides(meta.dims); + meta.strides = DenseTensorMeta::calc_strides(meta.dims); slice2.set_meta(meta); meta.dims = common::make_ddim({n, t, h, w, c_slice3}); - meta.strides = phi::DenseTensorMeta::calc_strides(meta.dims); + meta.strides = DenseTensorMeta::calc_strides(meta.dims); slice3.set_meta(meta); dev_ctx.template Alloc(&slice1); dev_ctx.template Alloc(&slice2); @@ -120,7 +120,7 @@ void TemporalShiftKernel(const Context& dev_ctx, auto x5 = x; x5.Resize(common::make_ddim({n, t, c, h, w})); - phi::DenseTensor pad_x; + DenseTensor pad_x; auto pad_x_meta = pad_x.meta(); pad_x_meta.dtype = x.dtype(); pad_x.set_meta(pad_x_meta); @@ -138,21 +138,21 @@ void TemporalShiftKernel(const Context& dev_ctx, topsatenPadMode_t(0), pad_value_scalar); - phi::DenseTensorMeta meta = x.meta(); - phi::DenseTensor slice1; - phi::DenseTensor slice2; - phi::DenseTensor slice3; + DenseTensorMeta meta = x.meta(); + DenseTensor slice1; + DenseTensor slice2; + DenseTensor slice3; auto c_slice1 = static_cast(c * shift_ratio); auto c_slice2 = c / 2 - static_cast(c * shift_ratio); auto c_slice3 = c - c / 2; meta.dims = common::make_ddim({n, t, c_slice1, h, w}); - meta.strides = phi::DenseTensorMeta::calc_strides(meta.dims); + meta.strides = DenseTensorMeta::calc_strides(meta.dims); slice1.set_meta(meta); meta.dims = common::make_ddim({n, t, c_slice2, h, w}); - meta.strides = phi::DenseTensorMeta::calc_strides(meta.dims); + meta.strides = DenseTensorMeta::calc_strides(meta.dims); slice2.set_meta(meta); meta.dims = common::make_ddim({n, t, c_slice3, h, w}); - meta.strides = phi::DenseTensorMeta::calc_strides(meta.dims); + meta.strides = DenseTensorMeta::calc_strides(meta.dims); slice3.set_meta(meta); dev_ctx.template Alloc(&slice1); dev_ctx.template Alloc(&slice2); diff --git a/backends/gcu/kernels/tile_kernel.cc b/backends/gcu/kernels/tile_kernel.cc index 463ee50b16c..a9c31216443 100644 --- a/backends/gcu/kernels/tile_kernel.cc +++ b/backends/gcu/kernels/tile_kernel.cc @@ -19,16 +19,15 @@ namespace custom_kernel { template void TileKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& repeat_times, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("tile"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); auto repeat_times_data = repeat_times.GetData(); LAUNCH_TOPSATENOP( topsatenTile, dev_ctx, output_z, input_x, repeat_times_data); diff --git a/backends/gcu/kernels/top_p_sampling_kernel.cc b/backends/gcu/kernels/top_p_sampling_kernel.cc index 512adee223b..08ecf2bc2a6 100644 --- a/backends/gcu/kernels/top_p_sampling_kernel.cc +++ b/backends/gcu/kernels/top_p_sampling_kernel.cc @@ -19,12 +19,12 @@ namespace custom_kernel { template void TopPSamplingKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& ps, - const paddle::optional& threshold, + const DenseTensor& x, + const DenseTensor& ps, + const paddle::optional& threshold, int random_seed, - phi::DenseTensor* out, - phi::DenseTensor* ids) { + DenseTensor* out, + DenseTensor* ids) { PADDLE_GCU_KERNEL_TRACE("top_p_sampling"); auto probs = custom_op_common::CreateTensorFromDenseTensor(x); auto top_p = custom_op_common::CreateTensorFromDenseTensor(ps); diff --git a/backends/gcu/kernels/topk_kernel.cc b/backends/gcu/kernels/topk_kernel.cc index a727182f776..62cb9549574 100644 --- a/backends/gcu/kernels/topk_kernel.cc +++ b/backends/gcu/kernels/topk_kernel.cc @@ -19,13 +19,13 @@ namespace custom_kernel { template void TopkKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::Scalar& k_scalar, int axis, bool largest, bool sorted, - phi::DenseTensor* out, - phi::DenseTensor* indices) { + DenseTensor* out, + DenseTensor* indices) { PADDLE_GCU_KERNEL_TRACE("topk"); if (axis < 0) { axis += x.dims().size(); @@ -52,12 +52,12 @@ void TopkKernel(const Context& dev_ctx, } if (LaunchAOTKernel()) { - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor output_value = + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor output_value = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); auto meta = indices->meta(); - meta.dtype = phi::DataType::INT32; - phi::DenseTensor output_indices = TensorEmpty(dev_ctx, meta); + meta.dtype = DataType::INT32; + DenseTensor output_indices = TensorEmpty(dev_ctx, meta); LAUNCH_TOPSATENOP(topsatenTopk, dev_ctx, @@ -70,7 +70,7 @@ void TopkKernel(const Context& dev_ctx, largest); MaybeTransResult(dev_ctx, output_value, out); - custom_kernel::Cast(dev_ctx, output_indices, phi::DataType::INT64, indices); + custom_kernel::Cast(dev_ctx, output_indices, DataType::INT64, indices); } else { // kernel impl base on JIT TensorNameMap input_names; @@ -110,5 +110,5 @@ PD_REGISTER_PLUGIN_KERNEL(topk, double, int, int64_t) { - kernel->OutputAt(1).SetDataType(phi::DataType::INT64); + kernel->OutputAt(1).SetDataType(DataType::INT64); } diff --git a/backends/gcu/kernels/transfer_layout_kernel.cc b/backends/gcu/kernels/transfer_layout_kernel.cc index 6dcd161ae81..9f568171b5b 100644 --- a/backends/gcu/kernels/transfer_layout_kernel.cc +++ b/backends/gcu/kernels/transfer_layout_kernel.cc @@ -31,10 +31,10 @@ std::vector GetAxis(const phi::DataLayout& from, template void TransferLayoutKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int src_layout, int dst_layout, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("transfer_layout"); PADDLE_ENFORCE_NE(src_layout, dst_layout, diff --git a/backends/gcu/kernels/transpose_kernel.cc b/backends/gcu/kernels/transpose_kernel.cc index e9f9b40c3ac..d2b9c046fbf 100644 --- a/backends/gcu/kernels/transpose_kernel.cc +++ b/backends/gcu/kernels/transpose_kernel.cc @@ -19,9 +19,9 @@ namespace custom_kernel { template void TransposeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::vector& axis, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("transpose"); if (LaunchAOTKernel()) { std::vector in_axis(axis.begin(), axis.end()); @@ -57,9 +57,9 @@ void TransposeKernel(const Context& dev_ctx, template void TransposeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& dout, + const DenseTensor& dout, const std::vector& axis, - phi::DenseTensor* dx) { + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("transpose_grad"); dev_ctx.template Alloc(dx); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/tril_triu_kernel.cc b/backends/gcu/kernels/tril_triu_kernel.cc index 38c285aba58..9c440a9c138 100644 --- a/backends/gcu/kernels/tril_triu_kernel.cc +++ b/backends/gcu/kernels/tril_triu_kernel.cc @@ -19,10 +19,10 @@ namespace custom_kernel { template void TrilTriuCommon(const std::string& op_type, const Context& ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int diagonal, bool lower, - phi::DenseTensor* out) { + DenseTensor* out) { ctx.template Alloc(out); TensorNameMap input_names; @@ -47,10 +47,10 @@ void TrilTriuCommon(const std::string& op_type, template void TrilTriuGradCommon(const std::string& op_type, const Context& ctx, - const phi::DenseTensor& out_grad, + const DenseTensor& out_grad, int diagonal, bool lower, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { ctx.template Alloc(x_grad); TensorNameMap input_names; @@ -74,10 +74,10 @@ void TrilTriuGradCommon(const std::string& op_type, template void TrilTriuKernel(const Context& ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int diagonal, bool lower, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("tril_triu"); if (LaunchAOTKernel()) { ctx.template Alloc(out); @@ -95,10 +95,10 @@ void TrilTriuKernel(const Context& ctx, template void TrilTriuGradKernel(const Context& ctx, - const phi::DenseTensor& out_grad, + const DenseTensor& out_grad, int diagonal, bool lower, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("tril_triu_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -110,9 +110,9 @@ void TrilTriuGradKernel(const Context& ctx, template void TrilKernel(const Context& ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int diagonal, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("tril"); if (LaunchAOTKernel()) { ctx.template Alloc(out); @@ -126,9 +126,9 @@ void TrilKernel(const Context& ctx, template void TrilGradKernel(const Context& ctx, - const phi::DenseTensor& out_grad, + const DenseTensor& out_grad, int diagonal, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("tril_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); @@ -140,9 +140,9 @@ void TrilGradKernel(const Context& ctx, template void TriuKernel(const Context& ctx, - const phi::DenseTensor& x, + const DenseTensor& x, int diagonal, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("triu"); if (LaunchAOTKernel()) { ctx.template Alloc(out); @@ -156,9 +156,9 @@ void TriuKernel(const Context& ctx, template void TriuGradKernel(const Context& ctx, - const phi::DenseTensor& out_grad, + const DenseTensor& out_grad, int diagonal, - phi::DenseTensor* x_grad) { + DenseTensor* x_grad) { PADDLE_GCU_KERNEL_TRACE("triu_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); diff --git a/backends/gcu/kernels/trunc_kernel.cc b/backends/gcu/kernels/trunc_kernel.cc index 5a475d38da1..d9cc7a4aa2c 100644 --- a/backends/gcu/kernels/trunc_kernel.cc +++ b/backends/gcu/kernels/trunc_kernel.cc @@ -18,8 +18,8 @@ namespace custom_kernel { template void TruncKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out) { + const DenseTensor& x, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("trunc"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/truncated_gaussian_random_kernel.cc b/backends/gcu/kernels/truncated_gaussian_random_kernel.cc index af90c219ed3..f2f6eadcd8d 100644 --- a/backends/gcu/kernels/truncated_gaussian_random_kernel.cc +++ b/backends/gcu/kernels/truncated_gaussian_random_kernel.cc @@ -156,19 +156,19 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx, float mean, float std, int seed, - phi::DataType dtype, - phi::DenseTensor* out) { + DataType dtype, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("truncated_gaussian_random"); ContextPinnedGuard ctx_pinned_guard(dev_ctx); VLOG(6) << "[HOST_KERNEL] Impl on host for truncated_gaussian_random"; VLOG(6) << "Enter TruncatedGaussianRandomKernel with mean:" << mean << ", std:" << std << ", seed:" << seed << ", shape:" << VectorToStr(shape) - << ", dtype:" << phi::DataTypeToString(dtype); + << ", dtype:" << DataTypeToString(dtype); dev_ctx.template Alloc(out); - phi::DenseTensor cpu_out; - phi::DenseTensorMeta cpu_meta = {out->dtype(), out->dims()}; + DenseTensor cpu_out; + DenseTensorMeta cpu_meta = {out->dtype(), out->dims()}; cpu_out.set_meta(cpu_meta); T* cpu_data = dev_ctx.template HostAlloc(&cpu_out); diff --git a/backends/gcu/kernels/uniform_kernel.cc b/backends/gcu/kernels/uniform_kernel.cc index 6e9df88b1e1..aec36400ab4 100644 --- a/backends/gcu/kernels/uniform_kernel.cc +++ b/backends/gcu/kernels/uniform_kernel.cc @@ -34,14 +34,14 @@ inline void UniformRealDistribution(T* data, template void UniformRawKernel(const Context& dev_ctx, const phi::IntArray& shape, - phi::DataType dtype, + DataType dtype, const phi::Scalar& min, const phi::Scalar& max, int seed, int diag_num, int diag_step, float diag_val, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("uniform_raw"); ContextPinnedGuard ctx_pinned_guard(dev_ctx); VLOG(6) << "[HOST_KERNEL] Impl on host for uniform_raw"; @@ -49,14 +49,14 @@ void UniformRawKernel(const Context& dev_ctx, << ", max:" << max.ToString() << ", seed:" << seed << ", diag_num:" << diag_num << ", diag_step:" << diag_step << ", diag_val:" << diag_val << ", shape:" << out->dims() - << ", dtype:" << phi::DataTypeToString(dtype); + << ", dtype:" << DataTypeToString(dtype); out->Resize(phi::make_ddim(shape.GetData())); T* data = dev_ctx.template Alloc(out); auto size = out->numel(); // 1. CPU implement - phi::DenseTensor cpu_out; - phi::DenseTensorMeta cpu_out_meta = {out->dtype(), out->dims()}; + DenseTensor cpu_out; + DenseTensorMeta cpu_out_meta = {out->dtype(), out->dims()}; cpu_out.set_meta(cpu_out_meta); T* cpu_data = dev_ctx.template HostAlloc(&cpu_out); @@ -95,11 +95,11 @@ void UniformRawKernel(const Context& dev_ctx, template void UniformKernel(const Context& dev_ctx, const phi::IntArray& shape, - phi::DataType dtype, + DataType dtype, const phi::Scalar& min, const phi::Scalar& max, int seed, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("uniform"); if (LaunchAOTKernel()) { diff --git a/backends/gcu/kernels/unsqueeze_kernel.cc b/backends/gcu/kernels/unsqueeze_kernel.cc index 448eda6c687..c8f35e51236 100644 --- a/backends/gcu/kernels/unsqueeze_kernel.cc +++ b/backends/gcu/kernels/unsqueeze_kernel.cc @@ -68,9 +68,9 @@ inline phi::DDim GetUnsqueezeShape(const std::vector unsqz_dims, template void UnsqueezeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, - phi::DenseTensor* out) { + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("unsqueeze"); VLOG(6) << "[HOST_KERNEL] Impl on host for unsqueeze"; auto x_dims = x.dims(); @@ -87,10 +87,10 @@ void UnsqueezeKernel(const Context& dev_ctx, template void UnsqueezeWithXShapeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const phi::IntArray& axes, - phi::DenseTensor* out, - phi::DenseTensor* xshape) { + DenseTensor* out, + DenseTensor* xshape) { PADDLE_GCU_KERNEL_TRACE("unsqueeze_with_xshape"); VLOG(6) << "[HOST_KERNEL] Impl on host for unsqueeze_with_xshape"; custom_kernel::UnsqueezeKernel(dev_ctx, x, axes, out); @@ -98,9 +98,9 @@ void UnsqueezeWithXShapeKernel(const Context& dev_ctx, template void UnsqueezeGradKernel(const Context& dev_ctx, - const phi::DenseTensor& x, - const phi::DenseTensor& dout, - phi::DenseTensor* dx) { + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { PADDLE_GCU_KERNEL_TRACE("unsqueeze_grad"); VLOG(6) << "[HOST_KERNEL] Impl on host for unsqueeze_grad"; auto x_dims = dx->dims(); diff --git a/backends/gcu/kernels/weight_quantize_kernel.cc b/backends/gcu/kernels/weight_quantize_kernel.cc index ea9b3f76f32..e83e8a857bc 100644 --- a/backends/gcu/kernels/weight_quantize_kernel.cc +++ b/backends/gcu/kernels/weight_quantize_kernel.cc @@ -234,9 +234,9 @@ template void quant_compute(const DeviceContext& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* out, - phi::DenseTensor* scale, + const DenseTensor& x, + DenseTensor* out, + DenseTensor* scale, const std::string& algo, const int32_t arch, const int32_t group_size) { @@ -253,14 +253,14 @@ void quant_compute(const DeviceContext& dev_ctx, const T* x_data = x.data(); ScaleT* scale_data = scale->data(); - phi::DenseTensorMeta out_meta = out->meta(); - phi::DenseTensor x_int; + DenseTensorMeta out_meta = out->meta(); + DenseTensor x_int; x_int.set_meta(out_meta); x_int.Resize({static_cast(m), static_cast(n)}); dev_ctx.template Alloc(&x_int); D* x_int_data = x_int.data(); - phi::DenseTensor x_int_tmp; + DenseTensor x_int_tmp; x_int_tmp.set_meta(out_meta); x_int_tmp.Resize({static_cast(m), static_cast(n / 2)}); dev_ctx.template Alloc(&x_int_tmp); @@ -295,38 +295,38 @@ void quant_compute(const DeviceContext& dev_ctx, template void WeightQuantizeKernel(const Context& dev_ctx, - const phi::DenseTensor& x, + const DenseTensor& x, const std::string& algo, const int32_t arch, const int32_t group_size, - phi::DenseTensor* out, - phi::DenseTensor* scale) { + DenseTensor* out, + DenseTensor* scale) { PADDLE_GCU_KERNEL_TRACE("weight_quantize"); - phi::DenseTensor x_cpu; - phi::DenseTensor out_cpu; - phi::DenseTensor scale_cpu; - phi::CPUContext dev_ctx_cpu; + DenseTensor x_cpu; + DenseTensor out_cpu; + DenseTensor scale_cpu; + CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator(&(dev_ctx.GetHostAllocator())); dev_ctx_cpu.SetHostAllocator(&(dev_ctx.GetHostAllocator())); - TensorCopy(dev_ctx, x, true, &x_cpu, phi::CPUPlace()); + TensorCopy(dev_ctx, x, true, &x_cpu, CPUPlace()); dev_ctx.Wait(); - phi::DenseTensorMeta out_meta = out->meta(); + DenseTensorMeta out_meta = out->meta(); out_cpu.set_meta(out_meta); dev_ctx_cpu.template Alloc(&out_cpu); - phi::DenseTensorMeta scale_meta = scale->meta(); + DenseTensorMeta scale_meta = scale->meta(); scale_cpu.set_meta(scale_meta); if (algo == "weight_only_int8") { dev_ctx_cpu.template Alloc(&scale_cpu); - quant_compute( + quant_compute( dev_ctx_cpu, x_cpu, &out_cpu, &scale_cpu, algo, arch, group_size); } else if (algo == "llm.int8") { dev_ctx_cpu.template Alloc(&scale_cpu); - quant_compute( + quant_compute( dev_ctx_cpu, x_cpu, &out_cpu, &scale_cpu, algo, arch, group_size); } else if (algo == "weight_only_int4") { dev_ctx_cpu.template Alloc(&scale_cpu); - quant_compute( + quant_compute( dev_ctx_cpu, x_cpu, &out_cpu, &scale_cpu, algo, arch, group_size); } else { common::errors::Unimplemented( diff --git a/backends/gcu/kernels/where_kernel.cc b/backends/gcu/kernels/where_kernel.cc index 10c3227caff..2172ee2e193 100644 --- a/backends/gcu/kernels/where_kernel.cc +++ b/backends/gcu/kernels/where_kernel.cc @@ -19,18 +19,17 @@ namespace custom_kernel { template void WhereKernel(const Context& dev_ctx, - const phi::DenseTensor& condition, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - phi::DenseTensor* out) { + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { PADDLE_GCU_KERNEL_TRACE("where"); dev_ctx.template Alloc(out); if (LaunchAOTKernel()) { - phi::DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); - phi::DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); - phi::DenseTensor output_z = - MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); + DenseTensor input_x = MaybeCreateOrTrans64To32bits(dev_ctx, x); + DenseTensor input_y = MaybeCreateOrTrans64To32bits(dev_ctx, y); + DenseTensor output_z = MaybeCreateOrTrans64To32bits(dev_ctx, *out, false); LAUNCH_TOPSATENOP( topsatenWhere, dev_ctx, output_z, condition, input_x, input_y); MaybeTransResult(dev_ctx, output_z, out); @@ -58,12 +57,12 @@ void WhereKernel(const Context& dev_ctx, template void WhereGradKernel(const Context& dev_ctx, - const phi::DenseTensor& condition, - const phi::DenseTensor& x, - const phi::DenseTensor& y, - const phi::DenseTensor& out_grad, - phi::DenseTensor* x_grad, - phi::DenseTensor* y_grad) { + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { PADDLE_GCU_KERNEL_TRACE("where_grad"); if (LaunchAOTKernel()) { THROW_AOT_UNIMPLEMENTED(); From 6455922c108b62bb0830e367c9a13673a96dd557 Mon Sep 17 00:00:00 2001 From: co63oc <4617245+co63oc@users.noreply.github.com> Date: Tue, 21 Apr 2026 08:45:59 +0800 Subject: [PATCH 2/3] fix --- backends/gcu/common/gcu_op_runner.h | 6 ++++- backends/gcu/kernels/accuracy_kernel.cc | 10 +++---- backends/gcu/kernels/adam_kernel.cc | 20 +++++++------- backends/gcu/kernels/arg_min_max_kernels.cc | 4 +-- backends/gcu/kernels/argsort_kernel.cc | 2 +- backends/gcu/kernels/atan2_kernel.cc | 2 +- backends/gcu/kernels/batch_norm_kernel.cc | 26 +++++++++---------- backends/gcu/kernels/cast_kernel.cc | 2 +- backends/gcu/kernels/compare_kernels.cc | 26 +++++++++---------- backends/gcu/kernels/dropout_kernel.cc | 2 +- backends/gcu/kernels/funcs/gcu_kernel_funcs.h | 2 +- backends/gcu/kernels/is_empty_kernel.cc | 2 +- backends/gcu/kernels/isfinite_kernel.cc | 2 +- backends/gcu/kernels/isinf_kernel.cc | 2 +- backends/gcu/kernels/isnan_kernel.cc | 2 +- backends/gcu/kernels/llama_stub_kernels.cc | 2 +- backends/gcu/kernels/logical_kernels.cc | 8 +++--- backends/gcu/kernels/masked_select_kernel.cc | 4 +-- backends/gcu/kernels/merged_adam_kernel.cc | 10 +++---- .../gcu/kernels/multiclass_nms3_kernel.cc | 4 +-- backends/gcu/kernels/multinomial_kernel.cc | 2 +- backends/gcu/kernels/nms_kernel.cc | 2 +- backends/gcu/kernels/numel_kernel.cc | 2 +- backends/gcu/kernels/one_hot_kernel.cc | 6 ++--- backends/gcu/kernels/reduce_kernels.cc | 6 ++--- backends/gcu/kernels/rnn_kernel.cc | 2 +- backends/gcu/kernels/topk_kernel.cc | 2 +- 27 files changed, 82 insertions(+), 78 deletions(-) diff --git a/backends/gcu/common/gcu_op_runner.h b/backends/gcu/common/gcu_op_runner.h index 488f6daee33..9bd0fe2ab46 100644 --- a/backends/gcu/common/gcu_op_runner.h +++ b/backends/gcu/common/gcu_op_runner.h @@ -28,7 +28,11 @@ namespace custom_kernel { constexpr char kEmptyVarName[] = "@EMPTY@"; using DenseTensor = phi::DenseTensor; -using DenseTensor = phi::DenseTensor; +using DenseTensorMeta = phi::DenseTensorMeta; +using DataType = phi::DataType; +using Place = phi::Place; +using CPUPlace = phi::CPUPlace; +using CPUContext = phi::CPUContext; using TensorNameMap = std::map>; using TensorValueMap = std::map>; using TensorNameValuePair = std::pair; diff --git a/backends/gcu/kernels/accuracy_kernel.cc b/backends/gcu/kernels/accuracy_kernel.cc index 387fe5bbb6e..4e5b54ea4fa 100644 --- a/backends/gcu/kernels/accuracy_kernel.cc +++ b/backends/gcu/kernels/accuracy_kernel.cc @@ -116,9 +116,9 @@ PD_REGISTER_PLUGIN_KERNEL(accuracy, phi::dtype::bfloat16, phi::dtype::float16, int) { - kernel->InputAt(1).SetDataType(DataType::INT64); - kernel->InputAt(2).SetDataType(DataType::INT64); - kernel->OutputAt(0).SetDataType(DataType::FLOAT32); - kernel->OutputAt(1).SetDataType(DataType::INT32); - kernel->OutputAt(2).SetDataType(DataType::INT32); + kernel->InputAt(1).SetDataType(phi::DataType::INT64); + kernel->InputAt(2).SetDataType(phi::DataType::INT64); + kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(1).SetDataType(phi::DataType::INT32); + kernel->OutputAt(2).SetDataType(phi::DataType::INT32); } diff --git a/backends/gcu/kernels/adam_kernel.cc b/backends/gcu/kernels/adam_kernel.cc index be40d8e5b09..5c6bc1c7549 100644 --- a/backends/gcu/kernels/adam_kernel.cc +++ b/backends/gcu/kernels/adam_kernel.cc @@ -302,11 +302,11 @@ void AdamwKernel(const Context& dev_ctx, // kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); // if (kernel_key.dtype() == DataType::FLOAT16) { -// kernel->OutputAt(1).SetDataType(DataType::FLOAT32); -// kernel->OutputAt(2).SetDataType(DataType::FLOAT32); -// kernel->OutputAt(3).SetDataType(DataType::FLOAT32); -// kernel->OutputAt(4).SetDataType(DataType::FLOAT32); -// kernel->OutputAt(5).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); +// kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); +// kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); +// kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); +// kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); // } // kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED); // kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED); @@ -324,11 +324,11 @@ void AdamwKernel(const Context& dev_ctx, // kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); // if (kernel_key.dtype() == DataType::FLOAT16) { -// kernel->OutputAt(1).SetDataType(DataType::FLOAT32); -// kernel->OutputAt(2).SetDataType(DataType::FLOAT32); -// kernel->OutputAt(3).SetDataType(DataType::FLOAT32); -// kernel->OutputAt(4).SetDataType(DataType::FLOAT32); -// kernel->OutputAt(5).SetDataType(DataType::FLOAT32); +// kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); +// kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); +// kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); +// kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); +// kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); // } // kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED); // kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED); diff --git a/backends/gcu/kernels/arg_min_max_kernels.cc b/backends/gcu/kernels/arg_min_max_kernels.cc index ada6839fabd..31df7239a83 100644 --- a/backends/gcu/kernels/arg_min_max_kernels.cc +++ b/backends/gcu/kernels/arg_min_max_kernels.cc @@ -142,7 +142,7 @@ PD_REGISTER_PLUGIN_KERNEL(argmin, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } PD_REGISTER_PLUGIN_KERNEL(argmax, @@ -153,5 +153,5 @@ PD_REGISTER_PLUGIN_KERNEL(argmax, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } diff --git a/backends/gcu/kernels/argsort_kernel.cc b/backends/gcu/kernels/argsort_kernel.cc index 225b2253d6c..e6193b550e0 100644 --- a/backends/gcu/kernels/argsort_kernel.cc +++ b/backends/gcu/kernels/argsort_kernel.cc @@ -126,7 +126,7 @@ PD_REGISTER_PLUGIN_KERNEL(argsort, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(1).SetDataType(DataType::INT64); + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); } PD_REGISTER_PLUGIN_KERNEL(argsort_grad, diff --git a/backends/gcu/kernels/atan2_kernel.cc b/backends/gcu/kernels/atan2_kernel.cc index c0cd98e7c8e..2f4e0c0cb19 100644 --- a/backends/gcu/kernels/atan2_kernel.cc +++ b/backends/gcu/kernels/atan2_kernel.cc @@ -50,5 +50,5 @@ PD_REGISTER_PLUGIN_KERNEL(atan2, double, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } diff --git a/backends/gcu/kernels/batch_norm_kernel.cc b/backends/gcu/kernels/batch_norm_kernel.cc index 31cc1b2f9b2..96d896e0b0c 100644 --- a/backends/gcu/kernels/batch_norm_kernel.cc +++ b/backends/gcu/kernels/batch_norm_kernel.cc @@ -409,14 +409,14 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm, phi::dtype::bfloat16, phi::dtype::float16) { if (kernel_key.dtype() == DataType::FLOAT16) { - kernel->InputAt(1).SetDataType(DataType::FLOAT32); // mean - kernel->InputAt(2).SetDataType(DataType::FLOAT32); // variance - kernel->InputAt(3).SetDataType(DataType::FLOAT32); // scale - kernel->InputAt(4).SetDataType(DataType::FLOAT32); // bias - kernel->OutputAt(1).SetDataType(DataType::FLOAT32); // mean_out - kernel->OutputAt(2).SetDataType(DataType::FLOAT32); // variance_out - kernel->OutputAt(3).SetDataType(DataType::FLOAT32); // saved_mean - kernel->OutputAt(4).SetDataType(DataType::FLOAT32); // saved_variance + kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); // mean + kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); // variance + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); // scale + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); // bias + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // mean_out + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // variance_out + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); // saved_mean + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); // saved_variance } } @@ -429,9 +429,9 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm_grad, phi::dtype::bfloat16, phi::dtype::float16) { if (kernel_key.dtype() == DataType::FLOAT16) { - kernel->OutputAt(0).SetDataType(DataType::FLOAT32); // x_grad - kernel->OutputAt(1).SetDataType(DataType::FLOAT32); // scale_grad - kernel->OutputAt(2).SetDataType(DataType::FLOAT32); // bias_grad + kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); // x_grad + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // scale_grad + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // bias_grad } } @@ -444,7 +444,7 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm_infer, phi::dtype::bfloat16, phi::dtype::float16) { if (kernel_key.dtype() == DataType::FLOAT16) { - kernel->OutputAt(1).SetDataType(DataType::FLOAT32); // mean_out - kernel->OutputAt(2).SetDataType(DataType::FLOAT32); // variance_out + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // mean_out + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // variance_out } } diff --git a/backends/gcu/kernels/cast_kernel.cc b/backends/gcu/kernels/cast_kernel.cc index d038c4a0278..e30691eee2c 100644 --- a/backends/gcu/kernels/cast_kernel.cc +++ b/backends/gcu/kernels/cast_kernel.cc @@ -88,5 +88,5 @@ PD_REGISTER_PLUGIN_KERNEL(cast, int32_t, int64_t, bool) { - kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } diff --git a/backends/gcu/kernels/compare_kernels.cc b/backends/gcu/kernels/compare_kernels.cc index fcab639396d..72263a3dd63 100644 --- a/backends/gcu/kernels/compare_kernels.cc +++ b/backends/gcu/kernels/compare_kernels.cc @@ -214,18 +214,18 @@ void GreaterThanKernel(const Context& dev_ctx, } // namespace custom_kernel -#define PD_REGISTER_COMPARE_KERNEL(name, func) \ - PD_REGISTER_PLUGIN_KERNEL(name, \ - gcu, \ - ALL_LAYOUT, \ - custom_kernel::func##Kernel, \ - bool, \ - int, \ - int64_t, \ - float, \ - phi::dtype::bfloat16, \ - phi::dtype::float16) { \ - kernel->OutputAt(0).SetDataType(DataType::BOOL); \ +#define PD_REGISTER_COMPARE_KERNEL(name, func) \ + PD_REGISTER_PLUGIN_KERNEL(name, \ + gcu, \ + ALL_LAYOUT, \ + custom_kernel::func##Kernel, \ + bool, \ + int, \ + int64_t, \ + float, \ + phi::dtype::bfloat16, \ + phi::dtype::float16) { \ + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); \ } #define PD_REGISTER_COMPARE_RAW_KERNEL(name, func) \ @@ -239,7 +239,7 @@ void GreaterThanKernel(const Context& dev_ctx, float, \ phi::dtype::bfloat16, \ phi::dtype::float16) { \ - kernel->OutputAt(0).SetDataType(DataType::BOOL); \ + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); \ } PD_REGISTER_COMPARE_KERNEL(less_than, LessThan) diff --git a/backends/gcu/kernels/dropout_kernel.cc b/backends/gcu/kernels/dropout_kernel.cc index 3221a51845e..07e231873e7 100644 --- a/backends/gcu/kernels/dropout_kernel.cc +++ b/backends/gcu/kernels/dropout_kernel.cc @@ -203,7 +203,7 @@ PD_REGISTER_PLUGIN_KERNEL(dropout, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(1).SetDataType(DataType::UINT8); + kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); } PD_REGISTER_PLUGIN_KERNEL(dropout_grad, diff --git a/backends/gcu/kernels/funcs/gcu_kernel_funcs.h b/backends/gcu/kernels/funcs/gcu_kernel_funcs.h index 3497d3c8290..43771743d83 100644 --- a/backends/gcu/kernels/funcs/gcu_kernel_funcs.h +++ b/backends/gcu/kernels/funcs/gcu_kernel_funcs.h @@ -29,7 +29,7 @@ __FUNCTION__)) namespace custom_kernel { -using DenseTensor = DenseTensor; +using DenseTensor = phi::DenseTensor; using TensorNameMap = std::map>; using TensorValueMap = std::map>; diff --git a/backends/gcu/kernels/is_empty_kernel.cc b/backends/gcu/kernels/is_empty_kernel.cc index c565f60bef6..8f72b4647be 100644 --- a/backends/gcu/kernels/is_empty_kernel.cc +++ b/backends/gcu/kernels/is_empty_kernel.cc @@ -37,5 +37,5 @@ PD_REGISTER_PLUGIN_KERNEL(is_empty, int64_t, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } diff --git a/backends/gcu/kernels/isfinite_kernel.cc b/backends/gcu/kernels/isfinite_kernel.cc index 47bb2fd700f..392c4f9a908 100644 --- a/backends/gcu/kernels/isfinite_kernel.cc +++ b/backends/gcu/kernels/isfinite_kernel.cc @@ -38,5 +38,5 @@ PD_REGISTER_PLUGIN_KERNEL(isfinite, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } diff --git a/backends/gcu/kernels/isinf_kernel.cc b/backends/gcu/kernels/isinf_kernel.cc index 470fba64bcf..2e26e89df89 100644 --- a/backends/gcu/kernels/isinf_kernel.cc +++ b/backends/gcu/kernels/isinf_kernel.cc @@ -53,5 +53,5 @@ PD_REGISTER_PLUGIN_KERNEL(isinf, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } diff --git a/backends/gcu/kernels/isnan_kernel.cc b/backends/gcu/kernels/isnan_kernel.cc index 17084f3ad17..12b4c25cf67 100644 --- a/backends/gcu/kernels/isnan_kernel.cc +++ b/backends/gcu/kernels/isnan_kernel.cc @@ -38,5 +38,5 @@ PD_REGISTER_PLUGIN_KERNEL(isnan, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } diff --git a/backends/gcu/kernels/llama_stub_kernels.cc b/backends/gcu/kernels/llama_stub_kernels.cc index e52accae477..a39b1e35a77 100644 --- a/backends/gcu/kernels/llama_stub_kernels.cc +++ b/backends/gcu/kernels/llama_stub_kernels.cc @@ -149,7 +149,7 @@ PD_REGISTER_PLUGIN_KERNEL( int64_t, phi::dtype::float16, phi::dtype::bfloat16) { - kernel->InputAt(3).SetDataType(DataType::INT32); + kernel->InputAt(3).SetDataType(phi::DataType::INT32); } PD_REGISTER_PLUGIN_KERNEL(fused_bias_act, diff --git a/backends/gcu/kernels/logical_kernels.cc b/backends/gcu/kernels/logical_kernels.cc index 7f916bf56a8..4d589ab9394 100644 --- a/backends/gcu/kernels/logical_kernels.cc +++ b/backends/gcu/kernels/logical_kernels.cc @@ -162,7 +162,7 @@ PD_REGISTER_PLUGIN_KERNEL(logical_and, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(logical_not, @@ -173,7 +173,7 @@ PD_REGISTER_PLUGIN_KERNEL(logical_not, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(logical_or, @@ -184,7 +184,7 @@ PD_REGISTER_PLUGIN_KERNEL(logical_or, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(logical_xor, @@ -195,5 +195,5 @@ PD_REGISTER_PLUGIN_KERNEL(logical_xor, float, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } diff --git a/backends/gcu/kernels/masked_select_kernel.cc b/backends/gcu/kernels/masked_select_kernel.cc index c32c0da9df9..46bc2397abe 100644 --- a/backends/gcu/kernels/masked_select_kernel.cc +++ b/backends/gcu/kernels/masked_select_kernel.cc @@ -131,7 +131,7 @@ PD_REGISTER_PLUGIN_KERNEL(masked_select, phi::dtype::bfloat16, float, int) { - kernel->InputAt(1).SetDataType(DataType::BOOL); + kernel->InputAt(1).SetDataType(phi::DataType::BOOL); } // PD_REGISTER_PLUGIN_KERNEL(masked_select_grad, @@ -142,5 +142,5 @@ PD_REGISTER_PLUGIN_KERNEL(masked_select, // float, // int, // int64_t) { -// kernel->InputAt(1).SetDataType(DataType::BOOL); +// kernel->InputAt(1).SetDataType(phi::DataType::BOOL); // } diff --git a/backends/gcu/kernels/merged_adam_kernel.cc b/backends/gcu/kernels/merged_adam_kernel.cc index faf3726117a..963a930fd58 100644 --- a/backends/gcu/kernels/merged_adam_kernel.cc +++ b/backends/gcu/kernels/merged_adam_kernel.cc @@ -375,11 +375,11 @@ PD_REGISTER_PLUGIN_KERNEL(merged_adam, if (kernel_key.dtype() == DataType::FLOAT16 || kernel_key.dtype() == DataType::BFLOAT16) { - kernel->OutputAt(1).SetDataType(DataType::FLOAT32); - kernel->OutputAt(2).SetDataType(DataType::FLOAT32); - kernel->OutputAt(3).SetDataType(DataType::FLOAT32); - kernel->OutputAt(4).SetDataType(DataType::FLOAT32); - kernel->OutputAt(5).SetDataType(DataType::FLOAT32); + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32); } kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED); kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED); diff --git a/backends/gcu/kernels/multiclass_nms3_kernel.cc b/backends/gcu/kernels/multiclass_nms3_kernel.cc index 57979a9f141..f020cdc6a91 100644 --- a/backends/gcu/kernels/multiclass_nms3_kernel.cc +++ b/backends/gcu/kernels/multiclass_nms3_kernel.cc @@ -139,6 +139,6 @@ PD_REGISTER_PLUGIN_KERNEL(multiclass_nms3, phi::dtype::float16, phi::dtype::bfloat16, float) { - kernel->OutputAt(1).SetDataType(DataType::INT32); - kernel->OutputAt(2).SetDataType(DataType::INT32); + kernel->OutputAt(1).SetDataType(phi::DataType::INT32); + kernel->OutputAt(2).SetDataType(phi::DataType::INT32); } diff --git a/backends/gcu/kernels/multinomial_kernel.cc b/backends/gcu/kernels/multinomial_kernel.cc index 9f699575294..61ca3a336c7 100644 --- a/backends/gcu/kernels/multinomial_kernel.cc +++ b/backends/gcu/kernels/multinomial_kernel.cc @@ -90,5 +90,5 @@ PD_REGISTER_PLUGIN_KERNEL(multinomial, custom_kernel::MultinomialKernel, // float, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::INT64); + kernel->OutputAt(0).SetDataType(phi::DataType::INT64); } diff --git a/backends/gcu/kernels/nms_kernel.cc b/backends/gcu/kernels/nms_kernel.cc index b7ff3bccddf..200cb9a54c2 100644 --- a/backends/gcu/kernels/nms_kernel.cc +++ b/backends/gcu/kernels/nms_kernel.cc @@ -116,5 +116,5 @@ void NMSKernel(const Context& dev_ctx, PD_REGISTER_PLUGIN_KERNEL( nms, gcu, ALL_LAYOUT, custom_kernel::NMSKernel, float, double) { - kernel->OutputAt(0).SetDataType(DataType::INT64); + kernel->OutputAt(0).SetDataType(phi::DataType::INT64); } diff --git a/backends/gcu/kernels/numel_kernel.cc b/backends/gcu/kernels/numel_kernel.cc index f66591d96b2..a87c1f85cd6 100644 --- a/backends/gcu/kernels/numel_kernel.cc +++ b/backends/gcu/kernels/numel_kernel.cc @@ -61,5 +61,5 @@ PD_REGISTER_PLUGIN_KERNEL(numel, float, double, bool) { - kernel->OutputAt(0).SetDataType(DataType::INT64); + kernel->OutputAt(0).SetDataType(phi::DataType::INT64); } diff --git a/backends/gcu/kernels/one_hot_kernel.cc b/backends/gcu/kernels/one_hot_kernel.cc index cd9debe10db..de5f7ff45d4 100644 --- a/backends/gcu/kernels/one_hot_kernel.cc +++ b/backends/gcu/kernels/one_hot_kernel.cc @@ -90,7 +90,7 @@ void OneHotRawKernel(const Context& dev_ctx, PD_REGISTER_PLUGIN_KERNEL( one_hot, gcu, ALL_LAYOUT, custom_kernel::OneHotKernel, int32_t, int64_t) { - kernel->OutputAt(0).SetDataType(DataType::FLOAT32); + kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); } PD_REGISTER_PLUGIN_KERNEL(one_hot_v2, @@ -99,7 +99,7 @@ PD_REGISTER_PLUGIN_KERNEL(one_hot_v2, custom_kernel::OneHotV2Kernel, int32_t, int64_t) { - kernel->OutputAt(0).SetDataType(DataType::FLOAT32); + kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); } PD_REGISTER_PLUGIN_KERNEL(one_hot_raw, @@ -108,5 +108,5 @@ PD_REGISTER_PLUGIN_KERNEL(one_hot_raw, custom_kernel::OneHotRawKernel, int32_t, int64_t) { - kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } diff --git a/backends/gcu/kernels/reduce_kernels.cc b/backends/gcu/kernels/reduce_kernels.cc index 4d3906a4598..092c6b8f544 100644 --- a/backends/gcu/kernels/reduce_kernels.cc +++ b/backends/gcu/kernels/reduce_kernels.cc @@ -492,7 +492,7 @@ PD_REGISTER_PLUGIN_KERNEL(any, bool, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(all, @@ -504,7 +504,7 @@ PD_REGISTER_PLUGIN_KERNEL(all, bool, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::BOOL); + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); } PD_REGISTER_PLUGIN_KERNEL(max, @@ -546,7 +546,7 @@ PD_REGISTER_PLUGIN_KERNEL(sum, double, phi::dtype::bfloat16, phi::dtype::float16) { - kernel->OutputAt(0).SetDataType(DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } // PD_REGISTER_PLUGIN_KERNEL(sum_grad, diff --git a/backends/gcu/kernels/rnn_kernel.cc b/backends/gcu/kernels/rnn_kernel.cc index 8687af85880..1fd154190d9 100644 --- a/backends/gcu/kernels/rnn_kernel.cc +++ b/backends/gcu/kernels/rnn_kernel.cc @@ -473,5 +473,5 @@ void RnnKernel(const Context& dev_ctx, // phi::dtype::float16, // phi::dtype::bfloat16, // float) { -// kernel->OutputAt(1).SetDataType(DataType::UINT8); +// kernel->OutputAt(1).SetDataType(phi::DataType::UINT8); // } diff --git a/backends/gcu/kernels/topk_kernel.cc b/backends/gcu/kernels/topk_kernel.cc index 62cb9549574..1300db631e4 100644 --- a/backends/gcu/kernels/topk_kernel.cc +++ b/backends/gcu/kernels/topk_kernel.cc @@ -110,5 +110,5 @@ PD_REGISTER_PLUGIN_KERNEL(topk, double, int, int64_t) { - kernel->OutputAt(1).SetDataType(DataType::INT64); + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); } From 071e13e51dcf1f0c071ed93c3c87e18d3941e757 Mon Sep 17 00:00:00 2001 From: co63oc <4617245+co63oc@users.noreply.github.com> Date: Tue, 21 Apr 2026 08:53:38 +0800 Subject: [PATCH 3/3] fix --- backends/gcu/kernels/adam_kernel.cc | 4 ++-- backends/gcu/kernels/batch_norm_kernel.cc | 6 +++--- backends/gcu/kernels/merged_adam_kernel.cc | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/backends/gcu/kernels/adam_kernel.cc b/backends/gcu/kernels/adam_kernel.cc index 5c6bc1c7549..97c7e11cc7a 100644 --- a/backends/gcu/kernels/adam_kernel.cc +++ b/backends/gcu/kernels/adam_kernel.cc @@ -301,7 +301,7 @@ void AdamwKernel(const Context& dev_ctx, // kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); -// if (kernel_key.dtype() == DataType::FLOAT16) { +// if (kernel_key.dtype() == phi::DataType::FLOAT16) { // kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); @@ -323,7 +323,7 @@ void AdamwKernel(const Context& dev_ctx, // kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); // kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); -// if (kernel_key.dtype() == DataType::FLOAT16) { +// if (kernel_key.dtype() == phi::DataType::FLOAT16) { // kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); diff --git a/backends/gcu/kernels/batch_norm_kernel.cc b/backends/gcu/kernels/batch_norm_kernel.cc index 96d896e0b0c..ce562875653 100644 --- a/backends/gcu/kernels/batch_norm_kernel.cc +++ b/backends/gcu/kernels/batch_norm_kernel.cc @@ -408,7 +408,7 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm, double, phi::dtype::bfloat16, phi::dtype::float16) { - if (kernel_key.dtype() == DataType::FLOAT16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32); // mean kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32); // variance kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); // scale @@ -428,7 +428,7 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm_grad, double, phi::dtype::bfloat16, phi::dtype::float16) { - if (kernel_key.dtype() == DataType::FLOAT16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); // x_grad kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // scale_grad kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // bias_grad @@ -443,7 +443,7 @@ PD_REGISTER_PLUGIN_KERNEL(batch_norm_infer, double, phi::dtype::bfloat16, phi::dtype::float16) { - if (kernel_key.dtype() == DataType::FLOAT16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); // mean_out kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // variance_out } diff --git a/backends/gcu/kernels/merged_adam_kernel.cc b/backends/gcu/kernels/merged_adam_kernel.cc index 963a930fd58..477cac85e3c 100644 --- a/backends/gcu/kernels/merged_adam_kernel.cc +++ b/backends/gcu/kernels/merged_adam_kernel.cc @@ -373,8 +373,8 @@ PD_REGISTER_PLUGIN_KERNEL(merged_adam, kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); - if (kernel_key.dtype() == DataType::FLOAT16 || - kernel_key.dtype() == DataType::BFLOAT16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16 || + kernel_key.dtype() == phi::DataType::BFLOAT16) { kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);