@@ -198,19 +198,19 @@ index d970878dc2..fe0382ccad 100644
198198 x = *reinterpret_cast<uint16_t*>(&tmp);
199199
200200diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
201- index 024a7de73e..66b373d698 100644
201+ index d07575028c..ec262da03a 100644
202202--- a/paddle/phi/core/enforce.h
203203+++ b/paddle/phi/core/enforce.h
204204@@ -97,7 +97,7 @@ inline bool is_error(bool stat) { return !stat; }
205205
206- void ThrowWarnInternal(const std::string& message);
206+ PADDLE_API void ThrowWarnInternal(const std::string& message);
207207
208208- #if defined(__CUDA_ARCH__)
209209+ #if defined(__CUDACC__)
210210 // For cuda, the assertions can affect performance and it is therefore
211211 // recommended to disable them in production code
212212 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
213- @@ -109,7 +109,7 @@ void ThrowWarnInternal(const std::string& message);
213+ @@ -109,7 +109,7 @@ PADDLE_API void ThrowWarnInternal(const std::string& message);
214214 __LINE__, \
215215 #_IS_NOT_ERROR, \
216216 ##__VA_ARGS__); \
@@ -916,45 +916,6 @@ index 75a8f71d8c..cb21e9e301 100644
916916 #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
917917 #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
918918 #include "paddle/phi/kernels/lstsq_kernel.h"
919- diff --git a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
920- index 4a28600c38..d96495b7aa 100644
921- --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
922- +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
923- @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
924-
925- template <typename T>
926- HOSTDEVICE T igamc(const T a, const T x) {
927- - static T big = 4.503599627370496e15;
928- - static T biginv = 2.22044604925031308085e-16;
929- + const static T big = 4.503599627370496e15;
930- + const static T biginv = 2.22044604925031308085e-16;
931-
932- if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
933-
934- diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
935- index c627cc1264..b3941570ee 100644
936- --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
937- +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
938- @@ -20,8 +20,8 @@
939- namespace phi {
940- template <typename T>
941- HOSTDEVICE T digamma_positive_domain(T x) {
942- - static T c = T{8.5};
943- - static T euler_mascheroni = T{0.57721566490153286060};
944- + const static T c = T{8.5};
945- + const static T euler_mascheroni = T{0.57721566490153286060};
946- T r;
947- T value;
948- T x2;
949- @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
950-
951- template <typename T>
952- HOSTDEVICE T digamma(T x) {
953- - static T pi = T{3.14159265358979323846};
954- + const static T pi = T{3.14159265358979323846};
955-
956- if (x == T{0.0}) {
957- T inf = std::numeric_limits<T>::infinity();
958919
959920diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
960921index be6ee4f854..1f507c99f4 100644
@@ -1056,19 +1017,6 @@ index be6ee4f854..1f507c99f4 100644
10561017 } else {
10571018 LaunchNormalSoftmaxForward<T, IndexType, LogMode>(
10581019
1059- diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
1060- index 0a415200df..b0732e28f3 100644
1061- --- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
1062- +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
1063- @@ -147,7 +147,7 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
1064- DenseTensor* logits_grad) {
1065- PADDLE_ENFORCE_EQ(
1066- dev_ctx.GetPlace().GetType(),
1067- - AllocationType::GPU,
1068- + AllocationType::CUSTOM,
1069- common::errors::Unavailable("softmax_with_cross_entropy operator's "
1070- "CUDA kernel only runs on GPU device."));
1071- const T* loss_grad_data = loss_grad.data<T>();
10721020diff --git a/paddle/phi/kernels/funcs/cublaslt.h b/paddle/phi/kernels/funcs/cublaslt.h
10731021index d8bc15926b..6071baf340 100644
10741022--- a/paddle/phi/kernels/funcs/cublaslt.h
@@ -1102,3 +1050,70 @@ index d8bc15926b..6071baf340 100644
11021050 PADDLE_ENFORCE_EQ(
11031051 status,
11041052
1053+ diff --git a/paddle/phi/kernels/funcs/top_k_cuda_kernel.h b/paddle/phi/kernels/funcs/top_k_cuda_kernel.h
1054+ index 368cb21c21..f0f99fbd2f 100644
1055+ --- a/paddle/phi/kernels/funcs/top_k_cuda_kernel.h
1056+ +++ b/paddle/phi/kernels/funcs/top_k_cuda_kernel.h
1057+ @@ -167,7 +167,7 @@ struct Bitfield<unsigned int> {
1058+ int pos,
1059+ int len) {
1060+ unsigned int ret;
1061+ - #if defined(__HIPCC__)
1062+ + #if defined(PADDLE_WITH_CUDA)
1063+ ret = (val >> pos) & ((1u << len) - 1u);
1064+ #else
1065+ asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
1066+ @@ -178,7 +178,7 @@ struct Bitfield<unsigned int> {
1067+ static __device__ __forceinline__ unsigned int setBitfield(
1068+ unsigned int val, unsigned int to_insert, int pos, int len) {
1069+ unsigned int ret;
1070+ - #if defined(__HIPCC__)
1071+ + #if defined(PADDLE_WITH_CUDA)
1072+ unsigned int mask = ((1u << len) - 1u) << pos;
1073+ ret = (val & ~mask) | ((to_insert << pos) & mask);
1074+ #else
1075+ @@ -196,7 +196,7 @@ struct Bitfield<uint64_t> {
1076+ int pos,
1077+ int len) {
1078+ uint64_t ret;
1079+ - #if defined(__HIPCC__)
1080+ + #if defined(PADDLE_WITH_CUDA)
1081+ ret = (val >> pos) & ((1ULL << len) - 1ULL);
1082+ #else
1083+ asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
1084+ @@ -209,7 +209,7 @@ struct Bitfield<uint64_t> {
1085+ int pos,
1086+ int len) {
1087+ uint64_t ret;
1088+ - #if defined(__HIPCC__)
1089+ + #if defined(PADDLE_WITH_CUDA)
1090+ uint64_t mask = ((1ULL << len) - 1ULL) << pos;
1091+ ret = (val & ~mask) | ((to_insert << pos) & mask);
1092+ #else
1093+ @@ -223,7 +223,7 @@ struct Bitfield<uint64_t> {
1094+
1095+ // --- getLaneId / getLaneMaskLe ---
1096+ __device__ __forceinline__ int getLaneId() {
1097+ - #if defined(__HIPCC__)
1098+ + #if defined(PADDLE_WITH_CUDA)
1099+ return __lane_id();
1100+ #else
1101+ int laneId;
1102+ @@ -233,7 +233,7 @@ __device__ __forceinline__ int getLaneId() {
1103+ }
1104+
1105+ __device__ __forceinline__ unsigned getLaneMaskLe() {
1106+ - #if defined(__HIPCC__)
1107+ + #if defined(PADDLE_WITH_CUDA)
1108+ // HIP warp size is 64, construct mask for lanes <= current lane
1109+ return (getLaneId() == 63) ? 0xFFFFFFFFFFFFFFFFULL
1110+ : (1ULL << (getLaneId() + 1)) - 1ULL;
1111+ @@ -245,7 +245,7 @@ __device__ __forceinline__ unsigned getLaneMaskLe() {
1112+ }
1113+
1114+ __device__ __forceinline__ unsigned getLaneMaskLt() {
1115+ - #if defined(__HIPCC__)
1116+ + #if defined(PADDLE_WITH_CUDA)
1117+ return (getLaneId() == 0) ? 0ULL : (1ULL << getLaneId()) - 1ULL;
1118+ #else
1119+ unsigned mask;
0 commit comments