Skip to content

Commit c19a580

Browse files
authored
Merge branch 'develop' into fix16
2 parents cede8d4 + 8f3743f commit c19a580

7 files changed

Lines changed: 212 additions & 60 deletions

File tree

Paddle

Submodule Paddle updated 472 files

backends/metax_gpu/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ file(
327327
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/mp_allreduce_sum_kernel.cu
328328
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/pow2_decay_with_linear_warmup_kernel.cu
329329
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
330-
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_k_kernel.cu
330+
# ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_k_kernel.cu
331331
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
332332
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/where_grad_kernel.cu
333333
${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/where_kernel.cu

backends/metax_gpu/compile.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
2525
export PATH=${PATH}:${CUCC_PATH}/tools:${CUCC_PATH}/bin
2626
export PATH=${MACA_PATH}/bin:${PATH}
2727
export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
28+
# export MXCC_OVERRIDE_OPTIONS="+-mllvm +-metaxgpu-inline-branch-fold-bias=10000"
2829
export PADDLE_VERSION="3.3.0.dev$(date +%Y%m%d)"
2930
export MACA_AI_VERSION=$(cat /opt/maca/Version.txt | cut -d':' -f2)
3031
if [ ! -d build ]; then

backends/metax_gpu/env.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
14+
export MXCC_OVERRIDE_OPTIONS="+-mllvm +-metaxgpu-inline-branch-fold-bias=10000"
1515
DEFAULT_DIR="/opt/maca"
1616
export MACA_PATH=${1:-$DEFAULT_DIR}
1717
export CUDA_PATH=/usr/local/cuda

backends/metax_gpu/kernels/custom_kernel/flash_attn_grad_kernel.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,8 +281,8 @@ void FlashAttnGradKernel(const Context& ctx,
281281
// // printf("params.dq dims[2]:%d, params.dk dims[2]:%d, params.dv
282282
// dims[2]:%d\n", params.dq->head_num, params.dk->head_num,
283283
// params.dv->head_num);
284-
print_tensor_info(params.dq);
285-
print_tensor_info(params.dk);
284+
// print_tensor_info(params.dq);
285+
// print_tensor_info(params.dk);
286286
// print_tensor_info(params.dv);
287287
mcflashattnStatus_t succ = phi::dynload::mha_bwd(params.batch_size,
288288
params.seqlen_q,

backends/metax_gpu/patch/paddle.patch

Lines changed: 70 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -198,19 +198,19 @@ index d970878dc2..fe0382ccad 100644
198198
x = *reinterpret_cast<uint16_t*>(&tmp);
199199

200200
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
201-
index 024a7de73e..66b373d698 100644
201+
index d07575028c..ec262da03a 100644
202202
--- a/paddle/phi/core/enforce.h
203203
+++ b/paddle/phi/core/enforce.h
204204
@@ -97,7 +97,7 @@ inline bool is_error(bool stat) { return !stat; }
205205

206-
void ThrowWarnInternal(const std::string& message);
206+
PADDLE_API void ThrowWarnInternal(const std::string& message);
207207

208208
-#if defined(__CUDA_ARCH__)
209209
+#if defined(__CUDACC__)
210210
// For cuda, the assertions can affect performance and it is therefore
211211
// recommended to disable them in production code
212212
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
213-
@@ -109,7 +109,7 @@ void ThrowWarnInternal(const std::string& message);
213+
@@ -109,7 +109,7 @@ PADDLE_API void ThrowWarnInternal(const std::string& message);
214214
__LINE__, \
215215
#_IS_NOT_ERROR, \
216216
##__VA_ARGS__); \
@@ -916,45 +916,6 @@ index 75a8f71d8c..cb21e9e301 100644
916916
#include "paddle/phi/kernels/impl/qr_kernel_impl.h"
917917
#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
918918
#include "paddle/phi/kernels/lstsq_kernel.h"
919-
diff --git a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
920-
index 4a28600c38..d96495b7aa 100644
921-
--- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
922-
+++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
923-
@@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
924-
925-
template <typename T>
926-
HOSTDEVICE T igamc(const T a, const T x) {
927-
- static T big = 4.503599627370496e15;
928-
- static T biginv = 2.22044604925031308085e-16;
929-
+ const static T big = 4.503599627370496e15;
930-
+ const static T biginv = 2.22044604925031308085e-16;
931-
932-
if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
933-
934-
diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
935-
index c627cc1264..b3941570ee 100644
936-
--- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
937-
+++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
938-
@@ -20,8 +20,8 @@
939-
namespace phi {
940-
template <typename T>
941-
HOSTDEVICE T digamma_positive_domain(T x) {
942-
- static T c = T{8.5};
943-
- static T euler_mascheroni = T{0.57721566490153286060};
944-
+ const static T c = T{8.5};
945-
+ const static T euler_mascheroni = T{0.57721566490153286060};
946-
T r;
947-
T value;
948-
T x2;
949-
@@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
950-
951-
template <typename T>
952-
HOSTDEVICE T digamma(T x) {
953-
- static T pi = T{3.14159265358979323846};
954-
+ const static T pi = T{3.14159265358979323846};
955-
956-
if (x == T{0.0}) {
957-
T inf = std::numeric_limits<T>::infinity();
958919

959920
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
960921
index be6ee4f854..1f507c99f4 100644
@@ -1056,19 +1017,6 @@ index be6ee4f854..1f507c99f4 100644
10561017
} else {
10571018
LaunchNormalSoftmaxForward<T, IndexType, LogMode>(
10581019

1059-
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
1060-
index 0a415200df..b0732e28f3 100644
1061-
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
1062-
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
1063-
@@ -147,7 +147,7 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
1064-
DenseTensor* logits_grad) {
1065-
PADDLE_ENFORCE_EQ(
1066-
dev_ctx.GetPlace().GetType(),
1067-
- AllocationType::GPU,
1068-
+ AllocationType::CUSTOM,
1069-
common::errors::Unavailable("softmax_with_cross_entropy operator's "
1070-
"CUDA kernel only runs on GPU device."));
1071-
const T* loss_grad_data = loss_grad.data<T>();
10721020
diff --git a/paddle/phi/kernels/funcs/cublaslt.h b/paddle/phi/kernels/funcs/cublaslt.h
10731021
index d8bc15926b..6071baf340 100644
10741022
--- a/paddle/phi/kernels/funcs/cublaslt.h
@@ -1102,3 +1050,70 @@ index d8bc15926b..6071baf340 100644
11021050
PADDLE_ENFORCE_EQ(
11031051
status,
11041052

1053+
diff --git a/paddle/phi/kernels/funcs/top_k_cuda_kernel.h b/paddle/phi/kernels/funcs/top_k_cuda_kernel.h
1054+
index 368cb21c21..f0f99fbd2f 100644
1055+
--- a/paddle/phi/kernels/funcs/top_k_cuda_kernel.h
1056+
+++ b/paddle/phi/kernels/funcs/top_k_cuda_kernel.h
1057+
@@ -167,7 +167,7 @@ struct Bitfield<unsigned int> {
1058+
int pos,
1059+
int len) {
1060+
unsigned int ret;
1061+
-#if defined(__HIPCC__)
1062+
+#if defined(PADDLE_WITH_CUDA)
1063+
ret = (val >> pos) & ((1u << len) - 1u);
1064+
#else
1065+
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
1066+
@@ -178,7 +178,7 @@ struct Bitfield<unsigned int> {
1067+
static __device__ __forceinline__ unsigned int setBitfield(
1068+
unsigned int val, unsigned int to_insert, int pos, int len) {
1069+
unsigned int ret;
1070+
-#if defined(__HIPCC__)
1071+
+#if defined(PADDLE_WITH_CUDA)
1072+
unsigned int mask = ((1u << len) - 1u) << pos;
1073+
ret = (val & ~mask) | ((to_insert << pos) & mask);
1074+
#else
1075+
@@ -196,7 +196,7 @@ struct Bitfield<uint64_t> {
1076+
int pos,
1077+
int len) {
1078+
uint64_t ret;
1079+
-#if defined(__HIPCC__)
1080+
+#if defined(PADDLE_WITH_CUDA)
1081+
ret = (val >> pos) & ((1ULL << len) - 1ULL);
1082+
#else
1083+
asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
1084+
@@ -209,7 +209,7 @@ struct Bitfield<uint64_t> {
1085+
int pos,
1086+
int len) {
1087+
uint64_t ret;
1088+
-#if defined(__HIPCC__)
1089+
+#if defined(PADDLE_WITH_CUDA)
1090+
uint64_t mask = ((1ULL << len) - 1ULL) << pos;
1091+
ret = (val & ~mask) | ((to_insert << pos) & mask);
1092+
#else
1093+
@@ -223,7 +223,7 @@ struct Bitfield<uint64_t> {
1094+
1095+
// --- getLaneId / getLaneMaskLe ---
1096+
__device__ __forceinline__ int getLaneId() {
1097+
-#if defined(__HIPCC__)
1098+
+#if defined(PADDLE_WITH_CUDA)
1099+
return __lane_id();
1100+
#else
1101+
int laneId;
1102+
@@ -233,7 +233,7 @@ __device__ __forceinline__ int getLaneId() {
1103+
}
1104+
1105+
__device__ __forceinline__ unsigned getLaneMaskLe() {
1106+
-#if defined(__HIPCC__)
1107+
+#if defined(PADDLE_WITH_CUDA)
1108+
// HIP warp size is 64, construct mask for lanes <= current lane
1109+
return (getLaneId() == 63) ? 0xFFFFFFFFFFFFFFFFULL
1110+
: (1ULL << (getLaneId() + 1)) - 1ULL;
1111+
@@ -245,7 +245,7 @@ __device__ __forceinline__ unsigned getLaneMaskLe() {
1112+
}
1113+
1114+
__device__ __forceinline__ unsigned getLaneMaskLt() {
1115+
-#if defined(__HIPCC__)
1116+
+#if defined(PADDLE_WITH_CUDA)
1117+
return (getLaneId() == 0) ? 0ULL : (1ULL << getLaneId()) - 1ULL;
1118+
#else
1119+
unsigned mask;
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
2+
index 73da8a62b4..b7ec9080b1 100644
3+
--- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
4+
+++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
5+
@@ -257,64 +257,40 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
6+
Pair<T> topk[],
7+
Pair<T> beam_max[],
8+
int* beam,
9+
- int* k,
10+
int* count,
11+
const int tid,
12+
const int wid,
13+
const int lane) {
14+
- while (true) {
15+
- __syncthreads();
16+
- Pair<T> input_now = topk[0];
17+
- input_now = WarpReduce(input_now);
18+
+ __syncthreads();
19+
+ Pair<T> input_now = topk[0];
20+
+ input_now = WarpReduce(input_now);
21+
22+
- if (lane == 0) {
23+
- shared_max[wid] = input_now;
24+
- }
25+
- __syncthreads();
26+
- input_now = (tid < BlockSize / WARP_SIZE)
27+
- ? shared_max[lane]
28+
- : Pair<T>(std::numeric_limits<T>::min(), -1);
29+
- if (wid == 0) {
30+
- input_now = WarpReduce(input_now);
31+
- if (lane == 0) shared_max[0] = input_now;
32+
- }
33+
- __syncthreads();
34+
- if (tid == 0) {
35+
- beam_max[*count] = shared_max[0];
36+
- (*count)++;
37+
- }
38+
- int tid_max = shared_max[0].id % BlockSize;
39+
- if (tid == tid_max) {
40+
- (*beam)++;
41+
- }
42+
- if (--(*k) == 0) break;
43+
- __syncthreads();
44+
+ if (lane == 0) {
45+
+ shared_max[wid] = input_now;
46+
+ }
47+
+ __syncthreads();
48+
+ input_now = (tid < BlockSize / WARP_SIZE)
49+
+ ? shared_max[lane]
50+
+ : Pair<T>(std::numeric_limits<T>::min(), -1);
51+
+ if (wid == 0) {
52+
+ input_now = WarpReduce(input_now);
53+
+ if (lane == 0) shared_max[0] = input_now;
54+
+ }
55+
+ __syncthreads();
56+
+ if (tid == 0) {
57+
+ beam_max[*count] = shared_max[0];
58+
+ (*count)++;
59+
+ }
60+
+ int tid_max = shared_max[0].id % BlockSize;
61+
+ if (tid == tid_max) {
62+
+ (*beam)++;
63+
+ }
64+
65+
- if (tid == tid_max) {
66+
- if (*beam < MaxLength) {
67+
- topk[0] = topk[*beam];
68+
- }
69+
- }
70+
+ __syncthreads();
71+
72+
- if (MaxLength < 5) {
73+
- if (*beam >= MaxLength) break;
74+
- } else {
75+
-#ifdef PADDLE_WITH_HIP
76+
- uint64_t mask = 0u;
77+
- mask = __ballot(true);
78+
- if (tid_max / WARP_SIZE == wid) {
79+
- if (__shfl_down(*beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength)
80+
- break;
81+
- }
82+
-#else
83+
- unsigned mask = 0u;
84+
- mask = __ballot_sync(FINAL_MASK, true);
85+
- if (tid_max / WARP_SIZE == wid) {
86+
- if (__shfl_down_sync(
87+
- FINAL_MASK, *beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength)
88+
- break;
89+
- }
90+
-#endif
91+
+ if (tid == tid_max) {
92+
+ if (*beam < MaxLength) {
93+
+ topk[0] = topk[*beam];
94+
}
95+
}
96+
}
97+
@@ -385,7 +361,7 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
98+
topk[j].set(std::numeric_limits<T>::min(), -1);
99+
}
100+
101+
- while (top_num) {
102+
+ for (int iter = 0; iter < TopPBeamTopK; ++iter) {
103+
ThreadGetTopK<T, MaxLength, BlockSize>(topk,
104+
&beam,
105+
TopPBeamTopK,
106+
@@ -396,7 +372,7 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
107+
vocab_size,
108+
tid);
109+
BlockReduce<T, MaxLength, BlockSize>(
110+
- shared_max, topk, beam_max, &beam, &top_num, &count, tid, wid, lane);
111+
+ shared_max, topk, beam_max, &beam, &count, tid, wid, lane);
112+
}
113+
if (tid == 0) {
114+
count_iter_begin[bid] = count_iter[bid];
115+
@@ -488,18 +464,18 @@ __global__ void KeMatrixTopPBeamTopKFt(const T* src,
116+
topk[j].set(std::numeric_limits<T>::min(), -1);
117+
}
118+
119+
- while (top_num) {
120+
+ for (int iter = 0; iter < TopPBeamTopK; ++iter) {
121+
ThreadGetTopK<T, MaxLength, BlockSize>(topk,
122+
&beam,
123+
TopPBeamTopK,
124+
- src + bid * vocab_size,
125+
+ src + offset,
126+
&firststep,
127+
&is_empty,
128+
&max,
129+
vocab_size,
130+
tid);
131+
BlockReduce<T, MaxLength, BlockSize>(
132+
- shared_max, topk, beam_max, &beam, &top_num, &count, tid, wid, lane);
133+
+ shared_max, topk, beam_max, &beam, &count, tid, wid, lane);
134+
}
135+
if (tid == 0) {
136+
count_iter_begin[bid] = count_iter[bid];

0 commit comments

Comments
 (0)