Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions paddle/phi/kernels/funcs/fast_ln_v1.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_v1_fwd_kernel(
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
if (col < cols) {
phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
} else {
gamma[it] = Vec_scale{};
beta[it] = Vec_scale{};
Expand All @@ -80,7 +80,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_v1_fwd_kernel(
#pragma unroll
for (int it = 0, col = c; it < LDGS; it++) {
if (col < cols) {
phi::Load<T, VecSize>(
Load<T, VecSize>(
x_ptr + static_cast<int64_t>(row) * ELTS_PER_ROW + col * VecSize,
&x[it]);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,16 +280,16 @@ __global__ void DequantKernel(T* output,
AlignedVector<T, VecSize> out_vec;

for (; idx < numel; idx += stride) {
phi::Load<int32_t, VecSize>(input + idx, &in_vec);
phi::Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);
Load<int32_t, VecSize>(input + idx, &in_vec);
Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);

#pragma unroll
for (int i = 0; i < VecSize; ++i) {
out_vec[i] =
static_cast<T>(static_cast<float>(in_vec[i]) * out_scale_vec[i]);
}

phi::Store<T, VecSize>(out_vec, output + idx);
Store<T, VecSize>(out_vec, output + idx);
}
}

Expand Down
50 changes: 25 additions & 25 deletions paddle/phi/kernels/fusion/gpu/fused_attention_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,13 @@ void FusedAttentionGradKernel(

const bool is_upscale_in_train =
(dropout_implementation == "upscale_in_train");
phi::fusion::DropoutParam dropout_param2(dropout_fix_seed,
0,
is_test,
is_upscale_in_train,
dropout_rate,
nullptr,
dropout_seed);
fusion::DropoutParam dropout_param2(dropout_fix_seed,
0,
is_test,
is_upscale_in_train,
dropout_rate,
nullptr,
dropout_seed);
const bool has_dropout = (dropout_param2.dropout_prob != 0.0f);

bool is_upscale_in_train_1 =
Expand Down Expand Up @@ -324,31 +324,31 @@ void FusedAttentionGradKernel(
bool transB = transpose_qkv_wb ? false : true;
bool compute_qkv_bias = qkv_bias_p ? true : false;
auto layer_norm_compute =
phi::fusion::AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
auto qkv_compute = phi::fusion::AttnMatMul<T>(dev_ctx,
transA,
transB,
bsz_seq,
output_size,
input_size,
compute_qkv_bias);
phi::fusion::AttnDropoutParam attn_dropout_param(is_test,
attn_dropout_implementation,
attn_dropout_rate,
is_upscale_in_train_1,
attn_dropout_fix_seed,
attn_dropout_seed,
seed_1);
auto fmha_ref_compute = phi::fusion::FMHARef<T>(
fusion::AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
auto qkv_compute = fusion::AttnMatMul<T>(dev_ctx,
transA,
transB,
bsz_seq,
output_size,
input_size,
compute_qkv_bias);
fusion::AttnDropoutParam attn_dropout_param(is_test,
attn_dropout_implementation,
attn_dropout_rate,
is_upscale_in_train_1,
attn_dropout_fix_seed,
attn_dropout_seed,
seed_1);
auto fmha_ref_compute = fusion::FMHARef<T>(
dev_ctx, batch_size, max_seq_len, num_head, dim_head, attn_dropout_param);
output_size = hidden_size;
transA = false;
transB = false;
bool compute_bias = false;
// (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed)
auto out_linear_compute = phi::fusion::AttnMatMul<T>(
auto out_linear_compute = fusion::AttnMatMul<T>(
dev_ctx, transA, transB, bsz_seq, input_size, output_size, compute_bias);
phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fused_dropout_layernorm_helper(
dev_ctx, bsz_seq, dim_embed, dropout_param2, ln_epsilon);

Expand Down
38 changes: 19 additions & 19 deletions paddle/phi/kernels/fusion/gpu/fused_attention_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,13 @@ void FusedAttentionKernel(const Context &dev_ctx,

const bool is_upscale_in_train =
(dropout_implementation == "upscale_in_train");
phi::fusion::DropoutParam dropout_param2(dropout_fix_seed,
0,
is_test,
is_upscale_in_train,
dropout_rate,
nullptr,
dropout_seed);
fusion::DropoutParam dropout_param2(dropout_fix_seed,
0,
is_test,
is_upscale_in_train,
dropout_rate,
nullptr,
dropout_seed);

const bool has_dropout = (dropout_param2.dropout_prob != 0.0f);

Expand Down Expand Up @@ -240,25 +240,25 @@ void FusedAttentionKernel(const Context &dev_ctx,
int input_size = dim_embed;

auto layer_norm_compute =
phi::fusion::AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
fusion::AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);

bool compute_bias = true;
if (qkv_bias_p == nullptr) {
compute_bias = false;
}
// (transA, transB, compute_bias) = (false, true, true)
bool transB = transpose_qkv_wb ? false : true;
auto qkv_compute = phi::fusion::AttnMatMul<T>(
auto qkv_compute = fusion::AttnMatMul<T>(
dev_ctx, false, transB, bsz_seq, output_size, input_size, compute_bias);

phi::fusion::AttnDropoutParam attn_dropout_param(is_test,
attn_dropout_implementation,
attn_dropout_rate,
is_upscale_in_train_1,
attn_dropout_fix_seed,
attn_dropout_seed,
seed_1);
auto fmha_ref_compute = phi::fusion::FMHARef<T>(
fusion::AttnDropoutParam attn_dropout_param(is_test,
attn_dropout_implementation,
attn_dropout_rate,
is_upscale_in_train_1,
attn_dropout_fix_seed,
attn_dropout_seed,
seed_1);
auto fmha_ref_compute = fusion::FMHARef<T>(
dev_ctx, batch_size, max_seq_len, num_head, dim_head, attn_dropout_param);

output_size = hidden_size;
Expand All @@ -268,9 +268,9 @@ void FusedAttentionKernel(const Context &dev_ctx,
// which is actually the input size. While the input size is hidden size,
// which is actually the output size. So for out linear, switch the
// input size and output size.
auto out_linear_compute = phi::fusion::AttnMatMul<T>(
auto out_linear_compute = fusion::AttnMatMul<T>(
dev_ctx, false, false, bsz_seq, input_size, output_size, false);
phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fused_dropout_layernorm_helper(
dev_ctx, bsz_seq, dim_embed, dropout_param2, ln_epsilon);

Expand Down
6 changes: 3 additions & 3 deletions paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ __global__ void ActFFNGlu(const T *bias,
load_func.template load<VecSize>(&src_vec2, index + hid_dim);

if (bias) {
phi::Load<T, VecSize>(&bias[idx], &bias_vec1);
phi::Load<T, VecSize>(&bias[idx + hid_dim], &bias_vec2);
Load<T, VecSize>(&bias[idx], &bias_vec1);
Load<T, VecSize>(&bias[idx + hid_dim], &bias_vec2);
}
#pragma unroll
for (int j = 0; j < VecSize; j++) {
Expand Down Expand Up @@ -134,7 +134,7 @@ __global__ void BiasAct(const T *bias,
int64_t linear_idx = row_idx * cols + col_idx;
load_func.template load<VecSize>(&src_vec, linear_idx);
if (bias) {
phi::Load<T, VecSize>(&bias[col_idx], &bias_vec);
Load<T, VecSize>(&bias[col_idx], &bias_vec);
}
#pragma unroll
for (int j = 0; j < VecSize; j++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,15 +102,15 @@ void FusedBiasDropoutResidualLnGradKernel(
bsz_seq *= input_x_dims[i];
}
int64_t dim_embed = input_x_dims[input_x_dims.size() - 1];
phi::fusion::DropoutParam dropout_param(
fusion::DropoutParam dropout_param(
dropout_fix_seed,
0,
is_test,
dropout_implementation == "upscale_in_train",
dropout_rate,
nullptr,
dropout_seed);
phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fused_dropout_layernorm_helper(
dev_ctx, bsz_seq, dim_embed, dropout_param, ln_epsilon);
fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,15 @@ void FusedBiasDropoutResidualLnKernel(const Context& dev_ctx,
bsz_seq *= input_x_dims[i];
}
int dim_embed = input_x_dims[input_x_dims.size() - 1];
phi::fusion::DropoutParam dropout_param(
fusion::DropoutParam dropout_param(
dropout_fix_seed,
0,
is_test,
dropout_implementation == "upscale_in_train",
dropout_rate,
nullptr,
dropout_seed);
phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fused_dropout_layernorm_helper(
dev_ctx, bsz_seq, dim_embed, dropout_param, ln_epsilon);
// output = layernorm(residual + dropout(input + bias))
Expand Down
28 changes: 14 additions & 14 deletions paddle/phi/kernels/fusion/gpu/fused_dropout_act_bias.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,11 @@ __global__ void FusedActBias(Functor act,
idx < elem_cnt;
idx += step) {
const int32_t col_idx = idx % cols;
phi::Load<InType, VecSize>(&src[idx], &src_vec);
phi::Load<float, VecSize>(&dequant_out_scale_data[col_idx],
&dequant_out_scale_vec);
Load<InType, VecSize>(&src[idx], &src_vec);
Load<float, VecSize>(&dequant_out_scale_data[col_idx],
&dequant_out_scale_vec);
if (bias) {
phi::Load<T, VecSize>(&bias[col_idx], &bias_vec);
Load<T, VecSize>(&bias[col_idx], &bias_vec);
}
#pragma unroll
for (int32_t unroll_idx = 0; unroll_idx < VecSize; unroll_idx++) {
Expand Down Expand Up @@ -194,7 +194,7 @@ __global__ void FusedActBias(Functor act,
}
}
}
phi::Store<OutType, VecSize>(out_vec, &dst[idx]);
Store<OutType, VecSize>(out_vec, &dst[idx]);
}
}

Expand Down Expand Up @@ -322,17 +322,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad,
LoadT src_vec;
MaskLoadT mask_vec;

phi::Load<T, VecSize>(&dout[i], &dout_vec);
phi::Load<MaskType, VecSize>(&mask[i], &mask_vec);
phi::Load<T, VecSize>(&src[i], &src_vec);
Load<T, VecSize>(&dout[i], &dout_vec);
Load<MaskType, VecSize>(&mask[i], &mask_vec);
Load<T, VecSize>(&src[i], &src_vec);

StoreT dx_vec;
#pragma unroll
for (int ii = 0; ii < VecSize; ii++) {
T tmp = dout_vec[ii] * static_cast<T>(mask_vec[ii]) * factor;
dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]);
}
phi::Store<T, VecSize>(dx_vec, &dx[i]);
Store<T, VecSize>(dx_vec, &dx[i]);
}
}

Expand Down Expand Up @@ -376,10 +376,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
LoadT bias_vec;
MaskLoadT mask_vec;

phi::Load<T, VecSize>(&dout[index], &dout_vec);
phi::Load<T, VecSize>(&src[index], &src_vec);
phi::Load<MaskType, VecSize>(&mask[index], &mask_vec);
phi::Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);
Load<T, VecSize>(&dout[index], &dout_vec);
Load<T, VecSize>(&src[index], &src_vec);
Load<MaskType, VecSize>(&mask[index], &mask_vec);
Load<T, VecSize>(&bias[col_id * VecSize], &bias_vec);

StoreT dx_vec;
#pragma unroll
Expand All @@ -390,7 +390,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
dx_vec[i] = val;
tmp_sum[i] += val;
}
phi::Store<T, VecSize>(dx_vec, &dx[index]);
Store<T, VecSize>(dx_vec, &dx[index]);
}
}

Expand Down
38 changes: 19 additions & 19 deletions paddle/phi/kernels/fusion/gpu/fused_feedforward_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,19 +77,19 @@ void FFNGrad(const GPUContext& dev_ctx,
const int bsz_seq,
const int d_model,
const int dim_feedforward,
const phi::fusion::DropoutParam& dropout_param1,
const phi::fusion::DropoutParam& dropout_param2,
const fusion::DropoutParam& dropout_param1,
const fusion::DropoutParam& dropout_param2,
const std::string& act_method,
const bool pre_layer_norm,
const float epsilon1,
const float epsilon2,
const bool add_residual,
const int ring_id) {
phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
fusion::FusedDropoutLayerNormHelper<T, uint8_t> pre_layernorm_helper(
bsz_seq, d_model, epsilon1);
phi::fusion::FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
fusion::FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
dev_ctx, bsz_seq, dim_feedforward, dropout_param1);
phi::fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fusion::FusedDropoutLayerNormHelper<T, uint8_t>
fused_dropout_layernorm_helper(
dev_ctx, bsz_seq, d_model, dropout_param2, epsilon2);

Expand Down Expand Up @@ -283,20 +283,20 @@ void FusedFeedForwardGradKernel(const Context& dev_ctx,
bool is_upscale_in_train1 = dropout1_implementation == "upscale_in_train";
bool is_upscale_in_train2 = dropout2_implementation == "upscale_in_train";

phi::fusion::DropoutParam dropout_param1(dropout1_fix_seed,
0,
is_test,
is_upscale_in_train1,
dropout1_prob,
nullptr,
dropout1_seed_val);
phi::fusion::DropoutParam dropout_param2(dropout2_fix_seed,
0,
is_test,
is_upscale_in_train2,
dropout2_prob,
nullptr,
dropout2_seed_val);
fusion::DropoutParam dropout_param1(dropout1_fix_seed,
0,
is_test,
is_upscale_in_train1,
dropout1_prob,
nullptr,
dropout1_seed_val);
fusion::DropoutParam dropout_param2(dropout2_fix_seed,
0,
is_test,
is_upscale_in_train2,
dropout2_prob,
nullptr,
dropout2_seed_val);

dev_ctx.template Alloc<T>(d_x, d_x->numel() * sizeof(T));
if (d_ln1_scale) {
Expand Down
Loading
Loading