Skip to content

Commit 9e58d4d

Browse files
aendkORippler
andauthored
Avoid PDL race conditions by disabling __restrict__ when PDL is used (ggml-org#24030)
* Removes __restrict__ from PDL kernel headers due to incompatibility with PDL. Adds preprocessor directives based on arch in kernel body to add __restrict__ to retain performance on older architectures. * Simplifies new __restrict__ usage via macro * Add hopper to PDL __restrict__ fix. Co-authored-by: Oliver Simons <osimons@nvidia.com> --------- Co-authored-by: Oliver Simons <osimons@nvidia.com>
1 parent 3571fa5 commit 9e58d4d

14 files changed

Lines changed: 145 additions & 62 deletions

ggml/src/ggml-cuda/common.cuh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,6 +1611,12 @@ static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) {
16111611

16121612
#endif //defined(GGML_CUDA_USE_PDL)
16131613

1614+
// PDL and __restrict__ need to be mutually exclusive, see https://github.com/ggml-org/llama.cpp/pull/24030
1615+
# if (defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER)
1616+
# define GGML_CUDA_RESTRICT
1617+
# else
1618+
# define GGML_CUDA_RESTRICT __restrict__
1619+
# endif // defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
16141620

16151621
template<typename Kernel, typename... Args>
16161622
static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) {

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -718,8 +718,8 @@ static __global__ void flash_attn_mask_to_KV_max(
718718
template<int D, int ncols1, int ncols2> // D == head size
719719
__launch_bounds__(D, 1)
720720
static __global__ void flash_attn_stream_k_fixup_uniform(
721-
float * __restrict__ dst,
722-
const float2 * __restrict__ dst_fixup,
721+
float * dst_ptr,
722+
const float2 * dst_fixup_ptr,
723723
const int ne01, const int ne02,
724724
const int ne12, const int nblocks_stream_k,
725725
const int gqa_ratio,
@@ -729,6 +729,8 @@ static __global__ void flash_attn_stream_k_fixup_uniform(
729729
const uint3 fd_iter_j) {
730730
constexpr int ncols = ncols1*ncols2;
731731
ggml_cuda_pdl_lc();
732+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
733+
const float2 * GGML_CUDA_RESTRICT dst_fixup = dst_fixup_ptr;
732734

733735
const int tile_idx = blockIdx.x; // One block per output tile.
734736
const int j = blockIdx.y;
@@ -800,15 +802,17 @@ static __global__ void flash_attn_stream_k_fixup_uniform(
800802
template <int D, int ncols1, int ncols2> // D == head size
801803
__launch_bounds__(D, 1)
802804
static __global__ void flash_attn_stream_k_fixup_general(
803-
float * __restrict__ dst,
804-
const float2 * __restrict__ dst_fixup,
805+
float * dst_ptr,
806+
const float2 * dst_fixup_ptr,
805807
const int ne01, const int ne02,
806808
const int gqa_ratio,
807809
const int total_work,
808810
const uint3 fd_iter_k_j_z_ne12,
809811
const uint3 fd_iter_k_j_z,
810812
const uint3 fd_iter_k_j,
811813
const uint3 fd_iter_k) {
814+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
815+
const float2 * GGML_CUDA_RESTRICT dst_fixup = dst_fixup_ptr;
812816
constexpr int ncols = ncols1*ncols2;
813817

814818
const int bidx0 = blockIdx.x;
@@ -907,11 +911,14 @@ static __global__ void flash_attn_stream_k_fixup_general(
907911
template<int D> // D == head size
908912
__launch_bounds__(D, 1)
909913
static __global__ void flash_attn_combine_results(
910-
const float * __restrict__ VKQ_parts,
911-
const float2 * __restrict__ VKQ_meta,
912-
float * __restrict__ dst,
914+
const float * VKQ_parts_ptr,
915+
const float2 * VKQ_meta_ptr,
916+
float * dst_ptr,
913917
const int parallel_blocks) {
914918
ggml_cuda_pdl_lc();
919+
const float * GGML_CUDA_RESTRICT VKQ_parts = VKQ_parts_ptr;
920+
const float2 * GGML_CUDA_RESTRICT VKQ_meta = VKQ_meta_ptr;
921+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
915922
// Dimension 0: threadIdx.x
916923
// Dimension 1: blockIdx.x
917924
// Dimension 2: blockIdx.y
@@ -1196,8 +1203,8 @@ void launch_fattn(
11961203

11971204
GGML_ASSERT(block_dim.x % warp_size == 0);
11981205

1199-
// disabled PDL enrollment for now due to a compiler bug.
1200-
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
1206+
ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(blocks_num, block_dim, nbytes_shared, main_stream);
1207+
ggml_cuda_kernel_launch(fattn_kernel, launch_params,
12011208
(const char *) Q->data,
12021209
K_data,
12031210
V_data,

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1703,14 +1703,14 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
17031703
template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap, bool V_is_K_view>
17041704
__launch_bounds__(ggml_cuda_fattn_mma_get_nthreads(DKQ, DV, ncols1*ncols2), ggml_cuda_fattn_mma_get_occupancy(DKQ, DV, ncols1*ncols2))
17051705
static __global__ void flash_attn_ext_f16(
1706-
const char * __restrict__ Q,
1707-
const char * __restrict__ K,
1708-
const char * __restrict__ V,
1709-
const char * __restrict__ mask,
1710-
const char * __restrict__ sinks,
1711-
const int * __restrict__ KV_max,
1712-
float * __restrict__ dst,
1713-
float2 * __restrict__ dst_meta,
1706+
const char * Q_ptr,
1707+
const char * K_ptr,
1708+
const char * V_ptr,
1709+
const char * mask_ptr,
1710+
const char * sinks_ptr,
1711+
const int * KV_max_ptr,
1712+
float * dst_ptr,
1713+
float2 * dst_meta_ptr,
17141714
const float scale,
17151715
const float max_bias,
17161716
const float m0,
@@ -1726,6 +1726,14 @@ static __global__ void flash_attn_ext_f16(
17261726
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
17271727
ggml_cuda_pdl_sync(); // TODO optimize placement
17281728
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE))
1729+
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
1730+
const char * GGML_CUDA_RESTRICT K = K_ptr;
1731+
const char * GGML_CUDA_RESTRICT V = V_ptr;
1732+
const char * GGML_CUDA_RESTRICT mask = mask_ptr;
1733+
const char * GGML_CUDA_RESTRICT sinks = sinks_ptr;
1734+
const int * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
1735+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
1736+
float2 * GGML_CUDA_RESTRICT dst_meta = dst_meta_ptr;
17291737

17301738
// Skip unused kernel variants for faster compilation:
17311739
if (use_logit_softcap && !(DKQ == 128 || DKQ == 256 || DKQ == 512)) {
@@ -1871,7 +1879,7 @@ static __global__ void flash_attn_ext_f16(
18711879
(Q_f2, K_h2, V_h2, mask_h, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
18721880
ne01, ne02, gqa_ratio, ne11, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, zt_gqa, kb0_start, kb0_stop);
18731881
#else
1874-
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
1882+
GGML_UNUSED_VARS(Q_ptr, K_ptr, V_ptr, mask_ptr, sinks_ptr, KV_max_ptr, dst_ptr, dst_meta_ptr, scale,
18751883
max_bias, m0, m1, n_head_log2, logit_softcap,
18761884
ne00, ne01, ne02, ne03,
18771885
nb01, nb02, nb03,

ggml/src/ggml-cuda/fattn-tile.cuh

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -788,14 +788,14 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
788788
template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap> // D == head size
789789
__launch_bounds__(ggml_cuda_fattn_tile_get_nthreads(DKQ, DV, ncols1*ncols2), ggml_cuda_fattn_tile_get_occupancy(DKQ, DV, ncols1*ncols2))
790790
static __global__ void flash_attn_tile(
791-
const char * __restrict__ Q,
792-
const char * __restrict__ K,
793-
const char * __restrict__ V,
794-
const char * __restrict__ mask,
795-
const char * __restrict__ sinks,
796-
const int * __restrict__ KV_max,
797-
float * __restrict__ dst,
798-
float2 * __restrict__ dst_meta,
791+
const char * Q_ptr,
792+
const char * K_ptr,
793+
const char * V_ptr,
794+
const char * mask_ptr,
795+
const char * sinks_ptr,
796+
const int * KV_max_ptr,
797+
float * dst_ptr,
798+
float2 * dst_meta_ptr,
799799
const float scale,
800800
const float max_bias,
801801
const float m0,
@@ -810,6 +810,14 @@ static __global__ void flash_attn_tile(
810810
const int32_t ne31, const int32_t ne32, const int32_t ne33,
811811
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
812812
#ifdef FLASH_ATTN_AVAILABLE
813+
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
814+
const char * GGML_CUDA_RESTRICT K = K_ptr;
815+
const char * GGML_CUDA_RESTRICT V = V_ptr;
816+
const char * GGML_CUDA_RESTRICT mask = mask_ptr;
817+
const char * GGML_CUDA_RESTRICT sinks = sinks_ptr;
818+
const int * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
819+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
820+
float2 * GGML_CUDA_RESTRICT dst_meta = dst_meta_ptr;
813821

814822
// Skip unused kernel variants for faster compilation:
815823

@@ -1126,7 +1134,7 @@ static __global__ void flash_attn_tile(
11261134
}
11271135
}
11281136
#else
1129-
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
1137+
GGML_UNUSED_VARS(Q_ptr, K_ptr, V_ptr, mask_ptr, sinks_ptr, KV_max_ptr, dst_ptr, dst_meta_ptr, scale,
11301138
max_bias, m0, m1, n_head_log2, logit_softcap,
11311139
ne00, ne01, ne02, ne03,
11321140
nb01, nb02, nb03,

ggml/src/ggml-cuda/fattn-vec.cuh

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
1919
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
2020
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 1)
2121
static __global__ void flash_attn_ext_vec(
22-
const char * __restrict__ Q,
23-
const char * __restrict__ K,
24-
const char * __restrict__ V,
25-
const char * __restrict__ mask,
26-
const char * __restrict__ sinks,
27-
const int * __restrict__ KV_max,
28-
float * __restrict__ dst,
29-
float2 * __restrict__ dst_meta,
22+
const char * Q_ptr,
23+
const char * K_ptr,
24+
const char * V_ptr,
25+
const char * mask_ptr,
26+
const char * sinks_ptr,
27+
const int * KV_max_ptr,
28+
float * dst_ptr,
29+
float2 * dst_meta_ptr,
3030
const float scale,
3131
const float max_bias,
3232
const float m0,
@@ -42,6 +42,14 @@ static __global__ void flash_attn_ext_vec(
4242
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
4343
ggml_cuda_pdl_lc();
4444
#ifdef FLASH_ATTN_AVAILABLE
45+
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
46+
const char * GGML_CUDA_RESTRICT K = K_ptr;
47+
const char * GGML_CUDA_RESTRICT V = V_ptr;
48+
const char * GGML_CUDA_RESTRICT mask = mask_ptr;
49+
const char * GGML_CUDA_RESTRICT sinks = sinks_ptr;
50+
const int * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
51+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
52+
float2 * GGML_CUDA_RESTRICT dst_meta = dst_meta_ptr;
4553

4654
// Skip unused kernel variants for faster compilation:
4755
if (use_logit_softcap && !(D == 128 || D == 256)) {
@@ -506,7 +514,7 @@ static __global__ void flash_attn_ext_vec(
506514
dst_meta[((sequence*int(ne01.z) + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
507515
}
508516
#else
509-
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
517+
GGML_UNUSED_VARS(Q_ptr, K_ptr, V_ptr, mask_ptr, sinks_ptr, KV_max_ptr, dst_ptr, dst_meta_ptr, scale,
510518
max_bias, m0, m1, n_head_log2, logit_softcap,
511519
ne00, ne01, ne02, ne03,
512520
nb01, nb02, nb03,

ggml/src/ggml-cuda/fattn-wmma-f16.cu

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ namespace wmma = rocwmma;
2424
template<int D, int ncols, int nwarps, int VKQ_stride, typename KQ_acc_t, bool use_logit_softcap>
2525
__launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1)
2626
static __global__ void flash_attn_ext_f16(
27-
const char * __restrict__ Q,
28-
const char * __restrict__ K,
29-
const char * __restrict__ V,
30-
const char * __restrict__ mask,
31-
const char * __restrict__ sinks,
32-
const int * __restrict__ KV_max,
33-
float * __restrict__ dst,
34-
float2 * __restrict__ dst_meta,
27+
const char * Q_ptr,
28+
const char * K_ptr,
29+
const char * V_ptr,
30+
const char * mask_ptr,
31+
const char * sinks_ptr,
32+
const int * KV_max_ptr,
33+
float * dst_ptr,
34+
float2 * dst_meta_ptr,
3535
const float scale,
3636
const float max_bias,
3737
const float m0,
@@ -46,6 +46,14 @@ static __global__ void flash_attn_ext_f16(
4646
const int32_t ne31, const int32_t ne32, const int32_t ne33,
4747
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
4848
#if defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
49+
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
50+
const char * GGML_CUDA_RESTRICT K = K_ptr;
51+
const char * GGML_CUDA_RESTRICT V = V_ptr;
52+
const char * GGML_CUDA_RESTRICT mask = mask_ptr;
53+
const char * GGML_CUDA_RESTRICT sinks = sinks_ptr;
54+
const int * GGML_CUDA_RESTRICT KV_max = KV_max_ptr;
55+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
56+
float2 * GGML_CUDA_RESTRICT dst_meta = dst_meta_ptr;
4957
// Skip unused kernel variants for faster compilation:
5058
if (use_logit_softcap && !(D == 128 || D == 256)) {
5159
NO_DEVICE_CODE;
@@ -494,7 +502,7 @@ static __global__ void flash_attn_ext_f16(
494502
dst_meta[j_dst_unrolled] = dst_meta_val;
495503
}
496504
#else
497-
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
505+
GGML_UNUSED_VARS(Q_ptr, K_ptr, V_ptr, mask_ptr, sinks_ptr, KV_max_ptr, dst_ptr, dst_meta_ptr, scale,
498506
max_bias, m0, m1, n_head_log2, logit_softcap,
499507
ne00, ne01, ne02, ne03,
500508
nb01, nb02, nb03,

ggml/src/ggml-cuda/getrows.cu

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,17 @@ static __global__ void k_get_rows(
4242

4343
template<typename src0_t, typename dst_t>
4444
static __global__ void k_get_rows_float(
45-
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
45+
const src0_t * src0_ptr, const int32_t * src1_ptr, dst_t * dst_ptr,
4646
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
4747
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
4848
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
4949
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
5050
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
5151

5252
ggml_cuda_pdl_lc();
53+
const src0_t * GGML_CUDA_RESTRICT src0 = src0_ptr;
54+
const int32_t * GGML_CUDA_RESTRICT src1 = src1_ptr;
55+
dst_t * GGML_CUDA_RESTRICT dst = dst_ptr;
5356
ggml_cuda_pdl_sync();
5457
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
5558
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {

ggml/src/ggml-cuda/mmvf.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66

77
template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_fusion = false, bool is_multi_token_id = false>
88
static __global__ void mul_mat_vec_f(
9-
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
9+
const T * x_ptr, const float * y_ptr, const int32_t * ids_ptr, const ggml_cuda_mm_fusion_args_device fusion, float * dst_ptr,
1010
const int ncols2, const uint3 nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
1111
const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
1212
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
1313
const int ids_stride) {
14+
const T * GGML_CUDA_RESTRICT x = x_ptr;
15+
const float * GGML_CUDA_RESTRICT y = y_ptr;
16+
const int32_t * GGML_CUDA_RESTRICT ids = ids_ptr;
17+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
1418
const int row = blockIdx.x;
1519
// for MUL_MAT_ID - blockIdx.y = n_expert_used, blockIdx.z = ncols_dst (tokens)
1620
const int channel_dst = blockIdx.y;

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,12 +476,16 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
476476
template <ggml_type type, int ncols_dst, bool has_fusion, bool small_k = false>
477477
__launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
478478
static __global__ void mul_mat_vec_q(
479-
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
479+
const void * vx_ptr, const void * vy_ptr, const int32_t * ids_ptr, const ggml_cuda_mm_fusion_args_device fusion, float * dst_ptr,
480480
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
481481
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
482482
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
483483
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
484484
const uint32_t ids_stride) {
485+
const void * GGML_CUDA_RESTRICT vx = vx_ptr;
486+
const void * GGML_CUDA_RESTRICT vy = vy_ptr;
487+
const int32_t * GGML_CUDA_RESTRICT ids = ids_ptr;
488+
float * GGML_CUDA_RESTRICT dst = dst_ptr;
485489

486490
constexpr int qk = ggml_cuda_type_traits<type>::qk;
487491
constexpr int qi = ggml_cuda_type_traits<type>::qi;

ggml/src/ggml-cuda/quantize.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33

44
__launch_bounds__(CUDA_QUANTIZE_BLOCK_SIZE, 1)
55
static __global__ void quantize_q8_1(
6-
const float * __restrict__ x, void * __restrict__ vy,
6+
const float * x_ptr, void * vy_ptr,
77
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
88
const int64_t ne0, const uint32_t ne1, const uint3 ne2) {
99
ggml_cuda_pdl_lc();
10+
const float * GGML_CUDA_RESTRICT x = x_ptr;
11+
void * GGML_CUDA_RESTRICT vy = vy_ptr;
1012
const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
1113

1214
if (i0 >= ne0) {

0 commit comments

Comments
 (0)