Skip to content

Commit 2e16dda

Browse files
committed
rm warining
1 parent 42f8358 commit 2e16dda

7 files changed

Lines changed: 19 additions & 18 deletions

File tree

docs/backend/SYCL.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
724724

725725
- `Split-mode:[row]` is not supported.
726726

727+
- Missed the AOT (Ahead-of-Time) in buiding.
728+
- Good: build quickly, smaller size of binary file.
729+
- Bad: The startup is slow in first time, but subsequent performance is unaffected.
730+
727731
## Q&A
728732

729733
- Error: `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.

ggml/src/ggml-sycl/common.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,7 @@ static __dpct_inline__ uint32_t fastmodulo(uint32_t n, const sycl::uint3 fastdiv
870870
}
871871

872872
static bool fast_fp16_available(const int cc) {
873+
GGML_UNUSED(cc);
873874
return true; //Intel GPUs always support FP16.
874875
}
875876

ggml/src/ggml-sycl/convert.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ static void dequantize_block_nc_sycl(const void * vx,
535535
stream->parallel_for(sycl::nd_range<3>(num_blocks * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
536536
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
537537
[=](sycl::nd_item<3> item_ct1) {
538+
GGML_UNUSED(item_ct1);
538539
dequantize_block_nc<qk, qr, dequantize_kernel>(vx, y, ne00, ne01, ne02, s01, s02, s03);
539540
});
540541
}

ggml/src/ggml-sycl/fattn-common.hpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,6 @@ static __dpct_inline__ float vec_dot_fattn_vec_KQ_q4_0(const char * __restrict__
100100
const void * __restrict__ Q_ds_v) {
101101
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
102102

103-
int idx = Q_q8[0];
104-
105103
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
106104
GGML_UNUSED(Q_v);
107105

@@ -875,12 +873,14 @@ static void lauch_kernel(
875873
const int32_t nb31,
876874
const int32_t nb32,
877875
const int64_t nb33) {
876+
GGML_UNUSED(local_mem_size);
878877
q->submit([&](sycl::handler &cgh) {
879878
cgh.parallel_for(
880879
sycl::nd_range<3>(
881880
static_cast<sycl::range<3>>(group_range * local_range),
882881
static_cast<sycl::range<3>>(local_range)),
883882
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(warp_size)]] {
883+
GGML_UNUSED(item_ct1);
884884
fattn_kernel(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
885885
max_bias, m0, m1, n_head_log2, logit_softcap, ne00,
886886
ne01, ne02, ne03, nb01, nb02, nb03, ne10, ne11,
@@ -920,7 +920,6 @@ void launch_fattn(
920920
ggml_sycl_pool & pool = ctx.pool();
921921
dpct::queue_ptr main_stream = ctx.stream();
922922
const int id = ggml_sycl_get_device();
923-
const int cc = ggml_sycl_info().devices[id].cc;
924923
const int nsm = ggml_sycl_info().devices[id].nsm;
925924

926925
ggml_sycl_pool_alloc<sycl::half> K_f16(pool);
@@ -1031,6 +1030,7 @@ void launch_fattn(
10311030

10321031
cgh.parallel_for(sycl::nd_range<3>(blocks_num_KV_max * block_dim_KV_max, block_dim_KV_max),
10331032
[=](sycl::nd_item<3> item_ct1) {
1033+
GGML_UNUSED(item_ct1);
10341034
flash_attn_mask_to_KV_max<ncols1, warp_size>(
10351035
mask_data_ct0, KV_max_ptr_ct1, iter_k, s31, s33,
10361036
buf_iw_acc_ct1.get_multi_ptr<sycl::access::decorated::no>().get());
@@ -1049,10 +1049,7 @@ void launch_fattn(
10491049
if (stream_k) {
10501050
// For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
10511051
const int max_blocks = max_blocks_per_sm*nsm;
1052-
const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
1053-
10541052
const int nblocks_stream_k = max_blocks;
1055-
10561053
const bool use_stream_k = true;
10571054

10581055
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
@@ -1151,6 +1148,7 @@ void launch_fattn(
11511148

11521149
cgh.parallel_for(sycl::nd_range<3>(blocks_num_combine * block_dim_combine, block_dim_combine),
11531150
[=](sycl::nd_item<3> item_ct1) {
1151+
GGML_UNUSED(item_ct1);
11541152
flash_attn_stream_k_fixup<DV, ncols1, ncols2>(KQV_data_ct0, dst_tmp_meta_ptr_ct1,
11551153
Q_ne_ct2, Q_ne_ct3, Q_ne_ct4,
11561154
K_ne_ct5, K_ne_ct6, nbatch_fa);
@@ -1170,6 +1168,7 @@ void launch_fattn(
11701168

11711169
cgh.parallel_for(sycl::nd_range<3>(blocks_num_combine * block_dim_combine, block_dim_combine),
11721170
[=](sycl::nd_item<3> item_ct1) {
1171+
GGML_UNUSED(item_ct1);
11731172
flash_attn_combine_results<DV>(
11741173
dst_tmp_ptr_ct0, dst_tmp_meta_ptr_ct1, KQV_data_ct2, parallel_blocks,
11751174
dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>().get());

ggml/src/ggml-sycl/fattn-tile.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1185,9 +1185,7 @@ static void flash_attn_tile(const char * Q,
11851185

11861186
template <int DKQ, int DV, int ncols2, bool use_logit_softcap>
11871187
static void launch_fattn_tile_switch_ncols1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1188-
11891188
const ggml_tensor * Q = dst->src[0];
1190-
const ggml_tensor * K = dst->src[1];
11911189

11921190
const int id = ggml_sycl_get_device();
11931191
const int cc = ggml_sycl_info().devices[id].cc;

ggml/src/ggml-sycl/fattn-vec.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ static void flash_attn_ext_vec(const char* __restrict__ Q,
201201
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += warp_size) {
202202
const int i = i0 + item_ct1.get_local_id(2);
203203

204-
if (i0 + warp_size <= D/sizeof(int) || i < D/sizeof(int)) {
204+
if (i0 + warp_size <= int(D/sizeof(int)) || i < int(D/sizeof(int))) {
205205
tmp_q_i32[i] = 0;
206206
}
207207
}
@@ -328,9 +328,10 @@ static void flash_attn_ext_vec(const char* __restrict__ Q,
328328

329329
KQ_max_new[j] = sycl::fmax((float) KQ_max_new[j], sum);
330330

331-
if ((nthreads_KQ == warp_size ? item_ct1.get_local_id(2) : item_ct1.get_local_id(2) % nthreads_KQ) ==
332-
i_KQ_0) {
333-
KQ_reg[j] = sum;
331+
if (int(nthreads_KQ == warp_size ? item_ct1.get_local_id(2)
332+
: item_ct1.get_local_id(2) %
333+
nthreads_KQ) == i_KQ_0) {
334+
KQ_reg[j] = sum;
334335
}
335336
}
336337
}

ggml/src/ggml-sycl/fattn.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,14 @@ static void ggml_sycl_flash_attn_ext_vec(ggml_backend_sycl_context & ctx, ggml_t
9494
// Best FlashAttention kernel for a specific GPU:
9595
enum best_fattn_kernel {
9696
BEST_FATTN_KERNEL_NONE = 0,
97-
BEST_FATTN_KERNEL_TILE = 200,
9897
BEST_FATTN_KERNEL_VEC = 100,
99-
BEST_FATTN_KERNEL_WMMA_F16 = 300,
100-
BEST_FATTN_KERNEL_MMA_F16 = 400,
98+
BEST_FATTN_KERNEL_TILE = 200,
10199
};
102100

103101
static best_fattn_kernel ggml_sycl_get_best_fattn_kernel(const int device, const ggml_tensor * dst) {
102+
GGML_UNUSED(device);
104103
#ifndef SYCL_FLASH_ATTN
105-
GGML_UNUSED(device); GGML_UNUSED(dst);
104+
GGML_UNUSED(dst);
106105
return BEST_FATTN_KERNEL_NONE;
107106
#endif// SYCL_FLASH_ATTN
108107

@@ -133,8 +132,6 @@ static best_fattn_kernel ggml_sycl_get_best_fattn_kernel(const int device, const
133132
}
134133
}
135134

136-
const int cc = ggml_sycl_info().devices[device].cc;
137-
138135
switch (K->ne[0]) {
139136
case 40:
140137
case 64:

0 commit comments

Comments
 (0)