@@ -100,8 +100,6 @@ static __dpct_inline__ float vec_dot_fattn_vec_KQ_q4_0(const char * __restrict__
100100 const void * __restrict__ Q_ds_v) {
101101 auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3 >();
102102
103- int idx = Q_q8[0 ];
104-
105103 const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
106104 GGML_UNUSED (Q_v);
107105
@@ -875,12 +873,14 @@ static void lauch_kernel(
875873 const int32_t nb31,
876874 const int32_t nb32,
877875 const int64_t nb33) {
876+ GGML_UNUSED (local_mem_size);
878877 q->submit ([&](sycl::handler &cgh) {
879878 cgh.parallel_for (
880879 sycl::nd_range<3 >(
881880 static_cast <sycl::range<3 >>(group_range * local_range),
882881 static_cast <sycl::range<3 >>(local_range)),
883882 [=](sycl::nd_item<3 > item_ct1) [[sycl::reqd_sub_group_size (warp_size)]] {
883+ GGML_UNUSED (item_ct1);
884884 fattn_kernel (Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
885885 max_bias, m0, m1, n_head_log2, logit_softcap, ne00,
886886 ne01, ne02, ne03, nb01, nb02, nb03, ne10, ne11,
@@ -920,7 +920,6 @@ void launch_fattn(
920920 ggml_sycl_pool & pool = ctx.pool ();
921921 dpct::queue_ptr main_stream = ctx.stream ();
922922 const int id = ggml_sycl_get_device ();
923- const int cc = ggml_sycl_info ().devices [id].cc ;
924923 const int nsm = ggml_sycl_info ().devices [id].nsm ;
925924
926925 ggml_sycl_pool_alloc<sycl::half> K_f16 (pool);
@@ -1031,6 +1030,7 @@ void launch_fattn(
10311030
10321031 cgh.parallel_for (sycl::nd_range<3 >(blocks_num_KV_max * block_dim_KV_max, block_dim_KV_max),
10331032 [=](sycl::nd_item<3 > item_ct1) {
1033+ GGML_UNUSED (item_ct1);
10341034 flash_attn_mask_to_KV_max<ncols1, warp_size>(
10351035 mask_data_ct0, KV_max_ptr_ct1, iter_k, s31, s33,
10361036 buf_iw_acc_ct1.get_multi_ptr <sycl::access::decorated::no>().get ());
@@ -1049,10 +1049,7 @@ void launch_fattn(
10491049 if (stream_k) {
10501050 // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
10511051 const int max_blocks = max_blocks_per_sm*nsm;
1052- const int tiles_nwaves = (ntiles_total + max_blocks - 1 ) / max_blocks;
1053-
10541052 const int nblocks_stream_k = max_blocks;
1055-
10561053 const bool use_stream_k = true ;
10571054
10581055 blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
@@ -1151,6 +1148,7 @@ void launch_fattn(
11511148
11521149 cgh.parallel_for (sycl::nd_range<3 >(blocks_num_combine * block_dim_combine, block_dim_combine),
11531150 [=](sycl::nd_item<3 > item_ct1) {
1151+ GGML_UNUSED (item_ct1);
11541152 flash_attn_stream_k_fixup<DV, ncols1, ncols2>(KQV_data_ct0, dst_tmp_meta_ptr_ct1,
11551153 Q_ne_ct2, Q_ne_ct3, Q_ne_ct4,
11561154 K_ne_ct5, K_ne_ct6, nbatch_fa);
@@ -1170,6 +1168,7 @@ void launch_fattn(
11701168
11711169 cgh.parallel_for (sycl::nd_range<3 >(blocks_num_combine * block_dim_combine, block_dim_combine),
11721170 [=](sycl::nd_item<3 > item_ct1) {
1171+ GGML_UNUSED (item_ct1);
11731172 flash_attn_combine_results<DV>(
11741173 dst_tmp_ptr_ct0, dst_tmp_meta_ptr_ct1, KQV_data_ct2, parallel_blocks,
11751174 dpct_local_acc_ct1.get_multi_ptr <sycl::access::decorated::no>().get ());
0 commit comments