@@ -220,6 +220,10 @@ kernel void kernel_convert_block_q4_0_trans4_ns(
220220 uint i01 = get_global_id (0 );
221221 uint i02 = get_global_id (2 );
222222
223+ if (i01 >= ne01 ) {
224+ return ;
225+ }
226+
223227 uint ne00_blk = ne00 / QK4_0 ;
224228 uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
225229 uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -263,6 +267,10 @@ kernel void kernel_restore_block_q4_0_trans4_ns(
263267 uint i01 = get_global_id (0 );
264268 uint i02 = get_global_id (2 );
265269
270+ if (i01 >= ne01 ) {
271+ return ;
272+ }
273+
266274 uint ne00_blk = ne00 / QK4_0 ;
267275 uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
268276 uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -401,6 +409,10 @@ kernel void kernel_convert_block_q4_1_trans4_ns(
401409 uint i01 = get_global_id (0 );
402410 uint i02 = get_global_id (2 );
403411
412+ if (i01 >= ne01 ) {
413+ return ;
414+ }
415+
404416 uint ne00_blk = ne00 / QK4_1 ;
405417 uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
406418 uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -446,6 +458,10 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
446458 uint i01 = get_global_id (0 );
447459 uint i02 = get_global_id (2 );
448460
461+ if (i01 >= ne01 ) {
462+ return ;
463+ }
464+
449465 uint ne00_blk = ne00 / QK4_1 ;
450466 uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
451467 uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -491,6 +507,10 @@ kernel void kernel_convert_block_q5_0_trans4_ns(
491507 uint i01 = get_global_id (0 );
492508 uint i02 = get_global_id (2 );
493509
510+ if (i01 >= ne01 ) {
511+ return ;
512+ }
513+
494514 uint ne00_blk = ne00 / QK5_0 ;
495515 uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
496516 uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -536,6 +556,10 @@ kernel void kernel_restore_block_q5_0_trans4_ns(
536556 uint i01 = get_global_id (0 );
537557 uint i02 = get_global_id (2 );
538558
559+ if (i01 >= ne01 ) {
560+ return ;
561+ }
562+
539563 uint ne00_blk = ne00 / QK5_0 ;
540564 uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
541565 uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -583,6 +607,10 @@ kernel void kernel_convert_block_q5_1_trans4_ns(
583607 uint i01 = get_global_id (0 );
584608 uint i02 = get_global_id (2 );
585609
610+ if (i01 >= ne01 ) {
611+ return ;
612+ }
613+
586614 uint ne00_blk = ne00 / QK5_1 ;
587615 uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
588616 uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -630,6 +658,10 @@ kernel void kernel_restore_block_q5_1_trans4_ns(
630658 uint i01 = get_global_id (0 );
631659 uint i02 = get_global_id (2 );
632660
661+ if (i01 >= ne01 ) {
662+ return ;
663+ }
664+
633665 uint ne00_blk = ne00 / QK5_1 ;
634666 uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
635667 uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -679,6 +711,10 @@ kernel void kernel_convert_block_q4_k_trans4_ns(
679711 uint i01 = get_global_id (0 );
680712 uint i02 = get_global_id (2 );
681713
714+ if (i01 >= ne01 ) {
715+ return ;
716+ }
717+
682718 uint ne00_blk = ne00 / QK_K ;
683719 uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
684720 uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -732,6 +768,10 @@ kernel void kernel_restore_block_q4_k_trans4_ns(
732768 uint i01 = get_global_id (0 ); // row index
733769 uint i02 = get_global_id (2 ); // batch index
734770
771+ if (i01 >= ne01 ) {
772+ return ;
773+ }
774+
735775 uint ne00_blk = ne00 / QK_K ;
736776
737777 uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -784,6 +824,10 @@ kernel void kernel_convert_block_q5_k_trans4_ns(
784824 uint i01 = get_global_id (0 );
785825 uint i02 = get_global_id (2 );
786826
827+ if (i01 >= ne01 ) {
828+ return ;
829+ }
830+
787831 uint ne00_blk = ne00 / QK_K ;
788832 uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
789833 uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -850,6 +894,10 @@ kernel void kernel_restore_block_q5_k_trans4_ns(
850894 uint i01 = get_global_id (0 ); // row index
851895 uint i02 = get_global_id (2 ); // batch index
852896
897+ if (i01 >= ne01 ) {
898+ return ;
899+ }
900+
853901 uint ne00_blk = ne00 / QK_K ;
854902
855903 uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -916,6 +964,10 @@ kernel void kernel_convert_block_q6_k_trans4_ns(
916964 uint i01 = get_global_id (0 );
917965 uint i02 = get_global_id (2 );
918966
967+ if (i01 >= ne01 ) {
968+ return ;
969+ }
970+
919971 uint ne00_blk = ne00 / QK_K ;
920972
921973 uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
@@ -993,6 +1045,10 @@ kernel void kernel_restore_block_q6_k_trans4_ns(
9931045 uint i01 = get_global_id (0 ); // row index
9941046 uint i02 = get_global_id (2 ); // batch index
9951047
1048+ if (i01 >= ne01 ) {
1049+ return ;
1050+ }
1051+
9961052 uint ne00_blk = ne00 / QK_K ;
9971053
9981054 uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -1147,6 +1203,10 @@ kernel void kernel_convert_block_mxfp4_trans4_ns(
11471203 uint i01 = get_global_id (0 );
11481204 uint i02 = get_global_id (2 );
11491205
1206+ if (i01 >= ne01 ) {
1207+ return ;
1208+ }
1209+
11501210 uint ne00_blk = ne00 / QK_MXFP4 ;
11511211 uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
11521212 uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
@@ -1190,6 +1250,10 @@ kernel void kernel_restore_block_mxfp4_trans4_ns(
11901250 uint i01 = get_global_id (0 );
11911251 uint i02 = get_global_id (2 );
11921252
1253+ if (i01 >= ne01 ) {
1254+ return ;
1255+ }
1256+
11931257 uint ne00_blk = ne00 / QK_MXFP4 ;
11941258 uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01 ;
11951259 uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01 ;
0 commit comments