@@ -610,16 +610,17 @@ kernel void i8_dpas_blockread_rowmajor_TN_m4_n16(global int* C, global char* A,
610610 intel_sub_group_2d_block_read_transpose_32b_32r1x1c (A , M * sizeof (char ), K , M * sizeof (char ), (int2 )(m / 4 , k ), (uint * )& readData );
611611
612612 // Note: after the transpose block read:
613- // readData.s0 contains row 0-15
614- // readData.s1 contains row 16-31
613+ // readData.s0 contains rows 0-15
614+ // readData.s1 contains rows 16-31
615615 // So, WI0 has rows 0 and 16, WI1 has rows 1 and 17, etc.
616616 // We want WI0 to have rows 0 and 1, WI1 to have rows 2 and 3, etc.
617- int shuffledData0 = (sglid < 8 ) ?
618- sub_group_shuffle (readData .s0 , (sglid * 2 )) :
619- sub_group_shuffle (readData .s1 , (sglid * 2 ) % 16 );
620- int shuffledData1 = (sglid < 8 ) ?
621- sub_group_shuffle (readData .s0 , (sglid * 2 ) + 1 ) :
622- sub_group_shuffle (readData .s1 , (sglid * 2 ) % 16 + 1 );
617+ int shuffleIndex = sglid * 2 % 16 ;
618+ int loData0 = sub_group_shuffle (readData .s0 , shuffleIndex );
619+ int hiData0 = sub_group_shuffle (readData .s1 , shuffleIndex );
620+ int shuffledData0 = (sglid < 8 ) ? loData0 : hiData0 ;
621+ int loData1 = sub_group_shuffle (readData .s0 , shuffleIndex + 1 );
622+ int hiData1 = sub_group_shuffle (readData .s1 , shuffleIndex + 1 );
623+ int shuffledData1 = (sglid < 8 ) ? loData1 : hiData1 ;
623624
624625 short4 aData ;
625626 aData .s0 = as_short ((char2 )(as_char4 (shuffledData0 ).s0 , as_char4 (shuffledData1 ).s0 ));
0 commit comments