Skip to content

Commit 0734097

Browse files
committed
switch to a more efficient sequence with conditional movs
1 parent fbb652f commit 0734097

1 file changed

Lines changed: 9 additions & 8 deletions

File tree

samples/99_matrixexperimentsi8/matrix_kernels_i8.cl

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -610,16 +610,17 @@ kernel void i8_dpas_blockread_rowmajor_TN_m4_n16(global int* C, global char* A,
610610
intel_sub_group_2d_block_read_transpose_32b_32r1x1c(A, M * sizeof(char), K, M * sizeof(char), (int2)(m / 4, k), (uint*)&readData);
611611

612612
// Note: after the transpose block read:
613-
// readData.s0 contains row 0-15
614-
// readData.s1 contains row 16-31
613+
// readData.s0 contains rows 0-15
614+
// readData.s1 contains rows 16-31
615615
// So, WI0 has rows 0 and 16, WI1 has rows 1 and 17, etc.
616616
// We want WI0 to have rows 0 and 1, WI1 to have rows 2 and 3, etc.
617-
int shuffledData0 = (sglid < 8) ?
618-
sub_group_shuffle(readData.s0, (sglid * 2)) :
619-
sub_group_shuffle(readData.s1, (sglid * 2) % 16);
620-
int shuffledData1 = (sglid < 8) ?
621-
sub_group_shuffle(readData.s0, (sglid * 2) + 1) :
622-
sub_group_shuffle(readData.s1, (sglid * 2) % 16 + 1);
617+
int shuffleIndex = sglid * 2 % 16;
618+
int loData0 = sub_group_shuffle(readData.s0, shuffleIndex);
619+
int hiData0 = sub_group_shuffle(readData.s1, shuffleIndex);
620+
int shuffledData0 = (sglid < 8) ? loData0 : hiData0;
621+
int loData1 = sub_group_shuffle(readData.s0, shuffleIndex + 1);
622+
int hiData1 = sub_group_shuffle(readData.s1, shuffleIndex + 1);
623+
int shuffledData1 = (sglid < 8) ? loData1 : hiData1;
623624

624625
short4 aData;
625626
aData.s0 = as_short((char2)(as_char4(shuffledData0).s0, as_char4(shuffledData1).s0));

0 commit comments

Comments
 (0)