diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp index 0217eb37416a..25779a82a65b 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp index 08c7251d0b1a..8a526b281225 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 +// REQUIRES: matrix-xmx8,gpu // Only runs on DPAS because AMX implementation does not support half data type // yet // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp index 8462dff81568..2fa63de99a8f 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp index 32f5e0138cd3..464d946ef143 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: gpu diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp index df7a479e06fa..7332417f8655 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // this code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp index c64a99fee59c..baf2963867f0 100644 --- a/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp +++ b/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp index ee084fd400ce..170154dc1021 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp index a616e9a0b0d6..1acaa3befff5 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp index 8e7b70194125..d55974705360 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp index 3c2736ddd149..556ac61a5622 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 +// REQUIRES: matrix-xmx8,gpu // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // Only run on the GPU because half is not supported on AMX hardware // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp index 272f22e554e9..8f96fdb3fd19 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp index fc9536ff7868..0d10afea9b03 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp index d1ab5e003cd1..476606b1dd68 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp index b07ad6ec286f..b3bc8d7c68e4 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp index a14b259b4ce7..1a9e6bdcdab1 100644 --- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp +++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp index a81f0e255ffe..b67c04d519d5 100644 --- a/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp +++ b/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp index bd8becff4c05..79f2a72341d9 100644 --- a/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: matrix,gpu // Only runs on DPAS because AMX implementation does not support half data type // yet // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp index 49a16d3964c3..e7f82611fd4d 100644 --- a/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp +++ b/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp index d3e38f638b06..ba97bdcb60b0 100644 --- a/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp +++ b/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: gpu diff --git a/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp index 2b7895ea5b59..269d1f75920e 100644 --- a/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp +++ b/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // this code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/SYCL/Matrix/Legacy/element_wise_ops.cpp b/SYCL/Matrix/Legacy/element_wise_ops.cpp index d9a407e131c2..a432215beeb5 100644 --- a/SYCL/Matrix/Legacy/element_wise_ops.cpp +++ b/SYCL/Matrix/Legacy/element_wise_ops.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp b/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp index b02e8cfc0722..97d6fd22b407 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp index fd926ca1c6da..6402fb72fa7e 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp index ea94bfcae16b..65f8b84b629d 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 627406446363..aa7d80dba0b6 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp index ec64f32cfa6c..65a8fe1e234b 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of row major layout for matrix B which does automatic VNNI // transform. This is currently only available on AMX diff --git a/SYCL/Matrix/Legacy/joint_matrix_half.cpp b/SYCL/Matrix/Legacy/joint_matrix_half.cpp index d88a9f0b1b46..19f6a021cec9 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_half.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_half.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: matrix,gpu // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // Only run on the GPU because half is not supported on AMX hardware // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp index bccb0d23d97d..9b0db278dabb 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp index c16d3ad726be..525ef4fe51a6 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: gpu diff --git a/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp index afdfd28feb90..4ddb42871357 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp index 7c12200762ef..b7edf91804be 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp index 935606cbe672..6e999a6b6233 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp index 054f3aaae564..2a7f289df036 100644 --- a/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp +++ b/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp index b12923536a03..a65e36554f93 100644 --- a/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp index 62fd63cc88ad..bbb1b8fab1c8 100644 --- a/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 +// REQUIRES: matrix-xmx8,gpu // Only runs on DPAS because AMX implementation does not support half data type // yet // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp index 2605e89e30cd..ad85e6abad63 100644 --- a/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp index 7124332923a3..286884bf4387 100644 --- a/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp +++ b/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This test stores the matrix B that is VNNIed (packed) in a row major fashion. // This is expected to fail on the GPU because the implementation does not diff --git a/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp index cc3dd78a63b5..d89af2d7acc2 100644 --- a/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp +++ b/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // this code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/SYCL/Matrix/XMX8/element_wise_ops.cpp b/SYCL/Matrix/XMX8/element_wise_ops.cpp index 1d7b64e406ba..b1acf17705d2 100644 --- a/SYCL/Matrix/XMX8/element_wise_ops.cpp +++ b/SYCL/Matrix/XMX8/element_wise_ops.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp b/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp index e1f67c435ec2..e2d4f82a6a67 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp index e7c1b42dd7ab..7dfcd2efc43b 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/XMX8/joint_matrix_half.cpp b/SYCL/Matrix/XMX8/joint_matrix_half.cpp index 355fef88e23f..c84fc0d2030e 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_half.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_half.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 +// REQUIRES: matrix-xmx8,gpu // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // Only run on the GPU because half is not supported on AMX hardware // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp index 0af6a21b85c5..2b69e13785b6 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp index 86d7f753080a..212799fa0713 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp index 252a647f5d9a..8c6a15445fef 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp index e74e7ad46bd4..124cc97f530c 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp index 06934de225ad..84dc77f931a6 100644 --- a/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp +++ b/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/element_wise_all_ops_bf16.cpp index b871f13e567b..439c069652ed 100644 --- a/SYCL/Matrix/element_wise_all_ops_bf16.cpp +++ b/SYCL/Matrix/element_wise_all_ops_bf16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/element_wise_all_ops_half.cpp b/SYCL/Matrix/element_wise_all_ops_half.cpp index e860180c03f4..331920e9ea1b 100644 --- a/SYCL/Matrix/element_wise_all_ops_half.cpp +++ b/SYCL/Matrix/element_wise_all_ops_half.cpp @@ -5,12 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: matrix,gpu // Only runs on DPAS because AMX implementation does not support half data type // yet // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/element_wise_all_ops_int8.cpp b/SYCL/Matrix/element_wise_all_ops_int8.cpp index adcee2a750ef..45f2ef4bda97 100644 --- a/SYCL/Matrix/element_wise_all_ops_int8.cpp +++ b/SYCL/Matrix/element_wise_all_ops_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp index 6008079449c2..f33cc2cc125f 100644 --- a/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp +++ b/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This test stores the matrix B that is VNNIed (packed) in a row major fashion. // This is expected to fail on the GPU because the implementation does not diff --git a/SYCL/Matrix/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/element_wise_irreg_sum_rows.cpp index 76e24de5c6ba..7aebe7e031bc 100644 --- a/SYCL/Matrix/element_wise_irreg_sum_rows.cpp +++ b/SYCL/Matrix/element_wise_irreg_sum_rows.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This code calculates the sum of rows into a global array of number of rows // elements. First, partial reduction is computed inside each SG, then atomic diff --git a/SYCL/Matrix/element_wise_ops.cpp b/SYCL/Matrix/element_wise_ops.cpp index c3b949fd9f30..d686ff4ec6d9 100644 --- a/SYCL/Matrix/element_wise_ops.cpp +++ b/SYCL/Matrix/element_wise_ops.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_bfloat16.cpp b/SYCL/Matrix/joint_matrix_bfloat16.cpp index 77f1be403d73..b8008a8c9848 100644 --- a/SYCL/Matrix/joint_matrix_bfloat16.cpp +++ b/SYCL/Matrix/joint_matrix_bfloat16.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp index 5c955ec42259..4bc00841720a 100644 --- a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp +++ b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: * diff --git a/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 7f6b5c9e4f97..97cf4c97d27d 100644 --- a/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp b/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp index ae2bd6b99b58..064fbfd6cf20 100644 --- a/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp +++ b/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of row major layout for matrix B which does automatic VNNI // transform. This is currently only available on AMX diff --git a/SYCL/Matrix/joint_matrix_half.cpp b/SYCL/Matrix/joint_matrix_half.cpp index 1d131a64a881..720920e1cb82 100644 --- a/SYCL/Matrix/joint_matrix_half.cpp +++ b/SYCL/Matrix/joint_matrix_half.cpp @@ -5,11 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: matrix,gpu // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // Only run on the GPU because half is not supported on AMX hardware // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp index c572e53c3452..1859b57f8bc1 100644 --- a/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // This tests support of col major layout for matrix B which does transpose and // then VNNI transform. This is currently only available on AMX diff --git a/SYCL/Matrix/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/joint_matrix_int8_vnni.cpp index f8ae1a8cf776..7c77be898444 100644 --- a/SYCL/Matrix/joint_matrix_int8_vnni.cpp +++ b/SYCL/Matrix/joint_matrix_int8_vnni.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out // XFAIL: gpu diff --git a/SYCL/Matrix/joint_matrix_ss_int8.cpp b/SYCL/Matrix/joint_matrix_ss_int8.cpp index 860e590357dd..dac9ea719b68 100644 --- a/SYCL/Matrix/joint_matrix_ss_int8.cpp +++ b/SYCL/Matrix/joint_matrix_ss_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_su_int8.cpp b/SYCL/Matrix/joint_matrix_su_int8.cpp index bd89977fc345..83de8f45d097 100644 --- a/SYCL/Matrix/joint_matrix_su_int8.cpp +++ b/SYCL/Matrix/joint_matrix_su_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_us_int8.cpp b/SYCL/Matrix/joint_matrix_us_int8.cpp index 0690636c5914..c4cd631fdaad 100644 --- a/SYCL/Matrix/joint_matrix_us_int8.cpp +++ b/SYCL/Matrix/joint_matrix_us_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include diff --git a/SYCL/Matrix/joint_matrix_uu_int8.cpp b/SYCL/Matrix/joint_matrix_uu_int8.cpp index 42f2ff8fe674..33a5372c33f9 100644 --- a/SYCL/Matrix/joint_matrix_uu_int8.cpp +++ b/SYCL/Matrix/joint_matrix_uu_int8.cpp @@ -10,6 +10,8 @@ // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out #include #include