diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp
index 0217eb37416a..25779a82a65b 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp
index 08c7251d0b1a..8a526b281225 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_half.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
+// REQUIRES: matrix-xmx8,gpu
 
 // Only runs on DPAS because AMX implementation does not support half data type
 // yet
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp
index 8462dff81568..2fa63de99a8f 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp
index 32f5e0138cd3..464d946ef143 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_all_ops_int8_packed.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: gpu
 
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp
index df7a479e06fa..7332417f8655 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_irreg_sum_rows.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // this code calculates the sum of rows into a global array of number of rows
 // elements. First, partial reduction is computed inside each SG, then atomic
diff --git a/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp b/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp
index c64a99fee59c..baf2963867f0 100644
--- a/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/element_wise_ops.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp
index ee084fd400ce..170154dc1021 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp
index a616e9a0b0d6..1acaa3befff5 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp
index 8e7b70194125..d55974705360 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_bfloat16_32x64.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp
index 3c2736ddd149..556ac61a5622 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_half.cpp
@@ -5,11 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
+// REQUIRES: matrix-xmx8,gpu
 
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // Only run on the GPU because half is not supported on AMX hardware
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp
index 272f22e554e9..8f96fdb3fd19 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_int8_vnni.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp
index fc9536ff7868..0d10afea9b03 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_ss_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp
index d1ab5e003cd1..476606b1dd68 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_su_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp
index b07ad6ec286f..b3bc8d7c68e4 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_us_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp b/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp
index a14b259b4ce7..1a9e6bdcdab1 100644
--- a/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp
+++ b/SYCL/Matrix/Legacy/XMX8/joint_matrix_uu_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp
index a81f0e255ffe..b67c04d519d5 100644
--- a/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_all_ops_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp
index bd8becff4c05..79f2a72341d9 100644
--- a/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_all_ops_half.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: matrix,gpu
 
 // Only runs on DPAS because AMX implementation does not support half data type
 // yet
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp
index 49a16d3964c3..e7f82611fd4d 100644
--- a/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_all_ops_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp
index d3e38f638b06..ba97bdcb60b0 100644
--- a/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_all_ops_int8_packed.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: gpu
 
diff --git a/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp
index 2b7895ea5b59..269d1f75920e 100644
--- a/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_irreg_sum_rows.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // this code calculates the sum of rows into a global array of number of rows
 // elements. First, partial reduction is computed inside each SG, then atomic
diff --git a/SYCL/Matrix/Legacy/element_wise_ops.cpp b/SYCL/Matrix/Legacy/element_wise_ops.cpp
index d9a407e131c2..a432215beeb5 100644
--- a/SYCL/Matrix/Legacy/element_wise_ops.cpp
+++ b/SYCL/Matrix/Legacy/element_wise_ops.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp b/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp
index b02e8cfc0722..97d6fd22b407 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp
index fd926ca1c6da..6402fb72fa7e 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp
index ea94bfcae16b..65f8b84b629d 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_32x64.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 627406446363..aa7d80dba0b6 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of col major layout for matrix B which does transpose and
 // then VNNI transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
index ec64f32cfa6c..65a8fe1e234b 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of row major layout for matrix B which does automatic VNNI
 // transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/Legacy/joint_matrix_half.cpp b/SYCL/Matrix/Legacy/joint_matrix_half.cpp
index d88a9f0b1b46..19f6a021cec9 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_half.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_half.cpp
@@ -5,11 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: matrix,gpu
 
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // Only run on the GPU because half is not supported on AMX hardware
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp b/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp
index bccb0d23d97d..9b0db278dabb 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of col major layout for matrix B which does transpose and
 // then VNNI transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp
index c16d3ad726be..525ef4fe51a6 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_int8_vnni.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: gpu
 
diff --git a/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp
index afdfd28feb90..4ddb42871357 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_ss_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp
index 7c12200762ef..b7edf91804be 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_su_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp
index 935606cbe672..6e999a6b6233 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_us_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp b/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp
index 054f3aaae564..2a7f289df036 100644
--- a/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp
+++ b/SYCL/Matrix/Legacy/joint_matrix_uu_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp
index b12923536a03..a65e36554f93 100644
--- a/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_all_ops_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp
index 62fd63cc88ad..bbb1b8fab1c8 100644
--- a/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_all_ops_half.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
+// REQUIRES: matrix-xmx8,gpu
 
 // Only runs on DPAS because AMX implementation does not support half data type
 // yet
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp
index 2605e89e30cd..ad85e6abad63 100644
--- a/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_all_ops_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
index 7124332923a3..286884bf4387 100644
--- a/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This test stores the matrix B that is VNNIed (packed) in a row major fashion.
 // This is expected to fail on the GPU because the implementation does not
diff --git a/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp
index cc3dd78a63b5..d89af2d7acc2 100644
--- a/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_irreg_sum_rows.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // this code calculates the sum of rows into a global array of number of rows
 // elements. First, partial reduction is computed inside each SG, then atomic
diff --git a/SYCL/Matrix/XMX8/element_wise_ops.cpp b/SYCL/Matrix/XMX8/element_wise_ops.cpp
index 1d7b64e406ba..b1acf17705d2 100644
--- a/SYCL/Matrix/XMX8/element_wise_ops.cpp
+++ b/SYCL/Matrix/XMX8/element_wise_ops.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp b/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp
index e1f67c435ec2..e2d4f82a6a67 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_bfloat16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
index e7c1b42dd7ab..7dfcd2efc43b 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/XMX8/joint_matrix_half.cpp b/SYCL/Matrix/XMX8/joint_matrix_half.cpp
index 355fef88e23f..c84fc0d2030e 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_half.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_half.cpp
@@ -5,11 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
+// REQUIRES: matrix-xmx8,gpu
 
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // Only run on the GPU because half is not supported on AMX hardware
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp
index 0af6a21b85c5..2b69e13785b6 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_int8_vnni.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp
index 86d7f753080a..212799fa0713 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_ss_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp
index 252a647f5d9a..8c6a15445fef 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_su_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp
index e74e7ad46bd4..124cc97f530c 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_us_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp
index 06934de225ad..84dc77f931a6 100644
--- a/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp
+++ b/SYCL/Matrix/XMX8/joint_matrix_uu_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/element_wise_all_ops_bf16.cpp b/SYCL/Matrix/element_wise_all_ops_bf16.cpp
index b871f13e567b..439c069652ed 100644
--- a/SYCL/Matrix/element_wise_all_ops_bf16.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_bf16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/element_wise_all_ops_half.cpp b/SYCL/Matrix/element_wise_all_ops_half.cpp
index e860180c03f4..331920e9ea1b 100644
--- a/SYCL/Matrix/element_wise_all_ops_half.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_half.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: matrix,gpu
 
 // Only runs on DPAS because AMX implementation does not support half data type
 // yet
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/element_wise_all_ops_int8.cpp b/SYCL/Matrix/element_wise_all_ops_int8.cpp
index adcee2a750ef..45f2ef4bda97 100644
--- a/SYCL/Matrix/element_wise_all_ops_int8.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <random>
diff --git a/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp b/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp
index 6008079449c2..f33cc2cc125f 100644
--- a/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_int8_packed.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This test stores the matrix B that is VNNIed (packed) in a row major fashion.
 // This is expected to fail on the GPU because the implementation does not
diff --git a/SYCL/Matrix/element_wise_irreg_sum_rows.cpp b/SYCL/Matrix/element_wise_irreg_sum_rows.cpp
index 76e24de5c6ba..7aebe7e031bc 100644
--- a/SYCL/Matrix/element_wise_irreg_sum_rows.cpp
+++ b/SYCL/Matrix/element_wise_irreg_sum_rows.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This code calculates the sum of rows into a global array of number of rows
 // elements. First, partial reduction is computed inside each SG, then atomic
diff --git a/SYCL/Matrix/element_wise_ops.cpp b/SYCL/Matrix/element_wise_ops.cpp
index c3b949fd9f30..d686ff4ec6d9 100644
--- a/SYCL/Matrix/element_wise_ops.cpp
+++ b/SYCL/Matrix/element_wise_ops.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_bfloat16.cpp b/SYCL/Matrix/joint_matrix_bfloat16.cpp
index 77f1be403d73..b8008a8c9848 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
index 5c955ec42259..4bc00841720a 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: *
 
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 7f6b5c9e4f97..97cf4c97d27d 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of col major layout for matrix B which does transpose and
 // then VNNI transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp b/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
index ae2bd6b99b58..064fbfd6cf20 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_rowmajorA_rowmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of row major layout for matrix B which does automatic VNNI
 // transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/joint_matrix_half.cpp b/SYCL/Matrix/joint_matrix_half.cpp
index 1d131a64a881..720920e1cb82 100644
--- a/SYCL/Matrix/joint_matrix_half.cpp
+++ b/SYCL/Matrix/joint_matrix_half.cpp
@@ -5,11 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: matrix,gpu
 
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // Only run on the GPU because half is not supported on AMX hardware
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
index c572e53c3452..1859b57f8bc1 100644
--- a/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/SYCL/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // This tests support of col major layout for matrix B which does transpose and
 // then VNNI transform. This is currently only available on AMX
diff --git a/SYCL/Matrix/joint_matrix_int8_vnni.cpp b/SYCL/Matrix/joint_matrix_int8_vnni.cpp
index f8ae1a8cf776..7c77be898444 100644
--- a/SYCL/Matrix/joint_matrix_int8_vnni.cpp
+++ b/SYCL/Matrix/joint_matrix_int8_vnni.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 // XFAIL: gpu
 
diff --git a/SYCL/Matrix/joint_matrix_ss_int8.cpp b/SYCL/Matrix/joint_matrix_ss_int8.cpp
index 860e590357dd..dac9ea719b68 100644
--- a/SYCL/Matrix/joint_matrix_ss_int8.cpp
+++ b/SYCL/Matrix/joint_matrix_ss_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_su_int8.cpp b/SYCL/Matrix/joint_matrix_su_int8.cpp
index bd89977fc345..83de8f45d097 100644
--- a/SYCL/Matrix/joint_matrix_su_int8.cpp
+++ b/SYCL/Matrix/joint_matrix_su_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_us_int8.cpp b/SYCL/Matrix/joint_matrix_us_int8.cpp
index 0690636c5914..c4cd631fdaad 100644
--- a/SYCL/Matrix/joint_matrix_us_int8.cpp
+++ b/SYCL/Matrix/joint_matrix_us_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/SYCL/Matrix/joint_matrix_uu_int8.cpp b/SYCL/Matrix/joint_matrix_uu_int8.cpp
index 42f2ff8fe674..33a5372c33f9 100644
--- a/SYCL/Matrix/joint_matrix_uu_int8.cpp
+++ b/SYCL/Matrix/joint_matrix_uu_int8.cpp
@@ -10,6 +10,8 @@
 // RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out
 
 #include <iostream>
 #include <sycl/sycl.hpp>