change MMA trait

xiaolil1 · xiaolil1 · commit 245c354db4b2 · 2025-08-03T16:05:28.000Z
diff --git a/bitsandbytes/backends/xpu/ops.py b/bitsandbytes/backends/xpu/ops.py
@@ -188,7 +188,7 @@ def _(
         shape = (*A.shape[:-1], shapeB[0])
         #import pdb
         #pdb.set_trace()
-        out = torch.zeros(shape, device=A.device, dtype=torch.float32)
+        out = torch.zeros(shape, device=A.device, dtype=torch.bfloat16)
         _gemv_4bit_impl(A, B, shapeB, absmax.bfloat16(), code, blocksize, out=out)
         return out
 
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -381,7 +381,7 @@ void gemv_4bit_inference_fp16(
 
 #if 1
 void gemm_4bit_inference_bf16(
-    int m, int n, int k, sycl::ext::oneapi::bfloat16 * A,  unsigned char* B,   sycl::ext::oneapi::bfloat16 *absmax, float *datatype, float * out,
+    int m, int n, int k, sycl::ext::oneapi::bfloat16 * A,  unsigned char* B,   sycl::ext::oneapi::bfloat16 *absmax, float *datatype, sycl::ext::oneapi::bfloat16 * out,
     int lda, int ldb, int ldc, int blocksize, sycl::queue* stream
 ) {
     gemm_4bit_inference_cutlass_dequant<sycl::ext::oneapi::bfloat16, 16>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize, stream);
@@ -827,7 +827,7 @@ void cgemv_4bit_inference_fp16(
 #if 1
 void cgemv_4bit_inference_bf16(
     int m, int n, int k, sycl::ext::oneapi::bfloat16 * A,  unsigned char* B,   sycl::ext::oneapi::bfloat16 *absmax, float *datatype,
-    float * out,  int lda, int ldb, int ldc, int blocksize, sycl::queue* stream
+    sycl::ext::oneapi::bfloat16 * out,  int lda, int ldb, int ldc, int blocksize, sycl::queue* stream
 ) {
     gemm_4bit_inference_bf16(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize, stream);
 }
diff --git a/csrc/xpu_cutlass.h b/csrc/xpu_cutlass.h
@@ -109,7 +109,7 @@ void gemv_4bit_inference_cutlass_cute(int m, int n, int k, T *A, T *B,
 
 template <typename T, int BITS>
 void gemm_4bit_inference_cutlass_dequant(int m, int n, int k, T *A, unsigned char *B,
-                         T *absmax, float *datatype, float *out, int lda,
+                         T *absmax, float *datatype, T *out, int lda,
                          int ldb, int ldc, int blocksize, sycl::queue *stream);
 
 template <typename T, int BITS>
diff --git a/csrc/xpu_cutlass_fusion.cpp b/csrc/xpu_cutlass_fusion.cpp
@@ -40,7 +40,7 @@ using namespace cutlass::gemm;
 
 // Define Basic information 
 //Weight-only-quant (B)
-using MmaType = cutlass::bfloat16_t;
+using MmaType = sycl::ext::oneapi::bfloat16; //cutlass::bfloat16_t;
 using QuantType = cutlass::uint4_t; //NF4,FP4
 
 using ElementA = MmaType; //bfloat16_t;
@@ -50,18 +50,23 @@ using ElementMMA = ElementA;
 using ElementQuant = QuantType;
 using ElementScale = MmaType; //sycl::ext::oneapi::bfloat16; //MmaType;
 
-using ElementC = float;
-using ElementD = float;
-using ElementAccumulator = float;      // data_type of accumulator
-using ElementComputeEpilogue = float;  // data_type of epilogue operations
-using ElementOutput = float;
+using ElementAccumulator = MmaType;      // data_type of accumulator
+using ElementComputeEpilogue = MmaType;  // data_type of epilogue operations
+using ElementOutput = MmaType;
 
 using ProblemShape = Shape<int, int, int, int>;
 
+#if 1
 using TileShape = Shape<_256, _256, _32>;
 using TiledMma =
-      typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<TileShape>,
+      typename TiledMMAHelper<MMA_Atom<XE_8x16x16_BF16BF16BF16BF16_TT>, Layout<TileShape>,
                                     Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
+#else
+  using TileShape = Shape<_16, _64, _64>;
+  using TiledMma =
+      typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32F16F16F32_TT>, Layout<TileShape>,
+                                    Layout<Shape<_1, _2, _1>, Stride<_2, _1, _0>>>::TiledMMA;
+#endif
 
 using WorkgroupTileShape = TileShape;
 static constexpr auto BLK_M = get<0>(WorkgroupTileShape{}); //256 //16
@@ -94,7 +99,8 @@ static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize; // 16
 
 // Design Epilogue
 using EpilogueDispatchPolicy = cutlass::epilogue::IntelPVCEpilogue;
-using EpilogueOp = cutlass::epilogue::fusion::LinearCombination<ElementAccumulator, ElementComputeEpilogue, ElementAccumulator, ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;
+//constexpr int kAlignment = 128 / sizeof(ElementOutput);
+using EpilogueOp = cutlass::epilogue::fusion::LinearCombination<ElementOutput, ElementComputeEpilogue, ElementAccumulator, ElementAccumulator, cutlass::FloatRoundStyle::round_to_nearest>;
 using FusionCallBacks = cutlass::epilogue::fusion::FusionCallbacks<EpilogueDispatchPolicy, EpilogueOp, TileShape, decltype(tile_shape(TiledMma()))>;
 using SharedStorage = FusionCallBacks::SharedStorage;
 
@@ -115,9 +121,9 @@ using CollectiveEpilogue = cutlass::epilogue::collective::CollectiveEpilogue<
         ElementOutput,
         cutlass::gemm::TagToStrideC_t<cutlass::layout::RowMajor>, // Convert CUTLASS 2.x to CUTLASS 3.x representation
         FusionCallBacks,
-        XE_2D_U32x8x16_LD_N, // The copy atom used to load matrix C
+        XE_2D_U16x8x16_LD_N, // The copy atom used to load matrix C
         void, void,
-        XE_2D_U32x8x16_ST_N, // The copy atom used to store matrix D
+        XE_2D_U16x8x16_ST_N, // The copy atom used to store matrix D
         void, void>;
 using EpilogueParams = typename CollectiveEpilogue::Params;
 
@@ -166,7 +172,7 @@ class kgemm_4bit_inference_cutlass_dequant {
     int m, n, k;
     T* A;
     uint8_t* B;
-    float* out;
+    T* out;
     float *datatype; //LUT
     int group_size;
 	  
@@ -279,7 +285,7 @@ class kgemm_4bit_inference_cutlass_dequant {
 
     T* A = params.A;
     uint8_t* B = params.B;
-    float* out = params.out;
+    T* out = params.out;
     float* datatype = params.datatype;
 
     auto tiled_copy_a = params.tiled_copy_a;
@@ -544,7 +550,7 @@ printf("\n");
 
 template <typename T, int BITS>
 void gemm_4bit_inference_cutlass_dequant(int m, int n, int k, T *A, unsigned char *B,
-                         T *absmax_, float *datatype, float *out, int lda,
+                         T *absmax_, float *datatype, T *out, int lda,
                          int ldb, int ldc, int blocksize, sycl::queue *stream) {
   //std::cout<<"this is gemm_4bit_inference_cutlass_dequant ......................!!!!!!\n";
 
@@ -593,8 +599,8 @@ void gemm_4bit_inference_cutlass_dequant(int m, int n, int k, T *A, unsigned cha
   cutlass::KernelHardwareInfo hw_info;
   hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
   auto problem_shape_MNKL = problem_size; //append<4>(problem_size, 1);
-  float alpha=1.0f;
-  float beta=0.f;
+  T alpha=1.0f;
+  T beta=0.f;
   StrideC stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, l));
   StrideD stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(m, n, l));
 
@@ -649,6 +655,6 @@ void gemm_4bit_inference_cutlass_dequant(int m, int n, int k, T *A, unsigned cha
 
 template void gemm_4bit_inference_cutlass_dequant<sycl::ext::oneapi::bfloat16, 16>(
     int m, int n, int k, sycl::ext::oneapi::bfloat16 *A, unsigned char *B,
-    sycl::ext::oneapi::bfloat16 *absmax, float *datatype, float *out, int lda,
+    sycl::ext::oneapi::bfloat16 *absmax, float *datatype, sycl::ext::oneapi::bfloat16 *out, int lda,
     int ldb, int ldc, int blocksize, sycl::queue *stream);
 
diff --git a/include/cute/algorithm/copy.hpp b/include/cute/algorithm/copy.hpp
@@ -185,6 +185,12 @@ copy(Copy_Atom<CopyArgs...>       const& copy_atom,
     Tensor dst_v = group_modes<1,R>(dst);
 
     if constexpr (is_static<decltype(shape(src_v))>::value && is_static<decltype(shape(dst_v))>::value) {
+#if 0      
+      if(cute::thread0()){
+        print("src_v : "); print(src_v); print("\n");
+        print("dst_v : "); print(dst_v); print("\n");
+      }
+#endif      
       CUTE_STATIC_ASSERT_V(size<1>(src_v) == size<1>(dst_v));
 
       // AutoFilter on the Rest-mode
diff --git a/include/cute/arch/mma_xe.hpp b/include/cute/arch/mma_xe.hpp
@@ -45,6 +45,11 @@ SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_bf16_bf16_matrix_mad_k16(cut
 SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc));
 SYCL_DEVICE_OCL(cute::intel::float2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::float2 acc));
 SYCL_DEVICE_OCL(float  intel_sub_group_bf16_bf16_matrix_mad_k16(short a, cute::intel::int8 b, float acc));
+// mma_bfloat16 with bfloat16 accumulator:
+SYCL_EXTERNAL cute::intel::short8 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::short8 acc);
+SYCL_EXTERNAL cute::intel::short4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::short4 acc);
+SYCL_EXTERNAL cute::intel::short2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::short2 acc);
+SYCL_EXTERNAL               short intel_sub_group_bf16_bf16_matrix_mad_k16(              short a, cute::intel::int8 b,               short acc);
 // mma_half
 SYCL_DEVICE_OCL(cute::intel::float8 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short8 a, cute::intel::int8 b, cute::intel::float8 acc));
 SYCL_DEVICE_OCL(cute::intel::float4 intel_sub_group_f16_f16_matrix_mad_k16(cute::intel::short4 a, cute::intel::int8 b, cute::intel::float4 acc));
@@ -155,6 +160,26 @@ struct XE_1x16x16_F32BF16BF16F32_TT
   }
 };
 
+struct XE_8x16x16_BF16BF16BF16BF16_TT
+{
+  using DRegisters = intel::short8[1];
+  using ARegisters = intel::short8[1];
+  using BRegisters = intel::int8[1];
+  using CRegisters = intel::short8[1];
+
+  CUTE_HOST_DEVICE static void
+  fma(intel::short8      & d,
+      intel::short8 const& a,
+      intel::int8   const& b,
+      intel::short8 const& c)
+  {
+#if defined(SYCL_INTEL_TARGET)
+    d = intel_sub_group_bf16_bf16_matrix_mad_k16(a, b, c);
+#else
+    CUTE_INVALID_CONTROL_PATH("Attempting to use XE_8x16x16_BF16BF16BF16BF16_TT on non-PVC hardware");
+#endif
+  }
+};
 //MxNxK_D,A,B,C
 //# of vector component of a x subgroup-size x function name
 //float8 intel_sub_group_f16_f16_matrix_mad_k16(short8 a, int8 b, int8 acc);
diff --git a/include/cute/arch/mma_xe_builtin.hpp b/include/cute/arch/mma_xe_builtin.hpp
@@ -62,6 +62,17 @@ SYCL_EXTERNAL cute::intel::short4 intel_sub_group_bf16_bf16_matrix_mad_k16(cute:
 SYCL_EXTERNAL cute::intel::short2 intel_sub_group_bf16_bf16_matrix_mad_k16(cute::intel::short2 a, cute::intel::int8 b, cute::intel::short2 acc);
 SYCL_EXTERNAL               short intel_sub_group_bf16_bf16_matrix_mad_k16(              short a, cute::intel::int8 b,               short acc);
 
+// Use the spirv functions as the builtins do not work
+SYCL_EXTERNAL cute::intel::half8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::half8, int32_t);
+SYCL_EXTERNAL cute::intel::half4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::half4, int32_t);
+SYCL_EXTERNAL cute::intel::half2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::half2, int32_t);
+SYCL_EXTERNAL cute::intel::half  __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t,               short, cute::intel::int8,  cute::intel::half, int32_t);
+
+struct SPIRV_MMAOperands {
+  static constexpr int SPIRV_MatrixAFp16 = 0x400;
+  static constexpr int SPIRV_MatrixBFp16 = 0x800;
+};
+
 namespace cute::detail
 {
 
@@ -97,6 +108,16 @@ struct XeSubgroupMatrixMultiplyAccumulate<bfloat16_t, bfloat16_t, bfloat16_t, bf
     }
 };
 
+template<>
+struct XeSubgroupMatrixMultiplyAccumulate<half_t, half_t, half_t, half_t> {
+  template<typename ARegisters, typename BRegisters, typename CRegisters>
+  CUTE_HOST_DEVICE
+  auto operator()(ARegisters a, BRegisters b, CRegisters c) {
+    return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c,
+             SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16);
+  }
+};
+
 template<>
 struct XeSubgroupMatrixMultiplyAccumulate<int32_t, int8_t, int8_t, int32_t> {
     template<typename ARegisters, typename BRegisters, typename CRegisters>
diff --git a/include/cute/arch/mma_xe_spirv.hpp b/include/cute/arch/mma_xe_spirv.hpp
@@ -56,6 +56,11 @@ SYCL_EXTERNAL cute::intel::short4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(
 SYCL_EXTERNAL cute::intel::short2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::short2, int32_t);
 SYCL_EXTERNAL               short __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t,               short, cute::intel::int8,               short, int32_t);
 
+SYCL_EXTERNAL cute::intel::half8 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short8, cute::intel::int8, cute::intel::half8, int32_t);
+SYCL_EXTERNAL cute::intel::half4 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short4, cute::intel::int8, cute::intel::half4, int32_t);
+SYCL_EXTERNAL cute::intel::half2 __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t, cute::intel::short2, cute::intel::int8, cute::intel::half2, int32_t);
+SYCL_EXTERNAL cute::intel::half  __spirv_SubgroupMatrixMultiplyAccumulateINTEL(int32_t,               short, cute::intel::int8,  cute::intel::half, int32_t);
+
 struct SPIRV_MMAOperands {
   static constexpr int SPIRV_MatrixASigned = 0x1;
   static constexpr int SPIRV_MatrixBSigned = 0x2;
@@ -109,6 +114,16 @@ struct XeSubgroupMatrixMultiplyAccumulate<bfloat16_t, bfloat16_t, bfloat16_t, bf
     }
 };
 
+template<>
+struct XeSubgroupMatrixMultiplyAccumulate<half_t, half_t, half_t, half_t> {
+    template<typename ARegisters, typename BRegisters, typename CRegisters>
+    CUTE_HOST_DEVICE
+    auto operator()(ARegisters a, BRegisters b, CRegisters c) {
+      return __spirv_SubgroupMatrixMultiplyAccumulateINTEL(16, a, b, c,
+               SPIRV_MMAOperands::SPIRV_MatrixAFp16 | SPIRV_MMAOperands::SPIRV_MatrixBFp16);
+    }
+};
+
 template<>
 struct XeSubgroupMatrixMultiplyAccumulate<int32_t, int8_t, int8_t, int32_t> {
     template<typename ARegisters, typename BRegisters, typename CRegisters>
diff --git a/include/cute/atom/mma_traits_xe.hpp b/include/cute/atom/mma_traits_xe.hpp
@@ -38,6 +38,21 @@
 namespace cute
 {
 template <>
+struct MMA_Traits<XE_8x16x16_BF16BF16BF16BF16_TT>
+{
+  using ValTypeD = bfloat16_t;
+  using ValTypeA = bfloat16_t;
+  using ValTypeB = bfloat16_t;
+  using ValTypeC = bfloat16_t;
+
+  using Shape_MNK = Shape<_8,_16,_16>;
+  using ThrID   = Layout<_16>;
+
+  using ALayout = Layout<Shape<_16, _8>, Stride<_8, _1>>;
+  using BLayout = Layout<Shape<_16, _16>, Stride<_1, _16>>;
+  using CLayout = Layout<Shape<_16, _8>, Stride<_8, _1>>;
+};
+template <>
 struct MMA_Traits<XE_8x16x16_F32BF16BF16F32_TT>
 {
   using ValTypeD = float;
diff --git a/include/cute/util/sycl_vec.hpp b/include/cute/util/sycl_vec.hpp
@@ -52,11 +52,18 @@ using uchar2 = vector_t<uchar, 2>;
 using uchar4 = vector_t<uchar, 4>;
 using uchar8 = vector_t<uchar, 8>;
 using uchar16 = vector_t<uchar, 16>;
+using uchar32 = vector_t<uchar, 32>;
+using uchar64 = vector_t<uchar, 64>;
 
 using float2 = vector_t<float, 2>;
 using float4 = vector_t<float, 4>;
 using float8 = vector_t<float, 8>;
 
+using half = _Float16;
+using half2 = vector_t<_Float16, 2>;
+using half4 = vector_t<_Float16, 4>;
+using half8 = vector_t<_Float16, 8>;
+
 using short2 = vector_t<short, 2>;
 using short4 = vector_t<short, 4>;
 using short8 = vector_t<short, 8>;
diff --git a/include/cutlass/epilogue/collective/xe_epilogue.hpp b/include/cutlass/epilogue/collective/xe_epilogue.hpp
@@ -102,11 +102,11 @@ class CollectiveEpilogue<
   using CopyOpR2S = CopyOpR2S_;
 
   using ThreadEpilogueOp = typename fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
-  using GmemTiledCopyC = CopyOpG2R;
+  using GmemTiledCopyC = conditional_t<cute::is_void_v<CopyOpG2R>, XE_2D_U32x8x16_LD_N, CopyOpG2R>;
   using GmemTiledCopyD = cute::conditional_t<not cute::is_void_v<ElementD> && not cute::is_void_v<CopyOpR2G>,
                                              CopyOpR2G, XE_2D_U32x8x16_ST_N>;
-  using ElementOutput = typename FusionCallbacks::ElementOutput;
-  using ElementCompute = typename FusionCallbacks::ElementCompute;
+  using ElementOutput = ElementD;
+  using ElementCompute = ElementAccumulator;
 
   static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize;
 
@@ -311,7 +311,7 @@ class CollectiveEpilogue<
     Tensor tCgD = thread_xe_store_d.partition_D(gD);
 
     Tensor trC = make_tensor<typename TiledMma::ValTypeC>(Shape<Int<FragmentSize>>{});
-    Tensor trD = make_tensor<typename TiledMma::ValTypeD>(Shape<Int<FragmentSize>>{});
+    Tensor trD_compute = make_tensor<ElementCompute>(Shape<Int<FragmentSize>>{});
 
     // Because Sm90 uses shared memory, they are not tied to using the same accumulator values
     // for MMA and Epilogue. But because we are operating directly in the accumulators, we need to be
@@ -349,7 +349,10 @@ class CollectiveEpilogue<
 
     cst_callbacks.begin();
 
-    auto acc_frag = recast<Array<ElementOutput, FragmentSize>>(accumulators);
+    auto acc_frag = recast<Array<ElementCompute, FragmentSize>>(accumulators);
+    auto trD_compute_frag = recast<Array<ElementCompute, FragmentSize>>(trD_compute);
+
+    Tensor trD = make_tensor<ElementOutput>(Shape<Int<FragmentSize>>{});
     auto trD_frag = recast<Array<ElementOutput, FragmentSize>>(trD);
 
     constexpr int ValuesLoaded =
@@ -374,12 +377,16 @@ class CollectiveEpilogue<
         auto acc_frag_mn = acc_frag(_, epi_m, epi_n);
 
         CUTLASS_PRAGMA_UNROLL
-        for (int epi_v = 0; epi_v < size<0>(trD_frag); ++epi_v) {
-          trD_frag(epi_v) = cst_callbacks.visit(acc_frag_mn(epi_v), epi_v, epi_m, epi_n);
+        for (int epi_v = 0; epi_v < size<0>(trD_compute_frag); ++epi_v) {
+          trD_compute_frag(epi_v) = cst_callbacks.visit(acc_frag_mn(epi_v), epi_v, epi_m, epi_n);
         }
-        cst_callbacks.reduce(nullptr, synchronize, epi_m, epi_n, (epi_m == FragsM - 1 && epi_n == FragsN - 1), trD_frag);
+        cst_callbacks.reduce(nullptr, synchronize, epi_m, epi_n, (epi_m == FragsM - 1 && epi_n == FragsN - 1), trD_compute_frag);
         
         if constexpr (is_destination_supported) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(trD_compute_frag); ++i) {
+            trD_frag(i) = cutlass::NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize>{}(trD_compute_frag(i));
+          }
           copy(params.xe_store_d, trD, tCgD(_, epi_m, epi_n));
         }
         
diff --git a/run_case.sh b/run_case.sh
@@ -28,8 +28,8 @@
 
 
 #gdb -args python -m pytest -vs tests/test_xpu.py::TestXPU::test_gemm_4bit
-#pytest -vs tests/test_xpu.py::TestXPU::test_gemm_4bit
-pytest -vs tests/test_xpu.py::TestXPU::test_gemv_4bit
+pytest -vs tests/test_xpu.py::TestXPU::test_gemm_4bit
+#pytest -vs tests/test_xpu.py::TestXPU::test_gemv_4bit
 #python tests/test_xpu_db.py
 #gdb -args python tests/test_xpu_db.py
 #pytest tests/test_functional.py::TestQuantize4BitFunctional::test_gemv_4bit[dim=256-uint8-bf16-fc1-nf4-DQ_True-xpu]
diff --git a/tests/test_xpu.py b/tests/test_xpu.py
@@ -40,7 +40,7 @@ class TestXPU:
     @pytest.mark.parametrize("device", ["xpu"])#get_available_devices())
     @pytest.mark.parametrize("double_quant", [True], ids=lambda double_quant: f"DQ_{double_quant}")
     @pytest.mark.parametrize("storage_type", ["nf4"])
-    @pytest.mark.parametrize("kind", ["fc1"])#, "attn_packed"])
+    @pytest.mark.parametrize("kind", ["fc0"])#, "attn_packed"])
     @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=describe_dtype)
     @pytest.mark.parametrize(
         "quant_storage",