refine code

xiaolil1 · xiaolil1 · commit 9526a541b617 · 2025-07-21T08:31:09.000Z
diff --git a/csrc/xpu_cutlass_fusion.cpp b/csrc/xpu_cutlass_fusion.cpp
@@ -69,7 +69,7 @@ using TiledMma =
                                   Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>::TiledMMA;
 
 // Define Mainloop dispatch policy
-constexpr int PipelineStages = 3;
+constexpr int PipelineStages = 1;
 using DispatchPolicy = cutlass::gemm::MainloopIntelPVCMixedPrecision<PipelineStages>;
 static constexpr int SubgroupSize = DispatchPolicy::SubgroupSize; // sub_group size
 
@@ -231,15 +231,18 @@ class kgemm_4bit_inference_cutlass_dequant {
    /// Utilities to transform A.
   template <class EngineIn,
             class EngineOut, 
+            class EngineRef, 
             class EngineScales, 
             class LayoutIn,
             class LayoutOut,
+            class LayoutRef,
             class LayoutScales,
             class... Ts>
   CUTLASS_DEVICE
   void dequant(
     Tensor<EngineIn, LayoutIn> const& tCrA_load, 
     Tensor<EngineOut, LayoutOut>& tCrA_mma,
+    Tensor<EngineRef, LayoutRef>& A_ref, //mma_A for debug
     Tensor<EngineScales, LayoutScales>& tCrS_input,
     float* quant_map
   ) {
@@ -258,21 +261,27 @@ class kgemm_4bit_inference_cutlass_dequant {
     auto const& dst = tCrA_mma(_, _, _);
     auto pSrc = const_cast<SrcType*>(raw_pointer_cast(src.data()));
     auto pDst = const_cast<DstType*>(raw_pointer_cast(dst.data()));
+    auto pA = const_cast<DstType*>(raw_pointer_cast(A_ref.data()));
     constexpr int num_elements = decltype(size(src))::value / 2;
+    for(int i=0; i<num_elements * 2; i++){
+      if(cute::thread0())
+      printf("ThreadIdxX() = %d, i = %d, *(pSrc + i) = %d, *(pA + i*2) = %f, *(pA + i*2+1) = %f\n", ThreadIdxX(), i, static_cast<int>(*(pSrc + i)), static_cast<int>(*(pA + i*2)), static_cast<int>(*(pA + i*2+1)));
+    }
 
   // TODO(Codeplay): (perf) consider replacing `pack` with `num_elements` here - See xe_flash_attn_mma.hpp
     constexpr int pack = 1; //decltype(select_packing<SrcType, DstType, num_elements>::value())::value;
       int src_size = sizeof_bits_v<SrcType>;
       int dst_size = sizeof_bits_v<DstType>;
-      if(cute::thread0()) printf("Cosize = %d, src_size = %d, dst_size = %d\n", num_elements, src_size, dst_size);
+      //if(cute::thread0()) printf("Cosize = %d, src_size = %d, dst_size = %d\n", num_elements, src_size, dst_size);
     //using Converter = cutlass::NumericArrayConverter<DstType, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
 #if 1    
     for(int i=0; i<num_elements; i++){
       auto src_value = *(pSrc + i);
-      if(cute::thread0()) printf("*(pSrc + i) = %d, src_value = %d\n",static_cast<int>(*(pSrc + i)), static_cast<int>(src_value));
+      //if(cute::thread0()) printf("*(pSrc + i) = %d, src_value = %d\n",static_cast<int>(*(pSrc + i)), static_cast<int>(src_value));
       *(pDst + (2 * i)) = static_cast<DstType>(quant_map[src_value >> 4]);
       *(pDst + (2 * i + 1)) = static_cast<DstType>(quant_map[src_value & 0x0f]);
-      if(cute::thread0()) printf("num_elements = %d, i = %d, *(pSrc + i) = %d, *(pSrc + i) >> 4= %d, *(pSrc + i) & 0x0f, quant_map[*(pSrc + i) >> 4] = %f, quant_map[src_value & 0x0f] = %f \n", num_elements, i, static_cast<int>(*(pSrc + i)), static_cast<int>(*(pSrc + i) >> 4), static_cast<int>(*(pSrc + i) & 0x0f), static_cast<int>(quant_map[*(pSrc + i) >> 4]), static_cast<int>(quant_map[src_value & 0x0f]), static_cast<int>(*(pDst + (2 * i))), static_cast<int>(*(pDst + (2 * i + 1))));
+      if(cute::thread0())
+      printf("num_elements = %d, i = %d, *(pSrc + i) = %d, *(pSrc + i) >> 4= %d, *(pSrc + i) & 0x0f, quant_map[*(pSrc + i) >> 4] = %f, quant_map[src_value & 0x0f] = %f \n", num_elements, i, static_cast<int>(*(pSrc + i)), static_cast<int>(*(pSrc + i) >> 4), static_cast<int>(*(pSrc + i) & 0x0f), static_cast<int>(quant_map[*(pSrc + i) >> 4]), static_cast<int>(quant_map[src_value & 0x0f]), static_cast<int>(*(pDst + (2 * i))), static_cast<int>(*(pDst + (2 * i + 1))));
     }
 #else
     using SrcArray = cutlass::Array<SrcType, pack>;
@@ -306,7 +315,9 @@ class kgemm_4bit_inference_cutlass_dequant {
     for (int i = 0; i < 4; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < 32; ++j) {
+        if(cute::thread0()) printf("tCrA_mma(_, i, _)[j] = %f, i = %d, j = %d, tCrS_input(i) = %f\n",tCrA_mma(_, i, _)[j], i, j, tCrS_input(i));
         tCrA_mma(_, i, _)[j] *= tCrS_input(i);
+        //if(cute::thread0()) printf("after scaling tCrA_mma(_, i, _)[j] = %f\n", tCrA_mma(_, i, _)[j]);
       }
     }
 #else
@@ -366,7 +377,7 @@ class kgemm_4bit_inference_cutlass_dequant {
 
 //// Get the block level coordinate(indexing) for current block
     auto blk_shape = TileShape{}; //256,256,32
-    auto blk_shape_4bit = Shape<_256, _256, _16>{}; //TileShape{}; //256,256,32
+    //auto blk_shape_4bit = Shape<_256, _256, _16>{}; //TileShape{}; //256,256,32
     int m_coord, n_coord, l_coord; //block index
     if (params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN) {
       if(cute::thread0()) printf("AlongN !!\n");
@@ -393,7 +404,7 @@ class kgemm_4bit_inference_cutlass_dequant {
     // gA: 逻辑视图（无实际内存分配）
     Tensor gA = local_tile(mA_mkl, select<0,2>(blk_shape), make_coord(m_coord,_,l_coord));
     Tensor gB = local_tile(mB_nkl, select<1,2>(blk_shape), make_coord(n_coord,_,l_coord));	
-    Tensor gB_4bit = local_tile(mB_nkl_4bit, select<1,2>(blk_shape_4bit), make_coord(n_coord,_,l_coord / 2));	
+    //Tensor gB_4bit = local_tile(mB_nkl_4bit, select<1,2>(blk_shape_4bit), make_coord(n_coord,_,l_coord / 2));	
   
 //// Allocate the tiled_mma and the accumulators for the (M,N) subgroup_tile_shape
     TiledMma tiled_mma;
@@ -412,8 +423,8 @@ class kgemm_4bit_inference_cutlass_dequant {
     // 对于单维度，坐标直接等于索引值。
     // 使用方式：int k = get<0>(coord);  // k = 0
     // cute::make_coord_iterator(A, B): 生成起始坐标A，步长B的迭代器
-    auto k_tile_iter  = cute::make_coord_iterator(idx2crd(0, make_shape(K)), make_shape(K));
-    int k_tile_count = ceil_div(K, get<2>(workgroup_shape));
+    auto k_tile_iter  = cute::make_coord_iterator(idx2crd(0, make_shape(K / 2)), make_shape(K / 2));
+    int k_tile_count = ceil_div(K / 2, get<2>(workgroup_shape));
     if(cute::thread0()) printf("k_tile_count = %d\n", k_tile_count);
          
 //////Run MainLoop//////
@@ -440,7 +451,7 @@ class kgemm_4bit_inference_cutlass_dequant {
     // 虽然每个线程参与多个 Atom 的计算，但 tCgB 的 shape 是针对单个Atom 的线程分片
     Tensor tCgA = thr_mma.partition_A(gA);
     Tensor tCgB = thr_mma.partition_B(gB);
-    Tensor tCgB_4bit = thr_mma.partition_B(gB_4bit);
+    //Tensor tCgB_4bit = thr_mma.partition_B(gB_4bit);
 	
 //// Create fragments：将全局或共享内存中的数据分块转换为适合硬件加速器（如 Tensor Core）计算的寄存器格式
     // make_fragment_layout: 为寄存器片段（Fragment）创建内存布局（Layout），确保数据在寄存器中的排布符合硬件指令（如 Tensor Core）的要求
@@ -452,7 +463,7 @@ class kgemm_4bit_inference_cutlass_dequant {
     Tensor fragment_scale_input = make_tensor<NonVoidElementScale>(FragScaleLayout{}); // 创建scale 寄存器张量
 
     // narrow input fragment
-    Tensor mma_B_4bit = make_tensor<ElementMMA>(make_fragment_layout(tiled_copy_b_4bit, tCgB_4bit(_,_,_,0).shape()));
+    Tensor mma_B_4bit = make_tensor<ElementMMA>(make_fragment_layout(tiled_copy_b_4bit, tCgB(_,_,_,0).shape()));
     Tensor quant_frag = make_tensor<ElementQuant>(decltype(mma_B_4bit.layout()){});
 
     static_assert(std::is_same_v<typename decltype(quant_frag)::value_type, ElementQuant>);
@@ -515,7 +526,7 @@ class kgemm_4bit_inference_cutlass_dequant {
     //      partition_S: 生成逻辑视图（源布局），不实际移动数据
     //      partition_D: 实际复制数据到目标布局（如共享内存→寄存器）
     auto pAgA = thr_prefetch_A.partition_S(gA);
-    auto pBgB = thr_prefetch_B.partition_S(gB_4bit);
+    auto pBgB = thr_prefetch_B.partition_S(gB);
   
 ////
 //// Mainloop
@@ -528,7 +539,7 @@ class kgemm_4bit_inference_cutlass_dequant {
 
     Tensor copy_iter_s = [&](){
         return make_tensor(make_inttuple_iter(make_coord(n_coord, 0, l_coord)), // 初始坐标：(n_coord, 0, l_coord)，表示从 N 维的 n_coord 开始，K 维从 0 开始
-                           make_layout(make_shape(_2{}, _2{}, _1{}, k_tile_count/2), // 迭代器的逻辑形状：[2, 2, 1, k_tile_count]，表示每次迭代生成 2x2x1 的坐标块，共 k_tile_count 次
+                           make_layout(make_shape(_2{}, _2{}, _1{}, k_tile_count), // 迭代器的逻辑形状：[2, 2, 1, k_tile_count]，表示每次迭代生成 2x2x1 的坐标块，共 k_tile_count 次
                                        make_stride(E<0>{} * _16{}, E<0>{} * _32{}, _0{}, E<1>{} * _1{}))); // 步长 [16, 32, 0, 1]：
                                                                                                            //   E<0>{} * _16{}: 第一维度（N）的步长为 16;
                                                                                                            //   E<0>{} * _32{}：第二维度（K）的步长为 32;
@@ -538,7 +549,7 @@ class kgemm_4bit_inference_cutlass_dequant {
                                                                                                            //   E<N>：一个模板类，表示第 N 维的步长或索引，通常用于动态形状或步长的占位符
                                                                                                            //   E<0>{}：表示第 0 维（最内层维度）的动态步长或索引值，具体值在运行时确定
     }();
-
+#if 0
   #define CUTLASS_ENABLE_DEBUG_PRINTS 1
   #if CUTLASS_ENABLE_DEBUG_PRINTS
   #define PRINT(x) print(#x ": "); print(x); print("\n");
@@ -571,9 +582,10 @@ class kgemm_4bit_inference_cutlass_dequant {
       }
   #undef PRINT
   #endif
+#endif  
 
     // crd2idx: 将多维逻辑坐标转换为线性索引
-    const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K));
+    const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K / 2));
     int prefetch_k = 0;
 
     CUTLASS_PRAGMA_UNROLL
@@ -593,7 +605,7 @@ class kgemm_4bit_inference_cutlass_dequant {
 
       copy(tiled_copy_scale, copy_iter_s(_, _, _, k_start_idx + (k_tile / k_reload_factor)), copy_tCrS);
       //dequant(quant_frag, mma_B_expanded, fragment_scale_input, quant_map);
-      dequant(quant_frag, mma_B, fragment_scale_input, quant_map);
+      dequant(quant_frag, mma_B, mma_A, fragment_scale_input, quant_map);
 
       if(prefetch_k < k_tile_count) {
         prefetch(tiled_prefetch_a, pAgA(_,_,_,prefetch_k));
diff --git a/tests/test_xpu.py b/tests/test_xpu.py
@@ -47,7 +47,7 @@ class TestXPU:
         [torch.uint8],
         ids=describe_dtype,
     )
-    @pytest.mark.parametrize("dim", [32], ids=id_formatter("dim"))
+    @pytest.mark.parametrize("dim", [256], ids=id_formatter("dim"))
     def test_gemm_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
         errs1 = []
         errs2 = []
@@ -86,7 +86,7 @@ def test_gemm_4bit(self, device, dim, dtype, storage_type, quant_storage, double
         )
         ##pdb.set_trace()
         C3 = torch.matmul(A, B.t())
-        #pdb.set_trace()
+        pdb.set_trace()
         C2 = F.gemv_4bit(A, qB.t(), state=state)
         #pdb.set_trace()
         print(C3[0])