make it work!

xiaolil1 · xiaolil1 · commit 12a484a981b8 · 2025-07-29T11:46:25.000Z
diff --git a/bitsandbytes/backends/xpu/ops.py b/bitsandbytes/backends/xpu/ops.py
@@ -74,8 +74,8 @@ def _gemv_4bit_impl(
     blocksize: int,
     out: torch.Tensor,
 ) -> None:
-    import pdb
-    pdb.set_trace()
+    #import pdb
+    #pdb.set_trace()
     m = ct.c_int32(*A.shape[:-1])
     n = ct.c_int32(shapeB[0])
     k = ct.c_int32(shapeB[1])
@@ -85,8 +85,9 @@ def _gemv_4bit_impl(
     ldc = m
 
     #absmax = absmax * 10
-    pdb.set_trace()
-
+    #pdb.set_trace()
+    print("A before kernel: ", A)
+    print("B before kernel: ", B)
     stream = _get_tensor_stream(A)
     if A.dtype == torch.float16:
         lib.cgemv_4bit_inference_fp16(
@@ -185,8 +186,8 @@ def _(
         blocksize: int,
     ) -> torch.Tensor:
         shape = (*A.shape[:-1], shapeB[0])
-        import pdb
-        pdb.set_trace()
+        #import pdb
+        #pdb.set_trace()
         out = torch.zeros(shape, device=A.device, dtype=torch.float32)
         _gemv_4bit_impl(A, B, shapeB, absmax.bfloat16(), code, blocksize, out=out)
         return out
diff --git a/csrc/xpu_cutlass_fusion.cpp b/csrc/xpu_cutlass_fusion.cpp
@@ -234,23 +234,29 @@ class kgemm_4bit_inference_cutlass_dequant {
     using DstType = typename EngineOut::value_type;
     using ScaleType = typename EngineScales::value_type;
 #if 0
-    int numbers = decltype(size(in))::value;
-    for(int i=0; i<numbers; i++){
-      //auto in_ptr_8 = (uint8_t*)(raw_pointer_cast(in.data()));
-      //out[i] = static_cast<DstType>(quant_map[in_ptr_8[i].data()]);
-      uint8_t value = in[i].get();
-      out[i] = static_cast<DstType>(quant_map[value]);
-      int thread_idx = int(ThreadIdxX());
-      if(cute::thread0()){
-      //if(syclcompat::global_id::x() == 2 && syclcompat::global_id::y() ==0 && syclcompat::global_id::z() ==0 )
-        //printf("syclcompat::global_id::x() = %d, syclcompat::global_id::y() = %d, syclcompat::global_id::z() = %d, thread_idx = %d, i = %d, in[i].ptr_ = %x, in[i].idx_=%x, value_bit = %x, value = %d, quant_map[value] = %f, out[i] = %f\n",syclcompat::global_id::x(), syclcompat::global_id::y(), syclcompat::global_id::z(), thread_idx, i, in[i].ptr_, in[i].idx_, value, static_cast<int>(value), quant_map[value], static_cast<float>(out[i]));
-      }
-    }
-    int scale_number = decltype(size(tCrS_input))::value;
-    for(int i=0; i<scale_number; i++){
+    static constexpr auto N = decltype(size<1>(in))::value;
+    static constexpr auto loop_cnt = decltype(size(out))::value / N;
+    for (int n = 0; n < N; n++) {
       auto s_value = tCrS_input(i);
-      if(cute::thread0()) printf("scale_number = %d, tCrS_input[%d] = %f\n",scale_number, i, static_cast<float>(s_value));
-    }
+      for (int l = 0; s < loop_cnt; l++) {
+      
+//      int numbers = decltype(size(in))::value;
+//      for(int i=0; i<numbers / N; i++){
+//        //auto in_ptr_8 = (uint8_t*)(raw_pointer_cast(in.data()));
+//        //out[i] = static_cast<DstType>(quant_map[in_ptr_8[i].data()]);
+//        uint8_t value = in[i].get();
+//        out[i] = static_cast<DstType>(quant_map[value]);
+//        int thread_idx = int(ThreadIdxX());
+//        if(cute::thread0()){
+//        //if(syclcompat::global_id::x() == 2 && syclcompat::global_id::y() ==0 && syclcompat::global_id::z() ==0 )
+//          //printf("syclcompat::global_id::x() = %d, syclcompat::global_id::y() = %d, syclcompat::global_id::z() = %d, thread_idx = %d, i = %d, in[i].ptr_ = %x, in[i].idx_=%x, value_bit = %x, value = %d, quant_map[value] = %f, out[i] = %f\n",syclcompat::global_id::x(), syclcompat::global_id::y(), syclcompat::global_id::z(), thread_idx, i, in[i].ptr_, in[i].idx_, value, static_cast<int>(value), quant_map[value], static_cast<float>(out[i]));
+//        }
+//      }
+//    int scale_number = decltype(size(tCrS_input))::value;
+//    for(int i=0; i<scale_number; i++){
+//      auto s_value = tCrS_input(i);
+//      if(cute::thread0()) printf("scale_number = %d, tCrS_input[%d] = %f\n",scale_number, i, static_cast<float>(s_value));
+//    }
 #else    
     static constexpr auto N = decltype(size<1>(in))::value;
 
@@ -269,7 +275,11 @@ class kgemm_4bit_inference_cutlass_dequant {
     auto s_tensor = make_tensor((format_type*)(raw_pointer_cast(in.data())), Shape<Int<loop_cnt / scalar>, Int<N>>{});
     auto d_tensor = make_tensor(out.data(), Shape<Int<vec_size>, Int<splits>, Int<N>>{});
 
-//if(cute::thread0())
+int scale_number = decltype(size(tCrS_input))::value;
+for(int i=0; i<scale_number; i++){
+  auto s_value = tCrS_input(i);
+  if(cute::thread0()) printf("scale_number = %d, tCrS_input[%d] = %f\n",scale_number, i, static_cast<float>(s_value));
+}
 //  printf("thread_idx = %d, decltype(size(in))::value = %d, K = %d, N = %d, L = %d, src_bits = %d, sizeof_bits_v<format_type> = %d, scalar = %d, decltype(size(out))::value = %d, loop_cnt = %d, splits = %d\n",int(ThreadIdxX()), decltype(size(in))::value, decltype(size<0>(in))::value, N, decltype(size<2>(in))::value, src_bits, sizeof_bits_v<format_type>, scalar, decltype(size(out))::value, loop_cnt, splits);
 
     for (int n = 0; n < N; n++) {
@@ -285,8 +295,13 @@ class kgemm_4bit_inference_cutlass_dequant {
 
         for (int i = 0; i < vec_size; i++) {
           uint8_t value = (format_data >> (src_bits * i)) & 0xf;
-          dst[i] = static_cast<DstType>(quant_map[value] * static_cast<float>(ts));          
-          //if(cute::thread0()) printf("n = %d, s = %d, i = %d, src = %d, quant_map[value] = %f, ts = %f, dst = %f\n", n, s, i, static_cast<int>(value), quant_map[value], static_cast<float>(ts), static_cast<float>(dst[i]));
+          if(i % 2 != 0) { //1,3, high_4bit
+            dst[i-1] = static_cast<DstType>(quant_map[value] * static_cast<float>(ts));          
+          } else {
+            dst[i+1] = static_cast<DstType>(quant_map[value] * static_cast<float>(ts));          
+          }
+          if(cute::thread0())
+            printf("tid = %d, n = %d, s = %d, i = %d, format_data = %d, value = %d, quant_map[value] = %f, ts = %f, dst = %f\n",ThreadIdxX(), n, s, i, static_cast<int>(format_data), static_cast<int>(value), quant_map[value], static_cast<float>(ts), static_cast<float>(dst[i]));
         }
       }
     }
@@ -500,29 +515,38 @@ static constexpr auto SG_QNT_WIDTH = Int<SG_N>{};
       }
   #undef PRINT
 #endif  
-	const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K));
+	  const int k_start_idx = crd2idx((*k_tile_iter), make_shape(K));
     int prefetch_k = k_start_idx;
 
+#if 1
+    const int k_reload_factor = ceil_div(params.group_size, BLK_K);
+    if(cute::thread0()) printf("params.group_size = %d, BLK_K = %d, k_reload_factor = %f\n",params.group_size, BLK_K, k_reload_factor);
+#endif
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < DispatchPolicy::Stages; i++, prefetch_k++) {
       prefetch(tiled_prefetch_a, pAgA(_,_,_,prefetch_k));
       prefetch(tiled_prefetch_b, pBgB(_,_,_,prefetch_k));
     }
 
-    for (int k_tile = k_start_idx; k_tile < k_tile_count + k_start_idx; k_tile++, prefetch_k++) {
+    for (int k_tile = k_start_idx, k_s = 0; k_tile < k_tile_count + k_start_idx; k_tile++, prefetch_k++, k_s++) {
       barrier_arrive(2);
 
       // Copy gmem to rmem for the first k_tile
       copy(tiled_copy_a, tAgA(_,_,_,k_tile), frag_copy_A);
       copy(tiled_copy_b, tBgB(_,_,_,k_tile), frag_copy_B);
-
+#if 1
+      const int s_step = k_start_idx + (k_s / k_reload_factor); //1 + k_tile / k_reload_factor;
+      if(cute::thread0()) printf("k_start_idx = %d, k_s = %d, k_reload_factor = %f, s_step = %d\n",k_start_idx, k_s, k_reload_factor, s_step);
+      copy(tiled_copy_scale, copy_iter_s(_, _, _, s_step), frag_copy_Scale);
+#else
       const int k_reload_factor = ceil_div(params.group_size, BLK_K);
       //const int k_reload_factor = params.group_size / BLK_K;
 
-      if(cute::thread0()) printf("params.group_size = %d, BLK_K = %d, k_reload_factor = %d\n",params.group_size, BLK_K, k_reload_factor);
+      //if(cute::thread0())
+        printf("params.group_size = %d, BLK_K = %d, k_reload_factor = %d\n",params.group_size, BLK_K, k_reload_factor);
 
       copy(tiled_copy_scale, copy_iter_s(_, _, _, k_tile / k_reload_factor), frag_copy_Scale);
-
+#endif
       if(prefetch_k < k_tile_count) {
         prefetch(tiled_prefetch_a, pAgA(_,_,_,prefetch_k));
       }
@@ -563,12 +587,10 @@ if (cute::thread0()) {
 // 打印输出
 debug_print("Accumulators (After GEMM)", accumulators);
 
-barrier_wait(2);
 }
 #endif
 #if 0
 cute::gemm(tiled_mma, mma_A, mma_B, accumulators);
-barrier_wait(2);
 
 for (int i = 0; i < accumulators.size(); ++i) {
     printf("Thread (%d, %d): accumulators[%d] =%f\n", syclcompat::global_id::x() , syclcompat::global_id::y(), i, static_cast<float>(accumulators[i]));
diff --git a/tests/test_xpu.py b/tests/test_xpu.py
@@ -40,7 +40,7 @@ class TestXPU:
     @pytest.mark.parametrize("device", ["xpu"])#get_available_devices())
     @pytest.mark.parametrize("double_quant", [True], ids=lambda double_quant: f"DQ_{double_quant}")
     @pytest.mark.parametrize("storage_type", ["nf4"])
-    @pytest.mark.parametrize("kind", ["fc1"])#, "attn_packed"])
+    @pytest.mark.parametrize("kind", ["fc0"])#, "attn_packed"])
     @pytest.mark.parametrize("dtype", [torch.bfloat16], ids=describe_dtype)
     @pytest.mark.parametrize(
         "quant_storage",
@@ -65,10 +65,30 @@ def test_gemm_4bit(self, device, dim, dtype, storage_type, quant_storage, double
 
         #for i in range(iters):
         #pdb.set_trace()
-        if kind == "fc1":
+        if kind == "fc0":
+            dim = 16
+            #A = torch.arange(32, 0, -2).reshape(1, dim).bfloat16().xpu()  * torch.randn(1, dim, dtype=dtype, device=device) * 10
+            #shuffled_indices = torch.randperm(dim)
+            #A = A[:, shuffled_indices]  # 直接索引列
+
+            #B = torch.arange(0, 32, 1).reshape(2, dim).bfloat16().xpu() * torch.randn(2, dim, dtype=dtype, device=device) / 10 
+            #shuffled_indices = torch.randperm(dim)
+            #B = B[:, shuffled_indices].contiguous()  # 直接索引列
+
+            #A = torch.ones(1, dim, dtype=dtype, device=device)
+            #B = torch.ones(2, dim, dtype=dtype, device=device) # / math.sqrt(dim)
+
+            A = torch.randn(1, dim, dtype=dtype, device=device) * 10
+            B = torch.randn(2, dim, dtype=dtype, device=device)  / math.sqrt(dim)
+            double_quant=False
+            block_size = 16
+        elif kind == "fc1":
+            dim=256
             A = torch.randn(32, dim, dtype=dtype, device=device) * 10
             #A = torch.arange(1, 32 * 256 + 1).reshape(32, 256).bfloat16().xpu()
-            B = torch.randn(dim, dim, dtype=dtype, device=device) / math.sqrt(dim)
+            B = torch.randn(dim, dim, dtype=dtype, device=device) # / math.sqrt(dim)
+            double_quant=False
+            block_size = 32
         elif kind == "fc2":
             A = torch.randn(1, 4 * dim, dtype=dtype, device=device)
             B = torch.randn(dim, 4 * dim, dtype=dtype, device=device) / math.sqrt(dim)
@@ -84,24 +104,53 @@ def test_gemm_4bit(self, device, dim, dtype, storage_type, quant_storage, double
             quant_type=storage_type,
             compress_statistics=double_quant,
             quant_storage=quant_storage,
-            blocksize=64,
+            blocksize=block_size,
         )
 
-        ##pdb.set_trace()
-        C3 = torch.matmul(A, B.t())
-        #pdb.set_trace()
-        C2 = F.gemv_4bit(A, qB.t(), state=state)
-        #pdb.set_trace()
-        print("C3.sum() = ", C3.sum())
-        print("C2.sum() = ", C2.sum())
-        diff = abs(C2-C3)
-        print("diff = ", diff.sum())
-        print(C3[0])
-        print(C2[0])
-        #print(C3)
-        #print(C2)
-        #A.requires_grad = True
-        #C1 = bnb.matmul_4bit(A, qB.t(), state)
+        if kind == "fc0":
+          pdb.set_trace()
+          print("")
+          print("absmax = ", state.absmax)
+          print("A = ",A)
+          print("B = ",B)
+          print("qB = ",qB)
+          print("B.t() = ",B.t())
+          print("qB.t() = ",qB.t())
+          C3 = torch.matmul(A, B.t())
+          #pdb.set_trace()
+          C2 = F.gemv_4bit(A, qB.t(), state=state)
+          #pdb.set_trace()
+          print("C3.sum() = ", C3.sum())
+          print("C2.sum() = ", C2.sum())
+          diff = abs(C2-C3)
+          print("diff = ", diff.sum())
+          print(C3)
+          print(C2)
+          #exit()
+          #print(C3)
+          #print(C2)
+          #A.requires_grad = True
+          #C1 = bnb.matmul_4bit(A, qB.t(), state)          
+        else:
+          pdb.set_trace()
+          print("")
+          print("absmax = ", state.absmax)
+          print("A[0] = ",A[0])
+          print("B[0] = ",B[0])
+          C3 = torch.matmul(A, B.t())
+          #pdb.set_trace()
+          C2 = F.gemv_4bit(A, qB.t(), state=state)
+          #pdb.set_trace()
+          print("C3.sum() = ", C3.sum())
+          print("C2.sum() = ", C2.sum())
+          diff = abs(C2-C3)
+          print("diff = ", diff.sum())
+          print(C3[0])
+          print(C2[0])
+          #print(C3)
+          #print(C2)
+          #A.requires_grad = True
+          #C1 = bnb.matmul_4bit(A, qB.t(), state)
 
     @pytest.mark.parametrize("device", ["xpu"]) #get_available_devices())
     @pytest.mark.parametrize("embedding_dim", [64, 65])