new method

xiaolil1 · xiaolil1 · commit 989fbd34fc36 · 2025-08-19T05:55:52.000Z
diff --git a/csrc/xpu_cutlass_fusion.cpp b/csrc/xpu_cutlass_fusion.cpp
@@ -292,11 +292,11 @@ CUTLASS_DEVICE void dequant(
     static constexpr auto N = decltype(size<1>(in))::value;
     static constexpr auto K = decltype(size(out))::value / N;
 
-    using compress_type = ushort; //uint32_t;
+    using compress_type = uint32_t;
     static constexpr auto compress_size = sizeof_bits_v<compress_type> / sizeof_bits_v<SrcType>;
     static_assert((compress_size % N) == 0);
 
-    static constexpr auto vec_size = 4;
+    static constexpr auto vec_size = 8;
     //using VecSrcElemType = cute::array<SrcType, compress_size>;
     using VecSrcType = cute::array<compress_type, vec_size>; //sycl::vec<uint32_t, 4>;
     using VecDstElemType = cute::array<DstType, compress_size>;
@@ -307,16 +307,35 @@ CUTLASS_DEVICE void dequant(
     constexpr uint32_t MASK_LOW[4]  = {0xF, 0xF00, 0xF0000, 0xF000000};
     constexpr int SHIFT_HIGH[4] = {4, 12, 20, 28};
     constexpr int SHIFT_LOW[4]  = {0, 8, 16, 24};
+    constexpr int shifts[8] = {4,0,12,8,20,16,28,24};
 
-    auto s_tensor = make_tensor((VecSrcType*)(raw_pointer_cast(in.data())), Shape<Int<K / (compress_size * vec_size)>, Int<N>>{});
-    auto d_tensor = make_tensor((VecDstType*)(raw_pointer_cast(out.data())), Shape<Int<K / (compress_size * vec_size)>, Int<N>>{});
+    auto s_tensor = make_tensor((VecSrcType*)(raw_pointer_cast(in.data())), Shape<Int<1>, Int<N>>{});
+    auto d_tensor = make_tensor((VecDstType*)(raw_pointer_cast(out.data())), Shape<Int<1>, Int<N>>{});
 
     #pragma unroll
     for (int n = 0; n < N; n++) {
         float ts = tCrS_input(n);
-        auto& src = *(cute::array<VecSrcType, K / (compress_size * vec_size)>*)(s_tensor(_, n).data());
-        auto& dst = *(cute::array<VecDstType, K / (compress_size * vec_size)>*)(d_tensor(_, n).data());
-
+        auto& src = *(cute::array<VecSrcType, 1>*)(s_tensor(_, n).data());
+        auto& dst = *(cute::array<VecDstType, 1>*)(d_tensor(_, n).data());
+#if 0
+       const auto src_val = src[0];
+       VecDstType dst_val;
+       #pragma unroll
+       for (int i = 0; i < vec_size; ++i) {
+           const compress_type val = src_val[i];
+           VecDstElemType dst_elem;
+           dst_elem[0] = static_cast<DstType>(quant_map[(val>>shifts[0])&0xF] * ts);
+           dst_elem[1] = static_cast<DstType>(quant_map[(val>>shifts[1])&0xF] * ts);
+           dst_elem[2] = static_cast<DstType>(quant_map[(val>>shifts[2])&0xF] * ts);
+           dst_elem[3] = static_cast<DstType>(quant_map[(val>>shifts[3])&0xF] * ts);
+           dst_elem[4] = static_cast<DstType>(quant_map[(val>>shifts[4])&0xF] * ts);
+           dst_elem[5] = static_cast<DstType>(quant_map[(val>>shifts[5])&0xF] * ts);
+           dst_elem[6] = static_cast<DstType>(quant_map[(val>>shifts[6])&0xF] * ts);
+           dst_elem[7] = static_cast<DstType>(quant_map[(val>>shifts[7])&0xF] * ts);
+           dst_val[i] = dst_elem;
+       }
+       dst[0] = dst_val;
+#else
         #pragma unroll
         for (int k = 0; k < K / (compress_size * vec_size); k++) {
             VecSrcType src_val = src[k];
@@ -332,17 +351,18 @@ CUTLASS_DEVICE void dequant(
                 #pragma unroll
                 for (int j = 0; j < compress_size / 2; j++) {
                 //for (int j = 0; j < 4; j++) {
-                    uint8_t high = (compressed_val & MASK_HIGH[j]) >> SHIFT_HIGH[j];
-                    uint8_t low  = (compressed_val & MASK_LOW[j]) >> SHIFT_LOW[j];
-                    dst_elem[2 * j]   = static_cast<DstType>(quant_map[high] * ts);
-                    dst_elem[2 * j + 1]   = static_cast<DstType>(quant_map[low] * ts);
-                    //dst_elem[2*j]   = static_cast<DstType>(quant_map[(compressed_val >> (4 * (j * 2 + 1))) & 0xf] * ts);
-                    //dst_elem[2*j+1] = static_cast<DstType>(quant_map[(compressed_val >> (4 * (j * 2))) & 0xf] * ts);
+                    //uint8_t high = (compressed_val & MASK_HIGH[j]) >> SHIFT_HIGH[j];
+                    //uint8_t low  = (compressed_val & MASK_LOW[j]) >> SHIFT_LOW[j];
+                    //dst_elem[2 * j]   = static_cast<DstType>(quant_map[high] * ts);
+                    //dst_elem[2 * j + 1]   = static_cast<DstType>(quant_map[low] * ts);
+                    dst_elem[2*j]   = static_cast<DstType>(quant_map[(compressed_val >> (4 * (j * 2 + 1))) & 0xf] * ts);
+                    dst_elem[2*j+1] = static_cast<DstType>(quant_map[(compressed_val >> (4 * (j * 2))) & 0xf] * ts);
                 }
                 dst_val[i] = dst_elem;
             }
             dst[k] = dst_val;
         }
+#endif        
     }
 }
 #endif