issue/1118: bit=4 error

xgqdut2016 · xgqdut2016 · commit af8cf6d8deb4 · 2026-04-13T16:13:29.000+08:00
diff --git a/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cu b/src/infiniop/ops/gptq_qyblas_gemm/nvidia/gptq_qyblas_gemm_nvidia.cu
@@ -43,17 +43,7 @@ infiniStatus_t Descriptor::calculate(void *workspace,
                                      int64_t quant_type,
                                      int64_t bit,
                                      void *stream) const {
-
-    int64_t M = static_cast<int64_t>(_info.M);
     int64_t K = static_cast<int64_t>(_info.K);
-    int64_t N = static_cast<int64_t>(_info.N);
-    int64_t scales_size_0 = static_cast<int64_t>(_info.scales_size_0);
-    int64_t scales_size_1 = static_cast<int64_t>(_info.scales_size_1);
-    int64_t lda = static_cast<int64_t>(_info.lda);
-    int64_t ldb = static_cast<int64_t>(_info.ldb);
-    int64_t result_ld = static_cast<int64_t>(_info.result_ld);
-    bool transpose_mat_1 = _info.transpose_mat_1;
-    bool transpose_mat_2 = _info.transpose_mat_2;
 
     cudaDataType_t computeType_ = (cudaDataType_t)CUDA_R_32F;
     cudaDataType_t kernel_Atype_, kernel_Btype_, kernel_Ctype_, kernel_Stype_, kernel_Ztype_;
@@ -76,7 +66,6 @@ infiniStatus_t Descriptor::calculate(void *workspace,
 
         if (4 == bit) {
             kernel_Atype_ = (cudaDataType_t)CUDA_R_4U;
-            K = K * 2;
         }
     }
 
@@ -127,11 +116,42 @@ infiniStatus_t Descriptor::calculate(void *workspace,
     float alpha = 1.0f;
     float beta = 0.0f;
 
+    bool transpose_mat_1 = _info.transpose_mat_1;
+    bool transpose_mat_2 = _info.transpose_mat_2;
+    int64_t M;
+    int64_t N;
+    int64_t lda;
+    int64_t ldb;
+    cublasOperation_t transa;
+    cublasOperation_t transb;
+
+    if (transpose_mat_2) {
+        M = static_cast<int64_t>(_info.N);
+        N = static_cast<int64_t>(_info.M);
+        lda = (bit == 4 ? static_cast<int64_t>(_info.ldb) * 2 : static_cast<int64_t>(_info.ldb));
+        ldb = static_cast<int64_t>(_info.lda);
+        std::swap(a, b);
+        std::swap(kernel_Atype_, kernel_Btype_);
+        transa = transpose_mat_2 ? CUBLAS_OP_T : CUBLAS_OP_N;
+        transb = transpose_mat_1 ? CUBLAS_OP_T : CUBLAS_OP_N;
+    } else {
+        M = static_cast<int64_t>(_info.M);
+        N = static_cast<int64_t>(_info.N);
+        lda = static_cast<int64_t>(_info.lda);
+        ldb = static_cast<int64_t>(_info.ldb);
+        transa = transpose_mat_1 ? CUBLAS_OP_T : CUBLAS_OP_N;
+        transb = transpose_mat_2 ? CUBLAS_OP_T : CUBLAS_OP_N;
+    }
+    int64_t scales_size_0 = static_cast<int64_t>(_info.scales_size_0);
+    int64_t scales_size_1 = static_cast<int64_t>(_info.scales_size_1);
+
+    int64_t result_ld = static_cast<int64_t>(_info.result_ld);
+
     dlblasExtQuantParametersV2_t extParameters;
 
     if (quant_type == 0) {
-        extParameters.a_group_size_m = M / scales_size_0;
-        extParameters.a_group_size_k = K / scales_size_1;
+        extParameters.a_group_size_m = M / scales_size_1;
+        extParameters.a_group_size_k = K / scales_size_0;
         extParameters.a_zeropoints_type = kernel_Ztype_;
         extParameters.a_zeropoints = b_zeros;
         extParameters.a_scales_type = kernel_Stype_;
@@ -146,13 +166,13 @@ infiniStatus_t Descriptor::calculate(void *workspace,
     } else if (quant_type == 2 || quant_type == 3) {
         // calculate block_shape according weight/scales shape
         int block_shape = 128;
-        while ((N + block_shape - 1) / block_shape < scales_size_0) {
+        while ((M + block_shape - 1) / block_shape < scales_size_0) {
             block_shape /= 2;
             if (block_shape < 32) {
                 fprintf(stderr,
                         "INTERNAL ASSERT FAILED: block_shape >= 32\n"
                         "Invalid fp blockwise linear arguments. Weight: [%d, %d]. Scales: [%d, %d].\n",
-                        (int)N, (int)K, (int)scales_size_0, (int)scales_size_1);
+                        (int)M, (int)K, (int)scales_size_0, (int)scales_size_1);
                 abort();
             }
         }
@@ -168,9 +188,6 @@ infiniStatus_t Descriptor::calculate(void *workspace,
         extParameters.a_scales = b_scales;
     }
 
-    cublasOperation_t transa = transpose_mat_2 ? CUBLAS_OP_T : CUBLAS_OP_N;
-    cublasOperation_t transb = transpose_mat_1 ? CUBLAS_OP_T : CUBLAS_OP_N;
-
     if (_info.dtype == INFINI_DTYPE_F16 || _info.dtype == INFINI_DTYPE_BF16) {
         CHECK_STATUS(_opaque->internal->useCublas(
             (cudaStream_t)stream,
@@ -179,16 +196,16 @@ infiniStatus_t Descriptor::calculate(void *workspace,
                     dlblasGemmExV2(handle,
                                    transa,
                                    transb,
-                                   N,
                                    M,
+                                   N,
                                    K,
                                    &alpha,
-                                   b,
-                                   kernel_Btype_,
-                                   ldb,
                                    a,
                                    kernel_Atype_,
                                    lda,
+                                   b,
+                                   kernel_Btype_,
+                                   ldb,
                                    &beta,
                                    out,
                                    kernel_Ctype_,
diff --git a/test/infiniop/gptq_qyblas_gemm.py b/test/infiniop/gptq_qyblas_gemm.py
@@ -51,9 +51,14 @@ def to_iter(x):
 )
 
 
+_TEST_CASES_W4 = [(32768, 3584, 4608, [128, 128], InfiniDtype.U8),]
+
+
 # Data types used for testing
 _TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16]
 
+_TENSOR_DTYPES_W4 = [InfiniDtype.F16]
+
 
 DEBUG = False
 PROFILE = False
@@ -129,9 +134,6 @@ def test(
     quant_type = 3
     bit = 8
 
-    int8_info = torch.iinfo(torch.int8)
-    int8_max, int8_min = int8_info.max, int8_info.min
-
     block_n, block_k = block_size[0], block_size[1]
     n_tiles = (N + block_n - 1) // block_n
     k_tiles = (K + block_k - 1) // block_k
@@ -143,23 +145,28 @@ def test(
         device,
     )
     if weight_dtype == InfiniDtype.I8:
-        B_orig = TestTensor(
-            (N, K),
-            None,
-            weight_dtype,
-            device,
-            randint_low=int8_min,
-            randint_high=int8_max,
-        )
-        B_torch = B_orig.torch_tensor().t()
-        B = TestTensor(
-            (K, N),
-            B_torch.stride(),
-            weight_dtype,
-            device,
-            mode="manual",
-            set_tensor=B_torch,
-        )
+        _info = torch.iinfo(torch.int8)
+    elif weight_dtype == InfiniDtype.U8:
+        _info = torch.iinfo(torch.uint8)
+    elif weight_dtype == InfiniDtype.F8:
+        _info = torch.iinfo(float8_e4m3fn)
+    B_orig = TestTensor(
+        (N, K),
+        None,
+        weight_dtype,
+        device,
+        randint_low=_info.min,
+        randint_high=_info.max,
+    )
+    B_torch = B_orig.torch_tensor().t()
+    B = TestTensor(
+        (K, N),
+        B_torch.stride(),
+        weight_dtype,
+        device,
+        mode="manual",
+        set_tensor=B_torch,
+    )
     
     b_scales = TestTensor(
         (n_tiles, k_tiles),
@@ -254,6 +261,165 @@ def lib_gptq_qyblas_gemm():
     check_error(LIBINFINIOP.infiniopDestroyGptqQyblasGemmDescriptor(descriptor))
 
 
+def test_w4(
+    handle,
+    device,
+    M,
+    K,
+    N,
+    block_size,
+    weight_dtype=InfiniDtype.I8,
+    dtype=InfiniDtype.BF16,
+    sync=None,
+):
+    print(
+        f"Testing w4 Gptq Qyblas Gemm on {InfiniDeviceNames[device]} with M-K-N:{M, K, N}, block_size:{block_size}, weight dtype:{InfiniDtypeNames[weight_dtype]}, dtype:{InfiniDtypeNames[dtype]}"
+    )
+    quant_type = 0
+    bit = 4
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    A = TestTensor(
+        (M, K),
+        None,
+        dtype,
+        device,
+    )
+    if weight_dtype == InfiniDtype.I8:
+        _info = torch.iinfo(torch.int8)
+    elif weight_dtype == InfiniDtype.U8:
+        _info = torch.iinfo(torch.uint8)
+    elif weight_dtype == InfiniDtype.F8:
+        _info = torch.iinfo(float8_e4m3fn)
+    # B_orig = TestTensor(
+    #     (N, K // 2),
+    #     None,
+    #     weight_dtype,
+    #     device,
+    #     randint_low=_info.min,
+    #     randint_high=_info.max,
+    # )
+    # B_torch = B_orig.torch_tensor().t()
+    # B = TestTensor(
+    #     (K // 2, N),
+    #     B_torch.stride(),
+    #     weight_dtype,
+    #     device,
+    #     mode="manual",
+    #     set_tensor=B_torch,
+    # )
+
+    B = TestTensor(
+        (K // 2, N),
+        None,
+        weight_dtype,
+        device,
+        randint_low=_info.min,
+        randint_high=_info.max,
+    )
+    
+    b_scales = TestTensor(
+        (k_tiles, N),
+        None,
+        dtype,
+        device,
+    )
+
+    b_zeros = TestTensor(
+        (k_tiles, N),
+        None,
+        dtype,
+        device,
+        mode="zeros",
+    )
+    
+    out = TestTensor(
+        (M, N),
+        None,
+        dtype,
+        device,
+        mode="zeros",
+    )
+
+    print("A", A.torch_tensor().shape, A.torch_tensor().dtype, A.torch_tensor().stride())
+    print("B", B.torch_tensor().shape, B.torch_tensor().dtype, B.torch_tensor().stride())
+    print("scales", b_scales.torch_tensor().shape, b_scales.torch_tensor().dtype, b_scales.torch_tensor().stride())
+    print("zeros", b_zeros.torch_tensor().shape, b_zeros.torch_tensor().dtype, b_zeros.torch_tensor().stride())
+    print("out", out.torch_tensor().shape, out.torch_tensor().dtype, out.torch_tensor().stride())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGptqQyblasGemmDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out.descriptor,
+            A.descriptor,
+            B.descriptor,
+            b_scales.descriptor,
+            b_zeros.descriptor,
+        )
+    )
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+
+    for tensor in [out, A, B, b_scales, b_zeros]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGptqQyblasGemmWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, A.device)
+
+    def lib_gptq_qyblas_gemm():
+        check_error(
+            LIBINFINIOP.infiniopGptqQyblasGemm(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                out.data(),
+                A.data(),
+                B.data(),
+                b_scales.data(),
+                b_zeros.data(),
+                quant_type,
+                bit,
+                None,
+            )
+        )
+
+    lib_gptq_qyblas_gemm()
+
+    if sync is not None:
+        sync()
+
+    out_dtype = to_torch_dtype(dtype)
+    ans = native_w8a16_block_int8_matmul(A.torch_tensor(), B_orig.torch_tensor(), b_scales.torch_tensor(), block_size, out_dtype)
+    
+    rel_diff = (torch.mean(
+        torch.abs(out.actual_tensor().to(torch.float32) - ans.to(torch.float32))) /
+                torch.mean(torch.abs(ans.to(torch.float32))))
+
+    assert rel_diff < 0.05
+    
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: native_w8a16_block_int8_matmul(A.torch_tensor(), B_orig.torch_tensor(), b_scales.torch_tensor(), block_size, out_dtype), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gptq_qyblas_gemm(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyGptqQyblasGemmDescriptor(descriptor))
+
+
 if __name__ == "__main__":
     args = get_args()
 
@@ -263,7 +429,9 @@ def lib_gptq_qyblas_gemm():
     NUM_PRERUN = args.num_prerun
     NUM_ITERATIONS = args.num_iterations
 
+    # for device in get_test_devices(args):
+    #     test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
     for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+        test_operator(device, test_w4, _TEST_CASES_W4, _TENSOR_DTYPES_W4)
 
     print("\033[92mTest passed!\033[0m")