xiaolil1
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 4 additions & 2 deletions b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎bitsandbytes/backends/xpu/ops.py‎
Lines changed: 8 additions & 14 deletions b/‎bitsandbytes/backends/xpu/ops.py‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎bitsandbytes/functional.py‎
Lines changed: 6 additions & 1 deletion b/‎bitsandbytes/functional.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎csrc/pythonInterface.cpp‎
Lines changed: 2 additions & 2 deletions b/‎csrc/pythonInterface.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/xpu_cutlass.h‎
Lines changed: 1 addition & 1 deletion b/‎csrc/xpu_cutlass.h‎
Lines changed: 1 addition & 1 deletion
@@ -436,7 +436,8 @@ def matmul_4bit(
     bias: Optional[torch.Tensor] = None,
 ):
     assert quant_state is not None
-
+    #import pdb
+    #pdb.set_trace()
     if A.device.type == "cpu" and A.requires_grad == False:
         if getattr(quant_state, "ipex", False):
             # IPEX CPU will change weight to 4D so don't need transpose
@@ -447,7 +448,8 @@ def matmul_4bit(
             return out
         else:
             return MatMul4Bit.apply(A, B, out, bias, quant_state)
-    if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "hpu":
+    #if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "hpu":
+    if A.requires_grad == False and A.device.type != "hpu":
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
 
@@ -74,20 +74,16 @@ def _gemv_4bit_impl(
     blocksize: int,
     out: torch.Tensor,
 ) -> None:
-    import pdb
-    #pdb.set_trace()
-    m = ct.c_int32(*A.shape[:-1]) #A.shape[1])
+    m = ct.c_int32(A.shape[-2])#ct.c_int32(1)
     n = ct.c_int32(shapeB[0])
     k = ct.c_int32(shapeB[1])
-
+    #import pdb
     lda = m
     ldb = ct.c_int32((A.shape[-1] + 1) // 2)
     ldc = m
-
-    #absmax = absmax * 10
     #pdb.set_trace()
-    #print("A before kernel: ", A)
-    #print("B before kernel: ", B)
+    absmax = absmax.view(shapeB[0],int(shapeB[1]/blocksize)).transpose(0,1).contiguous()
+    #pdb.set_trace()
     stream = _get_tensor_stream(A)
     if A.dtype == torch.float16:
         lib.cgemv_4bit_inference_fp16(
@@ -112,7 +108,7 @@ def _gemv_4bit_impl(
             k,
             get_ptr(A),
             get_ptr(B),
-            get_ptr(absmax.bfloat16()),
+            get_ptr(absmax),
             get_ptr(code),
             get_ptr(out),
             lda,
@@ -186,11 +182,9 @@ def _(
         blocksize: int,
     ) -> torch.Tensor:
         shape = (*A.shape[:-1], shapeB[0])
-        #import pdb
-        #pdb.set_trace()
-        out = torch.zeros(shape, device=A.device, dtype=torch.float32)
-        _gemv_4bit_impl(A, B, shapeB, absmax.bfloat16(), code, blocksize, out=out)
-        return out
+        out = torch.empty(shape, device=A.device, dtype=A.dtype).float()
+        _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+        return out.bfloat16()
 
     @register_kernel("bitsandbytes::gemv_4bit.out", "xpu")
     def _(
 
@@ -937,6 +937,12 @@ def quantize_4bit(
         quant_storage,
     )
 
+    #import pdb
+    #pdb.set_trace()
+    #print("_absmax = ", _absmax)
+    #_absmax = _absmax.view(input_shape[0],int(input_shape[1]/blocksize)).transpose(0,1).contiguous()
+    #pdb.set_trace()
+
     code = get_4bit_type(quant_type, device=A.device)
 
     if compress_statistics:
@@ -969,7 +975,6 @@ def quantize_4bit(
     # TODO(matthewdouglas): Deprecate absmax kwarg
     if absmax is not None:
         state.absmax = absmax.copy_(state.absmax)
-
     return out, state
 
 
 
@@ -381,7 +381,7 @@ void gemv_4bit_inference_fp16(
 
 #if 1
 void gemm_4bit_inference_bf16(
-    int m, int n, int k, sycl::ext::oneapi::bfloat16 * A,  unsigned char* B,   sycl::ext::oneapi::bfloat16 *absmax, float *datatype, float * out,
+    int m, int n, int k, sycl::ext::oneapi::bfloat16 * A,  unsigned char* B,   float *absmax, float *datatype, float * out,
     int lda, int ldb, int ldc, int blocksize, sycl::queue* stream
 ) {
     gemm_4bit_inference_cutlass_dequant<sycl::ext::oneapi::bfloat16, 16>(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize, stream);
@@ -826,7 +826,7 @@ void cgemv_4bit_inference_fp16(
 
 #if 1
 void cgemv_4bit_inference_bf16(
-    int m, int n, int k, sycl::ext::oneapi::bfloat16 * A,  unsigned char* B,   sycl::ext::oneapi::bfloat16 *absmax, float *datatype,
+    int m, int n, int k, sycl::ext::oneapi::bfloat16 * A,  unsigned char* B,   float *absmax, float *datatype,
     float * out,  int lda, int ldb, int ldc, int blocksize, sycl::queue* stream
 ) {
     gemm_4bit_inference_bf16(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize, stream);
 
@@ -109,7 +109,7 @@ void gemv_4bit_inference_cutlass_cute(int m, int n, int k, T *A, T *B,
 
 template <typename T, int BITS>
 void gemm_4bit_inference_cutlass_dequant(int m, int n, int k, T *A, unsigned char *B,
-                         T *absmax, float *datatype, float *out, int lda,
+                         float *absmax, float *datatype, float *out, int lda,
                          int ldb, int ldc, int blocksize, sycl::queue *stream);
 
 template <typename T, int BITS>