bitsandbytes-foundation
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 9 additions & 1 deletion b/‎bitsandbytes/autograd/_functions.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 60 additions & 1 deletion b/‎bitsandbytes/backends/cpu/ops.py‎
Lines changed: 60 additions & 1 deletion
diff --git a/‎bitsandbytes/functional.py‎
Lines changed: 134 additions & 0 deletions b/‎bitsandbytes/functional.py‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎bitsandbytes/nn/modules.py‎
Lines changed: 27 additions & 4 deletions b/‎bitsandbytes/nn/modules.py‎
Lines changed: 27 additions & 4 deletions
@@ -282,6 +282,9 @@ if (BUILD_CPU)
         check_cxx_compiler_flag(-mavx512bf16 HAS_AVX512BF16_FLAG)
         if (HAS_AVX512F_FLAG)
             target_compile_options(bitsandbytes PRIVATE -mavx512f)
+            target_compile_options(bitsandbytes PRIVATE -mavx512dq)
+            target_compile_options(bitsandbytes PRIVATE -mavx512bw)
+            target_compile_options(bitsandbytes PRIVATE -mavx512vl)
         endif()
         if (HAS_AVX512BF16_FLAG)
             target_compile_options(bitsandbytes PRIVATE -mavx512bf16)
 
@@ -374,10 +374,18 @@ def matmul_4bit(
     bias: Optional[torch.Tensor] = None,
 ):
     assert quant_state is not None
-    # Change dtype to bfloat16 on CPU
+    # Change dtype to input dtype on CPU
     if A.device.type == "cpu":
         quant_state.dtype = A.dtype
 
+        if getattr(quant_state, "packing_format_for_cpu", False):
+            out = F.gemv_4bit(A, B, out, state=quant_state)
+            if bias is not None:
+                out += bias
+            return out
+        else:
+            return MatMul4Bit.apply(A, B, out, bias, quant_state)
+
     if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "hpu":
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
 
@@ -5,7 +5,7 @@
 
 import torch
 
-from bitsandbytes.functional import get_ptr
+from bitsandbytes.functional import get_ptr, has_avx512bf16
 
 from ..._ops import register_kernel
 from ...cextension import ErrorHandlerMockBNBNativeLibrary, lib
@@ -217,3 +217,62 @@ def _(
             raise ValueError
 
         return out
+
+    if has_avx512bf16():
+
+        @register_kernel("bitsandbytes::gemv_4bit", "cpu")
+        def _(
+            A: torch.Tensor,
+            B: torch.Tensor,
+            shapeB: Sequence[int],
+            absmax: torch.Tensor,
+            code: torch.Tensor,
+            blocksize: int,
+        ) -> torch.Tensor:
+            assert B.dtype == torch.uint8, "Only support uint8 qweight"
+            dtype = A.dtype
+            quant_type = "fp4" if code[1] > 0 else "nf4"
+            # cpu fused op only support bf16 for now.
+            if dtype != torch.bfloat16:
+                A = A.to(torch.bfloat16)
+
+            final_out_shape = (*A.shape[:-1], shapeB[0])
+            A = A.reshape(-1, A.shape[-1])
+            out_shape = (*A.shape[:-1], shapeB[0])
+            out = torch.empty(out_shape, dtype=A.dtype, device=A.device)
+            M = A.shape[0]
+            N = shapeB[0]
+            K = A.shape[1]
+            x_strideM = A.stride(0)
+            out_strideM = out.stride(0)
+            if quant_type == "fp4":
+                lib.gemv_4bit_inference_cpu_fp4_bf16(
+                    ct.c_int64(M),
+                    ct.c_int64(N),
+                    ct.c_int64(K),
+                    get_ptr(A),
+                    get_ptr(B),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int64(blocksize),
+                    ct.c_int64(x_strideM),
+                    ct.c_int64(out_strideM),
+                )
+            elif quant_type == "nf4":
+                lib.gemv_4bit_inference_cpu_nf4_bf16(
+                    ct.c_int64(M),
+                    ct.c_int64(N),
+                    ct.c_int64(K),
+                    get_ptr(A),
+                    get_ptr(B),
+                    get_ptr(absmax),
+                    get_ptr(out),
+                    ct.c_int64(blocksize),
+                    ct.c_int64(x_strideM),
+                    ct.c_int64(out_strideM),
+                )
+
+            if dtype != torch.bfloat16:
+                out = out.to(dtype)
+
+            return out.reshape(final_out_shape)
@@ -2103,4 +2103,138 @@ def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
     return out
 
 
+def _convert_weight_packed_for_cpu(qweight: torch.Tensor, quant_state: QuantState, block_n: int = 32):
+    """
+    qweight: (K * N / 2)  uint8
+    return: packed_weight
+    """
+    if qweight.dtype != torch.uint8:
+        quant_state.original_storage_type = qweight.dtype
+        qweight = qweight.view(torch.uint8)
+    quant_state.original_dtype = quant_state.dtype
+    quant_state.original_nested = quant_state.nested
+    quant_state.original_qshape = qweight.shape
+
+    qweight = qweight.reshape(-1)
+    unpacked_w = torch.empty(qweight.shape[0] * 2, dtype=torch.int32, device=qweight.device)
+    unpacked_w[1::2] = qweight & 0xF
+    unpacked_w[::2] = qweight >> 4
+    qweight_final = unpacked_w.reshape(quant_state.shape).to(torch.uint8)  # (*, N, K)
+    # pack weight: [*, N, K] -> [*, N, K/2] combine low and high bit
+    assert len(qweight_final.shape) == 2
+    N, K = qweight_final.shape[0], qweight_final.shape[1]
+    assert N % block_n == 0, "N must be divisible by block_n"
+    assert K % 2 == 0, "K must be even"
+    BLOCK_N = block_n
+    BIT_COUNT = 32  # (=32 low +32 high)
+    new_shape = [N // BLOCK_N, BLOCK_N, K // 2, 2]
+    out_shape = [N, K // 2]
+    qw = qweight_final.reshape(new_shape)  # (..., N/B, B, K/2, 2)
+    qw = qw.transpose(-3, -2).contiguous()  # (..., N/B, K/2, B, 2)
+    qw = qw.reshape(-1, BIT_COUNT * 2)  # [-1, 64]
+    high = qw[:, BIT_COUNT:]  # high 32
+    low = qw[:, :BIT_COUNT]  # low 32
+    packed = ((high << 4) | low).to(torch.uint8)  # combine
+    final_qweight = packed.reshape(out_shape)
+    if quant_state.nested:
+        absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+        absmax += quant_state.offset
+        if absmax.dtype != torch.float32:
+            absmax = absmax.float()
+
+        quant_state.absmax = absmax
+        quant_state.nested = False
+        delattr(quant_state, "state2")
+
+    quant_state.absmax = (
+        quant_state.absmax.reshape(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
+        .T.to(torch.bfloat16)
+        .contiguous()
+    )
+
+    quant_state.dtype = torch.bfloat16
+    quant_state.packing_format_for_cpu = True
+    return final_qweight, quant_state
+
+
+def _convert_weight_packed_for_cpu_inverse(
+    packed_weight: torch.Tensor,
+    quant_state: QuantState,
+    block_n: int = 32,
+) -> tuple[torch.Tensor, QuantState]:
+    """
+    packed_weight: [N, K/2] uint8, output of `_convert_weight_packed_for_cpu` (final_qweight)
+    quant_state:   QuantState that was modified by `_convert_weight_packed_for_cpu`
+    Returns:
+        qweight: [*, N, K] uint8, original qweight shape (quant_state.shape)
+        recovered_state: QuantState with partially restored fields (best-effort inverse)
+    """
+    assert quant_state.packing_format_for_cpu, "only for packing format"
+    assert packed_weight.dtype == torch.uint8
+    assert len(packed_weight.shape) == 2, "packed_weight should be [N, K/2]"
+    N, K_half = packed_weight.shape
+    K = K_half * 2
+
+    # 1) packed [N, K/2] -> [N//BLOCK_N, BLOCK_N, K/2, 2]
+    BLOCK_N = block_n
+    BIT_COUNT = 32  # (=32 low + 32 high)
+
+    assert N % BLOCK_N == 0, "N must be divisible by block_n"
+    assert K % 2 == 0, "K must be even"
+
+    # [N, K/2] -> [-1, 64] (32 low + 32 high)
+    packed = packed_weight.reshape(-1, BIT_COUNT)  # [-1, 64]
+    # split high/low nibbles
+    high = (packed >> 4) & 0xF
+    low = packed & 0xF
+    # concatenate to [..., 64], first 32 are low, last 32 are high
+    qw = torch.cat([low, high], dim=-1).to(torch.uint8)  # [..., 64]
+
+    # -> [N/BLOCK_N, K/2, BLOCK_N, 2] -> [N, K]
+    qw = qw.reshape(N // BLOCK_N, K_half, BLOCK_N, 2)  # [N/B, K/2, B, 2]
+    qw = qw.transpose(-3, -2).contiguous()  # [N/B, B, K/2, 2]
+    qw = qw.reshape(N, K)  # [N, K]
+
+    qweight = qw  # [N, K]
+
+    unpacked_w = qweight.reshape(-1).to(torch.int32)  # [K*N]
+    high4 = (unpacked_w[::2] & 0xF).to(torch.uint8)
+    low4 = (unpacked_w[1::2] & 0xF).to(torch.uint8)
+    qweight = (high4 << 4) | low4  # [K*N/2]
+
+    # 2) Best-effort restore of quant_state fields (absmax / dtype / nested flags, etc.)
+    recovered_state = quant_state
+    qweight = qweight.to(torch.uint8).reshape(recovered_state.original_qshape)
+
+    # quantize absmax
+    if recovered_state.original_nested:
+        absmax = recovered_state.absmax.T.reshape(-1).to(recovered_state.original_dtype)
+        offset = absmax.mean()
+        qabsmax, state2 = quantize_blockwise(absmax - offset, blocksize=256)
+        recovered_state.absmax = qabsmax
+        recovered_state.offset = offset
+        recovered_state.state2 = state2
+        recovered_state.nested = True
+
+    recovered_state.dtype = recovered_state.original_dtype
+    recovered_state.packing_format_for_cpu = False
+
+    if getattr(recovered_state, "original_storage_type", None):
+        qweight = qweight.view(recovered_state.original_storage_type)
+
+    return qweight, recovered_state
+
+
+def has_avx512bf16():
+    """
+    Try calling native lib.has_avx512bf16_cpu().
+    Return False explicitly if symbol missing or call fails.
+    """
+    try:
+        support_avx_bf16 = lib.has_avx512bf16_cpu()
+    except (AttributeError, RuntimeError, OSError):
+        support_avx_bf16 = False
+    return support_avx_bf16
+
+
 C = 127.0
@@ -12,7 +12,12 @@
 
 import bitsandbytes as bnb
 from bitsandbytes.cextension import ROCM_WARP_SIZE_64
-from bitsandbytes.functional import QuantState
+from bitsandbytes.functional import (
+    QuantState,
+    _convert_weight_packed_for_cpu,
+    _convert_weight_packed_for_cpu_inverse,
+    has_avx512bf16,
+)
 from bitsandbytes.optim import GlobalOptimManager
 from bitsandbytes.utils import INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING, OutlierTracer
 
@@ -311,9 +316,13 @@ def cpu(self):
         return self.to(device="cpu")
 
     def cuda(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
+        if getattr(self.quant_state, "packing_format_for_cpu", False):
+            self.data, self.quant_state = _convert_weight_packed_for_cpu_inverse(self.data, self.quant_state)
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
     def xpu(self, device: Optional[int | device | str] = None, non_blocking: bool = False):
+        if getattr(self.quant_state, "packing_format_for_cpu", False):
+            self.data, self.quant_state = _convert_weight_packed_for_cpu_inverse(self.data, self.quant_state)
         return self.to(device="xpu" if device is None else device, non_blocking=non_blocking)
 
     @overload
@@ -479,6 +488,7 @@ def __init__(
         self.compute_type_is_set = compute_dtype is not None
         self.quant_state = None
         self.quant_storage = quant_storage
+        self.support_avx512bf16_for_cpu = has_avx512bf16()
 
     def set_compute_type(self, x):
         if x.dtype in [torch.float32, torch.bfloat16]:
@@ -505,14 +515,27 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         save weight and bias,
         then fill state_dict with components of quant_state
         """
+        if getattr(self.weight.quant_state, "packing_format_for_cpu", False):
+            self.weight.data, self.weight.quant_state = _convert_weight_packed_for_cpu_inverse(
+                self.weight.data, self.weight.quant_state
+            )
         super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
-
         if getattr(self.weight, "quant_state", None) is not None:
             for k, v in self.weight.quant_state.as_dict(packed=True).items():
                 destination[prefix + "weight." + k] = v if keep_vars else v.detach()
 
     def forward(self, x: torch.Tensor):
         fix_4bit_weight_quant_state_from_module(self)
+        quant_state = self.weight.quant_state
+
+        if (
+            not getattr(quant_state, "packing_format_for_cpu", False)
+            and x.device.type == "cpu"
+            and self.support_avx512bf16_for_cpu
+            and not self.training
+            and x.requires_grad == False
+        ):
+            self.weight.data, quant_state = _convert_weight_packed_for_cpu(self.weight.data, quant_state)
 
         # weights are cast automatically as Int8Params, but the bias has to be cast manually
         if self.bias is not None and self.bias.dtype != x.dtype:
@@ -527,9 +550,9 @@ def forward(self, x: torch.Tensor):
             x = x.to(self.compute_dtype)
 
         bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        weight = self.weight.t()
+        weight = self.weight if getattr(quant_state, "packing_format_for_cpu", False) else self.weight.t()
 
-        return bnb.matmul_4bit(x, weight, bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
+        return bnb.matmul_4bit(x, weight, bias=bias, quant_state=quant_state).to(inp_dtype)
 
 
 class LinearFP4(Linear4bit):