bitsandbytes-foundation
diff --git a/‎bitsandbytes/_ops.py‎
Lines changed: 60 additions & 0 deletions b/‎bitsandbytes/_ops.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 103 additions & 0 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎bitsandbytes/functional.py‎
Lines changed: 66 additions & 0 deletions b/‎bitsandbytes/functional.py‎
Lines changed: 66 additions & 0 deletions
@@ -557,6 +557,66 @@ def _(
     return out
 
 
+# VQ (Vector Quantization) quantize/dequantize
+
+torch.library.define(
+    "bitsandbytes::quantize_vq",
+    "(Tensor A, Tensor codebook, int p) -> (Tensor, Tensor)",
+)
+
+
+@register_fake("bitsandbytes::quantize_vq")
+def _(A: torch.Tensor, codebook: torch.Tensor, p: int) -> tuple[torch.Tensor, torch.Tensor]:
+    torch._check(p in (2, 4), lambda: f"p must be 2 or 4, got {p}")
+    torch._check(codebook.shape == (256, p), lambda: f"codebook must be [256, {p}], got {codebook.shape}")
+    n = A.numel()
+    num_blocks = -(n // -32)
+    words_per_block = 32 // p // 4  # p=2: 4, p=4: 2
+    packed = torch.empty(num_blocks * words_per_block, device=A.device, dtype=torch.int32)
+    absmax = torch.empty(num_blocks, device=A.device, dtype=torch.uint8)
+    return packed, absmax
+
+
+torch.library.define(
+    "bitsandbytes::dequantize_vq",
+    "(Tensor packed, Tensor codebook, Tensor absmax, int p, int n, ScalarType dtype) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::dequantize_vq")
+def _(
+    packed: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax: torch.Tensor,
+    p: int,
+    n: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    torch._check(p in (2, 4), lambda: f"p must be 2 or 4, got {p}")
+    num_blocks = -(n // -32)
+    return torch.empty(num_blocks * 32, device=packed.device, dtype=dtype)
+
+
+torch.library.define(
+    "bitsandbytes::dequantize_vq_",
+    "(Tensor packed, Tensor codebook, Tensor absmax, int p, int n, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)",
+)
+
+
+@register_fake("bitsandbytes::dequantize_vq_")
+def _(
+    packed: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax: torch.Tensor,
+    p: int,
+    n: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    torch._check(p in (2, 4), lambda: f"p must be 2 or 4, got {p}")
+    return out
+
+
 # K-bit repack: flat bit-plane layout -> GEMM-tiled layout
 
 torch.library.define(
 
@@ -956,6 +956,109 @@ def _(
     return out
 
 
+_VQ_DTYPE_SUFFIX = {
+    torch.float16: "fp16",
+    torch.bfloat16: "bf16",
+    torch.float32: "fp32",
+}
+
+
+@register_kernel("bitsandbytes::quantize_vq", "cuda")
+def _(A: torch.Tensor, codebook: torch.Tensor, p: int) -> tuple[torch.Tensor, torch.Tensor]:
+    torch._check(p in (2, 4), lambda: f"p must be 2 or 4, got {p}")
+    torch._check(
+        A.dtype in _VQ_DTYPE_SUFFIX,
+        lambda: f"quantize_vq only supports float16/bfloat16/float32, got {A.dtype}",
+    )
+    torch._check(codebook.dtype == torch.float16, lambda: f"codebook must be float16, got {codebook.dtype}")
+
+    n = A.numel()
+    num_blocks = -(n // -32)
+    words_per_block = 32 // p // 4
+    packed = torch.zeros(num_blocks * words_per_block, device=A.device, dtype=torch.int32)
+    absmax = torch.zeros(num_blocks, device=A.device, dtype=torch.uint8)
+
+    with _cuda_device_of(A):
+        tname = _VQ_DTYPE_SUFFIX[A.dtype]
+        fn = getattr(lib, f"cquantize_vq_{tname}_p{p}")
+        fn(
+            get_ptr(codebook),
+            get_ptr(A),
+            get_ptr(absmax),
+            get_ptr(packed),
+            ct.c_int(n),
+            _get_tensor_stream(A),
+        )
+
+    return packed, absmax
+
+
+def _dequantize_vq_impl(
+    packed: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax: torch.Tensor,
+    p: int,
+    n: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> None:
+    torch._check(p in (2, 4), lambda: f"p must be 2 or 4, got {p}")
+    torch._check(
+        dtype in _VQ_DTYPE_SUFFIX,
+        lambda: f"dequantize_vq only supports float16/bfloat16/float32, got {dtype}",
+    )
+    torch._check(codebook.dtype == torch.float16, lambda: f"codebook must be float16, got {codebook.dtype}")
+
+    # If fp32 absmax, encode to E4M4 first
+    if absmax.dtype == torch.float32:
+        from bitsandbytes.functional import encode_absmax_e4m4
+
+        absmax = encode_absmax_e4m4(absmax)
+
+    tname = _VQ_DTYPE_SUFFIX[dtype]
+    aname = _KBIT_ABSMAX_SUFFIX[absmax.dtype]
+
+    with _cuda_device_of(packed):
+        fn = getattr(lib, f"cdequantize_vq_{tname}_{aname}_p{p}")
+        fn(
+            get_ptr(packed),
+            get_ptr(codebook),
+            get_ptr(absmax),
+            get_ptr(out),
+            ct.c_int(n),
+            _get_tensor_stream(packed),
+        )
+
+
+@register_kernel("bitsandbytes::dequantize_vq", "cuda")
+def _(
+    packed: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax: torch.Tensor,
+    p: int,
+    n: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    num_blocks = -(n // -32)
+    out = torch.empty(num_blocks * 32, device=packed.device, dtype=dtype)
+    _dequantize_vq_impl(packed, codebook, absmax, p, n, dtype, out)
+    return out
+
+
+@register_kernel("bitsandbytes::dequantize_vq_", "cuda")
+def _(
+    packed: torch.Tensor,
+    codebook: torch.Tensor,
+    absmax: torch.Tensor,
+    p: int,
+    n: int,
+    dtype: torch.dtype,
+    out: torch.Tensor,
+) -> torch.Tensor:
+    _dequantize_vq_impl(packed, codebook, absmax, p, n, dtype, out)
+    return out
+
+
 @register_kernel("bitsandbytes::repack_kbit", "cuda")
 def _(
     packed_flat: torch.Tensor,
 
@@ -1310,6 +1310,72 @@ def dequantize_kbit(
     return result[:n]
 
 
+def quantize_vq(
+    A: Tensor,
+    p: int = 2,
+    codebook: Optional[Tensor] = None,
+) -> tuple[Tensor, Tensor, Tensor]:
+    """Quantize a tensor using VQ codebook quantization (blocksize=32).
+
+    Each group of p consecutive weights is mapped to the nearest entry in a
+    256-entry codebook. Produces 8/p bits per weight (p=2: 4 bits, p=4: 2 bits).
+
+    Args:
+        A: Input tensor. Supports float16, bfloat16, or float32.
+        p: VQ dimension (2 or 4). Each byte index maps to p weight values.
+        codebook: Optional fp16 codebook tensor of shape [256, p].
+            If None, uses precomputed Gaussian codebook.
+
+    Returns:
+        Tuple of (packed, absmax, codebook):
+        - packed: int32 tensor of packed byte indices.
+        - absmax: uint8 tensor of E4M4 per-block absmax values.
+        - codebook: The codebook tensor used.
+    """
+    if codebook is None:
+        codebook = create_vq_codebook(p, device=A.device)
+    else:
+        codebook = codebook.to(device=A.device, dtype=torch.float16)
+
+    A_flat = A.contiguous().view(-1)
+    packed, absmax = torch.ops.bitsandbytes.quantize_vq(A_flat, codebook, p)
+    return packed, absmax, codebook
+
+
+def dequantize_vq(
+    packed: Tensor,
+    absmax: Tensor,
+    codebook: Tensor,
+    p: int,
+    n: int,
+    dtype: torch.dtype = torch.float16,
+    out: Optional[Tensor] = None,
+) -> Tensor:
+    """Dequantize a VQ codebook quantized tensor.
+
+    Args:
+        packed: int32 tensor of packed byte indices (from quantize_vq).
+        absmax: Per-block absmax values (uint8 E4M4 or float32).
+        codebook: fp16 codebook tensor of shape [256, p].
+        p: VQ dimension (2 or 4).
+        n: Number of original elements.
+        dtype: Output dtype. Defaults to float16.
+        out: Optional pre-allocated output tensor.
+
+    Returns:
+        Dequantized tensor of shape (n,) with the given dtype.
+    """
+    num_blocks = -(n // -32)
+    padded_n = num_blocks * 32
+
+    if out is not None:
+        torch.ops.bitsandbytes.dequantize_vq_(packed, codebook, absmax, p, n, dtype, out)
+        return out[:n]
+
+    result = torch.ops.bitsandbytes.dequantize_vq(packed, codebook, absmax, p, n, dtype)
+    return result[:n]
+
+
 def dequantize_kbit_tiled(
     packed: Tensor,
     absmax: Tensor,