From 18ead1935557a2d11cac44bb5dfd82f3d63ea682 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 May 2025 08:41:34 +0000
Subject: [PATCH 01/85] continuous release: tweaks
---
.github/workflows/python-package.yml | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index fbaa27d56..1b182c4a0 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -171,10 +171,10 @@ jobs:
path: tmp/
pattern: "bdist_wheel_*"
merge-multiple: true
-
+
- name: Inspect tmp directory after downloading artifacts
run: ls -alFR tmp/
-
+
- name: Move and rename wheel files with pattern replacement
run: |
mkdir -p wheels/
@@ -199,10 +199,11 @@ jobs:
- name: Inspect wheels directory after renaming files
run: ls -alFR wheels/
-
+
+ - uses: actions/checkout@v4
- name: Delete old pre-release (if exists)
run: |
- gh release delete continuous-release_main --cleanup-tag -y || true
+ gh release delete continuous-release_main --cleanup-tag -y
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -213,7 +214,7 @@ jobs:
This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
- **How to install:**
+ **How to install:**
Pick the correct command for your platform and run it in your terminal:
ENDOFMARKDOWN
@@ -228,7 +229,7 @@ jobs:
done
cat >> body.md << 'ENDOFMARKDOWN'
- > **Note:**
+ > **Note:**
> These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
ENDOFMARKDOWN
From 90f38accc98c7a5487b4a239cd3ad38691b4ea85 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 May 2025 09:07:06 +0000
Subject: [PATCH 02/85] continuous release: tweaks
---
.github/workflows/python-package.yml | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 1b182c4a0..dabd8c659 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -173,7 +173,14 @@ jobs:
merge-multiple: true
- name: Inspect tmp directory after downloading artifacts
- run: ls -alFR tmp/
+ run: |
+ ls -alFR tmp/
+ WHEEL_COUNT=$(find wheels -type f -name "*.whl" | wc -l)
+ echo "Found $WHEEL_COUNT wheel files"
+ if [ "$WHEEL_COUNT" -eq 0 ]; then
+ echo "::error::No wheel files found in wheels directory! Cannot proceed with release."
+ exit 1
+ fi
- name: Move and rename wheel files with pattern replacement
run: |
@@ -201,9 +208,11 @@ jobs:
run: ls -alFR wheels/
- uses: actions/checkout@v4
+ with:
+ path: repo
- name: Delete old pre-release (if exists)
run: |
- gh release delete continuous-release_main --cleanup-tag -y
+ cd repo && gh release delete continuous-release_main --cleanup-tag -y
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
From 66c0c454b609e33df1fd0e2dbdf47c3bd941681a Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 May 2025 09:48:33 +0000
Subject: [PATCH 03/85] continuous release: tweaks
---
.github/workflows/python-package.yml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index dabd8c659..ec8c8d0ef 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -175,10 +175,10 @@ jobs:
- name: Inspect tmp directory after downloading artifacts
run: |
ls -alFR tmp/
- WHEEL_COUNT=$(find wheels -type f -name "*.whl" | wc -l)
+ WHEEL_COUNT=$(find tmp/ -type f -name "*.whl" | wc -l)
echo "Found $WHEEL_COUNT wheel files"
if [ "$WHEEL_COUNT" -eq 0 ]; then
- echo "::error::No wheel files found in wheels directory! Cannot proceed with release."
+ echo "::error::No wheel files found in tmp directory! Cannot proceed with release."
exit 1
fi
From 4011273a86e8eaab435d3e0965df79512b3813ca Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 May 2025 10:19:01 +0000
Subject: [PATCH 04/85] continuous release: tweaks
---
.github/workflows/python-package.yml | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index ec8c8d0ef..902c5d78e 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -219,7 +219,7 @@ jobs:
- name: Generate pip install commands for release body
run: |
cat > body.md << 'ENDOFMARKDOWN'
- ## Latest `main` Wheel Pre-release
+ ## Latest `main` pre-release wheel
This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
@@ -231,6 +231,17 @@ jobs:
for whl in wheels/*.whl; do
fname=$(basename "$whl")
url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"
+
+ if [[ "$fname" == *"manylinux_2_24_x86_64"* ]]; then
+ echo "### Linux (x86_64)" >> body.md
+ elif [[ "$fname" == *"manylinux_2_24_aarch64"* ]]; then
+ echo "### Linux (ARM/aarch64)" >> body.md
+ elif [[ "$fname" == *"win_amd64"* ]]; then
+ echo "### Windows (x86_64)" >> body.md
+ else
+ echo "### Other platform" >> body.md
+ fi
+
echo "\`\`\`sh" >> body.md
echo "pip install $url" >> body.md
echo "\`\`\`" >> body.md
From 31762776c3eecb066e5a2d8bd9ae678b41f1402f Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 16 May 2025 16:09:38 +0000
Subject: [PATCH 05/85] continuous release: tweaks
---
.github/workflows/python-package.yml | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 902c5d78e..fa15d6d64 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -216,6 +216,14 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ - name: Ensure tag exists
+ run: |
+ cd repo
+ git tag -f continuous-release_main
+ git push -f origin continuous-release_main
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
- name: Generate pip install commands for release body
run: |
cat > body.md << 'ENDOFMARKDOWN'
@@ -266,7 +274,6 @@ jobs:
tag_name: continuous-release_main
make_latest: false
draft: false
- target_commitish: ${{ github.sha }}
audit-wheels:
needs: build-wheels
From 3047ab97ef858ef0faeeaf6e9f43f40b87f0e5fc Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 19 May 2025 08:38:04 +0000
Subject: [PATCH 06/85] continuous release: tweaks
---
.github/workflows/python-package.yml | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index fa15d6d64..f59abbcc5 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -259,6 +259,14 @@ jobs:
cat >> body.md << 'ENDOFMARKDOWN'
> **Note:**
> These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
+
+ The version number is replaced with 1.33.7-preview in order to keep the link stable, this however does not affect the installed version at all:
+ ```
+ > pip install https://.../bitsandbytes-1.33.7-preview-py3-none-manylinux_2_24_x86_64.whl
+ Collecting bitsandbytes==1.33.7rc0
+ ...
+ Successfully installed bitsandbytes-0.46.0.dev0
+ ```
ENDOFMARKDOWN
# for debugging:
From 513e69be0ab4f70e84e10c1e36d547010ab96dda Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 19 May 2025 09:42:46 +0000
Subject: [PATCH 07/85] continuous release: tweak + docs
---
.github/workflows/python-package.yml | 2 +-
docs/source/installation.mdx | 9 +++++++--
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index f59abbcc5..d3deb26ee 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -251,7 +251,7 @@ jobs:
fi
echo "\`\`\`sh" >> body.md
- echo "pip install $url" >> body.md
+ echo "pip install --force-reinstall $url" >> body.md
echo "\`\`\`" >> body.md
echo "" >> body.md
done
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index e127b0bda..704d7aacc 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -57,7 +57,12 @@ If you would like to use new feature even before they are officially released an
```
# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
-pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-0.46.0.dev0-py3-none-manylinux_2_24_x86_64.whl'
+
+# x86_64 (most users)
+pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
+
+# ARM/aarch64
+pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl
```
@@ -65,7 +70,7 @@ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsan
```
# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
-pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl
```
From cdcae8d34dcb7f67772a09331c13f783853b59d3 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 19 May 2025 14:12:44 -0400
Subject: [PATCH 08/85] CI runner updates (#1643)
* Test g5g runner
* Switch L4 to L40S runner; swap GitHub Linux T4 runner for AWS g4dn
* Run tests on last 2 pytorch stable releases
* Run tests on last 2 pytorch stable releases
---
.github/workflows/tests.yml | 54 ++++++++++++++++++++++++++++++-------
tests/test_functional.py | 33 -----------------------
tests/test_modules.py | 12 ++++-----
3 files changed, 50 insertions(+), 49 deletions(-)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9431b32f4..5d2a2708b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,7 +49,7 @@ jobs:
build-cuda:
strategy:
matrix:
- cuda_version: ["11.8.0", "12.8.1"]
+ cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
include:
- os: ubuntu-22.04
@@ -100,7 +100,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
- torch_version: ["2.7.0"]
+ torch_version: ["2.6.0", "2.7.0"]
include:
- os: ubuntu-22.04
arch: x86_64
@@ -138,9 +138,35 @@ jobs:
- name: Show installed packages
run: pip list
+ - name: Show environment information
+ run: python -m torch.utils.collect_env
+
- name: Run tests
run: pytest --durations=100
+ # cuda-aarch64-tests:
+ # if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
+ # needs: build-cuda
+ # strategy:
+ # fail-fast: false
+ # matrix:
+ # os: [ubuntu-22.04-arm]
+ # arch: [aarch64]
+ # torch_version: ["2.7.0"]
+ # cuda_version: ["11.8.0", "12.8.1"]
+
+ # runs-on: bandb-aws-g5g-4xlarge-plus-use1-public-80
+ # env:
+ # BNB_TEST_DEVICE: cuda
+ # steps:
+ # - name: Show GPU Information
+ # run: nvidia-smi
+
+ # - name: Show pip packages
+ # run: pip list
+
+
+
cuda-tests:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cuda
@@ -149,25 +175,28 @@ jobs:
matrix:
os: [ubuntu-22.04, windows-2025]
arch: [x86_64]
- gpu: [T4, L4]
- cuda_version: ["11.8.0", "12.8.1"]
+ gpu: [T4, L40S]
+ cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
include:
- cuda_version: "11.8.0"
torch_version: "2.4.1"
pypi_index: "https://download.pytorch.org/whl/cu118"
+ - cuda_version: "12.6.3"
+ torch_version: "2.6.0"
+ pypi_index: "https://download.pytorch.org/whl/cu126"
- cuda_version: "12.8.1"
torch_version: "2.7.0"
pypi_index: "https://download.pytorch.org/whl/cu128"
- # L4 runners
+ # L40S runners
- os: ubuntu-22.04
- gpu: L4
- runner: bandb-aws-g6-4xlarge-plus-use1-public-80
+ gpu: L40S
+ runner: bandb-aws-g6e-4xlarge-plus-use1-public-80
# T4 runners
- os: ubuntu-22.04
gpu: T4
- runner: CUDA-Linux-x64
+ runner: bandb-aws-g4dn-4xlarge-plus-use1-public-80
- os: windows-2025
gpu: T4
runner: CUDA-Windows-x64
@@ -176,10 +205,12 @@ jobs:
# and cannot support CUDA 12+. Skip for now.
- os: windows-2025
cuda_version: "12.8.1"
+ - os: windows-2025
+ cuda_version: "12.6.3"
- # No Windows L4 runners.
+ # No Windows L40S runners.
- os: windows-2025
- gpu: L4
+ gpu: L40S
runs-on: ${{ matrix.runner }}
env:
BNB_TEST_DEVICE: cuda
@@ -210,5 +241,8 @@ jobs:
- name: Show installed packages
run: pip list
+ - name: Show environment information
+ run: python -m torch.utils.collect_env
+
- name: Run tests
run: pytest --durations=100
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 96e77e4f4..423a92193 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -929,39 +929,6 @@ def test_spmm_coo_very_sparse(self, dim1, dim2, dtype, out_func):
# torch.cuda.synchronize()
# print(time.time() - t0)
- @pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1"))
- @pytest.mark.parametrize("dim2", [256, 1024], ids=id_formatter("dim2"))
- @pytest.mark.skip("No longer supported")
- def test_integrated_sparse_decomp(self, dim1, dim2):
- threshold = 3.0
- for _ in range(k):
- A = torch.randn(dim1, dim2).cuda().half()
- w1 = torch.randn(dim1, dim2).cuda().half()
- out1 = torch.matmul(A, w1.t())
-
- Cw1, statsw1, _ = F.int8_vectorwise_quant(w1)
- CA, statsA, _ = F.int8_vectorwise_quant(A)
-
- out1_32 = F.int8_linear_matmul(CA, Cw1)
- out2 = F.int8_mm_dequant(out1_32, statsA, statsw1)
-
- # CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold)
- CA, _, statsA, _, coo_tensor = F.double_quant(A, threshold=threshold)
-
- out1_32 = F.int8_linear_matmul(CA, Cw1)
- out3 = F.int8_mm_dequant(out1_32, statsA, statsw1)
-
- assert coo_tensor is not None
-
- out4 = F.spmm_coo(coo_tensor, w1.t())
- # idx = torch.unique(coo_tensor._indices()[1]).long()
- # out4 = torch.matmul(A, w1.t())
- out5 = out3 + out4
-
- err1 = torch.abs(out1 - out2).mean().item()
- err2 = torch.abs(out1 - out5).mean().item()
- assert err2 < err1
-
@pytest.mark.parametrize("dim1", [1 * 2048])
@pytest.mark.parametrize("dim2", [2048])
@pytest.mark.parametrize("dtype", [torch.int8])
diff --git a/tests/test_modules.py b/tests/test_modules.py
index dc1d60e6c..c8ec6311a 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -130,7 +130,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
assert l1.weight.dtype == torch.int8
l1.eval()
- for i in range(100):
+ for i in range(4):
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
o1 = l1(b1)
assert o1.dtype == torch.float16
@@ -139,7 +139,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
assert mlp.fc1.weight.dtype == torch.int8
assert mlp.fc2.weight.dtype == torch.int8
- for i in range(100):
+ for i in range(4):
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
o1 = mlp(b1)
assert o1.dtype == torch.float16
@@ -152,7 +152,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
assert mlp.fc1.weight.dtype == torch.int8
assert mlp.fc2.weight.dtype == torch.int8
- for i in range(100):
+ for i in range(4):
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
o1 = mlp(b1)
assert o1.dtype == torch.float16
@@ -163,7 +163,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
mlp = MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False).half().to(device)
- for i in range(100):
+ for i in range(4):
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
o1 = mlp(b1)
assert o1.dtype == torch.float16
@@ -185,7 +185,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
.to(device)
)
- for i in range(100):
+ for i in range(4):
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
o1 = mlp(b1)
assert o1.dtype == torch.float16
@@ -207,7 +207,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
w1, w2 = mlp.fc1.weight.clone().to(device), mlp.fc2.weight.clone().to(device) # grab weights before quantization,
mlp = mlp.to(device).half() # and this line triggers quantization
- for i in range(100):
+ for i in range(4):
b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
o1 = mlp(b1)
assert o1.dtype == torch.float16
From d729c188496ce5947f159693fbbb3e2dd281d87e Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 20 May 2025 21:14:15 +0530
Subject: [PATCH 09/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 363 ++++++++++++++++++------------
1 file changed, 223 insertions(+), 140 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index efdef2871..fd63c888d 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,14 +1,15 @@
from collections.abc import Sequence
import ctypes as ct
from math import prod
-from typing import Optional
+from typing import Optional
import torch
from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
from ..._ops import register_kernel
-from ...cextension import lib
+from ...cextension import lib, HIP_ENVIRONMENT
+
@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -84,7 +85,6 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
return out
-
@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
def _(
A: torch.Tensor,
@@ -164,7 +164,7 @@ def _(A: torch.Tensor, threshold=0.0):
out_row[:, outlier_cols] = 0
return out_row, row_stats, outlier_cols
-
+
@register_kernel("bitsandbytes::int8_double_quant", "cuda")
def _(
@@ -210,35 +210,67 @@ def _get_col_absmax(
@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty_like(A, dtype=torch.uint8)
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(A.numel()),
- )
-
- if A.dtype == torch.float16:
- lib.cquantize_blockwise_fp16(*args)
- elif A.dtype == torch.bfloat16:
- lib.cquantize_blockwise_bf16(*args)
- elif A.dtype == torch.float32:
- lib.cquantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-
- return out, absmax
-
+
+ device = A.device
+ device_type = device.type
+
+ if device_type == 'cuda':
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ elif device_type == 'hip' and HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty_like(A, dtype=torch.uint8)
+
+ if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(A.numel()),
+ )
+
+ if A.dtype == torch.float16:
+ lib.cquantize_blockwise_fp16(*args)
+ elif A.dtype == torch.bfloat16:
+ lib.cquantize_blockwise_bf16(*args)
+ elif A.dtype == torch.float32:
+ lib.cquantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+ elif device_type == 'cpu':
+ cpu_kernel_func = getattr(lib, 'cquantize_blockwise_cpu_fp32', None)
+ if cpu_kernel_func:
+ A_cpu = A.to(torch.float32) if A.dtype != torch.float32 else A
+ code_cpu = code.to('cpu')
+ absmax_cpu = torch.empty(absmax.shape, device='cpu', dtype=torch.float32)
+ out_cpu = torch.empty(out.shape, device='cpu', dtype=torch.uint8)
+
+ cpu_kernel_func(
+ get_ptr(code_cpu),
+ get_ptr(A_cpu),
+ get_ptr(absmax_cpu),
+ get_ptr(out_cpu),
+ ct.c_longlong(blocksize),
+ ct.c_longlong(A_cpu.numel())
+ )
+
+ out.copy_(out_cpu)
+ absmax.copy_(absmax_cpu)
+ else:
+ raise NotImplementedError("CPU blockwise quantization requires C extension support")
+ else:
+ raise NotImplementedError(f"Blockwise quantization not implemented for {device_type}")
+
+ return out, absmax
+
@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
@@ -252,7 +284,7 @@ def _(
A: torch.Tensor,
absmax: torch.Tensor,
code: torch.Tensor,
- blocksize: int,
+ blocksize: int,
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
@@ -264,76 +296,116 @@ def _(
def _dequantize_blockwise_impl(
A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
- torch._check(
- dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(A.numel()),
- _get_tensor_stream(A),
- )
-
- if dtype == torch.float16:
- lib.cdequantize_blockwise_fp16(*args)
- elif dtype == torch.bfloat16:
- lib.cdequantize_blockwise_bf16(*args)
- elif dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
+ device = A.device
+ device_type = device.type
+
+ if device_type == 'cuda':
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ elif device_type == 'hip' and HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+
+ torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+ torch._check(
+ dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
+ )
+
+ if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(A.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if dtype == torch.float16:
+ lib.cdequantize_blockwise_fp16(*args)
+ elif dtype == torch.bfloat16:
+ lib.cdequantize_blockwise_bf16(*args)
+ elif dtype == torch.float32:
+ lib.cdequantize_blockwise_fp32(*args)
+ elif device_type == 'cpu':
+ cpu_kernel_func = getattr(lib, 'cdequantize_blockwise_cpu_fp32', None)
+ if cpu_kernel_func:
+ code_cpu = code.to('cpu')
+ A_cpu = A.to('cpu')
+ absmax_cpu = absmax.to('cpu')
+ out_cpu = torch.empty(out.shape, dtype=torch.float32, device='cpu')
+
+ cpu_kernel_func(
+ get_ptr(code_cpu),
+ get_ptr(A_cpu),
+ get_ptr(absmax_cpu),
+ get_ptr(out_cpu),
+ ct.c_longlong(blocksize),
+ ct.c_longlong(A.numel())
+ )
+
+ out.copy_(out_cpu.to(dtype))
+ else:
+ raise NotImplementedError("CPU blockwise dequantization requires C extension support")
+ else:
+ raise NotImplementedError(f"Blockwise dequantization not implemented for {device_type}")
@register_kernel("bitsandbytes::quantize_4bit", "cuda")
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- A.dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
- )
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(n),
- )
-
- if A.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_bf16_fp4(*args)
- else:
- lib.cquantize_blockwise_bf16_nf4(*args)
- elif A.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp16_fp4(*args)
- else:
- lib.cquantize_blockwise_fp16_nf4(*args)
- elif A.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp32_fp4(*args)
- else:
- lib.cquantize_blockwise_fp32_nf4(*args)
-
- return out, absmax
+ device = A.device
+ device_type = device.type
+
+ if device_type == 'cuda':
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ elif device_type == 'hip' or HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ A.dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
+ )
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
+
+ if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(n),
+ )
+
+ if A.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_bf16_nf4(*args)
+ elif A.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp16_nf4(*args)
+ elif A.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp32_nf4(*args)
+ else:
+ raise NotImplementedError(f"4-bit quantization not implemented for {device_type}")
+
+ return out, absmax
@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
def _(
@@ -347,6 +419,7 @@ def _(
out = torch.empty(shape, dtype=dtype, device=A.device)
_dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
return out
+
@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
@@ -359,52 +432,62 @@ def _(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
+
torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
_dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
-
-def _dequantize_4bit_impl(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(out.numel()),
- _get_tensor_stream(A),
- )
-
- if out.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_bf16_fp4(*args)
- else:
- lib.cdequantize_blockwise_bf16_nf4(*args)
- elif out.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp16_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp16_nf4(*args)
- elif out.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp32_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp32_nf4(*args)
+def _dequantize_4bit_impl(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ device = A.device
+ device_type = device.type
+
+ if device_type == 'cuda':
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ elif device_type == 'hip' and HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+ )
+
+ if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(out.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if out.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_bf16_nf4(*args)
+ elif out.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp16_nf4(*args)
+ elif out.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp32_nf4(*args)
+ else:
+ raise NotImplementedError(f"4-bit dequantization not implemented for {device_type}")
@register_kernel("bitsandbytes::gemv_4bit", "cuda")
@@ -457,7 +540,7 @@ def _gemv_4bit_impl(
B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
)
- torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
+ torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
m = ct.c_int32(shapeB[0])
From 6459c2bd6e4eb68fbe36d3deb200ac3492f96c1a Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 20 May 2025 21:15:00 +0530
Subject: [PATCH 10/85] Update functional.py
---
bitsandbytes/functional.py | 391 ++++++++++++++++++++++---------------
1 file changed, 238 insertions(+), 153 deletions(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index b0092ffd1..7730f7182 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -15,7 +15,7 @@
from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
-from .cextension import lib
+from .cextension import lib, HIP_ENVIRONMENT
name2qmap = {}
@@ -719,152 +719,222 @@ def __eq__(self, other):
)
-def quantize_blockwise(
- A: torch.Tensor,
- code: Optional[torch.Tensor] = None,
- absmax: Optional[torch.Tensor] = None,
- out: Optional[torch.Tensor] = None,
- blocksize=4096,
- nested=False,
-) -> tuple[torch.Tensor, QuantState]:
- """Quantize a tensor in blocks of values.
-
- The input tensor is quantized by dividing it into blocks of `blocksize` values.
- The the absolute maximum value within these blocks is calculated for scaling
- the non-linear quantization.
-
- Args:
- A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
- code (`torch.Tensor`, *optional*):
- A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
- For more details, see (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
- absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
- out (`torch.Tensor`, *optional*): A tensor to use to store the result.
- blocksize (`int`, *optional*):
- The size of the blocks. Defaults to 4096.
- Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
- nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
-
- Raises:
- ValueError: Raised when the input data type is not supported.
-
- Returns:
- `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
- - `torch.Tensor`: The quantized tensor.
- - [`QuantState`]: The state object used to undo the quantization.
- """
-
- if code is None:
- if "dynamic" not in name2qmap:
- name2qmap["dynamic"] = create_dynamic_map().to(A.device)
- code = name2qmap["dynamic"]
-
- _out, _absmax = torch.ops.bitsandbytes.quantize_blockwise.default(
- A,
- code.to(A.device),
- blocksize,
- )
-
- if nested:
- offset = _absmax.mean()
- _absmax -= offset
- qabsmax, state2 = quantize_blockwise(_absmax, blocksize=blocksize, nested=False)
- quant_state = QuantState(
- absmax=qabsmax,
- code=code,
- blocksize=blocksize,
- dtype=A.dtype,
- offset=offset,
- state2=state2,
- )
- else:
- quant_state = QuantState(absmax=_absmax, code=code.to(A.device), blocksize=blocksize, dtype=A.dtype)
-
- # TODO(matthewdouglas): Deprecate out kwarg
- out = out.copy_(_out) if out is not None else _out
-
- # TODO(matthewdouglas): Deprecate absmax kwarg
- if absmax is not None:
- quant_state.absmax = absmax.copy_(quant_state.absmax)
-
- return out, quant_state
-
-
-def dequantize_blockwise(
- A: torch.Tensor,
- quant_state: Optional[QuantState] = None,
- absmax: Optional[torch.Tensor] = None,
- code: Optional[torch.Tensor] = None,
- out: Optional[torch.Tensor] = None,
- blocksize: int = 4096,
- nested=False,
-) -> torch.Tensor:
- """Dequantize a tensor in blocks of values.
-
- The input tensor is dequantized by dividing it into blocks of `blocksize` values.
- The the absolute maximum value within these blocks is used for scaling
- the non-linear dequantization.
-
- Args:
- A (`torch.Tensor`): The quantized input tensor.
- quant_state ([`QuantState`], *optional*):
- The quantization state as returned by [`quantize_blockwise`].
- Required if `absmax` is not provided.
- absmax (`torch.Tensor`, *optional*):
- A tensor containing the scaling values.
- Required if `quant_state` is not provided and ignored otherwise.
- code (`torch.Tensor`, *optional*):
- A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
- For more details, see (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
- Ignored when `quant_state` is provided.
- out (`torch.Tensor`, *optional*): A tensor to use to store the result.
- blocksize (`int`, *optional*):
- The size of the blocks. Defaults to 4096.
- Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
- Ignored when `quant_state` is provided.
-
- Raises:
- ValueError: Raised when the input data type is not supported.
-
- Returns:
- `torch.Tensor`:
- The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
- """
-
- assert quant_state is not None or absmax is not None
- if code is None and quant_state is None:
- if "dynamic" not in name2qmap:
- name2qmap["dynamic"] = create_dynamic_map().to(A.device)
- code = name2qmap["dynamic"]
-
- if quant_state is None:
- quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
-
- absmax = quant_state.absmax
- if quant_state.nested:
- absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
- absmax += quant_state.offset
- if absmax.dtype != torch.float32:
- absmax = absmax.float()
-
- if out is not None:
- torch.ops.bitsandbytes.dequantize_blockwise.out(
- A,
- absmax,
- code.to(A.device),
- blocksize,
- quant_state.dtype,
- out=out,
- )
- return out
-
- return torch.ops.bitsandbytes.dequantize_blockwise.default(
- A,
- absmax,
- quant_state.code.to(A.device),
- quant_state.blocksize,
- quant_state.dtype,
- )
+def quantize_blockwise(
+ A: torch.Tensor,
+ code: Optional[torch.Tensor] = None,
+ absmax: Optional[torch.Tensor] = None,
+ out: Optional[torch.Tensor] = None,
+ blocksize=4096,
+ nested=False,
+) -> tuple[torch.Tensor, QuantState]:
+ """Quantize a tensor in blocks of values.
+
+ The input tensor is quantized by dividing it into blocks of `blocksize` values.
+ The the absolute maximum value within these blocks is calculated for scaling
+ the non-linear quantization.
+
+ Args:
+ A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
+ code (`torch.Tensor`, *optional*):
+ A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
+ For more details, see (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
+ absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
+ out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+ blocksize (`int`, *optional*):
+ The size of the blocks. Defaults to 4096.
+ Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+ nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
+
+ Raises:
+ ValueError: Raised when the input data type is not supported.
+
+ Returns:
+ `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
+ - `torch.Tensor`: The quantized tensor.
+ - [`QuantState`]: The state object used to undo the quantization.
+ """
+
+ if code is None:
+ if "dynamic" not in name2qmap:
+ name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+ code = name2qmap["dynamic"]
+
+ if absmax is None:
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+
+ if out is None:
+ out = torch.zeros_like(A, dtype=torch.uint8)
+
+ device_type = A.device.type
+
+ if device_type == "cpu":
+ code = code.cpu()
+ lib.cquantize_blockwise_cpu_fp32(
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_longlong(blocksize),
+ ct.c_longlong(A.numel()),
+ )
+ elif device_type in ["cuda", "hip"]:
+ if not HIP_ENVIRONMENT:
+ assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+ else:
+ assert blocksize in [4096, 2048, 1024, 512, 256, 128]
+
+ code = code.to(A.device)
+
+ is_on_gpu([A, out, absmax])
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(A.numel()),
+ )
+
+ if A.dtype == torch.float16:
+ lib.cquantize_blockwise_fp16(*args)
+ elif A.dtype == torch.bfloat16:
+ lib.cquantize_blockwise_bf16(*args)
+ elif A.dtype == torch.float32:
+ lib.cquantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+ else:
+ raise RuntimeError(f"Device type {device_type} not supported for quantization")
+
+ if nested:
+ offset = absmax.mean()
+ absmax -= offset
+ qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False)
+ quant_state = QuantState(
+ absmax=qabsmax,
+ code=code,
+ blocksize=blocksize,
+ dtype=A.dtype,
+ offset=offset,
+ state2=state2,
+ )
+ else:
+ quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=A.dtype)
+
+ return out, quant_state
+
+
+def dequantize_blockwise(
+ A: torch.Tensor,
+ quant_state: Optional[QuantState] = None,
+ absmax: Optional[torch.Tensor] = None,
+ code: Optional[torch.Tensor] = None,
+ out: Optional[torch.Tensor] = None,
+ blocksize: int = 4096,
+ nested=False,
+) -> torch.Tensor:
+ """Dequantize a tensor in blocks of values.
+
+ The input tensor is dequantized by dividing it into blocks of `blocksize` values.
+ The the absolute maximum value within these blocks is used for scaling
+ the non-linear dequantization.
+
+ Args:
+ A (`torch.Tensor`): The quantized input tensor.
+ quant_state ([`QuantState`], *optional*):
+ The quantization state as returned by [`quantize_blockwise`].
+ Required if `absmax` is not provided.
+ absmax (`torch.Tensor`, *optional*):
+ A tensor containing the scaling values.
+ Required if `quant_state` is not provided and ignored otherwise.
+ code (`torch.Tensor`, *optional*):
+ A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
+ For more details, see (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
+ Ignored when `quant_state` is provided.
+ out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+ blocksize (`int`, *optional*):
+ The size of the blocks. Defaults to 4096.
+ Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+ Ignored when `quant_state` is provided.
+
+ Raises:
+ ValueError: Raised when the input data type is not supported.
+
+ Returns:
+ `torch.Tensor`:
+ The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
+ """
+
+ assert quant_state is not None or absmax is not None
+ if code is None and quant_state is None:
+ if "dynamic" not in name2qmap:
+ name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+ code = name2qmap["dynamic"]
+
+ if quant_state is None:
+ quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
+
+ absmax = quant_state.absmax
+ if quant_state.nested:
+ absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+ absmax += quant_state.offset
+ if absmax.dtype != torch.float32:
+ absmax = absmax.float()
+
+ if out is None:
+ out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device)
+
+ device_type = A.device.type
+
+ if device_type == "cpu":
+ code = quant_state.code.cpu()
+ lib.cdequantize_blockwise_cpu_fp32(
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(quant_state.absmax),
+ get_ptr(out),
+ ct.c_longlong(quant_state.blocksize),
+ ct.c_longlong(A.numel()),
+ )
+ elif device_type in ["cuda", "hip"]:
+ code = quant_state.code.to(A.device)
+ supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+ if HIP_ENVIRONMENT:
+ supported_blocksizes = supported_blocksizes[:-1]
+ if quant_state.blocksize not in supported_blocksizes:
+ raise ValueError(
+ f"The blocksize of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}",
+ )
+
+ is_on_gpu([A, absmax, out])
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(quant_state.code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(quant_state.blocksize),
+ ct.c_int(A.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if out.dtype == torch.float16:
+ lib.cdequantize_blockwise_fp16(*args)
+ elif out.dtype == torch.bfloat16:
+ lib.cdequantize_blockwise_bf16(*args)
+ elif out.dtype == torch.float32:
+ lib.cdequantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {out.dtype}")
+ else:
+ raise RuntimeError(f"Device type {device_type} not supported for dequantization")
+
+ return out
def get_4bit_type(typename, device=None, blocksize=64):
@@ -953,10 +1023,12 @@ def quantize_fp4(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=64,
+ blocksize=None,
compress_statistics=False,
quant_storage=torch.uint8,
):
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage)
@@ -964,10 +1036,12 @@ def quantize_nf4(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=64,
+ blocksize=None,
compress_statistics=False,
quant_storage=torch.uint8,
):
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage)
@@ -975,7 +1049,7 @@ def quantize_4bit(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=64,
+ blocksize=None,
compress_statistics=False,
quant_type="fp4",
quant_storage=torch.uint8,
@@ -1003,6 +1077,9 @@ def quantize_4bit(
- `torch.Tensor`: The quantized tensor with packed 4-bit values.
- [`QuantState`]: The state object used to undo the quantization.
"""
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
+
input_shape = A.shape
_out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
@@ -1053,8 +1130,10 @@ def dequantize_fp4(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: int = 64,
+ blocksize: Optional[int] = None,
) -> torch.Tensor:
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
@@ -1063,8 +1142,10 @@ def dequantize_nf4(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: int = 64,
+ blocksize: Optional[int] = None,
) -> torch.Tensor:
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
@@ -1073,7 +1154,7 @@ def dequantize_4bit(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: int = 64,
+ blocksize: Optional[int] = None,
quant_type="fp4",
) -> torch.Tensor:
"""Dequantizes a packed 4-bit quantized tensor.
@@ -1102,6 +1183,10 @@ def dequantize_4bit(
Returns:
`torch.Tensor`: The dequantized tensor.
"""
+
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
+
if quant_state is None:
assert absmax is not None and out is not None
From 09249c897e47708ea9d4e594b8deaea439d74ade Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 21 May 2025 20:12:20 +0530
Subject: [PATCH 11/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 106 +++++++++++++-----------------
1 file changed, 44 insertions(+), 62 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index fd63c888d..40f25a18f 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,3 +1,4 @@
+
from collections.abc import Sequence
import ctypes as ct
from math import prod
@@ -5,7 +6,7 @@
import torch
-from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr, is_on_gpu
from ..._ops import register_kernel
from ...cextension import lib, HIP_ENVIRONMENT
@@ -43,7 +44,7 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
n = prod(shapeB[:-1])
lda = shapeA[-1] # Weights (outputs, inputs)
ldb = shapeB[-1] # Activations (batch, tokens, inputs)
- ldc = shapeC[-1] # Output (batch, tokens, outputs)
+ ldc = shapeC[-1] # Output (batch, tokens, outputs)
torch._check(
lda == ldb,
@@ -53,10 +54,18 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
# cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
# We'll fall back to a slower fp32 calculation in this circumstance.
# Fortunately, this should not be very common.
- if lda % 4 != 0:
- result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
- return out.copy_(result)
+ if lda % 4 != 0:
+ result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+ if out is not None:
+ result = out.copy_(result)
+ return result
+
+ if out is None:
+ out = torch.empty(shapeC, device=A.device, dtype=dtype)
+
+ is_on_gpu([A, B, out])
+
with _cuda_device_of(A):
ctx = CUBLAS_Context.get_instance().get_context(A.device)
ptrA = get_ptr(A)
@@ -71,8 +80,11 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
ldc = ct.c_int32(ldc)
stream = _get_tensor_stream(A)
- has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
+ if dtype == torch.int32:
+ has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+ else:
+ has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
if has_error:
if has_error == 100:
# `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
@@ -111,6 +123,8 @@ def _(
# Note: fused bias in the kernel is only supported for fp16
# TODO(matthewdouglas): Consider supporting bf16 fused bias
ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
+
+ is_on_gpu([A, row_stats, col_stats, out, bias])
with _cuda_device_of(A):
lib.cdequant_mm_int32_fp16(
@@ -128,6 +142,8 @@ def _(
def _(A: torch.Tensor, threshold=0.0):
torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
+
+ is_on_gpu([A])
rows = prod(A.shape[:-1])
cols = A.shape[-1]
@@ -216,7 +232,7 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
if device_type == 'cuda':
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- elif device_type == 'hip' and HIP_ENVIRONMENT:
+ elif device_type == 'hip' or HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
@@ -225,8 +241,10 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
blocks = -(n // -blocksize)
absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
out = torch.empty_like(A, dtype=torch.uint8)
-
- if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
+
+ is_on_gpu([A, out, absmax])
+
+ if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
get_ptr(code),
@@ -245,30 +263,7 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
lib.cquantize_blockwise_fp32(*args)
else:
raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
- elif device_type == 'cpu':
- cpu_kernel_func = getattr(lib, 'cquantize_blockwise_cpu_fp32', None)
- if cpu_kernel_func:
- A_cpu = A.to(torch.float32) if A.dtype != torch.float32 else A
- code_cpu = code.to('cpu')
- absmax_cpu = torch.empty(absmax.shape, device='cpu', dtype=torch.float32)
- out_cpu = torch.empty(out.shape, device='cpu', dtype=torch.uint8)
-
- cpu_kernel_func(
- get_ptr(code_cpu),
- get_ptr(A_cpu),
- get_ptr(absmax_cpu),
- get_ptr(out_cpu),
- ct.c_longlong(blocksize),
- ct.c_longlong(A_cpu.numel())
- )
-
- out.copy_(out_cpu)
- absmax.copy_(absmax_cpu)
- else:
- raise NotImplementedError("CPU blockwise quantization requires C extension support")
- else:
- raise NotImplementedError(f"Blockwise quantization not implemented for {device_type}")
-
+
return out, absmax
@@ -302,7 +297,7 @@ def _dequantize_blockwise_impl(
if device_type == 'cuda':
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- elif device_type == 'hip' and HIP_ENVIRONMENT:
+ elif device_type == 'hip' or HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
@@ -310,8 +305,10 @@ def _dequantize_blockwise_impl(
dtype in [torch.float16, torch.bfloat16, torch.float32],
lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
)
-
- if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
+
+ is_on_gpu([A, absmax, out])
+
+ if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
get_ptr(code),
@@ -328,29 +325,8 @@ def _dequantize_blockwise_impl(
elif dtype == torch.bfloat16:
lib.cdequantize_blockwise_bf16(*args)
elif dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
- elif device_type == 'cpu':
- cpu_kernel_func = getattr(lib, 'cdequantize_blockwise_cpu_fp32', None)
- if cpu_kernel_func:
- code_cpu = code.to('cpu')
- A_cpu = A.to('cpu')
- absmax_cpu = absmax.to('cpu')
- out_cpu = torch.empty(out.shape, dtype=torch.float32, device='cpu')
-
- cpu_kernel_func(
- get_ptr(code_cpu),
- get_ptr(A_cpu),
- get_ptr(absmax_cpu),
- get_ptr(out_cpu),
- ct.c_longlong(blocksize),
- ct.c_longlong(A.numel())
- )
-
- out.copy_(out_cpu.to(dtype))
- else:
- raise NotImplementedError("CPU blockwise dequantization requires C extension support")
- else:
- raise NotImplementedError(f"Blockwise dequantization not implemented for {device_type}")
+ lib.cdequantize_blockwise_fp32(*args)
+
@register_kernel("bitsandbytes::quantize_4bit", "cuda")
def _(
@@ -375,7 +351,9 @@ def _(
blocks = -(n // -blocksize)
absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
+
+ is_on_gpu([A, out, absmax])
+
if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
@@ -450,7 +428,7 @@ def _dequantize_4bit_impl(
if device_type == 'cuda':
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- elif device_type == 'hip' and HIP_ENVIRONMENT:
+ elif device_type == 'hip' or HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
torch._check(quant_type in ["fp4", "nf4"])
@@ -459,6 +437,8 @@ def _dequantize_4bit_impl(
lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
)
+ is_on_gpu([A, absmax, out])
+
if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
@@ -550,6 +530,8 @@ def _gemv_4bit_impl(
lda = m
ldb = ct.c_int32((A.shape[-1] + 1) // 2)
ldc = m
+
+ is_on_gpu([B, A, out, absmax])
stream = _get_tensor_stream(A)
From 4afa7741b3b7105ac6a42700dab1fd83b5050fc5 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 21 May 2025 20:12:36 +0530
Subject: [PATCH 12/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 40f25a18f..ce5401c5f 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,4 +1,3 @@
-
from collections.abc import Sequence
import ctypes as ct
from math import prod
From 033d92cef2d41431fd4247c272c9429f7304bf40 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 21 May 2025 20:23:34 +0530
Subject: [PATCH 13/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index ce5401c5f..14f55847c 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -353,7 +353,7 @@ def _(
is_on_gpu([A, out, absmax])
- if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
+ if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
None,
@@ -438,7 +438,7 @@ def _dequantize_4bit_impl(
is_on_gpu([A, absmax, out])
- if device_type == 'cuda' or (device_type == 'hip' and HIP_ENVIRONMENT):
+ if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
None,
From 70bacda70b6b0316a872319e8da335bd7045ff71 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 21 May 2025 15:58:15 -0400
Subject: [PATCH 14/85] Update test for L40S
---
tests/test_functional.py | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 423a92193..0b9390aaa 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -525,7 +525,13 @@ def min_max(x):
# print(mean(errs2))
# print(mean(relerrs2))
assert mean(errs) < 0.015
- assert mean(relerrs) < 0.3
+
+ # There's a higher relerr on L40S with torch 2.4+cu118.
+ is_sm89 = torch.cuda.get_device_capability() == (8, 9)
+ if torch.version.cuda == "11.8" and is_sm89 and torch.__version__ < (2, 5):
+ assert mean(relerrs) < 0.41
+ else:
+ assert mean(relerrs) < 0.3
@pytest.mark.parametrize("dim1", [1, 64], ids=id_formatter("dim1"))
@pytest.mark.parametrize("dim2", [32, 128], ids=id_formatter("dim2"))
From d47553356f59c9e3e27059cd5872679e30293ece Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 21 May 2025 16:49:30 -0400
Subject: [PATCH 15/85] Update README.md
---
README.md | 52 ++++++++++++++++++++++++++++------------------------
1 file changed, 28 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
index 668bc5309..1bc87323c 100644
--- a/README.md
+++ b/README.md
@@ -36,44 +36,45 @@ bitsandbytes has the following minimum requirements for all platforms:
- | 🐧 Linux |
+ 🐧 Linux, glibc >= 2.24 |
| x86-64 |
◻️ CPU |
- |
+ AVX2 |
〰️ Partial Support |
|
- 🟩 NVIDIA GPU |
+ 🟩 NVIDIA GPU
cuda |
SM50+ minimum SM75+ recommended |
- ✅ Full Support * |
+ ✅ Full Support |
|
- 🟥 AMD GPU |
- gfx90a, gfx942, gfx1100 |
+ 🟥 AMD GPU
cuda |
+
+ CDNA: gfx90a, gfx942
+ RDNA: gfx1100, gfx1200
+ |
🚧 In Development |
|
- 🟦 Intel XPU |
+ 🟦 Intel GPU
xpu |
- Data Center GPU Max Series (Ponte Vecchio)
- Arc A-Series (Alchemist)
+ Data Center GPU Max Series
+ Arc A-Series (Alchemist)
Arc B-Series (Battlemage)
|
🚧 In Development |
-
| aarch64 |
◻️ CPU |
@@ -82,12 +83,12 @@ bitsandbytes has the following minimum requirements for all platforms:
|
- 🟩 NVIDIA GPU |
+ 🟩 NVIDIA GPU
cuda |
SM75, SM80, SM90, SM100 |
- ✅ Full Support * |
+ ✅ Full Support |
- | 🪟 Windows |
+ 🪟 Windows 11 / Windows Server 2019+ |
| x86-64 |
@@ -97,13 +98,13 @@ bitsandbytes has the following minimum requirements for all platforms:
|
- 🟩 NVIDIA GPU |
+ 🟩 NVIDIA GPU
cuda |
SM50+ minimum SM75+ recommended |
- ✅ Full Support * |
+ ✅ Full Support |
|
- 🟦 Intel XPU |
+ 🟦 Intel GPU
xpu |
Arc A-Series (Alchemist)
Arc B-Series (Battlemage)
@@ -111,19 +112,22 @@ bitsandbytes has the following minimum requirements for all platforms:
| 🚧 In Development |
- | 🍎 macOS |
+ 🍎 macOS 13.1+ |
| arm64 |
- ◻️ CPU / Metal |
+ ◻️ CPU |
Apple M1+ |
- ❌ Under consideration |
+ 🛣️ Future Roadmap |
+
+ |
+ ⬜ Metal
mps |
+ Apple M1+ |
+ 🛣️ Future Roadmap |
-\* Accelerated INT8 requires SM75+.
-
## :book: Documentation
* [Official Documentation](https://huggingface.co/docs/bitsandbytes/main)
* 🤗 [Transformers](https://huggingface.co/docs/transformers/quantization/bitsandbytes)
From 4def9590abb8a3f0ef789fce0b1659af729643e4 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 22 May 2025 20:51:50 +0530
Subject: [PATCH 16/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 14f55847c..5b94c5349 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -11,7 +11,6 @@
from ...cextension import lib, HIP_ENVIRONMENT
-
@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
def _(A: torch.Tensor, B: torch.Tensor):
out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
@@ -78,12 +77,9 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
ldb = ct.c_int32(ldb)
ldc = ct.c_int32(ldc)
stream = _get_tensor_stream(A)
-
- if dtype == torch.int32:
- has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
- else:
- has_error = lib.cigemmlt_8(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
+
+ has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
if has_error:
if has_error == 100:
# `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
@@ -96,6 +92,7 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
return out
+
@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
def _(
A: torch.Tensor,
@@ -384,6 +381,7 @@ def _(
return out, absmax
+
@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
def _(
A: torch.Tensor,
@@ -398,7 +396,6 @@ def _(
return out
-
@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
def _(
A: torch.Tensor,
@@ -496,7 +493,6 @@ def _(
torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
_gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
-
def _gemv_4bit_impl(
A: torch.Tensor,
B: torch.Tensor,
From 0f318667aaf4de15cd29f8063dcaa4fd90d24783 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 22 May 2025 21:31:55 +0530
Subject: [PATCH 17/85] Update functional.py
---
bitsandbytes/functional.py | 157 ++++++++++++-------------------------
1 file changed, 48 insertions(+), 109 deletions(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 7730f7182..3f0c1ff94 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -728,11 +728,9 @@ def quantize_blockwise(
nested=False,
) -> tuple[torch.Tensor, QuantState]:
"""Quantize a tensor in blocks of values.
-
The input tensor is quantized by dividing it into blocks of `blocksize` values.
The the absolute maximum value within these blocks is calculated for scaling
the non-linear quantization.
-
Args:
A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
code (`torch.Tensor`, *optional*):
@@ -744,10 +742,8 @@ def quantize_blockwise(
The size of the blocks. Defaults to 4096.
Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
-
Raises:
ValueError: Raised when the input data type is not supported.
-
Returns:
`Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
- `torch.Tensor`: The quantized tensor.
@@ -759,61 +755,23 @@ def quantize_blockwise(
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
code = name2qmap["dynamic"]
- if absmax is None:
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
-
- if out is None:
- out = torch.zeros_like(A, dtype=torch.uint8)
-
device_type = A.device.type
-
- if device_type == "cpu":
- code = code.cpu()
- lib.cquantize_blockwise_cpu_fp32(
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_longlong(blocksize),
- ct.c_longlong(A.numel()),
- )
- elif device_type in ["cuda", "hip"]:
+ if device_type in ["cuda", "hip"]:
if not HIP_ENVIRONMENT:
- assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+ assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
else:
- assert blocksize in [4096, 2048, 1024, 512, 256, 128]
-
- code = code.to(A.device)
-
- is_on_gpu([A, out, absmax])
+ assert blocksize in [4096, 2048, 1024, 512, 256, 128]
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(A.numel()),
- )
-
- if A.dtype == torch.float16:
- lib.cquantize_blockwise_fp16(*args)
- elif A.dtype == torch.bfloat16:
- lib.cquantize_blockwise_bf16(*args)
- elif A.dtype == torch.float32:
- lib.cquantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
- else:
- raise RuntimeError(f"Device type {device_type} not supported for quantization")
+ _out, _absmax = torch.ops.bitsandbytes.quantize_blockwise.default(
+ A,
+ code.to(A.device),
+ blocksize,
+ )
if nested:
- offset = absmax.mean()
- absmax -= offset
- qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False)
+ offset = _absmax.mean()
+ _absmax -= offset
+ qabsmax, state2 = quantize_blockwise(_absmax, blocksize=blocksize, nested=False)
quant_state = QuantState(
absmax=qabsmax,
code=code,
@@ -823,11 +781,18 @@ def quantize_blockwise(
state2=state2,
)
else:
- quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=A.dtype)
+ quant_state = QuantState(absmax=_absmax, code=code.to(A.device), blocksize=blocksize, dtype=A.dtype)
+
+ # TODO(matthewdouglas): Deprecate out kwarg
+ out = out.copy_(_out) if out is not None else _out
+
+ # TODO(matthewdouglas): Deprecate absmax kwarg
+ if absmax is not None:
+ quant_state.absmax = absmax.copy_(quant_state.absmax)
+
+ return out, quant_state
+
- return out, quant_state
-
-
def dequantize_blockwise(
A: torch.Tensor,
quant_state: Optional[QuantState] = None,
@@ -838,11 +803,9 @@ def dequantize_blockwise(
nested=False,
) -> torch.Tensor:
"""Dequantize a tensor in blocks of values.
-
The input tensor is dequantized by dividing it into blocks of `blocksize` values.
The the absolute maximum value within these blocks is used for scaling
the non-linear dequantization.
-
Args:
A (`torch.Tensor`): The quantized input tensor.
quant_state ([`QuantState`], *optional*):
@@ -860,10 +823,8 @@ def dequantize_blockwise(
The size of the blocks. Defaults to 4096.
Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
Ignored when `quant_state` is provided.
-
Raises:
ValueError: Raised when the input data type is not supported.
-
Returns:
`torch.Tensor`:
The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
@@ -878,6 +839,16 @@ def dequantize_blockwise(
if quant_state is None:
quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
+ device_type = A.device.type
+ if device_type in ["cuda", "hip"]:
+ supported_blocksizes = [4096, 2048, 1024, 512, 256, 128, 64]
+ if HIP_ENVIRONMENT:
+ supported_blocksizes = supported_blocksizes[:-1]
+ if quant_state.blocksize not in supported_blocksizes:
+ raise ValueError(
+ f"The blocksize of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}"
+ )
+
absmax = quant_state.absmax
if quant_state.nested:
absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
@@ -885,56 +856,24 @@ def dequantize_blockwise(
if absmax.dtype != torch.float32:
absmax = absmax.float()
- if out is None:
- out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device)
-
- device_type = A.device.type
-
- if device_type == "cpu":
- code = quant_state.code.cpu()
- lib.cdequantize_blockwise_cpu_fp32(
- get_ptr(code),
- get_ptr(A),
- get_ptr(quant_state.absmax),
- get_ptr(out),
- ct.c_longlong(quant_state.blocksize),
- ct.c_longlong(A.numel()),
+ if out is not None:
+ torch.ops.bitsandbytes.dequantize_blockwise.out(
+ A,
+ absmax,
+ quant_state.code.to(A.device),
+ quant_state.blocksize,
+ quant_state.dtype,
+ out=out,
)
- elif device_type in ["cuda", "hip"]:
- code = quant_state.code.to(A.device)
- supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
- if HIP_ENVIRONMENT:
- supported_blocksizes = supported_blocksizes[:-1]
- if quant_state.blocksize not in supported_blocksizes:
- raise ValueError(
- f"The blocksize of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}",
- )
-
- is_on_gpu([A, absmax, out])
-
- with _cuda_device_of(A):
- args = (
- get_ptr(quant_state.code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(quant_state.blocksize),
- ct.c_int(A.numel()),
- _get_tensor_stream(A),
- )
-
- if out.dtype == torch.float16:
- lib.cdequantize_blockwise_fp16(*args)
- elif out.dtype == torch.bfloat16:
- lib.cdequantize_blockwise_bf16(*args)
- elif out.dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {out.dtype}")
- else:
- raise RuntimeError(f"Device type {device_type} not supported for dequantization")
+ return out
- return out
+ return torch.ops.bitsandbytes.dequantize_blockwise.default(
+ A,
+ absmax,
+ quant_state.code.to(A.device),
+ quant_state.blocksize,
+ quant_state.dtype,
+ )
def get_4bit_type(typename, device=None, blocksize=64):
From 190faed7e96b8b27e033fe3c6ee5e3a6d5a4772a Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 22 May 2025 23:35:15 +0530
Subject: [PATCH 18/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 12 +++---------
1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 5b94c5349..ff5e023cc 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -52,15 +52,9 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
# cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
# We'll fall back to a slower fp32 calculation in this circumstance.
# Fortunately, this should not be very common.
-
- if lda % 4 != 0:
- result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
- if out is not None:
- result = out.copy_(result)
- return result
-
- if out is None:
- out = torch.empty(shapeC, device=A.device, dtype=dtype)
+ if lda % 4 != 0:
+ result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+ return out.copy_(result)
is_on_gpu([A, B, out])
From d7f413b9b367b9b26b87180095ebcc7a561fdc26 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 22 May 2025 23:52:39 +0530
Subject: [PATCH 19/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 30 +++++-------------------------
1 file changed, 5 insertions(+), 25 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index ff5e023cc..b75f67d62 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -55,8 +55,6 @@ def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor
if lda % 4 != 0:
result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
return out.copy_(result)
-
- is_on_gpu([A, B, out])
with _cuda_device_of(A):
ctx = CUBLAS_Context.get_instance().get_context(A.device)
@@ -114,8 +112,6 @@ def _(
# TODO(matthewdouglas): Consider supporting bf16 fused bias
ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
- is_on_gpu([A, row_stats, col_stats, out, bias])
-
with _cuda_device_of(A):
lib.cdequant_mm_int32_fp16(
ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
@@ -133,8 +129,6 @@ def _(A: torch.Tensor, threshold=0.0):
torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
- is_on_gpu([A])
-
rows = prod(A.shape[:-1])
cols = A.shape[-1]
@@ -231,9 +225,7 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
blocks = -(n // -blocksize)
absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
out = torch.empty_like(A, dtype=torch.uint8)
-
- is_on_gpu([A, out, absmax])
-
+
if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
@@ -295,9 +287,7 @@ def _dequantize_blockwise_impl(
dtype in [torch.float16, torch.bfloat16, torch.float32],
lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
)
-
- is_on_gpu([A, absmax, out])
-
+
if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
@@ -341,9 +331,7 @@ def _(
blocks = -(n // -blocksize)
absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
- is_on_gpu([A, out, absmax])
-
+
if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
@@ -370,8 +358,6 @@ def _(
lib.cquantize_blockwise_fp32_fp4(*args)
else:
lib.cquantize_blockwise_fp32_nf4(*args)
- else:
- raise NotImplementedError(f"4-bit quantization not implemented for {device_type}")
return out, absmax
@@ -400,11 +386,11 @@ def _(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
-
torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
_dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
def _dequantize_4bit_impl(
A: torch.Tensor,
absmax: torch.Tensor,
@@ -426,9 +412,7 @@ def _dequantize_4bit_impl(
dtype in [torch.bfloat16, torch.float16, torch.float32],
lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
)
-
- is_on_gpu([A, absmax, out])
-
+
if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
with _cuda_device_of(A):
args = (
@@ -456,8 +440,6 @@ def _dequantize_4bit_impl(
lib.cdequantize_blockwise_fp32_fp4(*args)
else:
lib.cdequantize_blockwise_fp32_nf4(*args)
- else:
- raise NotImplementedError(f"4-bit dequantization not implemented for {device_type}")
@register_kernel("bitsandbytes::gemv_4bit", "cuda")
@@ -520,8 +502,6 @@ def _gemv_4bit_impl(
ldb = ct.c_int32((A.shape[-1] + 1) // 2)
ldc = m
- is_on_gpu([B, A, out, absmax])
-
stream = _get_tensor_stream(A)
with _cuda_device_of(A):
From 3b6e68a001b0dce3b368129599335fcb569ac5cd Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 23 May 2025 00:05:43 +0530
Subject: [PATCH 20/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index b75f67d62..156125c9f 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -5,7 +5,7 @@
import torch
-from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr, is_on_gpu
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
from ..._ops import register_kernel
from ...cextension import lib, HIP_ENVIRONMENT
From 06740b1372a9c9751216b76dc4c8cc98514905dd Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 23 May 2025 01:53:30 +0530
Subject: [PATCH 21/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 989 +++++++++++++++---------------
1 file changed, 486 insertions(+), 503 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 156125c9f..48dc75135 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,325 +1,312 @@
-from collections.abc import Sequence
-import ctypes as ct
-from math import prod
-from typing import Optional
-
-import torch
-
-from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
-
-from ..._ops import register_kernel
-from ...cextension import lib, HIP_ENVIRONMENT
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor):
- out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
- return _int8_linear_matmul_impl(A, B, out)
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- _int8_linear_matmul_impl(A, B, out)
-
-
-def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- A, B = B, A
-
- shapeA = A.shape
- shapeB = B.shape
-
- torch._check(A.dtype == torch.int8, lambda: "B must be int8")
- torch._check(B.dtype == torch.int8, lambda: "A must be int8")
- torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
- torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
- torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
- torch._check(out.dtype == torch.int32)
-
- shapeC = (*shapeB[:-1], shapeA[0])
- torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
-
- k, m = shapeA
- n = prod(shapeB[:-1])
- lda = shapeA[-1] # Weights (outputs, inputs)
- ldb = shapeB[-1] # Activations (batch, tokens, inputs)
- ldc = shapeC[-1] # Output (batch, tokens, outputs)
-
- torch._check(
- lda == ldb,
- lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
- )
-
- # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
- # We'll fall back to a slower fp32 calculation in this circumstance.
- # Fortunately, this should not be very common.
- if lda % 4 != 0:
- result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
- return out.copy_(result)
-
- with _cuda_device_of(A):
- ctx = CUBLAS_Context.get_instance().get_context(A.device)
- ptrA = get_ptr(A)
- ptrB = get_ptr(B)
- ptrC = get_ptr(out)
- ptrRowScale = None
- m = ct.c_int32(m)
- n = ct.c_int32(n)
- k = ct.c_int32(k)
- lda = ct.c_int32(lda)
- ldb = ct.c_int32(ldb)
- ldc = ct.c_int32(ldc)
- stream = _get_tensor_stream(A)
-
- has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
- if has_error:
- if has_error == 100:
- # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
- # TODO: Warn and implement a fallback to fp32 compute?
- raise NotImplementedError("int8_linear_matmul not implemented!")
- else:
- raise RuntimeError(
- f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
- )
-
- return out
-
-
-@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
-def _(
- A: torch.Tensor,
- row_stats: torch.Tensor,
- col_stats: torch.Tensor,
- dtype: Optional[torch.dtype] = None,
- bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
- torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
- torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
- torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
-
- # Note: cuda kernel only currently supports fp16 output.
- # We'll later cast to desired dtype if needed.
- out = torch.empty_like(A, dtype=torch.float16)
-
- ptrA = get_ptr(A)
- ptrOut = get_ptr(out)
- ptrRowStats = get_ptr(row_stats)
- ptrColStats = get_ptr(col_stats)
- numRows = ct.c_int32(prod(A.shape[:-1]))
- numCols = ct.c_int32(A.shape[-1])
-
- # Note: fused bias in the kernel is only supported for fp16
- # TODO(matthewdouglas): Consider supporting bf16 fused bias
- ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
-
- with _cuda_device_of(A):
- lib.cdequant_mm_int32_fp16(
- ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
- )
-
- # Add bias separately if not fused in kernel
- if bias is not None and bias.dtype != torch.float16:
- out.add_(bias)
-
- return out.to(dtype or torch.float16)
-
-
-@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
-def _(A: torch.Tensor, threshold=0.0):
- torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
- torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
-
- rows = prod(A.shape[:-1])
- cols = A.shape[-1]
-
- row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
- out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
-
- outlier_cols = None
-
- if threshold > 0.0:
- # TODO we could improve perf of this
- outliers = A.abs() >= threshold
-
- if outliers.any():
- outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
- else:
- # Needed for torch.compile support.
- outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
-
- with _cuda_device_of(A):
- lib.cint8_vector_quant(
- get_ptr(A),
- get_ptr(out_row),
- get_ptr(row_stats),
- ct.c_float(threshold),
- ct.c_int32(rows),
- ct.c_int32(cols),
- _get_tensor_stream(A),
- )
-
- # Zero out values from outlier columns across all rows.
- # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
- if rows > 1 and outlier_cols is not None:
- out_row[:, outlier_cols] = 0
-
- return out_row, row_stats, outlier_cols
-
-
-@register_kernel("bitsandbytes::int8_double_quant", "cuda")
-def _(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
- # Use CUDA kernel for rowwise and COO tensor
- quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
- A,
- threshold=threshold,
- )
-
- # PyTorch impl for colwise
- col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
- if threshold > 0.0 and outlier_mask is not None:
- A = A.masked_fill(outlier_mask, 0.0)
- quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
-
- return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
-
-
-def _get_col_absmax(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
- torch._check(A.is_floating_point())
-
- outlier_mask = None
-
- absA = A.abs().view(-1, A.shape[-1])
-
- if threshold > 0.0:
- # Filter outliers from stats when enabled
- outlier_mask = absA >= threshold
- absA.masked_fill_(outlier_mask, 0.0)
-
- # shape [cols]; unsqueeze(0) gives [1,cols]
- col_stats = absA.amax(dim=0, keepdim=False).float()
-
- return col_stats, outlier_mask
-
-
-@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check_is_size(blocksize)
-
- device = A.device
- device_type = device.type
+from collections.abc import Sequence
+import ctypes as ct
+from math import prod
+from typing import Optional
+
+import torch
+
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
+
+from ..._ops import register_kernel
+from ...cextension import lib, HIP_ENVIRONMENT
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor):
+ out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
+ return _int8_linear_matmul_impl(A, B, out)
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ _int8_linear_matmul_impl(A, B, out)
+
+
+def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ A, B = B, A
+
+ shapeA = A.shape
+ shapeB = B.shape
+
+ torch._check(A.dtype == torch.int8, lambda: "B must be int8")
+ torch._check(B.dtype == torch.int8, lambda: "A must be int8")
+ torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
+ torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
+ torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
+ torch._check(out.dtype == torch.int32)
+
+ shapeC = (*shapeB[:-1], shapeA[0])
+ torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
+
+ k, m = shapeA
+ n = prod(shapeB[:-1])
+ lda = shapeA[-1] # Weights (outputs, inputs)
+ ldb = shapeB[-1] # Activations (batch, tokens, inputs)
+ ldc = shapeC[-1] # Output (batch, tokens, outputs)
+
+ torch._check(
+ lda == ldb,
+ lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
+ )
+
+ # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
+ # We'll fall back to a slower fp32 calculation in this circumstance.
+ # Fortunately, this should not be very common.
+ if lda % 4 != 0:
+ result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+ return out.copy_(result)
- if device_type == 'cuda':
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- elif device_type == 'hip' or HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ with _cuda_device_of(A):
+ ctx = CUBLAS_Context.get_instance().get_context(A.device)
+ ptrA = get_ptr(A)
+ ptrB = get_ptr(B)
+ ptrC = get_ptr(out)
+ ptrRowScale = None
+ m = ct.c_int32(m)
+ n = ct.c_int32(n)
+ k = ct.c_int32(k)
+ lda = ct.c_int32(lda)
+ ldb = ct.c_int32(ldb)
+ ldc = ct.c_int32(ldc)
+ stream = _get_tensor_stream(A)
+
+ has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
+ if has_error:
+ if has_error == 100:
+ # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+ # TODO: Warn and implement a fallback to fp32 compute?
+ raise NotImplementedError("int8_linear_matmul not implemented!")
+ else:
+ raise RuntimeError(
+ f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
+ )
+
+ return out
+
+
+@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
+def _(
+ A: torch.Tensor,
+ row_stats: torch.Tensor,
+ col_stats: torch.Tensor,
+ dtype: Optional[torch.dtype] = None,
+ bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+ torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
+ torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
+ torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
+
+ # Note: cuda kernel only currently supports fp16 output.
+ # We'll later cast to desired dtype if needed.
+ out = torch.empty_like(A, dtype=torch.float16)
+
+ ptrA = get_ptr(A)
+ ptrOut = get_ptr(out)
+ ptrRowStats = get_ptr(row_stats)
+ ptrColStats = get_ptr(col_stats)
+ numRows = ct.c_int32(prod(A.shape[:-1]))
+ numCols = ct.c_int32(A.shape[-1])
+
+ # Note: fused bias in the kernel is only supported for fp16
+ # TODO(matthewdouglas): Consider supporting bf16 fused bias
+ ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
+
+ with _cuda_device_of(A):
+ lib.cdequant_mm_int32_fp16(
+ ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
+ )
+
+ # Add bias separately if not fused in kernel
+ if bias is not None and bias.dtype != torch.float16:
+ out.add_(bias)
+
+ return out.to(dtype or torch.float16)
+
+
+@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
+def _(A: torch.Tensor, threshold=0.0):
+ torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
+ torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
+ rows = prod(A.shape[:-1])
+ cols = A.shape[-1]
+
+ row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
+ out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
+
+ outlier_cols = None
+
+ if threshold > 0.0:
+ # TODO we could improve perf of this
+ outliers = A.abs() >= threshold
+
+ if outliers.any():
+ outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
+ else:
+ # Needed for torch.compile support.
+ outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
+
+ with _cuda_device_of(A):
+ lib.cint8_vector_quant(
+ get_ptr(A),
+ get_ptr(out_row),
+ get_ptr(row_stats),
+ ct.c_float(threshold),
+ ct.c_int32(rows),
+ ct.c_int32(cols),
+ _get_tensor_stream(A),
+ )
+
+ # Zero out values from outlier columns across all rows.
+ # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
+ if rows > 1 and outlier_cols is not None:
+ out_row[:, outlier_cols] = 0
+
+ return out_row, row_stats, outlier_cols
+
+
+@register_kernel("bitsandbytes::int8_double_quant", "cuda")
+def _(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+ # Use CUDA kernel for rowwise and COO tensor
+ quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
+ A,
+ threshold=threshold,
+ )
+
+ # PyTorch impl for colwise
+ col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
+ if threshold > 0.0 and outlier_mask is not None:
+ A = A.masked_fill(outlier_mask, 0.0)
+ quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
+
+ return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
+
+
+def _get_col_absmax(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+ torch._check(A.is_floating_point())
+
+ outlier_mask = None
+
+ absA = A.abs().view(-1, A.shape[-1])
+
+ if threshold > 0.0:
+ # Filter outliers from stats when enabled
+ outlier_mask = absA >= threshold
+ absA.masked_fill_(outlier_mask, 0.0)
+
+ # shape [cols]; unsqueeze(0) gives [1,cols]
+ col_stats = absA.amax(dim=0, keepdim=False).float()
+
+ return col_stats, outlier_mask
+
+
+@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check_is_size(blocksize)
+
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
n = A.numel()
blocks = -(n // -blocksize)
absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
out = torch.empty_like(A, dtype=torch.uint8)
-
- if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(A.numel()),
- )
- if A.dtype == torch.float16:
- lib.cquantize_blockwise_fp16(*args)
- elif A.dtype == torch.bfloat16:
- lib.cquantize_blockwise_bf16(*args)
- elif A.dtype == torch.float32:
- lib.cquantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
-def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
- out = torch.empty_like(A, dtype=dtype)
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
-
-
-def _dequantize_blockwise_impl(
- A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
-) -> None:
-
- device = A.device
- device_type = device.type
-
- if device_type == 'cuda':
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- elif device_type == 'hip' or HIP_ENVIRONMENT:
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(A.numel()),
+ )
+
+ if A.dtype == torch.float16:
+ lib.cquantize_blockwise_fp16(*args)
+ elif A.dtype == torch.bfloat16:
+ lib.cquantize_blockwise_bf16(*args)
+ elif A.dtype == torch.float32:
+ lib.cquantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
+def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+ out = torch.empty_like(A, dtype=dtype)
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+
+
+def _dequantize_blockwise_impl(
+ A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+) -> None:
+ if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
torch._check(
dtype in [torch.float16, torch.bfloat16, torch.float32],
lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
)
-
- if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(A.numel()),
- _get_tensor_stream(A),
- )
- if dtype == torch.float16:
- lib.cdequantize_blockwise_fp16(*args)
- elif dtype == torch.bfloat16:
- lib.cdequantize_blockwise_bf16(*args)
- elif dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
-
-
-@register_kernel("bitsandbytes::quantize_4bit", "cuda")
-def _(
- A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> tuple[torch.Tensor, torch.Tensor]:
-
- device = A.device
- device_type = device.type
-
- if device_type == 'cuda':
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- elif device_type == 'hip' or HIP_ENVIRONMENT:
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(A.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if dtype == torch.float16:
+ lib.cdequantize_blockwise_fp16(*args)
+ elif dtype == torch.bfloat16:
+ lib.cdequantize_blockwise_bf16(*args)
+ elif dtype == torch.float32:
+ lib.cdequantize_blockwise_fp32(*args)
+
+
+@register_kernel("bitsandbytes::quantize_4bit", "cuda")
+def _(
+ A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
+ if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
@@ -331,66 +318,65 @@ def _(
blocks = -(n // -blocksize)
absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
- if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(n),
- )
- if A.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_bf16_fp4(*args)
- else:
- lib.cquantize_blockwise_bf16_nf4(*args)
- elif A.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp16_fp4(*args)
- else:
- lib.cquantize_blockwise_fp16_nf4(*args)
- elif A.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp32_fp4(*args)
- else:
- lib.cquantize_blockwise_fp32_nf4(*args)
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(n),
+ )
+
+ if A.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_bf16_nf4(*args)
+ elif A.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp16_nf4(*args)
+ elif A.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp32_nf4(*args)
return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
-) -> torch.Tensor:
- out = torch.empty(shape, dtype=dtype, device=A.device)
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
-
-
+
+
+@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+) -> torch.Tensor:
+ out = torch.empty(shape, dtype=dtype, device=A.device)
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
+
def _dequantize_4bit_impl(
A: torch.Tensor,
absmax: torch.Tensor,
@@ -399,157 +385,154 @@ def _dequantize_4bit_impl(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
- device = A.device
- device_type = device.type
-
- if device_type == 'cuda':
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- elif device_type == 'hip' or HIP_ENVIRONMENT:
+ if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
dtype in [torch.bfloat16, torch.float16, torch.float32],
lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
)
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(out.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if out.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_bf16_nf4(*args)
+ elif out.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp16_nf4(*args)
+ elif out.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp32_nf4(*args)
+
+
+@register_kernel("bitsandbytes::gemv_4bit", "cuda")
+def _(
+ A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
+) -> torch.Tensor:
+ shape = (*A.shape[:-1], shapeB[0])
+ out = torch.empty(shape, device=A.device, dtype=A.dtype)
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check(
+ out.shape == (*A.shape[:-1], shapeB[0]),
+ lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
+ )
+ torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+
+
+def _gemv_4bit_impl(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check_is_size(blocksize)
+ torch._check(
+ A.numel() == A.size(-1),
+ lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
+ )
+ torch._check(
+ A.dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
+ )
+ torch._check(
+ B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
+ )
+ torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ m = ct.c_int32(shapeB[0])
+ n = ct.c_int32(1)
+ k = ct.c_int32(shapeB[1])
+
+ lda = m
+ ldb = ct.c_int32((A.shape[-1] + 1) // 2)
+ ldc = m
- if device_type == 'cuda' or (device_type == 'hip' or HIP_ENVIRONMENT):
- with _cuda_device_of(A):
- args = (
- None,
+ stream = _get_tensor_stream(A)
+
+ with _cuda_device_of(A):
+ if A.dtype == torch.float16:
+ lib.cgemm_4bit_inference_naive_fp16(
+ m,
+ n,
+ k,
get_ptr(A),
+ get_ptr(B),
get_ptr(absmax),
+ get_ptr(code),
get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(out.numel()),
- _get_tensor_stream(A),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.bfloat16:
+ lib.cgemm_4bit_inference_naive_bf16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.float32:
+ lib.cgemm_4bit_inference_naive_fp32(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
)
-
- if out.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_bf16_fp4(*args)
- else:
- lib.cdequantize_blockwise_bf16_nf4(*args)
- elif out.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp16_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp16_nf4(*args)
- elif out.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp32_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp32_nf4(*args)
-
-
-@register_kernel("bitsandbytes::gemv_4bit", "cuda")
-def _(
- A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
-) -> torch.Tensor:
- shape = (*A.shape[:-1], shapeB[0])
- out = torch.empty(shape, device=A.device, dtype=A.dtype)
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check(
- out.shape == (*A.shape[:-1], shapeB[0]),
- lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
- )
- torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
-
-def _gemv_4bit_impl(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check_is_size(blocksize)
- torch._check(
- A.numel() == A.size(-1),
- lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
- )
- torch._check(
- A.dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
- )
- torch._check(
- B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
- lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
- )
- torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- m = ct.c_int32(shapeB[0])
- n = ct.c_int32(1)
- k = ct.c_int32(shapeB[1])
-
- lda = m
- ldb = ct.c_int32((A.shape[-1] + 1) // 2)
- ldc = m
-
- stream = _get_tensor_stream(A)
-
- with _cuda_device_of(A):
- if A.dtype == torch.float16:
- lib.cgemm_4bit_inference_naive_fp16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.bfloat16:
- lib.cgemm_4bit_inference_naive_bf16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.float32:
- lib.cgemm_4bit_inference_naive_fp32(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
From 9fe67efada457a759d1d8193265243209e784e2c Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 23 May 2025 02:11:31 +0530
Subject: [PATCH 22/85] Update functional.py
---
bitsandbytes/functional.py | 28 +++++++++++++---------------
1 file changed, 13 insertions(+), 15 deletions(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 3f0c1ff94..237aa3e54 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -754,13 +754,11 @@ def quantize_blockwise(
if "dynamic" not in name2qmap:
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
code = name2qmap["dynamic"]
-
- device_type = A.device.type
- if device_type in ["cuda", "hip"]:
- if not HIP_ENVIRONMENT:
- assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
- else:
- assert blocksize in [4096, 2048, 1024, 512, 256, 128]
+
+ if HIP_ENVIRONMENT:
+ assert blocksize in [4096, 2048, 1024, 512, 256, 128]
+ else:
+ assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
_out, _absmax = torch.ops.bitsandbytes.quantize_blockwise.default(
A,
@@ -839,15 +837,15 @@ def dequantize_blockwise(
if quant_state is None:
quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
- device_type = A.device.type
- if device_type in ["cuda", "hip"]:
+ if HIP_ENVIRONMENT:
+ supported_blocksizes = [4096, 2048, 1024, 512, 256, 128]
+ else:
supported_blocksizes = [4096, 2048, 1024, 512, 256, 128, 64]
- if HIP_ENVIRONMENT:
- supported_blocksizes = supported_blocksizes[:-1]
- if quant_state.blocksize not in supported_blocksizes:
- raise ValueError(
- f"The blocksize of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}"
- )
+
+ if quant_state.blocksize not in supported_blocksizes:
+ raise ValueError(
+ f"The blocksize of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}"
+ )
absmax = quant_state.absmax
if quant_state.nested:
From d97fdce654129ca156f0cb47555529d4f4941778 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 23 May 2025 02:18:37 +0530
Subject: [PATCH 23/85] Update functional.py
---
bitsandbytes/functional.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 237aa3e54..1cee234ea 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -858,8 +858,8 @@ def dequantize_blockwise(
torch.ops.bitsandbytes.dequantize_blockwise.out(
A,
absmax,
- quant_state.code.to(A.device),
- quant_state.blocksize,
+ code.to(A.device),
+ blocksize,
quant_state.dtype,
out=out,
)
From e99ac0a1fb4336a53250c6d74e4908de0883fb52 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 16:05:21 -0400
Subject: [PATCH 24/85] Optimizer backwards compatibility (#1647)
---
bitsandbytes/optim/optimizer.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index 4bed9a7c3..9c20f9376 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -303,9 +303,9 @@ def get_config(self, gindex, pindex, group):
config["eps"] = group["eps"]
config["weight_decay"] = group["weight_decay"]
config["lr"] = group["lr"]
- config["alpha"] = group.get("alpha")
- config["t_alpha"] = group.get("t_alpha")
- config["t_beta3"] = group.get("t_beta3")
+ config["alpha"] = group.get("alpha", 0.0)
+ config["t_alpha"] = group.get("t_alpha", 0)
+ config["t_beta3"] = group.get("t_beta3", 0)
config["optim_bits"] = self.args.optim_bits
config["min_8bit_size"] = self.args.min_8bit_size
config["percentile_clipping"] = self.args.percentile_clipping
@@ -530,7 +530,7 @@ def update_step(self, group, p, gindex, pindex):
state["state2"],
config["betas"][1],
config["betas"][2] if len(config["betas"]) >= 3 else 0.0,
- config["alpha"],
+ config.get("alpha", 0.0),
config["weight_decay"],
gnorm_scale,
state["unorm_vec"] if config["max_unorm"] > 0.0 else None,
@@ -575,7 +575,7 @@ def update_step(self, group, p, gindex, pindex):
config["betas"][0],
config["betas"][1],
config["betas"][2] if len(config["betas"]) >= 3 else 0.0,
- config["alpha"],
+ config.get("alpha", 0.0),
config["eps"],
step,
config["lr"],
From 503d243e047d64bae3b0cd3e713cf28530e9af43 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 20:19:09 -0400
Subject: [PATCH 25/85] General cleanup & test improvements (#1646)
* General cleanup & test improvements
* Tests: WA numpy 2 compat issue for torch<2.3
* Tests: update aarch64 cpu min torch version
* Tests: update aarch64 cpu min torch version
* Tests: update aarch64 cpu min torch version
---
.github/workflows/tests.yml | 28 ++-
benchmarking/int8/row_scale_benchmark.py | 70 -------
deploy.sh | 237 -----------------------
environment-bnb.yml | 21 --
environment.yml | 46 -----
tests/test_autograd.py | 7 +
tests/test_functional.py | 8 +-
tests/test_ops.py | 33 ++--
tests/test_triton.py | 2 +-
9 files changed, 57 insertions(+), 395 deletions(-)
delete mode 100644 benchmarking/int8/row_scale_benchmark.py
delete mode 100644 deploy.sh
delete mode 100644 environment-bnb.yml
delete mode 100644 environment.yml
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5d2a2708b..f1a5dca69 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -93,24 +93,32 @@ jobs:
path: output/${{ matrix.os }}/${{ matrix.arch }}/*
retention-days: 7
- cpu-tests:
+ test-cpu:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cpu
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
- torch_version: ["2.6.0", "2.7.0"]
+ # Test with the oldest supported torch version and the two newest.
+ torch_version: ["2.2.2", "2.6.0", "2.7.0"]
include:
- os: ubuntu-22.04
arch: x86_64
runner: banb-aws-general-8-plus-use1-public-80
- os: ubuntu-22.04-arm
arch: aarch64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ torch_version: "2.5.1"
- os: windows-2025
arch: x86_64
- os: macos-15
arch: arm64
+ exclude:
+ - os: ubuntu-22.04-arm
+ torch_version: "2.2.2"
+
runs-on: ${{ matrix.runner || matrix.os }}
env:
BNB_TEST_DEVICE: cpu
@@ -135,6 +143,11 @@ jobs:
pip install -e ".[test]"
pip install pytest-cov
+ # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+ - name: Downgrade NumPy
+ if: startsWith(matrix.torch_version, '2.2.')
+ run: pip install "numpy<2"
+
- name: Show installed packages
run: pip list
@@ -144,7 +157,7 @@ jobs:
- name: Run tests
run: pytest --durations=100
- # cuda-aarch64-tests:
+ # test-cuda-aarch64:
# if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
# needs: build-cuda
# strategy:
@@ -167,7 +180,7 @@ jobs:
- cuda-tests:
+ test-cuda:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cuda
strategy:
@@ -179,7 +192,7 @@ jobs:
cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
include:
- cuda_version: "11.8.0"
- torch_version: "2.4.1"
+ torch_version: "2.2.2"
pypi_index: "https://download.pytorch.org/whl/cu118"
- cuda_version: "12.6.3"
torch_version: "2.6.0"
@@ -238,6 +251,11 @@ jobs:
pip install -e ".[test]"
pip install pytest-cov
+ # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+ - name: Downgrade NumPy
+ if: startsWith(matrix.torch_version, '2.2.')
+ run: pip install "numpy<2"
+
- name: Show installed packages
run: pip list
diff --git a/benchmarking/int8/row_scale_benchmark.py b/benchmarking/int8/row_scale_benchmark.py
deleted file mode 100644
index 98d2496de..000000000
--- a/benchmarking/int8/row_scale_benchmark.py
+++ /dev/null
@@ -1,70 +0,0 @@
-"""
-Extracted from tests/test_functional.py
-
-Note: This feature is currently unused! It is kept here for archival purposes.
-
-Usage: pytest benchmarking/int8/row_scale_benchmark.py
-"""
-
-import time
-
-import pytest
-import torch
-
-from bitsandbytes import functional as F
-
-k = 20
-torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
-
-
-@pytest.mark.parametrize(
- ("dim1", "dim4", "inner"),
- [
- pytest.param(1024, 12288 * 4, 12288, id="1024, 12288*4, 12288"),
- pytest.param(2048, 4096 * 4, 4096, id="2048, 4096*4, 4096"),
- ],
-)
-@pytest.mark.skip("Row scale has some bugs for ampere")
-@pytest.mark.benchmark
-def test_row_scale_bench(dim1, dim4, inner):
- formatB = F.get_special_format_str()
- err1, err2, err3 = [], [], []
- relerr1, relerr2 = [], []
- scale = 1
- A = torch.randn(dim1, inner, device="cuda").half()
- B = torch.randn(dim4, inner, device="cuda").half()
- torch.nn.init.xavier_uniform_(B)
- # warmpup
- for i in range(k):
- C1 = torch.matmul(A, B.t())
-
- torch.cuda.synchronize()
- t0 = time.time()
- for i in range(k):
- C1 = torch.matmul(A, B.t())
- torch.cuda.synchronize()
- print("16", time.time() - t0)
-
- C1a, C1b, stats1a, stats1b, coo_tensor = F.int8_double_quant(A)
- CB, absmaxB = F.vectorwise_quant(B, quant_type="linear")
- A2, SA = F.nvidia_transform(C1a, "col32")
- B2, SB = F.nvidia_transform(CB, formatB)
- A1, maxA = F.vectorwise_quant(A, dim=1)
-
- c = 10.0 * inner * scale
- row_scale = maxA / c
- torch.cuda.synchronize()
- t0 = time.time()
- for i in range(k):
- outC32 = F.int8_linear_matmul(A2, B2, dtype=torch.int8, row_scale=row_scale)
- torch.cuda.synchronize()
- print("row-wise", time.time() - t0)
-
- C2a, C2b, stats2a, stats2b, coo_tensor = F.int8_double_quant(B)
- B2, SB = F.nvidia_transform(C2a, formatB)
- torch.cuda.synchronize()
- t0 = time.time()
- for i in range(k):
- outC32 = F.int8_linear_matmul(A2, B2)
- torch.cuda.synchronize()
- print("vector-wise", time.time() - t0)
diff --git a/deploy.sh b/deploy.sh
deleted file mode 100644
index e60373627..000000000
--- a/deploy.sh
+++ /dev/null
@@ -1,237 +0,0 @@
-#!/bin/bash
-BASE_PATH=$1
-
-echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
-echo $LD_LIBRARY_PATH
-
-if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-
-module unload cuda && echo "no module function available. Probably not on a slurm cluster."
-module unload gcc && echo "no module function available. Probably not on a slurm cluster."
-
-rm -rf dist build
-make cleaneggs
-make cleanlibs
-
-rm -rf build/*
-export CUDA_HOME=
-export CUDA_VERSION=
-make cpuonly CUDA_VERSION="CPU"
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.0
-make cuda110 CUDA_VERSION=110
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.1
-make cuda11x CUDA_VERSION=111
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.4
-make cuda11x CUDA_VERSION=114
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.5
-make cuda11x CUDA_VERSION=115
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.7
-make cuda11x CUDA_VERSION=117
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.8
-make cuda118 CUDA_VERSION=118
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.0
-make cuda12x CUDA_VERSION=120
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.1
-make cuda12x CUDA_VERSION=121
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.2
-make cuda12x CUDA_VERSION=122
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.3
-make cuda12x CUDA_VERSION=123
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-############################# START NO CUBLASLT #############################################
-# binaries without 8-bit matmul support START HERE
-# ###########################################################################################
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.0
-make cuda110_nomatmul CUDA_VERSION=110
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.1
-make cuda11x_nomatmul CUDA_VERSION=111
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.4
-make cuda11x_nomatmul CUDA_VERSION=114
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.5
-make cuda11x_nomatmul CUDA_VERSION=115
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.7
-make cuda11x_nomatmul CUDA_VERSION=117
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-11.8
-make cuda118_nomatmul CUDA_VERSION=118
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.0
-make cuda12x_nomatmul CUDA_VERSION=120
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.1
-make cuda12x_nomatmul CUDA_VERSION=121
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.2
-make cuda12x_nomatmul CUDA_VERSION=122
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-rm -rf build/*
-export CUDA_HOME=$BASE_PATH/cuda-12.3
-make cuda12x_nomatmul CUDA_VERSION=123
-
-if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
- # Control will enter here if $DIRECTORY doesn't exist.
- echo "Compilation unsuccessful!" 1>&2
- exit 64
-fi
-
-python -m build
-python -m twine upload dist/* --verbose
diff --git a/environment-bnb.yml b/environment-bnb.yml
deleted file mode 100644
index 1214f7930..000000000
--- a/environment-bnb.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-# for cmake build
-name: bnb
-channels:
- - pytorch
- - nvidia
- - conda-forge
-
-dependencies:
- - python
- #- accelerate
- #- einops
- - scipy
- #- transformers
- - pytest
- - pytest-cases
- - ipython
- - debugpy
- - yapf
- - monkeytype
- - rich
- - pytest-sugar
diff --git a/environment.yml b/environment.yml
deleted file mode 100644
index af421b3c6..000000000
--- a/environment.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-name: bnb
-channels:
- - pytorch
- - nvidia
- - conda-forge
-
-dependencies:
- # Base
- - conda-forge::python=3.8
- - pytorch::pytorch=>2.1
- - pytorch::pytorch-cuda=11.8
- - nvidia::cuda=11.8
- # Libraries
- - conda-forge::accelerate
- - conda-forge::einops
- - conda-forge::scipy
- - conda-forge::transformers
- # Development
- - conda-forge::pytest
- - conda-forge::build # build Python packages
- - conda-forge::twine # upload Python packages
- - conda-forge::pytest-cases # more readable and composable parametrized tests
- - conda-forge::ipython # better interactive shell
- - conda-forge::debugpy # debugger-support for VSCode
- - conda-forge::ruff # linting
- - conda-forge::yapf # code formatting
- - conda-forge::monkeytype # infer type annotations
- - conda-forge::rich # better, colored tracebacks, etc
- - conda-forge::pytest-sugar # better pytest output
- # - conda-forge::nodejs # for `doc-builder preview` (optional)
-
-## ENV CREATION - steps to reproduce:
-# mamba env remove -n bnb
-# mamba create -y -n bnb python=3.8 # creating an empty env bypasses conda
-# # and leads to much faster env resolution in the next step https://github.com/mamba-org/mamba/issues/633#issuecomment-812272143
-# mamba env update -n bnb -f environment.yml
-# mamba activate bnb
-
-## PIP dependencies (install *after* ENV CREATION):
-# pip install --no-cache-dir --no-deps lion_pytorch triton hf-doc-builder watchdog
-## NOTE: conda peft is not up to date, so we install from pip
-# cd pip install -e . ## installs bitsandbytes as editable development install from within repo root dir
-
-## ENV UPDATE:
-# # add new packages to environment.yml, then:
-# mamba env update -n bnb -f environment.yml
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index b6ba284c9..fc2e7aa6f 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -49,6 +49,10 @@ def test_matmullt(
req_grad = list(req_grad)
req_grad[2] = False
+ if device == "cpu" and dtype != torch.float32 and has_fp16_weights and any(req_grad):
+ if torch.__version__ < (2, 6):
+ pytest.xfail("mse_loss bf16/fp16 on CPU is not supported in torch < 2.6")
+
for i in range(3):
# normal multiply
if funcs[0] in [torch.mm, torch.matmul]:
@@ -185,6 +189,9 @@ def test_matmul_4bit(
req_grad = list(req_grad)
req_grad[2] = False
+ if device == "cpu" and dtype != torch.float32 and any(req_grad) and torch.__version__ < (2, 6):
+ pytest.xfail("mse_loss fp16 on CPU is not supported in torch < 2.6")
+
for i in range(3):
# normal multiply
if funcs[0] in [torch.mm, torch.matmul]:
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 0b9390aaa..8568d45f0 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1342,8 +1342,12 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
- if device == "cpu" and storage_type != "nf4":
- pytest.xfail("fp4 quantization is not supported on CPU")
+ if device == "cpu":
+ if storage_type != "nf4":
+ pytest.xfail("fp4 quantization is not supported on CPU")
+
+ if dtype == torch.bfloat16 and torch.__version__ < (2, 3):
+ pytest.xfail("eye doe not support bfloat16 on CPU in torch < 2.3")
dims = 10
torch.random.manual_seed(np.random.randint(0, 412424242))
diff --git a/tests/test_ops.py b/tests/test_ops.py
index 4da1663f0..e85bc0ef0 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -6,6 +6,13 @@
import bitsandbytes
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
+# torch.library.opcheck is only available in torch 2.4 and later.
+# When testing with older versions, we will skip it as a no-op.
+if torch.__version__ >= (2, 4):
+ opcheck = torch.library.opcheck
+else:
+ opcheck = lambda *args, **kwargs: None
+
class TestLLMInt8Ops:
@pytest.mark.parametrize("device", get_available_devices())
@@ -18,7 +25,7 @@ def test_int8_linear_matmul(self, device):
assert out.dtype == torch.int32
assert out.device == A.device
- torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul.default, (A, B))
+ opcheck(torch.ops.bitsandbytes.int8_linear_matmul.default, (A, B))
@pytest.mark.parametrize("device", get_available_devices())
def test_int8_linear_matmul_out(self, device):
@@ -32,7 +39,7 @@ def test_int8_linear_matmul_out(self, device):
assert out.dtype == torch.int32
assert out.device == A.device
- torch.library.opcheck(torch.ops.bitsandbytes.int8_linear_matmul.out, (A, B, out))
+ opcheck(torch.ops.bitsandbytes.int8_linear_matmul.out, (A, B, out))
@pytest.mark.parametrize("threshold", [0.0, 6.0])
@pytest.mark.parametrize("device", get_available_devices())
@@ -57,9 +64,8 @@ def test_int8_vectorwise_quant(self, threshold, device):
else:
assert outlier_cols is None
- torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
-
- torch.library.opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))
+ opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A,))
+ opcheck(torch.ops.bitsandbytes.int8_vectorwise_quant, (A, threshold))
@pytest.mark.parametrize("device", get_available_devices())
def test_int8_mm_dequant(self, device):
@@ -72,7 +78,7 @@ def test_int8_mm_dequant(self, device):
assert out.dtype == torch.float16
assert out.device == A.device
- torch.library.opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))
+ opcheck(torch.ops.bitsandbytes.int8_mm_dequant, (A, row_stats, col_stats))
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -89,7 +95,7 @@ def test_int8_scaled_mm(self, device, dtype, has_bias):
assert out.dtype == dtype
assert out.device == A.device
- torch.library.opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))
+ opcheck(torch.ops.bitsandbytes.int8_scaled_mm, (A, B, row_stats, col_stats, bias, dtype))
class TestInt8BlockwiseQuantOps:
@@ -115,7 +121,7 @@ def test_quantize_blockwise(self, device, dtype, blocksize):
assert absmax.device == A.device
assert absmax.dtype == torch.float32
- torch.library.opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))
+ opcheck(torch.ops.bitsandbytes.quantize_blockwise, (A, code, blocksize))
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -137,7 +143,7 @@ def test_dequantize_blockwise(self, device, dtype, blocksize):
assert out.dtype == dtype
assert out.device == A.device
- torch.library.opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
+ opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
class Test4bitBlockwiseQuantOps:
@@ -163,7 +169,7 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
assert absmax.device == A.device
assert absmax.dtype == torch.float32
- torch.library.opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
+ opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@@ -198,8 +204,9 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
assert out.device == A.device
assert out.shape == shape
- torch.library.opcheck(
- torch.ops.bitsandbytes.dequantize_4bit.default, (A, absmax, blocksize, quant_type, shape, dtype)
+ opcheck(
+ torch.ops.bitsandbytes.dequantize_4bit.default,
+ (A, absmax, blocksize, quant_type, shape, dtype),
)
@pytest.mark.parametrize("device", get_available_devices())
@@ -226,4 +233,4 @@ def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
assert out.shape == (1, 1, out_features)
assert out.isreal().all()
- torch.library.opcheck(torch.ops.bitsandbytes.gemv_4bit.default, (A, B_q, B.shape, absmax, code, blocksize))
+ opcheck(torch.ops.bitsandbytes.gemv_4bit.default, (A, B_q, B.shape, absmax, code, blocksize))
diff --git a/tests/test_triton.py b/tests/test_triton.py
index 70656a56f..b245e534a 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -11,7 +11,7 @@
not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8,
reason="This test requires triton and a GPU with compute capability 8.0 or higher.",
)
-@pytest.mark.skip("No longer supported.")
+@pytest.mark.deprecated
@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE)
def test_switchback(vector_wise_quantization):
for dim in [83]:
From 9f85829479ac2299dcf692e913a83911a1069ad4 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 23:45:49 -0400
Subject: [PATCH 26/85] Add torch.compile tests (#1648)
* Add torch.compile tests
* Tests: WA aarch64 CPU regressions for torch 2.6.0; add Windows torch==2.7.0+cu118 test config
* Tests: skip torch.compile for cuda on windows
---
.github/workflows/tests.yml | 30 +++++++++++-
bitsandbytes/functional.py | 4 +-
bitsandbytes/nn/modules.py | 2 +-
tests/test_linear4bit.py | 92 ++++++++++++++++++++++++++++++++++++-
tests/test_linear8bitlt.py | 66 ++++++++++++++++++++++++++
5 files changed, 188 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f1a5dca69..b93bff4f0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -137,6 +137,10 @@ jobs:
with:
python-version: 3.9
+ - name: Setup MSVC
+ if: startsWith(matrix.os, 'windows')
+ uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl for torch.compile
+
- name: Install dependencies
run: |
pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/cpu
@@ -201,18 +205,40 @@ jobs:
torch_version: "2.7.0"
pypi_index: "https://download.pytorch.org/whl/cu128"
- # L40S runners
+
+ # Linux L40S runners
- os: ubuntu-22.04
gpu: L40S
runner: bandb-aws-g6e-4xlarge-plus-use1-public-80
- # T4 runners
+ # Linux T4 runners
- os: ubuntu-22.04
gpu: T4
runner: bandb-aws-g4dn-4xlarge-plus-use1-public-80
+
+ # Specific Windows runners using cu118
+ - os: windows-2025
+ arch: x86_64
+ gpu: T4
+ runner: CUDA-Windows-x64
+ cuda_version: "11.8.0"
+ torch_version: "2.2.0"
+ pypi_index: "https://download.pytorch.org/whl/cu118"
- os: windows-2025
+ arch: x86_64
+ gpu: T4
+ runner: CUDA-Windows-x64
+ cuda_version: "11.8.0"
+ torch_version: "2.6.0"
+ pypi_index: "https://download.pytorch.org/whl/cu118"
+ - os: windows-2025
+ arch: x86_64
gpu: T4
runner: CUDA-Windows-x64
+ cuda_version: "11.8.0"
+ torch_version: "2.7.0"
+ pypi_index: "https://download.pytorch.org/whl/cu118"
+
exclude:
# Our current T4 Windows runner has a driver too old (471.11)
# and cannot support CUDA 12+. Skip for now.
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index b0092ffd1..f84f16c21 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -771,14 +771,14 @@ def quantize_blockwise(
qabsmax, state2 = quantize_blockwise(_absmax, blocksize=blocksize, nested=False)
quant_state = QuantState(
absmax=qabsmax,
- code=code,
+ code=code.to(A.device, copy=True),
blocksize=blocksize,
dtype=A.dtype,
offset=offset,
state2=state2,
)
else:
- quant_state = QuantState(absmax=_absmax, code=code.to(A.device), blocksize=blocksize, dtype=A.dtype)
+ quant_state = QuantState(absmax=_absmax, code=code.to(A.device, copy=True), blocksize=blocksize, dtype=A.dtype)
# TODO(matthewdouglas): Deprecate out kwarg
out = out.copy_(_out) if out is not None else _out
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 937084cf1..500102ab1 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -493,7 +493,7 @@ def forward(self, x: torch.Tensor):
bias = None if self.bias is None else self.bias.to(self.compute_dtype)
- return bnb.matmul_4bit(x, self.weight.data.t(), bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
+ return bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
class LinearFP4(Linear4bit):
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 67b61cb05..f3673797c 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -1,13 +1,21 @@
import copy
import os
import pickle
+import platform
from tempfile import TemporaryDirectory
import pytest
import torch
import bitsandbytes as bnb
-from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, torch_load_from_buffer, torch_save_to_buffer
+from tests.helpers import (
+ TRUE_FALSE,
+ describe_dtype,
+ get_available_devices,
+ id_formatter,
+ torch_load_from_buffer,
+ torch_save_to_buffer,
+)
storage = {
"uint8": torch.uint8,
@@ -275,3 +283,85 @@ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_s
# there was a bug where deepcopy would modify the original object
assert dict_keys_before == dict_keys_after
assert dict_keys_before == dict_keys_deserialized
+
+
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("compute_dtype", [torch.bfloat16, torch.float32], ids=describe_dtype)
+@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
+@pytest.mark.parametrize("fullgraph", TRUE_FALSE, ids=id_formatter("fullgraph"))
+@pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
+@pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
+def test_linear4bit_torch_compile(device, quant_type, compute_dtype, compress_statistics, bias, fullgraph, mode):
+ if device == "cpu" and quant_type == "fp4":
+ pytest.skip("FP4 is not supported for CPU")
+
+ if fullgraph and torch.__version__ < (2, 8):
+ pytest.skip("fullgraph mode requires torch 2.8 or higher")
+
+ if device == "cuda" and platform.system() == "Windows":
+ pytest.skip("Triton is not officially supported on Windows")
+
+ # Has a strange regression on Linux aarch64 CPU in torch==2.6.0 when fullgraph=False.
+ if (
+ not fullgraph
+ and device == "cpu"
+ and platform.machine() == "aarch64"
+ and platform.system() == "Linux"
+ and ((2, 7) > torch.__version__ >= (2, 6))
+ ):
+ pytest.xfail("Regression in torch==2.6.0 on Linux aarch64 CPU")
+
+ dim = 256
+ batch_size = 16
+
+ torch.compiler.reset()
+
+ # Create a small network with Linear4bit layers
+ net = torch.nn.Sequential(
+ *[
+ bnb.nn.Linear4bit(
+ dim,
+ dim,
+ bias=bias,
+ compute_dtype=compute_dtype,
+ compress_statistics=compress_statistics,
+ quant_type=quant_type,
+ )
+ for _ in range(4)
+ ]
+ ).to(device)
+
+ # Create input tensor
+ x = torch.randn(batch_size, dim, dtype=compute_dtype, device=device)
+
+ # Get reference output before compilation
+ with torch.no_grad():
+ ref_output = net(x)
+
+ # Compile the model
+ compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode)
+
+ # Get output from compiled model
+ with torch.no_grad():
+ compiled_output = compiled_net(x)
+
+ # Check outputs match
+ assert compiled_output.shape == ref_output.shape
+ assert compiled_output.device == ref_output.device
+ assert compiled_output.dtype == ref_output.dtype
+ torch.testing.assert_close(compiled_output, ref_output)
+
+ # Test with gradients
+ x.requires_grad_(True)
+ y1 = net(x).sum()
+ y1.backward()
+ grad_ref = x.grad.clone()
+
+ x.grad = None
+ y2 = compiled_net(x).sum()
+ y2.backward()
+ grad_compiled = x.grad.clone()
+
+ torch.testing.assert_close(grad_compiled, grad_ref)
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 8c08cfa2c..a77c693e0 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -2,6 +2,7 @@
import copy
import os
import pickle
+import platform
from tempfile import TemporaryDirectory
import pytest
@@ -224,3 +225,68 @@ def test_linear8bit_serialization(linear8bit):
# check for a bug where SCB and CB were not copied
assert (linear8bit.weight.SCB == deserialized.weight.SCB).all()
assert (linear8bit.weight.CB == deserialized.weight.CB).all()
+
+
+@pytest.mark.parametrize("device", get_available_devices())
+@pytest.mark.parametrize("threshold", [0.0, 6.0], ids=id_formatter("threshold"))
+@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
+@pytest.mark.parametrize("fullgraph", TRUE_FALSE, ids=id_formatter("fullgraph"))
+@pytest.mark.parametrize("mode", ["default", "reduce-overhead"], ids=id_formatter("mode"))
+@pytest.mark.skipif(torch.__version__ < (2, 4), reason="Not supported in torch < 2.4")
+def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode):
+ if device == "cuda" and platform.system() == "Windows":
+ pytest.skip("Triton is not officially supported on Windows")
+
+ dim = 256
+ batch_size = 16
+
+ torch.compiler.reset()
+
+ # Create a small network with Linear8bitLt layers
+ net = torch.nn.Sequential(
+ *[bnb.nn.Linear8bitLt(dim, dim, bias=bias, has_fp16_weights=False, threshold=threshold) for _ in range(4)]
+ ).to(device)
+
+ dynamic_output_shapes = fullgraph and threshold > 0
+ with torch._dynamo.config.patch("capture_dynamic_output_shape_ops", dynamic_output_shapes):
+ # Create input tensor
+ x = torch.randn(batch_size, dim, dtype=torch.float16, device=device)
+
+ # Get reference output before compilation
+ with torch.no_grad():
+ ref_output = net(x)
+
+ # Compile the model
+ compiled_net = torch.compile(net, fullgraph=fullgraph, mode=mode)
+
+ # Get output from compiled model
+ with torch.no_grad():
+ compiled_output = compiled_net(x)
+
+ # Check outputs match
+ assert compiled_output.shape == ref_output.shape
+ assert compiled_output.device == ref_output.device
+ assert compiled_output.dtype == ref_output.dtype
+ torch.testing.assert_close(compiled_output, ref_output)
+
+ # Test with gradients. Currently only works with threshold=0.
+ # Has a strange regression on Linux aarch64 CPU in torch==2.6.0.
+ is_broken_platform = (
+ device == "cpu"
+ and platform.machine() == "aarch64"
+ and platform.system() == "Linux"
+ and ((2, 7) > torch.__version__ >= (2, 6))
+ )
+
+ if threshold == 0 and not is_broken_platform:
+ x.requires_grad_(True)
+ y1 = net(x).sum()
+ y1.backward()
+ grad_ref = x.grad.clone()
+
+ x.grad = None
+ y2 = compiled_net(x).sum()
+ y2.backward()
+ grad_compiled = x.grad.clone()
+
+ torch.testing.assert_close(grad_compiled, grad_ref)
From 198d08fc475a7ee7653dd5a7f15d532647c455a1 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Fri, 23 May 2025 23:55:45 -0400
Subject: [PATCH 27/85] Documentation Cleanup (#1644)
* Start cleaning up docs
* Remove page
* Minor update
* correction
* Minor doc revisions
* Update installation.mdx
* Update _toctree.yml
---
docs/source/_toctree.yml | 13 +-
docs/source/algorithms.mdx | 12 --
docs/source/contributing.mdx | 3 +-
docs/source/faqs.mdx | 2 -
docs/source/installation.mdx | 229 ++++++++-------------------
docs/source/non_cuda_backends.mdx | 44 -----
docs/source/reference/functional.mdx | 5 -
7 files changed, 73 insertions(+), 235 deletions(-)
delete mode 100644 docs/source/algorithms.mdx
delete mode 100644 docs/source/non_cuda_backends.mdx
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 5fa353d6d..0f46fe6b0 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -2,18 +2,15 @@
sections:
- local: index
title: bitsandbytes
- - local: quickstart
- title: Quickstart
- local: installation
title: Installation
-- title: Guides
+ - local: quickstart
+ title: Quickstart
+
+- title: Usage Guides
sections:
- local: optimizers
title: 8-bit optimizers
- - local: algorithms
- title: Algorithms
- - local: non_cuda_backends
- title: Non-CUDA compute backends
- local: fsdp_qlora
title: FSDP-QLoRA
- local: integrations
@@ -56,7 +53,7 @@
title: RMSprop
- local: reference/optim/sgd
title: SGD
- - title: k-bit quantizers
+ - title: Modules
sections:
- local: reference/nn/linear8bit
title: LLM.int8()
diff --git a/docs/source/algorithms.mdx b/docs/source/algorithms.mdx
deleted file mode 100644
index 65e5567a4..000000000
--- a/docs/source/algorithms.mdx
+++ /dev/null
@@ -1,12 +0,0 @@
-# Other algorithms
-_WIP: Still incomplete... Community contributions would be greatly welcome!_
-
-This is an overview of the `bnb.functional` API in `bitsandbytes` that we think would also be useful as standalone entities.
-
-## Using Int8 Matrix Multiplication
-
-For straight Int8 matrix multiplication without mixed precision decomposition you can use ``bnb.matmul(...)``. To enable mixed precision decomposition, use the threshold parameter:
-
-```py
-bnb.matmul(..., threshold=6.0)
-```
diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx
index 5da42961e..464f92164 100644
--- a/docs/source/contributing.mdx
+++ b/docs/source/contributing.mdx
@@ -1,5 +1,4 @@
-# Contributors guidelines
-... still under construction ... (feel free to propose materials, `bitsandbytes` is a community project)
+# Contribution Guide
## Setup
diff --git a/docs/source/faqs.mdx b/docs/source/faqs.mdx
index b95a1d799..c81257451 100644
--- a/docs/source/faqs.mdx
+++ b/docs/source/faqs.mdx
@@ -3,5 +3,3 @@
Please submit your questions in [this Github Discussion thread](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1013) if you feel that they will likely affect a lot of other users and that they haven't been sufficiently covered in the documentation.
We'll pick the most generally applicable ones and post the QAs here or integrate them into the general documentation (also feel free to submit doc PRs, please).
-
-# ... under construction ...
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 704d7aacc..11dfbf5ea 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,91 +1,65 @@
# Installation Guide
-Welcome to the installation guide for the `bitsandbytes` library! This document provides step-by-step instructions to install `bitsandbytes` across various platforms and hardware configurations. The library primarily supports CUDA-based GPUs, but the team is actively working on enabling support for additional backends like AMD ROCm, Intel, and Apple Silicon.
-
-> [!TIP]
-> For a high-level overview of backend support and compatibility, see the [Multi-backend Support](#multi-backend) section.
+Welcome to the installation guide for the `bitsandbytes` library! This document provides step-by-step instructions to install `bitsandbytes` across various platforms and hardware configurations. The library primarily supports CUDA-based GPUs, but the team is actively working on enabling support for additional backends like CPU, AMD ROCm, Intel XPU, and Gaudi HPU.
## Table of Contents
- [CUDA](#cuda)
- [Installation via PyPI](#cuda-pip)
- [Compile from Source](#cuda-compile)
-- [Multi-backend Support (Alpha Release)](#multi-backend)
+ - [Preview Wheels from `main`](#cuda-preview)
+- [Multi-Backend Preview](#multi-backend)
- [Supported Backends](#multi-backend-supported-backends)
- [Pre-requisites](#multi-backend-pre-requisites)
- [Installation](#multi-backend-pip)
- [Compile from Source](#multi-backend-compile)
-- [PyTorch CUDA Versions](#pytorch-cuda-versions)
## CUDA[[cuda]]
-`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.8**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
-
-### Supported CUDA Configurations[[cuda-pip]]
-
-The latest version of the distributed `bitsandbytes` package is built with the following configurations:
-
-| **OS** | **CUDA Toolkit** | **Host Compiler** |
-|-------------|------------------|----------------------|
-| **Linux** | 11.8 - 12.3 | GCC 11.4 |
-| | 12.4 - 12.8 | GCC 13.2 |
-| **Windows** | 11.8 - 12.8 | MSVC 19.42+ (VS2022) |
-
-For CUDA systems, ensure your hardware meets the following requirements:
+`bitsandbytes` is currently supported on NVIDIA GPUs with [Compute Capability](https://developer.nvidia.com/cuda-gpus) 5.0+.
+The library can be built using CUDA Toolkit versions as old as **11.6** on Windows and **11.4** on Linux.
-| **Feature** | **Minimum Hardware Requirement** |
-|---------------------------------|---------------------------------------------------------------|
-| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or newer GPUs |
-| 8-bit optimizers/quantization | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
-| NF4/FP4 quantization | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
+| **Feature** | **CC Required** | **Example Hardware Requirement** |
+|---------------------------------|-----------------|---------------------------------------------|
+| LLM.int8() | 7.5+ | Turing (RTX 20 series, T4) or newer GPUs |
+| 8-bit optimizers/quantization | 5.0+ | Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs |
+| NF4/FP4 quantization | 5.0+ | Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs |
> [!WARNING]
-> `bitsandbytes >= 0.45.0` no longer supports Kepler GPUs.
->
> Support for Maxwell GPUs is deprecated and will be removed in a future release. For the best results, a Turing generation device or newer is recommended.
-```bash
-pip install bitsandbytes
-```
-
-### `pip install` pre-built wheel from latest `main` commit
-
-If you would like to use new feature even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):
+### Installation via PyPI[[cuda-pip]]
-
-
-
-```
-# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+This is the most straightforward and recommended installation option.
-# x86_64 (most users)
-pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
+The currently distributed `bitsandbytes` packages are built with the following configurations:
-# ARM/aarch64
-pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl
-```
+| **OS** | **CUDA Toolkit** | **Host Compiler** | **Targets**
+|--------------------|------------------|----------------------|--------------
+| **Linux x86-64** | 11.8 - 12.6 | GCC 11.2 | sm50, sm60, sm75, sm80, sm86, sm89, sm90
+| **Linux x86-64** | 12.8 | GCC 11.2 | sm75, sm80, sm86, sm89, sm90, sm100, sm120
+| **Linux aarch64** | 11.8 - 12.6 | GCC 11.2 | sm75, sm80, sm90
+| **Linux aarch64** | 12.8 | GCC 11.2 | sm75, sm80, sm90, sm100
+| **Windows x86-64** | 11.8 - 12.6 | MSVC 19.43+ (VS2022) | sm50, sm60, sm75, sm80, sm86, sm89, sm90
+| **Windows x86-64** | 12.8 | MSVC 19.43+ (VS2022) | sm75, sm80, sm86, sm89, sm90, sm100, sm120
-
-
+Use `pip` or `uv` to install:
+```bash
+pip install bitsandbytes
```
-# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
-pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl
-```
-
-
### Compile from source[[cuda-compile]]
> [!TIP]
-> Don't hesitate to compile from source! The process is pretty straight forward and resilient. This might be needed for older CUDA versions or other less common configurations, which we don't support out of the box due to package size.
+> Don't hesitate to compile from source! The process is pretty straight forward and resilient. This might be needed for older CUDA Toolkit versions or Linux distributions, or other less common configurations.
For Linux and Windows systems, compiling from source allows you to customize the build configurations. See below for detailed platform-specific instructions (see the `CMakeLists.txt` if you want to check the specifics and explore some additional options):
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.). It is recommended to use GCC 9 or newer.
For example, to install a compiler and CMake on Ubuntu:
@@ -93,7 +67,7 @@ For example, to install a compiler and CMake on Ubuntu:
apt-get install -y build-essential cmake
```
-You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) guide from NVIDIA. The current minimum supported CUDA Toolkit version is **11.8**.
+You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) guide. The current minimum supported CUDA Toolkit version that we test with is **11.8**.
```bash
git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
@@ -103,14 +77,14 @@ pip install -e . # `-e` for "editable" install, when developing BNB (otherwise
```
> [!TIP]
-> If you have multiple versions of CUDA installed or installed it in a non-standard location, please refer to CMake CUDA documentation for how to configure the CUDA compiler.
+> If you have multiple versions of the CUDA Toolkit installed or it is in a non-standard location, please refer to CMake CUDA documentation for how to configure the CUDA compiler.
-Windows systems require Visual Studio with C++ support as well as an installation of the CUDA SDK.
+Compilation from source on Windows systems require Visual Studio with C++ support as well as an installation of the CUDA Toolkit.
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA. The current minimum supported CUDA Toolkit version is **11.8**.
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.9** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA. The current minimum supported CUDA Toolkit version that we test with is **11.8**.
```bash
git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
@@ -124,78 +98,46 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
-### PyTorch CUDA versions[[pytorch-cuda-versions]]
-
-Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
+### Preview Wheels from `main`[[cuda-preview]]
-1. Determine the path of the CUDA version you want to use. Common paths include:
+If you would like to use new features even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):
-* `/usr/local/cuda`
-* `/usr/local/cuda-XX.X` where `XX.X` is the CUDA version number
-
-Then locally install the CUDA version you need with this script from bitsandbytes:
+
+
```bash
-wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
-# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-# CUDA_VERSION in {118, 120, 121, 122, 123, 124, 125, 126, 128}
-# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
-
-# For example, the following installs CUDA 12.6 to ~/local/cuda-12.6 and exports the path to your .bashrc
-
-bash install_cuda.sh 126 ~/local 1
-```
+# Note: if you don't want to reinstall our dependencies, append the `--no-deps` flag!
-2. Set the environment variables `BNB_CUDA_VERSION` and `LD_LIBRARY_PATH` by manually overriding the CUDA version installed by PyTorch.
-
-> [!TIP]
-> It is recommended to add the following lines to the `.bashrc` file to make them permanent.
+# x86_64 (most users)
+pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl
-```bash
-export BNB_CUDA_VERSION=
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:
+# ARM/aarch64
+pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl
```
-For example, to use a local install path:
+
+
```bash
-export BNB_CUDA_VERSION=126
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-12.6
+# Note: if you don't want to reinstall our dependencies, append the `--no-deps` flag!
+pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl
```
-
-3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 12.6) and a different bitsandbytes library is loaded.
-
-## Multi-backend Support (Alpha Release)[[multi-backend]]
-
-> [!TIP]
-> This functionality is currently in preview and not yet production-ready. We very much welcome community feedback, contributions and leadership on topics like Apple Silicon as well as other less common accellerators! For more information, see [this guide on multi-backend support](./non_cuda_backends).
-
-**Link to give us feedback** (bugs, install issues, perf results, requests, etc.)**:**
-
-
-
-
-[**Multi-backend refactor: Alpha release (AMD ROCm ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1339)
-
-
+
-[**Multi-backend refactor: Alpha release (INTEL ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1338)
-
-
+## Multi-Backend Preview[[multi-backend]]
-[**Github Discussion space on coordinating the kickoff of MPS backend development**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340)
+> [!WARNING]
+> This functionality existed as an early technical preview and is not recommended for production use. We are in the process of upstreaming improved support for AMD and Intel hardware into the main project.
-
-
+We provide an early preview of support for AMD and Intel hardware as part of a development branch.
### Supported Backends[[multi-backend-supported-backends]]
| **Backend** | **Supported Versions** | **Python versions** | **Architecture Support** | **Status** |
|-------------|------------------------|---------------------------|-------------------------|------------|
| **AMD ROCm** | 6.1+ | 3.10+ | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha |
-| **Apple Silicon (MPS)** | WIP | 3.10+ | M1/M2 chips | Planned |
| **Intel CPU** | v2.4.0+ (`ipex`) | 3.10+ | Intel CPU | Alpha |
| **Intel GPU** | v2.4.0+ (`ipex`) | 3.10+ | Intel GPU | Experimental |
| **Ascend NPU** | 2.1.0+ (`torch_npu`) | 3.10+ | Ascend NPU | Experimental |
@@ -204,9 +146,9 @@ For each supported backend, follow the respective instructions below:
### Pre-requisites[[multi-backend-pre-requisites]]
-To use bitsandbytes non-CUDA backends, be sure to install:
+To use this preview version of `bitsandbytes` with `transformers`, be sure to install:
-```
+```bash
pip install "transformers>=4.45.1"
```
@@ -218,33 +160,26 @@ pip install "transformers>=4.45.1"
>
> Other supported versions that don't come with pre-compiled binaries [can be compiled for with these instructions](#multi-backend-compile).
>
-> **Windows is not supported for the ROCm backend**; also not WSL2 to our knowledge.
+> **Windows is not supported for the ROCm backend**
> [!TIP]
> If you would like to install ROCm and PyTorch on bare metal, skip the Docker steps and refer to ROCm's official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Special note: please make sure to get the respective ROCm-specific PyTorch wheel for the installed ROCm version, e.g. `https://download.pytorch.org/whl/nightly/rocm6.2/`!
```bash
-# Create a docker container with latest ROCm image, which includes ROCm libraries
-docker pull rocm/dev-ubuntu-22.04:6.1.2-complete
-docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete
+# Create a docker container with the ROCm image, which includes ROCm libraries
+docker pull rocm/dev-ubuntu-22.04:6.3.4-complete
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.3.4-complete
apt-get update && apt-get install -y git && cd home
# Install pytorch compatible with above ROCm version
-pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
+pip install torch --index-url https://download.pytorch.org/whl/rocm6.3/
```
-
-
-Compatible hardware and functioning `import intel_extension_for_pytorch as ipex` capable environment with Python `3.10` as the minimum requirement.
-
-Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
-
-
-
+
-> [!TIP]
-> Apple Silicon support is still a WIP. Please visit and write us in [this Github Discussion space on coordinating the kickoff of MPS backend development](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and coordinate a community-led effort to implement this backend.
+* A compatible PyTorch version with Intel XPU support is required. It is recommended to use the latest stable release. See [Getting Started on Intel GPU](https://docs.pytorch.org/docs/stable/notes/get_start_xpu.html) for guidance.
+* The [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/) is recommended for performance improvements.
@@ -257,38 +192,22 @@ You can install the pre-built wheels for each backend, or compile from source fo
+This wheel provides support for ROCm and Intel XPU platforms.
```
-# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+# Note, if you don't want to reinstall our dependencies, append the `--no-deps` flag!
pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
```
+This wheel provides support for the Intel XPU platform.
-```
-# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+```bash
+# Note, if you don't want to reinstall our dependencies, append the `--no-deps` flag!
pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'
```
-
-
-
-Compatible hardware and functioning `import torch_npu` capable environment with Python `3.10` as the minimum requirement.
-
-Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to pip install the necessary `torch_npu` dependency.
-
-
-
-
-> [!WARNING]
-> bitsandbytes does not yet support Apple Silicon / Metal with a dedicated backend. However, the build infrastructure is in place and the below pip install will eventually provide Apple Silicon support as it becomes available on the `multi-backend-refactor` branch based on community contributions.
-
-```
-# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
-pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
-```
-
@@ -299,7 +218,7 @@ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsan
#### AMD GPU
-bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+bitsandbytes is supported from ROCm 6.1 - ROCm 6.4.
```bash
# Install bitsandbytes from source
@@ -318,29 +237,24 @@ pip install -e . # `-e` for "editable" install, when developing BNB (otherwise
#### Intel CPU + XPU
-> [!TIP]
-> Intel CPU/XPU backend only supports building from source; for now, please follow the instructions below.
It does not need compile CPP codes, all required ops are in [intel_extension_for_pytorch](https://pytorch-extension.intel.com/), please follow the instruction to install ipex.
The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#cuda-compile).
-```
+```bash
pip install intel_extension_for_pytorch
git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
pip install -e . # `-e` for "editable" install, when developing BNB (otherwise leave that out)
```
-
#### Ascend NPU
-> [!TIP]
-> Ascend NPU backend only supports building from source; for now, please follow the instructions below.
-
+Please refer to [the official Ascend installations instructions](https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html) for guidance on how to install the necessary `torch_npu` dependency.
-```
+```bash
# Install bitsandbytes from source
# Clone bitsandbytes repo, Ascend NPU backend is currently enabled on multi-backend-refactor branch
git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
@@ -351,14 +265,5 @@ cmake -DCOMPUTE_BACKEND=npu -S .
make
pip install -e . # `-e` for "editable" install, when developing BNB (otherwise leave that out)
```
-
-
-
-
-
-#### Apple Silicon
-
-WIP
-
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
deleted file mode 100644
index 728606b7b..000000000
--- a/docs/source/non_cuda_backends.mdx
+++ /dev/null
@@ -1,44 +0,0 @@
-# Multi-backend support (non-CUDA backends)
-
-> [!Tip]
-> If you feel these docs need some additional info, please consider submitting a PR or respectfully request the missing info in one of the below mentioned Github discussion spaces.
-
-As part of a recent refactoring effort, we will soon offer official multi-backend support. Currently, this feature is available in a preview alpha release, allowing us to gather early feedback from users to improve the functionality and identify any bugs.
-
-At present, the Intel CPU and AMD ROCm backends are considered fully functional. The Intel XPU backend has limited functionality and is less mature.
-
-Please refer to the [installation instructions](./installation#multi-backend) for details on installing the backend you intend to test (and hopefully provide feedback on).
-
-> [!Tip]
-> Apple Silicon support is planned for Q4 2024. We are actively seeking contributors to help implement this, develop a concrete plan, and create a detailed list of requirements. Due to limited resources, we rely on community contributions for this implementation effort. To discuss further, please spell out your thoughts and discuss in [this GitHub discussion](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and tag `@Titus-von-Koeller` and `@matthewdouglas`. Thank you!
-
-## Alpha Release
-
-As we are currently in the alpha testing phase, bugs are expected, and performance might not meet expectations. However, this is exactly what we want to discover from **your** perspective as the end user!
-
-Please share and discuss your feedback with us here:
-
-- [Github Discussion: Multi-backend refactor: Alpha release ( AMD ROCm ONLY )](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1339)
-- [Github Discussion: Multi-backend refactor: Alpha release ( Intel ONLY )](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1338)
-
-Thank you for your support!
-
-## Benchmarks
-
-### Intel
-
-The following performance data is collected from Intel 4th Gen Xeon (SPR) platform. The tables show speed-up and memory compared with different data types of [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf).
-
-#### Inference (CPU)
-
-| Data Type | BF16 | INT8 | NF4 | FP4 |
-|---|---|---|---|---|
-| Speed-Up (vs BF16) | 1.0x | 0.6x | 2.3x | 0.03x |
-| Memory (GB) | 13.1 | 7.6 | 5.0 | 4.6 |
-
-#### Fine-Tuning (CPU)
-
-| Data Type | AMP BF16 | INT8 | NF4 | FP4 |
-|---|---|---|---|---|
-| Speed-Up (vs AMP BF16) | 1.0x | 0.38x | 0.07x | 0.07x |
-| Memory (GB) | 40 | 9 | 6.6 | 6.6 |
diff --git a/docs/source/reference/functional.mdx b/docs/source/reference/functional.mdx
index dbbe21794..cc46675c6 100644
--- a/docs/source/reference/functional.mdx
+++ b/docs/source/reference/functional.mdx
@@ -9,8 +9,6 @@ The `bitsandbytes.functional` API provides the low-level building blocks for the
* For experimental or research purposes requiring non-standard quantization or performance optimizations.
## LLM.int8()
-[[autodoc]] functional.int8_double_quant
-
[[autodoc]] functional.int8_linear_matmul
[[autodoc]] functional.int8_mm_dequant
@@ -19,7 +17,6 @@ The `bitsandbytes.functional` API provides the low-level building blocks for the
[[autodoc]] functional.int8_vectorwise_quant
-
## 4-bit
[[autodoc]] functional.dequantize_4bit
@@ -49,5 +46,3 @@ For more details see [8-Bit Approximations for Parallelism in Deep Learning](htt
## Utility
[[autodoc]] functional.get_ptr
-
-[[autodoc]] functional.is_on_gpu
From f1fbe92d2bc2eebc4629ee41a76b163772cd1874 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Sat, 24 May 2025 21:53:44 +0530
Subject: [PATCH 28/85] Update functional.py
---
bitsandbytes/functional.py | 36 ++++++++++++++++++------------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 1cee234ea..b51258420 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -960,12 +960,12 @@ def quantize_fp4(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=None,
+ blocksize=64,
compress_statistics=False,
quant_storage=torch.uint8,
):
- if blocksize is None:
- blocksize = 64 if not HIP_ENVIRONMENT else 128
+ if HIP_ENVIRONMENT:
+ blocksize = 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage)
@@ -973,12 +973,12 @@ def quantize_nf4(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=None,
+ blocksize=64,
compress_statistics=False,
quant_storage=torch.uint8,
):
- if blocksize is None:
- blocksize = 64 if not HIP_ENVIRONMENT else 128
+ if HIP_ENVIRONMENT:
+ blocksize = 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage)
@@ -986,7 +986,7 @@ def quantize_4bit(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=None,
+ blocksize=64,
compress_statistics=False,
quant_type="fp4",
quant_storage=torch.uint8,
@@ -1014,8 +1014,8 @@ def quantize_4bit(
- `torch.Tensor`: The quantized tensor with packed 4-bit values.
- [`QuantState`]: The state object used to undo the quantization.
"""
- if blocksize is None:
- blocksize = 64 if not HIP_ENVIRONMENT else 128
+ if HIP_ENVIRONMENT:
+ blocksize = 128
input_shape = A.shape
@@ -1067,10 +1067,10 @@ def dequantize_fp4(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: Optional[int] = None,
+ blocksize: int = 64,
) -> torch.Tensor:
- if blocksize is None:
- blocksize = 64 if not HIP_ENVIRONMENT else 128
+ if HIP_ENVIRONMENT:
+ blocksize = 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
@@ -1079,10 +1079,10 @@ def dequantize_nf4(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: Optional[int] = None,
+ blocksize: int = 64,
) -> torch.Tensor:
- if blocksize is None:
- blocksize = 64 if not HIP_ENVIRONMENT else 128
+ if HIP_ENVIRONMENT:
+ blocksize = 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
@@ -1091,7 +1091,7 @@ def dequantize_4bit(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: Optional[int] = None,
+ blocksize: int = 64,
quant_type="fp4",
) -> torch.Tensor:
"""Dequantizes a packed 4-bit quantized tensor.
@@ -1121,8 +1121,8 @@ def dequantize_4bit(
`torch.Tensor`: The dequantized tensor.
"""
- if blocksize is None:
- blocksize = 64 if not HIP_ENVIRONMENT else 128
+ if HIP_ENVIRONMENT:
+ blocksize = 128
if quant_state is None:
assert absmax is not None and out is not None
From 660c25448edcff9f0f56368cc9ef04e91045d52c Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Sat, 24 May 2025 21:57:22 +0530
Subject: [PATCH 29/85] Update functional.py
---
bitsandbytes/functional.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index b51258420..2ae977e7a 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -986,7 +986,7 @@ def quantize_4bit(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=64,
+ blocksize=64,
compress_statistics=False,
quant_type="fp4",
quant_storage=torch.uint8,
From 1d4ea6acccd3e86d5e48b9283e2871ed1b35f043 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E0=A4=B5=E0=A5=87=E0=A4=A6=E0=A4=BE=E0=A4=82=E0=A4=A4?=
<146507396+ved1beta@users.noreply.github.com>
Date: Sun, 25 May 2025 00:28:36 +0530
Subject: [PATCH 30/85] simplified non_sign_bits (#1649)
---
bitsandbytes/functional.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index f84f16c21..94e2d845b 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -367,7 +367,7 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
# these are additional items that come from the case
# where all the exponent bits are zero and no
# indicator bit is present
- non_sign_bits = total_bits - (1 if signed else 1)
+ non_sign_bits = total_bits - 1
additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1
for i in range(max_exponent_bits):
fraction_items = int(
From c692f4bc8f604f50a8a4f4409d373ed70c630364 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 27 May 2025 21:45:04 +0530
Subject: [PATCH 31/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 48dc75135..14878123a 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -3,7 +3,7 @@
from math import prod
from typing import Optional
-import torch
+import torch
from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
From 46f9800d9e9a361ecabf1051f99776fbfc73589d Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 27 May 2025 21:55:36 +0530
Subject: [PATCH 32/85] Update ops.py
From 0d1b3a3200224f34e6fbf172e777eade3a549753 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 27 May 2025 16:40:23 -0400
Subject: [PATCH 33/85] Last minute pre-release changes
---
bitsandbytes/backends/cuda/ops.py | 30 ++++----
bitsandbytes/cextension.py | 13 ++--
bitsandbytes/diagnostics/cuda.py | 23 +-----
bitsandbytes/diagnostics/main.py | 122 ++++++++++++++++++++----------
bitsandbytes/diagnostics/utils.py | 2 +-
bitsandbytes/functional.py | 4 +-
6 files changed, 111 insertions(+), 83 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index efdef2871..c266f61a0 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -445,20 +445,22 @@ def _gemv_4bit_impl(
out: torch.Tensor,
) -> None:
torch._check_is_size(blocksize)
- torch._check(
- A.numel() == A.size(-1),
- lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
- )
- torch._check(
- A.dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
- )
- torch._check(
- B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
- lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
- )
- torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ # Note: these checks are not strictly necessary, and cost more than they are worth, so they are commented out for now.
+ # torch._check(
+ # A.numel() == A.size(-1),
+ # lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
+ # )
+ # torch._check(
+ # A.dtype in [torch.float16, torch.bfloat16, torch.float32],
+ # lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
+ # )
+ # torch._check(
+ # B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
+ # lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
+ # )
+ # torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
+ # torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
m = ct.c_int32(shapeB[0])
n = ct.c_int32(1)
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 3fb8db26f..ebc363991 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -1,4 +1,5 @@
import ctypes as ct
+import functools
import logging
import os
from pathlib import Path
@@ -29,10 +30,8 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
library_name = re.sub(r"cuda\d+", f"cuda{override_value}", library_name, count=1)
logger.warning(
f"WARNING: BNB_CUDA_VERSION={override_value} environment variable detected; loading {library_name}.\n"
- "This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n"
+ "This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.\n"
"If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n"
- "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n"
- "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: None:
if not binary_path.exists():
print_dedented(
f"""
- Library not found: {binary_path}. Maybe you need to compile it from source?
- If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
- for example, `make CUDA_VERSION=113`.
-
- The CUDA version for the compile might depend on your conda install, if using conda.
- Inspect CUDA version via `conda list | grep cuda`.
- """,
- )
-
- cuda_major, cuda_minor = cuda_specs.cuda_version_tuple
- if cuda_major < 11:
- print_dedented(
- """
- WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
- You will be only to use 8-bit optimizers and quantization routines!
+ Library not found: {binary_path}. Maybe you need to compile it from source?
""",
)
- print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")
-
# 7.5 is the minimum CC for int8 tensor cores
if not cuda_specs.has_imma:
print_dedented(
@@ -144,10 +127,6 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
""",
)
- # TODO:
- # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
- # (2) Multiple CUDA versions installed
-
def print_cuda_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py
index b6236d668..aa4cb3042 100644
--- a/bitsandbytes/diagnostics/main.py
+++ b/bitsandbytes/diagnostics/main.py
@@ -1,16 +1,30 @@
+import importlib
+import platform
import sys
import traceback
import torch
+from bitsandbytes import __version__ as bnb_version
from bitsandbytes.consts import PACKAGE_GITHUB_URL
from bitsandbytes.cuda_specs import get_cuda_specs
from bitsandbytes.diagnostics.cuda import (
print_cuda_diagnostics,
- print_cuda_runtime_diagnostics,
)
from bitsandbytes.diagnostics.utils import print_dedented, print_header
+_RELATED_PACKAGES = [
+ "accelerate",
+ "diffusers",
+ "numpy",
+ "pip",
+ "peft",
+ "safetensors",
+ "transformers",
+ "triton",
+ "trl",
+]
+
def sanity_check():
from bitsandbytes.optim import Adam
@@ -27,47 +41,77 @@ def sanity_check():
assert p1 != p2
+def get_package_version(name: str) -> str:
+ try:
+ version = importlib.metadata.version(name)
+ except importlib.metadata.PackageNotFoundError:
+ version = "not found"
+ return version
+
+
+def show_environment():
+ """Simple utility to print out environment information."""
+
+ print(f"Platform: {platform.platform()}")
+ if platform.system() == "Linux":
+ print(f" libc: {'-'.join(platform.libc_ver())}")
+
+ print(f"Python: {platform.python_version()}")
+
+ print(f"PyTorch: {torch.__version__}")
+ print(f" CUDA: {torch.version.cuda or 'N/A'}")
+ print(f" HIP: {torch.version.hip or 'N/A'}")
+ print(f" XPU: {getattr(torch.version, 'xpu', 'N/A') or 'N/A'}")
+
+ print("Related packages:")
+ for pkg in _RELATED_PACKAGES:
+ version = get_package_version(pkg)
+ print(f" {pkg}: {version}")
+
+
def main():
- print_header("")
- print_header("BUG REPORT INFORMATION")
+ print_header(f"bitsandbytes v{bnb_version}")
+ show_environment()
print_header("")
- print_header("OTHER")
cuda_specs = get_cuda_specs()
- print("CUDA specs:", cuda_specs)
- if not torch.cuda.is_available():
- print("Torch says CUDA is not available. Possible reasons:")
- print("1. CUDA driver not installed")
- print("2. CUDA not installed")
- print("3. You have multiple conflicting CUDA libraries")
+
if cuda_specs:
print_cuda_diagnostics(cuda_specs)
- print_cuda_runtime_diagnostics()
- print_header("")
- print_header("DEBUG INFO END")
- print_header("")
- print("Checking that the library is importable and CUDA is callable...")
- try:
- sanity_check()
- print("SUCCESS!")
- print("Installation was successful!")
- return
- except RuntimeError as e:
- if "not available in CPU-only" in str(e):
- print(
- f"WARNING: {__package__} is currently running as CPU-only!\n"
- "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
- f"If you think that this is so erroneously,\nplease report an issue!",
- )
- else:
- raise e
- except Exception:
- traceback.print_exc()
- print_dedented(
- f"""
- Above we output some debug information.
- Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
- WARNING: Please be sure to sanitize sensitive info from the output before posting it.
- """,
- )
- sys.exit(1)
+
+ # TODO: There's a lot of noise in this; needs improvement.
+ # print_cuda_runtime_diagnostics()
+
+ if not torch.cuda.is_available():
+ print("PyTorch says CUDA is not available. Possible reasons:")
+ print("1. CUDA driver not installed")
+ print("2. Using a CPU-only PyTorch build")
+ print("3. No GPU detected")
+
+ else:
+ print("Checking that the library is importable and CUDA is callable...")
+
+ try:
+ sanity_check()
+ print("SUCCESS!")
+ return
+ except RuntimeError as e:
+ if "not available in CPU-only" in str(e):
+ print(
+ f"WARNING: {__package__} is currently running as CPU-only!\n"
+ "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
+ f"If you think that this is so erroneously,\nplease report an issue!",
+ )
+ else:
+ raise e
+ except Exception:
+ traceback.print_exc()
+
+ print_dedented(
+ f"""
+ Above we output some debug information.
+ Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
+ WARNING: Please be sure to sanitize sensitive info from the output before posting it.
+ """,
+ )
+ sys.exit(1)
diff --git a/bitsandbytes/diagnostics/utils.py b/bitsandbytes/diagnostics/utils.py
index 770209b9d..facc58b30 100644
--- a/bitsandbytes/diagnostics/utils.py
+++ b/bitsandbytes/diagnostics/utils.py
@@ -3,7 +3,7 @@
HEADER_WIDTH = 60
-def print_header(txt: str, width: int = HEADER_WIDTH, filler: str = "+") -> None:
+def print_header(txt: str, width: int = HEADER_WIDTH, filler: str = "=") -> None:
txt = f" {txt} " if txt else ""
print(txt.center(width, filler))
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 94e2d845b..0bd4c8b4e 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -851,8 +851,8 @@ def dequantize_blockwise(
torch.ops.bitsandbytes.dequantize_blockwise.out(
A,
absmax,
- code.to(A.device),
- blocksize,
+ quant_state.code.to(A.device),
+ quant_state.blocksize,
quant_state.dtype,
out=out,
)
From 1e54f912efaa032e4f3f99b3c414d3370ce2032a Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 27 May 2025 17:13:27 -0400
Subject: [PATCH 34/85] Release 0.46.0
---
bitsandbytes/__init__.py | 2 +-
setup.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 917cd0b6a..12088a70c 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -64,4 +64,4 @@ def _import_backends():
"optim.optimizer.MockArgs": False,
}
-__version__ = "0.46.0.dev0"
+__version__ = "0.46.0"
diff --git a/setup.py b/setup.py
index d20300c16..3208bf1f0 100644
--- a/setup.py
+++ b/setup.py
@@ -12,4 +12,4 @@ def has_ext_modules(self):
return True
-setup(version="0.46.0.dev0", packages=find_packages(), distclass=BinaryDistribution)
+setup(version="0.46.0", packages=find_packages(), distclass=BinaryDistribution)
From 7823bac2c0c234c468392c219b29ed51dea8ca96 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:12:42 +0530
Subject: [PATCH 35/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 1059 ++++++++++++++---------------
1 file changed, 521 insertions(+), 538 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 14878123a..efdef2871 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,538 +1,521 @@
-from collections.abc import Sequence
-import ctypes as ct
-from math import prod
-from typing import Optional
-
-import torch
-
-from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
-
-from ..._ops import register_kernel
-from ...cextension import lib, HIP_ENVIRONMENT
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor):
- out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
- return _int8_linear_matmul_impl(A, B, out)
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- _int8_linear_matmul_impl(A, B, out)
-
-
-def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- A, B = B, A
-
- shapeA = A.shape
- shapeB = B.shape
-
- torch._check(A.dtype == torch.int8, lambda: "B must be int8")
- torch._check(B.dtype == torch.int8, lambda: "A must be int8")
- torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
- torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
- torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
- torch._check(out.dtype == torch.int32)
-
- shapeC = (*shapeB[:-1], shapeA[0])
- torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
-
- k, m = shapeA
- n = prod(shapeB[:-1])
- lda = shapeA[-1] # Weights (outputs, inputs)
- ldb = shapeB[-1] # Activations (batch, tokens, inputs)
- ldc = shapeC[-1] # Output (batch, tokens, outputs)
-
- torch._check(
- lda == ldb,
- lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
- )
-
- # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
- # We'll fall back to a slower fp32 calculation in this circumstance.
- # Fortunately, this should not be very common.
- if lda % 4 != 0:
- result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
- return out.copy_(result)
-
- with _cuda_device_of(A):
- ctx = CUBLAS_Context.get_instance().get_context(A.device)
- ptrA = get_ptr(A)
- ptrB = get_ptr(B)
- ptrC = get_ptr(out)
- ptrRowScale = None
- m = ct.c_int32(m)
- n = ct.c_int32(n)
- k = ct.c_int32(k)
- lda = ct.c_int32(lda)
- ldb = ct.c_int32(ldb)
- ldc = ct.c_int32(ldc)
- stream = _get_tensor_stream(A)
-
- has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
- if has_error:
- if has_error == 100:
- # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
- # TODO: Warn and implement a fallback to fp32 compute?
- raise NotImplementedError("int8_linear_matmul not implemented!")
- else:
- raise RuntimeError(
- f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
- )
-
- return out
-
-
-@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
-def _(
- A: torch.Tensor,
- row_stats: torch.Tensor,
- col_stats: torch.Tensor,
- dtype: Optional[torch.dtype] = None,
- bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
- torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
- torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
- torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
-
- # Note: cuda kernel only currently supports fp16 output.
- # We'll later cast to desired dtype if needed.
- out = torch.empty_like(A, dtype=torch.float16)
-
- ptrA = get_ptr(A)
- ptrOut = get_ptr(out)
- ptrRowStats = get_ptr(row_stats)
- ptrColStats = get_ptr(col_stats)
- numRows = ct.c_int32(prod(A.shape[:-1]))
- numCols = ct.c_int32(A.shape[-1])
-
- # Note: fused bias in the kernel is only supported for fp16
- # TODO(matthewdouglas): Consider supporting bf16 fused bias
- ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
-
- with _cuda_device_of(A):
- lib.cdequant_mm_int32_fp16(
- ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
- )
-
- # Add bias separately if not fused in kernel
- if bias is not None and bias.dtype != torch.float16:
- out.add_(bias)
-
- return out.to(dtype or torch.float16)
-
-
-@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
-def _(A: torch.Tensor, threshold=0.0):
- torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
- torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
-
- rows = prod(A.shape[:-1])
- cols = A.shape[-1]
-
- row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
- out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
-
- outlier_cols = None
-
- if threshold > 0.0:
- # TODO we could improve perf of this
- outliers = A.abs() >= threshold
-
- if outliers.any():
- outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
- else:
- # Needed for torch.compile support.
- outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
-
- with _cuda_device_of(A):
- lib.cint8_vector_quant(
- get_ptr(A),
- get_ptr(out_row),
- get_ptr(row_stats),
- ct.c_float(threshold),
- ct.c_int32(rows),
- ct.c_int32(cols),
- _get_tensor_stream(A),
- )
-
- # Zero out values from outlier columns across all rows.
- # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
- if rows > 1 and outlier_cols is not None:
- out_row[:, outlier_cols] = 0
-
- return out_row, row_stats, outlier_cols
-
-
-@register_kernel("bitsandbytes::int8_double_quant", "cuda")
-def _(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
- # Use CUDA kernel for rowwise and COO tensor
- quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
- A,
- threshold=threshold,
- )
-
- # PyTorch impl for colwise
- col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
- if threshold > 0.0 and outlier_mask is not None:
- A = A.masked_fill(outlier_mask, 0.0)
- quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
-
- return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
-
-
-def _get_col_absmax(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
- torch._check(A.is_floating_point())
-
- outlier_mask = None
-
- absA = A.abs().view(-1, A.shape[-1])
-
- if threshold > 0.0:
- # Filter outliers from stats when enabled
- outlier_mask = absA >= threshold
- absA.masked_fill_(outlier_mask, 0.0)
-
- # shape [cols]; unsqueeze(0) gives [1,cols]
- col_stats = absA.amax(dim=0, keepdim=False).float()
-
- return col_stats, outlier_mask
-
-
-@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check_is_size(blocksize)
-
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty_like(A, dtype=torch.uint8)
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(A.numel()),
- )
-
- if A.dtype == torch.float16:
- lib.cquantize_blockwise_fp16(*args)
- elif A.dtype == torch.bfloat16:
- lib.cquantize_blockwise_bf16(*args)
- elif A.dtype == torch.float32:
- lib.cquantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
-def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
- out = torch.empty_like(A, dtype=dtype)
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
-
-
-def _dequantize_blockwise_impl(
- A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
-) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
- torch._check(
- dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(A.numel()),
- _get_tensor_stream(A),
- )
-
- if dtype == torch.float16:
- lib.cdequantize_blockwise_fp16(*args)
- elif dtype == torch.bfloat16:
- lib.cdequantize_blockwise_bf16(*args)
- elif dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
-
-
-@register_kernel("bitsandbytes::quantize_4bit", "cuda")
-def _(
- A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> tuple[torch.Tensor, torch.Tensor]:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- A.dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
- )
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(n),
- )
-
- if A.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_bf16_fp4(*args)
- else:
- lib.cquantize_blockwise_bf16_nf4(*args)
- elif A.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp16_fp4(*args)
- else:
- lib.cquantize_blockwise_fp16_nf4(*args)
- elif A.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp32_fp4(*args)
- else:
- lib.cquantize_blockwise_fp32_nf4(*args)
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
-) -> torch.Tensor:
- out = torch.empty(shape, dtype=dtype, device=A.device)
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
-
-
-def _dequantize_4bit_impl(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(out.numel()),
- _get_tensor_stream(A),
- )
-
- if out.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_bf16_fp4(*args)
- else:
- lib.cdequantize_blockwise_bf16_nf4(*args)
- elif out.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp16_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp16_nf4(*args)
- elif out.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp32_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp32_nf4(*args)
-
-
-@register_kernel("bitsandbytes::gemv_4bit", "cuda")
-def _(
- A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
-) -> torch.Tensor:
- shape = (*A.shape[:-1], shapeB[0])
- out = torch.empty(shape, device=A.device, dtype=A.dtype)
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check(
- out.shape == (*A.shape[:-1], shapeB[0]),
- lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
- )
- torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
-
-
-def _gemv_4bit_impl(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check_is_size(blocksize)
- torch._check(
- A.numel() == A.size(-1),
- lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
- )
- torch._check(
- A.dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
- )
- torch._check(
- B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
- lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
- )
- torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- m = ct.c_int32(shapeB[0])
- n = ct.c_int32(1)
- k = ct.c_int32(shapeB[1])
-
- lda = m
- ldb = ct.c_int32((A.shape[-1] + 1) // 2)
- ldc = m
-
- stream = _get_tensor_stream(A)
-
- with _cuda_device_of(A):
- if A.dtype == torch.float16:
- lib.cgemm_4bit_inference_naive_fp16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.bfloat16:
- lib.cgemm_4bit_inference_naive_bf16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.float32:
- lib.cgemm_4bit_inference_naive_fp32(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
+from collections.abc import Sequence
+import ctypes as ct
+from math import prod
+from typing import Optional
+
+import torch
+
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
+
+from ..._ops import register_kernel
+from ...cextension import lib
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor):
+ out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
+ return _int8_linear_matmul_impl(A, B, out)
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ _int8_linear_matmul_impl(A, B, out)
+
+
+def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ A, B = B, A
+
+ shapeA = A.shape
+ shapeB = B.shape
+
+ torch._check(A.dtype == torch.int8, lambda: "B must be int8")
+ torch._check(B.dtype == torch.int8, lambda: "A must be int8")
+ torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
+ torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
+ torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
+ torch._check(out.dtype == torch.int32)
+
+ shapeC = (*shapeB[:-1], shapeA[0])
+ torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
+
+ k, m = shapeA
+ n = prod(shapeB[:-1])
+ lda = shapeA[-1] # Weights (outputs, inputs)
+ ldb = shapeB[-1] # Activations (batch, tokens, inputs)
+ ldc = shapeC[-1] # Output (batch, tokens, outputs)
+
+ torch._check(
+ lda == ldb,
+ lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
+ )
+
+ # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
+ # We'll fall back to a slower fp32 calculation in this circumstance.
+ # Fortunately, this should not be very common.
+ if lda % 4 != 0:
+ result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+ return out.copy_(result)
+
+ with _cuda_device_of(A):
+ ctx = CUBLAS_Context.get_instance().get_context(A.device)
+ ptrA = get_ptr(A)
+ ptrB = get_ptr(B)
+ ptrC = get_ptr(out)
+ ptrRowScale = None
+ m = ct.c_int32(m)
+ n = ct.c_int32(n)
+ k = ct.c_int32(k)
+ lda = ct.c_int32(lda)
+ ldb = ct.c_int32(ldb)
+ ldc = ct.c_int32(ldc)
+ stream = _get_tensor_stream(A)
+
+ has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
+ if has_error:
+ if has_error == 100:
+ # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+ # TODO: Warn and implement a fallback to fp32 compute?
+ raise NotImplementedError("int8_linear_matmul not implemented!")
+ else:
+ raise RuntimeError(
+ f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
+ )
+
+ return out
+
+
+@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
+def _(
+ A: torch.Tensor,
+ row_stats: torch.Tensor,
+ col_stats: torch.Tensor,
+ dtype: Optional[torch.dtype] = None,
+ bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+ torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
+ torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
+ torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
+
+ # Note: cuda kernel only currently supports fp16 output.
+ # We'll later cast to desired dtype if needed.
+ out = torch.empty_like(A, dtype=torch.float16)
+
+ ptrA = get_ptr(A)
+ ptrOut = get_ptr(out)
+ ptrRowStats = get_ptr(row_stats)
+ ptrColStats = get_ptr(col_stats)
+ numRows = ct.c_int32(prod(A.shape[:-1]))
+ numCols = ct.c_int32(A.shape[-1])
+
+ # Note: fused bias in the kernel is only supported for fp16
+ # TODO(matthewdouglas): Consider supporting bf16 fused bias
+ ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
+
+ with _cuda_device_of(A):
+ lib.cdequant_mm_int32_fp16(
+ ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
+ )
+
+ # Add bias separately if not fused in kernel
+ if bias is not None and bias.dtype != torch.float16:
+ out.add_(bias)
+
+ return out.to(dtype or torch.float16)
+
+
+@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
+def _(A: torch.Tensor, threshold=0.0):
+ torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
+ torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
+
+ rows = prod(A.shape[:-1])
+ cols = A.shape[-1]
+
+ row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
+ out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
+
+ outlier_cols = None
+
+ if threshold > 0.0:
+ # TODO we could improve perf of this
+ outliers = A.abs() >= threshold
+
+ if outliers.any():
+ outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
+ else:
+ # Needed for torch.compile support.
+ outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
+
+ with _cuda_device_of(A):
+ lib.cint8_vector_quant(
+ get_ptr(A),
+ get_ptr(out_row),
+ get_ptr(row_stats),
+ ct.c_float(threshold),
+ ct.c_int32(rows),
+ ct.c_int32(cols),
+ _get_tensor_stream(A),
+ )
+
+ # Zero out values from outlier columns across all rows.
+ # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
+ if rows > 1 and outlier_cols is not None:
+ out_row[:, outlier_cols] = 0
+
+ return out_row, row_stats, outlier_cols
+
+
+@register_kernel("bitsandbytes::int8_double_quant", "cuda")
+def _(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+ # Use CUDA kernel for rowwise and COO tensor
+ quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
+ A,
+ threshold=threshold,
+ )
+
+ # PyTorch impl for colwise
+ col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
+ if threshold > 0.0 and outlier_mask is not None:
+ A = A.masked_fill(outlier_mask, 0.0)
+ quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
+
+ return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
+
+
+def _get_col_absmax(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+ torch._check(A.is_floating_point())
+
+ outlier_mask = None
+
+ absA = A.abs().view(-1, A.shape[-1])
+
+ if threshold > 0.0:
+ # Filter outliers from stats when enabled
+ outlier_mask = absA >= threshold
+ absA.masked_fill_(outlier_mask, 0.0)
+
+ # shape [cols]; unsqueeze(0) gives [1,cols]
+ col_stats = absA.amax(dim=0, keepdim=False).float()
+
+ return col_stats, outlier_mask
+
+
+@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check_is_size(blocksize)
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty_like(A, dtype=torch.uint8)
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(A.numel()),
+ )
+
+ if A.dtype == torch.float16:
+ lib.cquantize_blockwise_fp16(*args)
+ elif A.dtype == torch.bfloat16:
+ lib.cquantize_blockwise_bf16(*args)
+ elif A.dtype == torch.float32:
+ lib.cquantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
+def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+ out = torch.empty_like(A, dtype=dtype)
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+
+
+def _dequantize_blockwise_impl(
+ A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+) -> None:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+ torch._check(
+ dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(A.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if dtype == torch.float16:
+ lib.cdequantize_blockwise_fp16(*args)
+ elif dtype == torch.bfloat16:
+ lib.cdequantize_blockwise_bf16(*args)
+ elif dtype == torch.float32:
+ lib.cdequantize_blockwise_fp32(*args)
+
+
+@register_kernel("bitsandbytes::quantize_4bit", "cuda")
+def _(
+ A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ A.dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
+ )
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(n),
+ )
+
+ if A.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_bf16_nf4(*args)
+ elif A.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp16_nf4(*args)
+ elif A.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp32_nf4(*args)
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+) -> torch.Tensor:
+ out = torch.empty(shape, dtype=dtype, device=A.device)
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
+
+def _dequantize_4bit_impl(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(out.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if out.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_bf16_nf4(*args)
+ elif out.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp16_nf4(*args)
+ elif out.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp32_nf4(*args)
+
+
+@register_kernel("bitsandbytes::gemv_4bit", "cuda")
+def _(
+ A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
+) -> torch.Tensor:
+ shape = (*A.shape[:-1], shapeB[0])
+ out = torch.empty(shape, device=A.device, dtype=A.dtype)
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check(
+ out.shape == (*A.shape[:-1], shapeB[0]),
+ lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
+ )
+ torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+
+
+def _gemv_4bit_impl(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check_is_size(blocksize)
+ torch._check(
+ A.numel() == A.size(-1),
+ lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
+ )
+ torch._check(
+ A.dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
+ )
+ torch._check(
+ B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
+ )
+ torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ m = ct.c_int32(shapeB[0])
+ n = ct.c_int32(1)
+ k = ct.c_int32(shapeB[1])
+
+ lda = m
+ ldb = ct.c_int32((A.shape[-1] + 1) // 2)
+ ldc = m
+
+ stream = _get_tensor_stream(A)
+
+ with _cuda_device_of(A):
+ if A.dtype == torch.float16:
+ lib.cgemm_4bit_inference_naive_fp16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.bfloat16:
+ lib.cgemm_4bit_inference_naive_bf16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.float32:
+ lib.cgemm_4bit_inference_naive_fp32(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
From d0ed1077d910acc4cd6f3ec4c57cf597931ff20c Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:14:34 +0530
Subject: [PATCH 36/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 1059 +++++++++++++++--------------
1 file changed, 538 insertions(+), 521 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index efdef2871..14878123a 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,521 +1,538 @@
-from collections.abc import Sequence
-import ctypes as ct
-from math import prod
-from typing import Optional
-
-import torch
-
-from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
-
-from ..._ops import register_kernel
-from ...cextension import lib
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor):
- out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
- return _int8_linear_matmul_impl(A, B, out)
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- _int8_linear_matmul_impl(A, B, out)
-
-
-def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- A, B = B, A
-
- shapeA = A.shape
- shapeB = B.shape
-
- torch._check(A.dtype == torch.int8, lambda: "B must be int8")
- torch._check(B.dtype == torch.int8, lambda: "A must be int8")
- torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
- torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
- torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
- torch._check(out.dtype == torch.int32)
-
- shapeC = (*shapeB[:-1], shapeA[0])
- torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
-
- k, m = shapeA
- n = prod(shapeB[:-1])
- lda = shapeA[-1] # Weights (outputs, inputs)
- ldb = shapeB[-1] # Activations (batch, tokens, inputs)
- ldc = shapeC[-1] # Output (batch, tokens, outputs)
-
- torch._check(
- lda == ldb,
- lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
- )
-
- # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
- # We'll fall back to a slower fp32 calculation in this circumstance.
- # Fortunately, this should not be very common.
- if lda % 4 != 0:
- result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
- return out.copy_(result)
-
- with _cuda_device_of(A):
- ctx = CUBLAS_Context.get_instance().get_context(A.device)
- ptrA = get_ptr(A)
- ptrB = get_ptr(B)
- ptrC = get_ptr(out)
- ptrRowScale = None
- m = ct.c_int32(m)
- n = ct.c_int32(n)
- k = ct.c_int32(k)
- lda = ct.c_int32(lda)
- ldb = ct.c_int32(ldb)
- ldc = ct.c_int32(ldc)
- stream = _get_tensor_stream(A)
-
- has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
- if has_error:
- if has_error == 100:
- # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
- # TODO: Warn and implement a fallback to fp32 compute?
- raise NotImplementedError("int8_linear_matmul not implemented!")
- else:
- raise RuntimeError(
- f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
- )
-
- return out
-
-
-@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
-def _(
- A: torch.Tensor,
- row_stats: torch.Tensor,
- col_stats: torch.Tensor,
- dtype: Optional[torch.dtype] = None,
- bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
- torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
- torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
- torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
-
- # Note: cuda kernel only currently supports fp16 output.
- # We'll later cast to desired dtype if needed.
- out = torch.empty_like(A, dtype=torch.float16)
-
- ptrA = get_ptr(A)
- ptrOut = get_ptr(out)
- ptrRowStats = get_ptr(row_stats)
- ptrColStats = get_ptr(col_stats)
- numRows = ct.c_int32(prod(A.shape[:-1]))
- numCols = ct.c_int32(A.shape[-1])
-
- # Note: fused bias in the kernel is only supported for fp16
- # TODO(matthewdouglas): Consider supporting bf16 fused bias
- ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
-
- with _cuda_device_of(A):
- lib.cdequant_mm_int32_fp16(
- ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
- )
-
- # Add bias separately if not fused in kernel
- if bias is not None and bias.dtype != torch.float16:
- out.add_(bias)
-
- return out.to(dtype or torch.float16)
-
-
-@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
-def _(A: torch.Tensor, threshold=0.0):
- torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
- torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
-
- rows = prod(A.shape[:-1])
- cols = A.shape[-1]
-
- row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
- out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
-
- outlier_cols = None
-
- if threshold > 0.0:
- # TODO we could improve perf of this
- outliers = A.abs() >= threshold
-
- if outliers.any():
- outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
- else:
- # Needed for torch.compile support.
- outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
-
- with _cuda_device_of(A):
- lib.cint8_vector_quant(
- get_ptr(A),
- get_ptr(out_row),
- get_ptr(row_stats),
- ct.c_float(threshold),
- ct.c_int32(rows),
- ct.c_int32(cols),
- _get_tensor_stream(A),
- )
-
- # Zero out values from outlier columns across all rows.
- # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
- if rows > 1 and outlier_cols is not None:
- out_row[:, outlier_cols] = 0
-
- return out_row, row_stats, outlier_cols
-
-
-@register_kernel("bitsandbytes::int8_double_quant", "cuda")
-def _(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
- # Use CUDA kernel for rowwise and COO tensor
- quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
- A,
- threshold=threshold,
- )
-
- # PyTorch impl for colwise
- col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
- if threshold > 0.0 and outlier_mask is not None:
- A = A.masked_fill(outlier_mask, 0.0)
- quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
-
- return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
-
-
-def _get_col_absmax(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
- torch._check(A.is_floating_point())
-
- outlier_mask = None
-
- absA = A.abs().view(-1, A.shape[-1])
-
- if threshold > 0.0:
- # Filter outliers from stats when enabled
- outlier_mask = absA >= threshold
- absA.masked_fill_(outlier_mask, 0.0)
-
- # shape [cols]; unsqueeze(0) gives [1,cols]
- col_stats = absA.amax(dim=0, keepdim=False).float()
-
- return col_stats, outlier_mask
-
-
-@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check_is_size(blocksize)
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty_like(A, dtype=torch.uint8)
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(A.numel()),
- )
-
- if A.dtype == torch.float16:
- lib.cquantize_blockwise_fp16(*args)
- elif A.dtype == torch.bfloat16:
- lib.cquantize_blockwise_bf16(*args)
- elif A.dtype == torch.float32:
- lib.cquantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
-def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
- out = torch.empty_like(A, dtype=dtype)
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
-
-
-def _dequantize_blockwise_impl(
- A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
-) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
- torch._check(
- dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(A.numel()),
- _get_tensor_stream(A),
- )
-
- if dtype == torch.float16:
- lib.cdequantize_blockwise_fp16(*args)
- elif dtype == torch.bfloat16:
- lib.cdequantize_blockwise_bf16(*args)
- elif dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
-
-
-@register_kernel("bitsandbytes::quantize_4bit", "cuda")
-def _(
- A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- A.dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
- )
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(n),
- )
-
- if A.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_bf16_fp4(*args)
- else:
- lib.cquantize_blockwise_bf16_nf4(*args)
- elif A.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp16_fp4(*args)
- else:
- lib.cquantize_blockwise_fp16_nf4(*args)
- elif A.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp32_fp4(*args)
- else:
- lib.cquantize_blockwise_fp32_nf4(*args)
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
-) -> torch.Tensor:
- out = torch.empty(shape, dtype=dtype, device=A.device)
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
-
-
-def _dequantize_4bit_impl(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(out.numel()),
- _get_tensor_stream(A),
- )
-
- if out.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_bf16_fp4(*args)
- else:
- lib.cdequantize_blockwise_bf16_nf4(*args)
- elif out.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp16_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp16_nf4(*args)
- elif out.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp32_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp32_nf4(*args)
-
-
-@register_kernel("bitsandbytes::gemv_4bit", "cuda")
-def _(
- A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
-) -> torch.Tensor:
- shape = (*A.shape[:-1], shapeB[0])
- out = torch.empty(shape, device=A.device, dtype=A.dtype)
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check(
- out.shape == (*A.shape[:-1], shapeB[0]),
- lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
- )
- torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
-
-
-def _gemv_4bit_impl(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check_is_size(blocksize)
- torch._check(
- A.numel() == A.size(-1),
- lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
- )
- torch._check(
- A.dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
- )
- torch._check(
- B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
- lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
- )
- torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- m = ct.c_int32(shapeB[0])
- n = ct.c_int32(1)
- k = ct.c_int32(shapeB[1])
-
- lda = m
- ldb = ct.c_int32((A.shape[-1] + 1) // 2)
- ldc = m
-
- stream = _get_tensor_stream(A)
-
- with _cuda_device_of(A):
- if A.dtype == torch.float16:
- lib.cgemm_4bit_inference_naive_fp16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.bfloat16:
- lib.cgemm_4bit_inference_naive_bf16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.float32:
- lib.cgemm_4bit_inference_naive_fp32(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
+from collections.abc import Sequence
+import ctypes as ct
+from math import prod
+from typing import Optional
+
+import torch
+
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
+
+from ..._ops import register_kernel
+from ...cextension import lib, HIP_ENVIRONMENT
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor):
+ out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
+ return _int8_linear_matmul_impl(A, B, out)
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ _int8_linear_matmul_impl(A, B, out)
+
+
+def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ A, B = B, A
+
+ shapeA = A.shape
+ shapeB = B.shape
+
+ torch._check(A.dtype == torch.int8, lambda: "B must be int8")
+ torch._check(B.dtype == torch.int8, lambda: "A must be int8")
+ torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
+ torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
+ torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
+ torch._check(out.dtype == torch.int32)
+
+ shapeC = (*shapeB[:-1], shapeA[0])
+ torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
+
+ k, m = shapeA
+ n = prod(shapeB[:-1])
+ lda = shapeA[-1] # Weights (outputs, inputs)
+ ldb = shapeB[-1] # Activations (batch, tokens, inputs)
+ ldc = shapeC[-1] # Output (batch, tokens, outputs)
+
+ torch._check(
+ lda == ldb,
+ lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
+ )
+
+ # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
+ # We'll fall back to a slower fp32 calculation in this circumstance.
+ # Fortunately, this should not be very common.
+ if lda % 4 != 0:
+ result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+ return out.copy_(result)
+
+ with _cuda_device_of(A):
+ ctx = CUBLAS_Context.get_instance().get_context(A.device)
+ ptrA = get_ptr(A)
+ ptrB = get_ptr(B)
+ ptrC = get_ptr(out)
+ ptrRowScale = None
+ m = ct.c_int32(m)
+ n = ct.c_int32(n)
+ k = ct.c_int32(k)
+ lda = ct.c_int32(lda)
+ ldb = ct.c_int32(ldb)
+ ldc = ct.c_int32(ldc)
+ stream = _get_tensor_stream(A)
+
+ has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
+ if has_error:
+ if has_error == 100:
+ # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+ # TODO: Warn and implement a fallback to fp32 compute?
+ raise NotImplementedError("int8_linear_matmul not implemented!")
+ else:
+ raise RuntimeError(
+ f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
+ )
+
+ return out
+
+
+@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
+def _(
+ A: torch.Tensor,
+ row_stats: torch.Tensor,
+ col_stats: torch.Tensor,
+ dtype: Optional[torch.dtype] = None,
+ bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+ torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
+ torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
+ torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
+
+ # Note: cuda kernel only currently supports fp16 output.
+ # We'll later cast to desired dtype if needed.
+ out = torch.empty_like(A, dtype=torch.float16)
+
+ ptrA = get_ptr(A)
+ ptrOut = get_ptr(out)
+ ptrRowStats = get_ptr(row_stats)
+ ptrColStats = get_ptr(col_stats)
+ numRows = ct.c_int32(prod(A.shape[:-1]))
+ numCols = ct.c_int32(A.shape[-1])
+
+ # Note: fused bias in the kernel is only supported for fp16
+ # TODO(matthewdouglas): Consider supporting bf16 fused bias
+ ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
+
+ with _cuda_device_of(A):
+ lib.cdequant_mm_int32_fp16(
+ ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
+ )
+
+ # Add bias separately if not fused in kernel
+ if bias is not None and bias.dtype != torch.float16:
+ out.add_(bias)
+
+ return out.to(dtype or torch.float16)
+
+
+@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
+def _(A: torch.Tensor, threshold=0.0):
+ torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
+ torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
+
+ rows = prod(A.shape[:-1])
+ cols = A.shape[-1]
+
+ row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
+ out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
+
+ outlier_cols = None
+
+ if threshold > 0.0:
+ # TODO we could improve perf of this
+ outliers = A.abs() >= threshold
+
+ if outliers.any():
+ outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
+ else:
+ # Needed for torch.compile support.
+ outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
+
+ with _cuda_device_of(A):
+ lib.cint8_vector_quant(
+ get_ptr(A),
+ get_ptr(out_row),
+ get_ptr(row_stats),
+ ct.c_float(threshold),
+ ct.c_int32(rows),
+ ct.c_int32(cols),
+ _get_tensor_stream(A),
+ )
+
+ # Zero out values from outlier columns across all rows.
+ # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
+ if rows > 1 and outlier_cols is not None:
+ out_row[:, outlier_cols] = 0
+
+ return out_row, row_stats, outlier_cols
+
+
+@register_kernel("bitsandbytes::int8_double_quant", "cuda")
+def _(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+ # Use CUDA kernel for rowwise and COO tensor
+ quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
+ A,
+ threshold=threshold,
+ )
+
+ # PyTorch impl for colwise
+ col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
+ if threshold > 0.0 and outlier_mask is not None:
+ A = A.masked_fill(outlier_mask, 0.0)
+ quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
+
+ return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
+
+
+def _get_col_absmax(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+ torch._check(A.is_floating_point())
+
+ outlier_mask = None
+
+ absA = A.abs().view(-1, A.shape[-1])
+
+ if threshold > 0.0:
+ # Filter outliers from stats when enabled
+ outlier_mask = absA >= threshold
+ absA.masked_fill_(outlier_mask, 0.0)
+
+ # shape [cols]; unsqueeze(0) gives [1,cols]
+ col_stats = absA.amax(dim=0, keepdim=False).float()
+
+ return col_stats, outlier_mask
+
+
+@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check_is_size(blocksize)
+
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty_like(A, dtype=torch.uint8)
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(A.numel()),
+ )
+
+ if A.dtype == torch.float16:
+ lib.cquantize_blockwise_fp16(*args)
+ elif A.dtype == torch.bfloat16:
+ lib.cquantize_blockwise_bf16(*args)
+ elif A.dtype == torch.float32:
+ lib.cquantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
+def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+ out = torch.empty_like(A, dtype=dtype)
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+
+
+def _dequantize_blockwise_impl(
+ A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+) -> None:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+ torch._check(
+ dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(A.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if dtype == torch.float16:
+ lib.cdequantize_blockwise_fp16(*args)
+ elif dtype == torch.bfloat16:
+ lib.cdequantize_blockwise_bf16(*args)
+ elif dtype == torch.float32:
+ lib.cdequantize_blockwise_fp32(*args)
+
+
+@register_kernel("bitsandbytes::quantize_4bit", "cuda")
+def _(
+ A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ A.dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
+ )
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(n),
+ )
+
+ if A.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_bf16_nf4(*args)
+ elif A.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp16_nf4(*args)
+ elif A.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp32_nf4(*args)
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+) -> torch.Tensor:
+ out = torch.empty(shape, dtype=dtype, device=A.device)
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
+
+def _dequantize_4bit_impl(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(out.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if out.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_bf16_nf4(*args)
+ elif out.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp16_nf4(*args)
+ elif out.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp32_nf4(*args)
+
+
+@register_kernel("bitsandbytes::gemv_4bit", "cuda")
+def _(
+ A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
+) -> torch.Tensor:
+ shape = (*A.shape[:-1], shapeB[0])
+ out = torch.empty(shape, device=A.device, dtype=A.dtype)
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check(
+ out.shape == (*A.shape[:-1], shapeB[0]),
+ lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
+ )
+ torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+
+
+def _gemv_4bit_impl(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check_is_size(blocksize)
+ torch._check(
+ A.numel() == A.size(-1),
+ lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
+ )
+ torch._check(
+ A.dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
+ )
+ torch._check(
+ B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
+ )
+ torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ m = ct.c_int32(shapeB[0])
+ n = ct.c_int32(1)
+ k = ct.c_int32(shapeB[1])
+
+ lda = m
+ ldb = ct.c_int32((A.shape[-1] + 1) // 2)
+ ldc = m
+
+ stream = _get_tensor_stream(A)
+
+ with _cuda_device_of(A):
+ if A.dtype == torch.float16:
+ lib.cgemm_4bit_inference_naive_fp16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.bfloat16:
+ lib.cgemm_4bit_inference_naive_bf16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.float32:
+ lib.cgemm_4bit_inference_naive_fp32(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
From af3aaf6a5d5ee90d713fbba875ab3cbd5137c619 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:17:20 +0530
Subject: [PATCH 37/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 14878123a..aa7c82f09 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -536,3 +536,4 @@ def _gemv_4bit_impl(
ct.c_int32(blocksize),
stream,
)
+
From d1e34a5dfe80aa95c42de7187800468d7a9e1b8a Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:18:53 +0530
Subject: [PATCH 38/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 1058 ++++++++++++++---------------
1 file changed, 520 insertions(+), 538 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index aa7c82f09..efdef2871 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,539 +1,521 @@
-from collections.abc import Sequence
-import ctypes as ct
-from math import prod
-from typing import Optional
-
-import torch
-
-from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
-
-from ..._ops import register_kernel
-from ...cextension import lib, HIP_ENVIRONMENT
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor):
- out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
- return _int8_linear_matmul_impl(A, B, out)
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- _int8_linear_matmul_impl(A, B, out)
-
-
-def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- A, B = B, A
-
- shapeA = A.shape
- shapeB = B.shape
-
- torch._check(A.dtype == torch.int8, lambda: "B must be int8")
- torch._check(B.dtype == torch.int8, lambda: "A must be int8")
- torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
- torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
- torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
- torch._check(out.dtype == torch.int32)
-
- shapeC = (*shapeB[:-1], shapeA[0])
- torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
-
- k, m = shapeA
- n = prod(shapeB[:-1])
- lda = shapeA[-1] # Weights (outputs, inputs)
- ldb = shapeB[-1] # Activations (batch, tokens, inputs)
- ldc = shapeC[-1] # Output (batch, tokens, outputs)
-
- torch._check(
- lda == ldb,
- lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
- )
-
- # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
- # We'll fall back to a slower fp32 calculation in this circumstance.
- # Fortunately, this should not be very common.
- if lda % 4 != 0:
- result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
- return out.copy_(result)
-
- with _cuda_device_of(A):
- ctx = CUBLAS_Context.get_instance().get_context(A.device)
- ptrA = get_ptr(A)
- ptrB = get_ptr(B)
- ptrC = get_ptr(out)
- ptrRowScale = None
- m = ct.c_int32(m)
- n = ct.c_int32(n)
- k = ct.c_int32(k)
- lda = ct.c_int32(lda)
- ldb = ct.c_int32(ldb)
- ldc = ct.c_int32(ldc)
- stream = _get_tensor_stream(A)
-
- has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
- if has_error:
- if has_error == 100:
- # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
- # TODO: Warn and implement a fallback to fp32 compute?
- raise NotImplementedError("int8_linear_matmul not implemented!")
- else:
- raise RuntimeError(
- f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
- )
-
- return out
-
-
-@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
-def _(
- A: torch.Tensor,
- row_stats: torch.Tensor,
- col_stats: torch.Tensor,
- dtype: Optional[torch.dtype] = None,
- bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
- torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
- torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
- torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
-
- # Note: cuda kernel only currently supports fp16 output.
- # We'll later cast to desired dtype if needed.
- out = torch.empty_like(A, dtype=torch.float16)
-
- ptrA = get_ptr(A)
- ptrOut = get_ptr(out)
- ptrRowStats = get_ptr(row_stats)
- ptrColStats = get_ptr(col_stats)
- numRows = ct.c_int32(prod(A.shape[:-1]))
- numCols = ct.c_int32(A.shape[-1])
-
- # Note: fused bias in the kernel is only supported for fp16
- # TODO(matthewdouglas): Consider supporting bf16 fused bias
- ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
-
- with _cuda_device_of(A):
- lib.cdequant_mm_int32_fp16(
- ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
- )
-
- # Add bias separately if not fused in kernel
- if bias is not None and bias.dtype != torch.float16:
- out.add_(bias)
-
- return out.to(dtype or torch.float16)
-
-
-@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
-def _(A: torch.Tensor, threshold=0.0):
- torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
- torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
-
- rows = prod(A.shape[:-1])
- cols = A.shape[-1]
-
- row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
- out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
-
- outlier_cols = None
-
- if threshold > 0.0:
- # TODO we could improve perf of this
- outliers = A.abs() >= threshold
-
- if outliers.any():
- outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
- else:
- # Needed for torch.compile support.
- outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
-
- with _cuda_device_of(A):
- lib.cint8_vector_quant(
- get_ptr(A),
- get_ptr(out_row),
- get_ptr(row_stats),
- ct.c_float(threshold),
- ct.c_int32(rows),
- ct.c_int32(cols),
- _get_tensor_stream(A),
- )
-
- # Zero out values from outlier columns across all rows.
- # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
- if rows > 1 and outlier_cols is not None:
- out_row[:, outlier_cols] = 0
-
- return out_row, row_stats, outlier_cols
-
-
-@register_kernel("bitsandbytes::int8_double_quant", "cuda")
-def _(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
- # Use CUDA kernel for rowwise and COO tensor
- quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
- A,
- threshold=threshold,
- )
-
- # PyTorch impl for colwise
- col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
- if threshold > 0.0 and outlier_mask is not None:
- A = A.masked_fill(outlier_mask, 0.0)
- quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
-
- return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
-
-
-def _get_col_absmax(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
- torch._check(A.is_floating_point())
-
- outlier_mask = None
-
- absA = A.abs().view(-1, A.shape[-1])
-
- if threshold > 0.0:
- # Filter outliers from stats when enabled
- outlier_mask = absA >= threshold
- absA.masked_fill_(outlier_mask, 0.0)
-
- # shape [cols]; unsqueeze(0) gives [1,cols]
- col_stats = absA.amax(dim=0, keepdim=False).float()
-
- return col_stats, outlier_mask
-
-
-@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check_is_size(blocksize)
-
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty_like(A, dtype=torch.uint8)
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(A.numel()),
- )
-
- if A.dtype == torch.float16:
- lib.cquantize_blockwise_fp16(*args)
- elif A.dtype == torch.bfloat16:
- lib.cquantize_blockwise_bf16(*args)
- elif A.dtype == torch.float32:
- lib.cquantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
-def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
- out = torch.empty_like(A, dtype=dtype)
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
-
-
-def _dequantize_blockwise_impl(
- A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
-) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
- torch._check(
- dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(A.numel()),
- _get_tensor_stream(A),
- )
-
- if dtype == torch.float16:
- lib.cdequantize_blockwise_fp16(*args)
- elif dtype == torch.bfloat16:
- lib.cdequantize_blockwise_bf16(*args)
- elif dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
-
-
-@register_kernel("bitsandbytes::quantize_4bit", "cuda")
-def _(
- A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> tuple[torch.Tensor, torch.Tensor]:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- A.dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
- )
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(n),
- )
-
- if A.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_bf16_fp4(*args)
- else:
- lib.cquantize_blockwise_bf16_nf4(*args)
- elif A.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp16_fp4(*args)
- else:
- lib.cquantize_blockwise_fp16_nf4(*args)
- elif A.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp32_fp4(*args)
- else:
- lib.cquantize_blockwise_fp32_nf4(*args)
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
-) -> torch.Tensor:
- out = torch.empty(shape, dtype=dtype, device=A.device)
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
-
-
-def _dequantize_4bit_impl(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(out.numel()),
- _get_tensor_stream(A),
- )
-
- if out.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_bf16_fp4(*args)
- else:
- lib.cdequantize_blockwise_bf16_nf4(*args)
- elif out.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp16_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp16_nf4(*args)
- elif out.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp32_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp32_nf4(*args)
-
-
-@register_kernel("bitsandbytes::gemv_4bit", "cuda")
-def _(
- A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
-) -> torch.Tensor:
- shape = (*A.shape[:-1], shapeB[0])
- out = torch.empty(shape, device=A.device, dtype=A.dtype)
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check(
- out.shape == (*A.shape[:-1], shapeB[0]),
- lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
- )
- torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
-
-
-def _gemv_4bit_impl(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check_is_size(blocksize)
- torch._check(
- A.numel() == A.size(-1),
- lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
- )
- torch._check(
- A.dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
- )
- torch._check(
- B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
- lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
- )
- torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- m = ct.c_int32(shapeB[0])
- n = ct.c_int32(1)
- k = ct.c_int32(shapeB[1])
-
- lda = m
- ldb = ct.c_int32((A.shape[-1] + 1) // 2)
- ldc = m
-
- stream = _get_tensor_stream(A)
-
- with _cuda_device_of(A):
- if A.dtype == torch.float16:
- lib.cgemm_4bit_inference_naive_fp16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.bfloat16:
- lib.cgemm_4bit_inference_naive_bf16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.float32:
- lib.cgemm_4bit_inference_naive_fp32(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
+from collections.abc import Sequence
+import ctypes as ct
+from math import prod
+from typing import Optional
+import torch
+
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
+
+from ..._ops import register_kernel
+from ...cextension import lib
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor):
+ out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
+ return _int8_linear_matmul_impl(A, B, out)
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ _int8_linear_matmul_impl(A, B, out)
+
+
+def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ A, B = B, A
+
+ shapeA = A.shape
+ shapeB = B.shape
+
+ torch._check(A.dtype == torch.int8, lambda: "B must be int8")
+ torch._check(B.dtype == torch.int8, lambda: "A must be int8")
+ torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
+ torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
+ torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
+ torch._check(out.dtype == torch.int32)
+
+ shapeC = (*shapeB[:-1], shapeA[0])
+ torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
+
+ k, m = shapeA
+ n = prod(shapeB[:-1])
+ lda = shapeA[-1] # Weights (outputs, inputs)
+ ldb = shapeB[-1] # Activations (batch, tokens, inputs)
+ ldc = shapeC[-1] # Output (batch, tokens, outputs)
+
+ torch._check(
+ lda == ldb,
+ lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
+ )
+
+ # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
+ # We'll fall back to a slower fp32 calculation in this circumstance.
+ # Fortunately, this should not be very common.
+ if lda % 4 != 0:
+ result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+ return out.copy_(result)
+
+ with _cuda_device_of(A):
+ ctx = CUBLAS_Context.get_instance().get_context(A.device)
+ ptrA = get_ptr(A)
+ ptrB = get_ptr(B)
+ ptrC = get_ptr(out)
+ ptrRowScale = None
+ m = ct.c_int32(m)
+ n = ct.c_int32(n)
+ k = ct.c_int32(k)
+ lda = ct.c_int32(lda)
+ ldb = ct.c_int32(ldb)
+ ldc = ct.c_int32(ldc)
+ stream = _get_tensor_stream(A)
+
+ has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
+ if has_error:
+ if has_error == 100:
+ # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+ # TODO: Warn and implement a fallback to fp32 compute?
+ raise NotImplementedError("int8_linear_matmul not implemented!")
+ else:
+ raise RuntimeError(
+ f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
+ )
+
+ return out
+
+
+@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
+def _(
+ A: torch.Tensor,
+ row_stats: torch.Tensor,
+ col_stats: torch.Tensor,
+ dtype: Optional[torch.dtype] = None,
+ bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+ torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
+ torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
+ torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
+
+ # Note: cuda kernel only currently supports fp16 output.
+ # We'll later cast to desired dtype if needed.
+ out = torch.empty_like(A, dtype=torch.float16)
+
+ ptrA = get_ptr(A)
+ ptrOut = get_ptr(out)
+ ptrRowStats = get_ptr(row_stats)
+ ptrColStats = get_ptr(col_stats)
+ numRows = ct.c_int32(prod(A.shape[:-1]))
+ numCols = ct.c_int32(A.shape[-1])
+
+ # Note: fused bias in the kernel is only supported for fp16
+ # TODO(matthewdouglas): Consider supporting bf16 fused bias
+ ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
+
+ with _cuda_device_of(A):
+ lib.cdequant_mm_int32_fp16(
+ ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
+ )
+
+ # Add bias separately if not fused in kernel
+ if bias is not None and bias.dtype != torch.float16:
+ out.add_(bias)
+
+ return out.to(dtype or torch.float16)
+
+
+@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
+def _(A: torch.Tensor, threshold=0.0):
+ torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
+ torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
+
+ rows = prod(A.shape[:-1])
+ cols = A.shape[-1]
+
+ row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
+ out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
+
+ outlier_cols = None
+
+ if threshold > 0.0:
+ # TODO we could improve perf of this
+ outliers = A.abs() >= threshold
+
+ if outliers.any():
+ outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
+ else:
+ # Needed for torch.compile support.
+ outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
+
+ with _cuda_device_of(A):
+ lib.cint8_vector_quant(
+ get_ptr(A),
+ get_ptr(out_row),
+ get_ptr(row_stats),
+ ct.c_float(threshold),
+ ct.c_int32(rows),
+ ct.c_int32(cols),
+ _get_tensor_stream(A),
+ )
+
+ # Zero out values from outlier columns across all rows.
+ # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
+ if rows > 1 and outlier_cols is not None:
+ out_row[:, outlier_cols] = 0
+
+ return out_row, row_stats, outlier_cols
+
+
+@register_kernel("bitsandbytes::int8_double_quant", "cuda")
+def _(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+ # Use CUDA kernel for rowwise and COO tensor
+ quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
+ A,
+ threshold=threshold,
+ )
+
+ # PyTorch impl for colwise
+ col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
+ if threshold > 0.0 and outlier_mask is not None:
+ A = A.masked_fill(outlier_mask, 0.0)
+ quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
+
+ return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
+
+
+def _get_col_absmax(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+ torch._check(A.is_floating_point())
+
+ outlier_mask = None
+
+ absA = A.abs().view(-1, A.shape[-1])
+
+ if threshold > 0.0:
+ # Filter outliers from stats when enabled
+ outlier_mask = absA >= threshold
+ absA.masked_fill_(outlier_mask, 0.0)
+
+ # shape [cols]; unsqueeze(0) gives [1,cols]
+ col_stats = absA.amax(dim=0, keepdim=False).float()
+
+ return col_stats, outlier_mask
+
+
+@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check_is_size(blocksize)
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty_like(A, dtype=torch.uint8)
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(A.numel()),
+ )
+
+ if A.dtype == torch.float16:
+ lib.cquantize_blockwise_fp16(*args)
+ elif A.dtype == torch.bfloat16:
+ lib.cquantize_blockwise_bf16(*args)
+ elif A.dtype == torch.float32:
+ lib.cquantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
+def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+ out = torch.empty_like(A, dtype=dtype)
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+
+
+def _dequantize_blockwise_impl(
+ A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+) -> None:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+ torch._check(
+ dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(A.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if dtype == torch.float16:
+ lib.cdequantize_blockwise_fp16(*args)
+ elif dtype == torch.bfloat16:
+ lib.cdequantize_blockwise_bf16(*args)
+ elif dtype == torch.float32:
+ lib.cdequantize_blockwise_fp32(*args)
+
+
+@register_kernel("bitsandbytes::quantize_4bit", "cuda")
+def _(
+ A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ A.dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
+ )
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(n),
+ )
+
+ if A.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_bf16_nf4(*args)
+ elif A.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp16_nf4(*args)
+ elif A.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp32_nf4(*args)
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+) -> torch.Tensor:
+ out = torch.empty(shape, dtype=dtype, device=A.device)
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
+
+def _dequantize_4bit_impl(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(out.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if out.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_bf16_nf4(*args)
+ elif out.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp16_nf4(*args)
+ elif out.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp32_nf4(*args)
+
+
+@register_kernel("bitsandbytes::gemv_4bit", "cuda")
+def _(
+ A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
+) -> torch.Tensor:
+ shape = (*A.shape[:-1], shapeB[0])
+ out = torch.empty(shape, device=A.device, dtype=A.dtype)
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check(
+ out.shape == (*A.shape[:-1], shapeB[0]),
+ lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
+ )
+ torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+
+
+def _gemv_4bit_impl(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check_is_size(blocksize)
+ torch._check(
+ A.numel() == A.size(-1),
+ lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
+ )
+ torch._check(
+ A.dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
+ )
+ torch._check(
+ B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
+ )
+ torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ m = ct.c_int32(shapeB[0])
+ n = ct.c_int32(1)
+ k = ct.c_int32(shapeB[1])
+
+ lda = m
+ ldb = ct.c_int32((A.shape[-1] + 1) // 2)
+ ldc = m
+
+ stream = _get_tensor_stream(A)
+
+ with _cuda_device_of(A):
+ if A.dtype == torch.float16:
+ lib.cgemm_4bit_inference_naive_fp16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.bfloat16:
+ lib.cgemm_4bit_inference_naive_bf16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.float32:
+ lib.cgemm_4bit_inference_naive_fp32(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
From b2b4df6d3046a166d6e177de2dbca26f1b0abcab Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:21:15 +0530
Subject: [PATCH 39/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 1059 +++++++++++++++--------------
1 file changed, 538 insertions(+), 521 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index efdef2871..14878123a 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,521 +1,538 @@
-from collections.abc import Sequence
-import ctypes as ct
-from math import prod
-from typing import Optional
-
-import torch
-
-from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
-
-from ..._ops import register_kernel
-from ...cextension import lib
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor):
- out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
- return _int8_linear_matmul_impl(A, B, out)
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- _int8_linear_matmul_impl(A, B, out)
-
-
-def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- A, B = B, A
-
- shapeA = A.shape
- shapeB = B.shape
-
- torch._check(A.dtype == torch.int8, lambda: "B must be int8")
- torch._check(B.dtype == torch.int8, lambda: "A must be int8")
- torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
- torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
- torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
- torch._check(out.dtype == torch.int32)
-
- shapeC = (*shapeB[:-1], shapeA[0])
- torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
-
- k, m = shapeA
- n = prod(shapeB[:-1])
- lda = shapeA[-1] # Weights (outputs, inputs)
- ldb = shapeB[-1] # Activations (batch, tokens, inputs)
- ldc = shapeC[-1] # Output (batch, tokens, outputs)
-
- torch._check(
- lda == ldb,
- lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
- )
-
- # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
- # We'll fall back to a slower fp32 calculation in this circumstance.
- # Fortunately, this should not be very common.
- if lda % 4 != 0:
- result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
- return out.copy_(result)
-
- with _cuda_device_of(A):
- ctx = CUBLAS_Context.get_instance().get_context(A.device)
- ptrA = get_ptr(A)
- ptrB = get_ptr(B)
- ptrC = get_ptr(out)
- ptrRowScale = None
- m = ct.c_int32(m)
- n = ct.c_int32(n)
- k = ct.c_int32(k)
- lda = ct.c_int32(lda)
- ldb = ct.c_int32(ldb)
- ldc = ct.c_int32(ldc)
- stream = _get_tensor_stream(A)
-
- has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
- if has_error:
- if has_error == 100:
- # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
- # TODO: Warn and implement a fallback to fp32 compute?
- raise NotImplementedError("int8_linear_matmul not implemented!")
- else:
- raise RuntimeError(
- f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
- )
-
- return out
-
-
-@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
-def _(
- A: torch.Tensor,
- row_stats: torch.Tensor,
- col_stats: torch.Tensor,
- dtype: Optional[torch.dtype] = None,
- bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
- torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
- torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
- torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
-
- # Note: cuda kernel only currently supports fp16 output.
- # We'll later cast to desired dtype if needed.
- out = torch.empty_like(A, dtype=torch.float16)
-
- ptrA = get_ptr(A)
- ptrOut = get_ptr(out)
- ptrRowStats = get_ptr(row_stats)
- ptrColStats = get_ptr(col_stats)
- numRows = ct.c_int32(prod(A.shape[:-1]))
- numCols = ct.c_int32(A.shape[-1])
-
- # Note: fused bias in the kernel is only supported for fp16
- # TODO(matthewdouglas): Consider supporting bf16 fused bias
- ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
-
- with _cuda_device_of(A):
- lib.cdequant_mm_int32_fp16(
- ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
- )
-
- # Add bias separately if not fused in kernel
- if bias is not None and bias.dtype != torch.float16:
- out.add_(bias)
-
- return out.to(dtype or torch.float16)
-
-
-@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
-def _(A: torch.Tensor, threshold=0.0):
- torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
- torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
-
- rows = prod(A.shape[:-1])
- cols = A.shape[-1]
-
- row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
- out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
-
- outlier_cols = None
-
- if threshold > 0.0:
- # TODO we could improve perf of this
- outliers = A.abs() >= threshold
-
- if outliers.any():
- outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
- else:
- # Needed for torch.compile support.
- outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
-
- with _cuda_device_of(A):
- lib.cint8_vector_quant(
- get_ptr(A),
- get_ptr(out_row),
- get_ptr(row_stats),
- ct.c_float(threshold),
- ct.c_int32(rows),
- ct.c_int32(cols),
- _get_tensor_stream(A),
- )
-
- # Zero out values from outlier columns across all rows.
- # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
- if rows > 1 and outlier_cols is not None:
- out_row[:, outlier_cols] = 0
-
- return out_row, row_stats, outlier_cols
-
-
-@register_kernel("bitsandbytes::int8_double_quant", "cuda")
-def _(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
- # Use CUDA kernel for rowwise and COO tensor
- quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
- A,
- threshold=threshold,
- )
-
- # PyTorch impl for colwise
- col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
- if threshold > 0.0 and outlier_mask is not None:
- A = A.masked_fill(outlier_mask, 0.0)
- quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
-
- return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
-
-
-def _get_col_absmax(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
- torch._check(A.is_floating_point())
-
- outlier_mask = None
-
- absA = A.abs().view(-1, A.shape[-1])
-
- if threshold > 0.0:
- # Filter outliers from stats when enabled
- outlier_mask = absA >= threshold
- absA.masked_fill_(outlier_mask, 0.0)
-
- # shape [cols]; unsqueeze(0) gives [1,cols]
- col_stats = absA.amax(dim=0, keepdim=False).float()
-
- return col_stats, outlier_mask
-
-
-@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check_is_size(blocksize)
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty_like(A, dtype=torch.uint8)
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(A.numel()),
- )
-
- if A.dtype == torch.float16:
- lib.cquantize_blockwise_fp16(*args)
- elif A.dtype == torch.bfloat16:
- lib.cquantize_blockwise_bf16(*args)
- elif A.dtype == torch.float32:
- lib.cquantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
-def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
- out = torch.empty_like(A, dtype=dtype)
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
-
-
-def _dequantize_blockwise_impl(
- A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
-) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
- torch._check(
- dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(A.numel()),
- _get_tensor_stream(A),
- )
-
- if dtype == torch.float16:
- lib.cdequantize_blockwise_fp16(*args)
- elif dtype == torch.bfloat16:
- lib.cdequantize_blockwise_bf16(*args)
- elif dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
-
-
-@register_kernel("bitsandbytes::quantize_4bit", "cuda")
-def _(
- A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- A.dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
- )
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(n),
- )
-
- if A.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_bf16_fp4(*args)
- else:
- lib.cquantize_blockwise_bf16_nf4(*args)
- elif A.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp16_fp4(*args)
- else:
- lib.cquantize_blockwise_fp16_nf4(*args)
- elif A.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp32_fp4(*args)
- else:
- lib.cquantize_blockwise_fp32_nf4(*args)
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
-) -> torch.Tensor:
- out = torch.empty(shape, dtype=dtype, device=A.device)
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
-
-
-def _dequantize_4bit_impl(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(out.numel()),
- _get_tensor_stream(A),
- )
-
- if out.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_bf16_fp4(*args)
- else:
- lib.cdequantize_blockwise_bf16_nf4(*args)
- elif out.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp16_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp16_nf4(*args)
- elif out.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp32_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp32_nf4(*args)
-
-
-@register_kernel("bitsandbytes::gemv_4bit", "cuda")
-def _(
- A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
-) -> torch.Tensor:
- shape = (*A.shape[:-1], shapeB[0])
- out = torch.empty(shape, device=A.device, dtype=A.dtype)
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check(
- out.shape == (*A.shape[:-1], shapeB[0]),
- lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
- )
- torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
-
-
-def _gemv_4bit_impl(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check_is_size(blocksize)
- torch._check(
- A.numel() == A.size(-1),
- lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
- )
- torch._check(
- A.dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
- )
- torch._check(
- B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
- lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
- )
- torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- m = ct.c_int32(shapeB[0])
- n = ct.c_int32(1)
- k = ct.c_int32(shapeB[1])
-
- lda = m
- ldb = ct.c_int32((A.shape[-1] + 1) // 2)
- ldc = m
-
- stream = _get_tensor_stream(A)
-
- with _cuda_device_of(A):
- if A.dtype == torch.float16:
- lib.cgemm_4bit_inference_naive_fp16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.bfloat16:
- lib.cgemm_4bit_inference_naive_bf16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.float32:
- lib.cgemm_4bit_inference_naive_fp32(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
+from collections.abc import Sequence
+import ctypes as ct
+from math import prod
+from typing import Optional
+
+import torch
+
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
+
+from ..._ops import register_kernel
+from ...cextension import lib, HIP_ENVIRONMENT
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor):
+ out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
+ return _int8_linear_matmul_impl(A, B, out)
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ _int8_linear_matmul_impl(A, B, out)
+
+
+def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ A, B = B, A
+
+ shapeA = A.shape
+ shapeB = B.shape
+
+ torch._check(A.dtype == torch.int8, lambda: "B must be int8")
+ torch._check(B.dtype == torch.int8, lambda: "A must be int8")
+ torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
+ torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
+ torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
+ torch._check(out.dtype == torch.int32)
+
+ shapeC = (*shapeB[:-1], shapeA[0])
+ torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
+
+ k, m = shapeA
+ n = prod(shapeB[:-1])
+ lda = shapeA[-1] # Weights (outputs, inputs)
+ ldb = shapeB[-1] # Activations (batch, tokens, inputs)
+ ldc = shapeC[-1] # Output (batch, tokens, outputs)
+
+ torch._check(
+ lda == ldb,
+ lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
+ )
+
+ # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
+ # We'll fall back to a slower fp32 calculation in this circumstance.
+ # Fortunately, this should not be very common.
+ if lda % 4 != 0:
+ result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+ return out.copy_(result)
+
+ with _cuda_device_of(A):
+ ctx = CUBLAS_Context.get_instance().get_context(A.device)
+ ptrA = get_ptr(A)
+ ptrB = get_ptr(B)
+ ptrC = get_ptr(out)
+ ptrRowScale = None
+ m = ct.c_int32(m)
+ n = ct.c_int32(n)
+ k = ct.c_int32(k)
+ lda = ct.c_int32(lda)
+ ldb = ct.c_int32(ldb)
+ ldc = ct.c_int32(ldc)
+ stream = _get_tensor_stream(A)
+
+ has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
+ if has_error:
+ if has_error == 100:
+ # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+ # TODO: Warn and implement a fallback to fp32 compute?
+ raise NotImplementedError("int8_linear_matmul not implemented!")
+ else:
+ raise RuntimeError(
+ f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
+ )
+
+ return out
+
+
+@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
+def _(
+ A: torch.Tensor,
+ row_stats: torch.Tensor,
+ col_stats: torch.Tensor,
+ dtype: Optional[torch.dtype] = None,
+ bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+ torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
+ torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
+ torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
+
+ # Note: cuda kernel only currently supports fp16 output.
+ # We'll later cast to desired dtype if needed.
+ out = torch.empty_like(A, dtype=torch.float16)
+
+ ptrA = get_ptr(A)
+ ptrOut = get_ptr(out)
+ ptrRowStats = get_ptr(row_stats)
+ ptrColStats = get_ptr(col_stats)
+ numRows = ct.c_int32(prod(A.shape[:-1]))
+ numCols = ct.c_int32(A.shape[-1])
+
+ # Note: fused bias in the kernel is only supported for fp16
+ # TODO(matthewdouglas): Consider supporting bf16 fused bias
+ ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
+
+ with _cuda_device_of(A):
+ lib.cdequant_mm_int32_fp16(
+ ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
+ )
+
+ # Add bias separately if not fused in kernel
+ if bias is not None and bias.dtype != torch.float16:
+ out.add_(bias)
+
+ return out.to(dtype or torch.float16)
+
+
+@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
+def _(A: torch.Tensor, threshold=0.0):
+ torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
+ torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
+
+ rows = prod(A.shape[:-1])
+ cols = A.shape[-1]
+
+ row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
+ out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
+
+ outlier_cols = None
+
+ if threshold > 0.0:
+ # TODO we could improve perf of this
+ outliers = A.abs() >= threshold
+
+ if outliers.any():
+ outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
+ else:
+ # Needed for torch.compile support.
+ outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
+
+ with _cuda_device_of(A):
+ lib.cint8_vector_quant(
+ get_ptr(A),
+ get_ptr(out_row),
+ get_ptr(row_stats),
+ ct.c_float(threshold),
+ ct.c_int32(rows),
+ ct.c_int32(cols),
+ _get_tensor_stream(A),
+ )
+
+ # Zero out values from outlier columns across all rows.
+ # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
+ if rows > 1 and outlier_cols is not None:
+ out_row[:, outlier_cols] = 0
+
+ return out_row, row_stats, outlier_cols
+
+
+@register_kernel("bitsandbytes::int8_double_quant", "cuda")
+def _(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+ # Use CUDA kernel for rowwise and COO tensor
+ quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
+ A,
+ threshold=threshold,
+ )
+
+ # PyTorch impl for colwise
+ col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
+ if threshold > 0.0 and outlier_mask is not None:
+ A = A.masked_fill(outlier_mask, 0.0)
+ quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
+
+ return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
+
+
+def _get_col_absmax(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+ torch._check(A.is_floating_point())
+
+ outlier_mask = None
+
+ absA = A.abs().view(-1, A.shape[-1])
+
+ if threshold > 0.0:
+ # Filter outliers from stats when enabled
+ outlier_mask = absA >= threshold
+ absA.masked_fill_(outlier_mask, 0.0)
+
+ # shape [cols]; unsqueeze(0) gives [1,cols]
+ col_stats = absA.amax(dim=0, keepdim=False).float()
+
+ return col_stats, outlier_mask
+
+
+@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check_is_size(blocksize)
+
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty_like(A, dtype=torch.uint8)
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(A.numel()),
+ )
+
+ if A.dtype == torch.float16:
+ lib.cquantize_blockwise_fp16(*args)
+ elif A.dtype == torch.bfloat16:
+ lib.cquantize_blockwise_bf16(*args)
+ elif A.dtype == torch.float32:
+ lib.cquantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
+def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+ out = torch.empty_like(A, dtype=dtype)
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+
+
+def _dequantize_blockwise_impl(
+ A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+) -> None:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+ torch._check(
+ dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(A.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if dtype == torch.float16:
+ lib.cdequantize_blockwise_fp16(*args)
+ elif dtype == torch.bfloat16:
+ lib.cdequantize_blockwise_bf16(*args)
+ elif dtype == torch.float32:
+ lib.cdequantize_blockwise_fp32(*args)
+
+
+@register_kernel("bitsandbytes::quantize_4bit", "cuda")
+def _(
+ A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ A.dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
+ )
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(n),
+ )
+
+ if A.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_bf16_nf4(*args)
+ elif A.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp16_nf4(*args)
+ elif A.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp32_nf4(*args)
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+) -> torch.Tensor:
+ out = torch.empty(shape, dtype=dtype, device=A.device)
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
+
+def _dequantize_4bit_impl(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(out.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if out.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_bf16_nf4(*args)
+ elif out.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp16_nf4(*args)
+ elif out.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp32_nf4(*args)
+
+
+@register_kernel("bitsandbytes::gemv_4bit", "cuda")
+def _(
+ A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
+) -> torch.Tensor:
+ shape = (*A.shape[:-1], shapeB[0])
+ out = torch.empty(shape, device=A.device, dtype=A.dtype)
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check(
+ out.shape == (*A.shape[:-1], shapeB[0]),
+ lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
+ )
+ torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+
+
+def _gemv_4bit_impl(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check_is_size(blocksize)
+ torch._check(
+ A.numel() == A.size(-1),
+ lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
+ )
+ torch._check(
+ A.dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
+ )
+ torch._check(
+ B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
+ )
+ torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ m = ct.c_int32(shapeB[0])
+ n = ct.c_int32(1)
+ k = ct.c_int32(shapeB[1])
+
+ lda = m
+ ldb = ct.c_int32((A.shape[-1] + 1) // 2)
+ ldc = m
+
+ stream = _get_tensor_stream(A)
+
+ with _cuda_device_of(A):
+ if A.dtype == torch.float16:
+ lib.cgemm_4bit_inference_naive_fp16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.bfloat16:
+ lib.cgemm_4bit_inference_naive_bf16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.float32:
+ lib.cgemm_4bit_inference_naive_fp32(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
From 8863d0e3d55c73478926c9388080750be2e49690 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:22:01 +0530
Subject: [PATCH 40/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 1059 ++++++++++++++---------------
1 file changed, 521 insertions(+), 538 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 14878123a..efdef2871 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -1,538 +1,521 @@
-from collections.abc import Sequence
-import ctypes as ct
-from math import prod
-from typing import Optional
-
-import torch
-
-from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
-
-from ..._ops import register_kernel
-from ...cextension import lib, HIP_ENVIRONMENT
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor):
- out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
- return _int8_linear_matmul_impl(A, B, out)
-
-
-@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
-def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- _int8_linear_matmul_impl(A, B, out)
-
-
-def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
- A, B = B, A
-
- shapeA = A.shape
- shapeB = B.shape
-
- torch._check(A.dtype == torch.int8, lambda: "B must be int8")
- torch._check(B.dtype == torch.int8, lambda: "A must be int8")
- torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
- torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
- torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
- torch._check(out.dtype == torch.int32)
-
- shapeC = (*shapeB[:-1], shapeA[0])
- torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
-
- k, m = shapeA
- n = prod(shapeB[:-1])
- lda = shapeA[-1] # Weights (outputs, inputs)
- ldb = shapeB[-1] # Activations (batch, tokens, inputs)
- ldc = shapeC[-1] # Output (batch, tokens, outputs)
-
- torch._check(
- lda == ldb,
- lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
- )
-
- # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
- # We'll fall back to a slower fp32 calculation in this circumstance.
- # Fortunately, this should not be very common.
- if lda % 4 != 0:
- result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
- return out.copy_(result)
-
- with _cuda_device_of(A):
- ctx = CUBLAS_Context.get_instance().get_context(A.device)
- ptrA = get_ptr(A)
- ptrB = get_ptr(B)
- ptrC = get_ptr(out)
- ptrRowScale = None
- m = ct.c_int32(m)
- n = ct.c_int32(n)
- k = ct.c_int32(k)
- lda = ct.c_int32(lda)
- ldb = ct.c_int32(ldb)
- ldc = ct.c_int32(ldc)
- stream = _get_tensor_stream(A)
-
- has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
-
- if has_error:
- if has_error == 100:
- # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
- # TODO: Warn and implement a fallback to fp32 compute?
- raise NotImplementedError("int8_linear_matmul not implemented!")
- else:
- raise RuntimeError(
- f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
- )
-
- return out
-
-
-@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
-def _(
- A: torch.Tensor,
- row_stats: torch.Tensor,
- col_stats: torch.Tensor,
- dtype: Optional[torch.dtype] = None,
- bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
- torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
- torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
- torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
-
- # Note: cuda kernel only currently supports fp16 output.
- # We'll later cast to desired dtype if needed.
- out = torch.empty_like(A, dtype=torch.float16)
-
- ptrA = get_ptr(A)
- ptrOut = get_ptr(out)
- ptrRowStats = get_ptr(row_stats)
- ptrColStats = get_ptr(col_stats)
- numRows = ct.c_int32(prod(A.shape[:-1]))
- numCols = ct.c_int32(A.shape[-1])
-
- # Note: fused bias in the kernel is only supported for fp16
- # TODO(matthewdouglas): Consider supporting bf16 fused bias
- ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
-
- with _cuda_device_of(A):
- lib.cdequant_mm_int32_fp16(
- ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
- )
-
- # Add bias separately if not fused in kernel
- if bias is not None and bias.dtype != torch.float16:
- out.add_(bias)
-
- return out.to(dtype or torch.float16)
-
-
-@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
-def _(A: torch.Tensor, threshold=0.0):
- torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
- torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
-
- rows = prod(A.shape[:-1])
- cols = A.shape[-1]
-
- row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
- out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
-
- outlier_cols = None
-
- if threshold > 0.0:
- # TODO we could improve perf of this
- outliers = A.abs() >= threshold
-
- if outliers.any():
- outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
- else:
- # Needed for torch.compile support.
- outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
-
- with _cuda_device_of(A):
- lib.cint8_vector_quant(
- get_ptr(A),
- get_ptr(out_row),
- get_ptr(row_stats),
- ct.c_float(threshold),
- ct.c_int32(rows),
- ct.c_int32(cols),
- _get_tensor_stream(A),
- )
-
- # Zero out values from outlier columns across all rows.
- # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
- if rows > 1 and outlier_cols is not None:
- out_row[:, outlier_cols] = 0
-
- return out_row, row_stats, outlier_cols
-
-
-@register_kernel("bitsandbytes::int8_double_quant", "cuda")
-def _(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
- # Use CUDA kernel for rowwise and COO tensor
- quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
- A,
- threshold=threshold,
- )
-
- # PyTorch impl for colwise
- col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
- if threshold > 0.0 and outlier_mask is not None:
- A = A.masked_fill(outlier_mask, 0.0)
- quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
-
- return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
-
-
-def _get_col_absmax(
- A: torch.Tensor,
- threshold=0.0,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
- torch._check(A.is_floating_point())
-
- outlier_mask = None
-
- absA = A.abs().view(-1, A.shape[-1])
-
- if threshold > 0.0:
- # Filter outliers from stats when enabled
- outlier_mask = absA >= threshold
- absA.masked_fill_(outlier_mask, 0.0)
-
- # shape [cols]; unsqueeze(0) gives [1,cols]
- col_stats = absA.amax(dim=0, keepdim=False).float()
-
- return col_stats, outlier_mask
-
-
-@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
-def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check_is_size(blocksize)
-
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty_like(A, dtype=torch.uint8)
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(A.numel()),
- )
-
- if A.dtype == torch.float16:
- lib.cquantize_blockwise_fp16(*args)
- elif A.dtype == torch.bfloat16:
- lib.cquantize_blockwise_bf16(*args)
- elif A.dtype == torch.float32:
- lib.cquantize_blockwise_fp32(*args)
- else:
- raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
-def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
- out = torch.empty_like(A, dtype=dtype)
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
- _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
-
-
-def _dequantize_blockwise_impl(
- A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
-) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
- torch._check(
- dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(A.numel()),
- _get_tensor_stream(A),
- )
-
- if dtype == torch.float16:
- lib.cdequantize_blockwise_fp16(*args)
- elif dtype == torch.bfloat16:
- lib.cdequantize_blockwise_bf16(*args)
- elif dtype == torch.float32:
- lib.cdequantize_blockwise_fp32(*args)
-
-
-@register_kernel("bitsandbytes::quantize_4bit", "cuda")
-def _(
- A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> tuple[torch.Tensor, torch.Tensor]:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- A.dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
- )
-
- n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int32(blocksize),
- ct.c_int(n),
- )
-
- if A.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_bf16_fp4(*args)
- else:
- lib.cquantize_blockwise_bf16_nf4(*args)
- elif A.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp16_fp4(*args)
- else:
- lib.cquantize_blockwise_fp16_nf4(*args)
- elif A.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cquantize_blockwise_fp32_fp4(*args)
- else:
- lib.cquantize_blockwise_fp32_nf4(*args)
-
- return out, absmax
-
-
-@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
-) -> torch.Tensor:
- out = torch.empty(shape, dtype=dtype, device=A.device)
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
- torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
- _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
-
-
-def _dequantize_4bit_impl(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- dtype: torch.dtype,
- out: torch.Tensor,
-) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
- torch._check(quant_type in ["fp4", "nf4"])
- torch._check(
- dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
- )
-
- with _cuda_device_of(A):
- args = (
- None,
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_int(blocksize),
- ct.c_int(out.numel()),
- _get_tensor_stream(A),
- )
-
- if out.dtype == torch.bfloat16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_bf16_fp4(*args)
- else:
- lib.cdequantize_blockwise_bf16_nf4(*args)
- elif out.dtype == torch.float16:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp16_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp16_nf4(*args)
- elif out.dtype == torch.float32:
- if quant_type == "fp4":
- lib.cdequantize_blockwise_fp32_fp4(*args)
- else:
- lib.cdequantize_blockwise_fp32_nf4(*args)
-
-
-@register_kernel("bitsandbytes::gemv_4bit", "cuda")
-def _(
- A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
-) -> torch.Tensor:
- shape = (*A.shape[:-1], shapeB[0])
- out = torch.empty(shape, device=A.device, dtype=A.dtype)
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
- return out
-
-
-@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
-def _(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check(
- out.shape == (*A.shape[:-1], shapeB[0]),
- lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
- )
- torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
- _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
-
-
-def _gemv_4bit_impl(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
- out: torch.Tensor,
-) -> None:
- torch._check_is_size(blocksize)
- torch._check(
- A.numel() == A.size(-1),
- lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
- )
- torch._check(
- A.dtype in [torch.float16, torch.bfloat16, torch.float32],
- lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
- )
- torch._check(
- B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
- lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
- )
- torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
- torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
-
- m = ct.c_int32(shapeB[0])
- n = ct.c_int32(1)
- k = ct.c_int32(shapeB[1])
-
- lda = m
- ldb = ct.c_int32((A.shape[-1] + 1) // 2)
- ldc = m
-
- stream = _get_tensor_stream(A)
-
- with _cuda_device_of(A):
- if A.dtype == torch.float16:
- lib.cgemm_4bit_inference_naive_fp16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.bfloat16:
- lib.cgemm_4bit_inference_naive_bf16(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
- elif A.dtype == torch.float32:
- lib.cgemm_4bit_inference_naive_fp32(
- m,
- n,
- k,
- get_ptr(A),
- get_ptr(B),
- get_ptr(absmax),
- get_ptr(code),
- get_ptr(out),
- lda,
- ldb,
- ldc,
- ct.c_int32(blocksize),
- stream,
- )
+from collections.abc import Sequence
+import ctypes as ct
+from math import prod
+from typing import Optional
+
+import torch
+
+from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
+
+from ..._ops import register_kernel
+from ...cextension import lib
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor):
+ out = torch.empty((*A.shape[:-1], B.shape[0]), device=A.device, dtype=torch.int32)
+ return _int8_linear_matmul_impl(A, B, out)
+
+
+@register_kernel("bitsandbytes::int8_linear_matmul.out", "cuda")
+def _(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ _int8_linear_matmul_impl(A, B, out)
+
+
+def _int8_linear_matmul_impl(A: torch.Tensor, B: torch.Tensor, out: torch.Tensor):
+ A, B = B, A
+
+ shapeA = A.shape
+ shapeB = B.shape
+
+ torch._check(A.dtype == torch.int8, lambda: "B must be int8")
+ torch._check(B.dtype == torch.int8, lambda: "A must be int8")
+ torch._check(A.ndim == 2, lambda: "Only two dimensional matrices are supported for argument B")
+ torch._check(B.ndim in [2, 3], lambda: "Only two or three dimensional matrices are supported for argument A")
+ torch._check(prod(shapeB) > 0, lambda: f"Input tensor dimensions need to be > 0: {shapeB}")
+ torch._check(out.dtype == torch.int32)
+
+ shapeC = (*shapeB[:-1], shapeA[0])
+ torch._check(out.shape == shapeC, lambda: f"Output shape {out.shape} does not match expected shape {shapeC}")
+
+ k, m = shapeA
+ n = prod(shapeB[:-1])
+ lda = shapeA[-1] # Weights (outputs, inputs)
+ ldb = shapeB[-1] # Activations (batch, tokens, inputs)
+ ldc = shapeC[-1] # Output (batch, tokens, outputs)
+
+ torch._check(
+ lda == ldb,
+ lambda: f"int8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = {shapeB} @ {shapeA}",
+ )
+
+ # cuBLASLt does not support int8 matmul with inner dimensions that are not divisible by 4.
+ # We'll fall back to a slower fp32 calculation in this circumstance.
+ # Fortunately, this should not be very common.
+ if lda % 4 != 0:
+ result = torch.matmul(B.float(), A.float().t()).to(torch.int32)
+ return out.copy_(result)
+
+ with _cuda_device_of(A):
+ ctx = CUBLAS_Context.get_instance().get_context(A.device)
+ ptrA = get_ptr(A)
+ ptrB = get_ptr(B)
+ ptrC = get_ptr(out)
+ ptrRowScale = None
+ m = ct.c_int32(m)
+ n = ct.c_int32(n)
+ k = ct.c_int32(k)
+ lda = ct.c_int32(lda)
+ ldb = ct.c_int32(ldb)
+ ldc = ct.c_int32(ldc)
+ stream = _get_tensor_stream(A)
+
+ has_error = lib.cigemmlt_32(ctx, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc, stream)
+
+ if has_error:
+ if has_error == 100:
+ # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu`
+ # TODO: Warn and implement a fallback to fp32 compute?
+ raise NotImplementedError("int8_linear_matmul not implemented!")
+ else:
+ raise RuntimeError(
+ f"cublasLt ran into an error!\n\t{shapeA=}, {shapeB=}, {shapeC=}\n\t{(lda, ldb, ldc)=}\n\t{(m, n, k)=}"
+ )
+
+ return out
+
+
+@register_kernel("bitsandbytes::int8_mm_dequant", "cuda")
+def _(
+ A: torch.Tensor,
+ row_stats: torch.Tensor,
+ col_stats: torch.Tensor,
+ dtype: Optional[torch.dtype] = None,
+ bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+ torch._check(A.dtype == torch.int32, lambda: f"A must be int32, got {A.dtype}")
+ torch._check(row_stats.dtype == torch.float32, lambda: f"row_stats must be float32, got {row_stats.dtype}")
+ torch._check(col_stats.dtype == torch.float32, lambda: f"col_stats must be float32, got {col_stats.dtype}")
+
+ # Note: cuda kernel only currently supports fp16 output.
+ # We'll later cast to desired dtype if needed.
+ out = torch.empty_like(A, dtype=torch.float16)
+
+ ptrA = get_ptr(A)
+ ptrOut = get_ptr(out)
+ ptrRowStats = get_ptr(row_stats)
+ ptrColStats = get_ptr(col_stats)
+ numRows = ct.c_int32(prod(A.shape[:-1]))
+ numCols = ct.c_int32(A.shape[-1])
+
+ # Note: fused bias in the kernel is only supported for fp16
+ # TODO(matthewdouglas): Consider supporting bf16 fused bias
+ ptrBias = get_ptr(bias) if bias is not None and bias.dtype == torch.float16 else None
+
+ with _cuda_device_of(A):
+ lib.cdequant_mm_int32_fp16(
+ ptrA, ptrRowStats, ptrColStats, ptrOut, ptrBias, numRows, numCols, _get_tensor_stream(A)
+ )
+
+ # Add bias separately if not fused in kernel
+ if bias is not None and bias.dtype != torch.float16:
+ out.add_(bias)
+
+ return out.to(dtype or torch.float16)
+
+
+@register_kernel("bitsandbytes::int8_vectorwise_quant", "cuda")
+def _(A: torch.Tensor, threshold=0.0):
+ torch._check(A.dtype == torch.float16, lambda: f"A must be float16, got {A.dtype}")
+ torch._check(threshold >= 0.0, lambda: "threshold must be non-negative")
+
+ rows = prod(A.shape[:-1])
+ cols = A.shape[-1]
+
+ row_stats = torch.empty(rows, device=A.device, dtype=torch.float32)
+ out_row = torch.empty(A.shape, device=A.device, dtype=torch.int8)
+
+ outlier_cols = None
+
+ if threshold > 0.0:
+ # TODO we could improve perf of this
+ outliers = A.abs() >= threshold
+
+ if outliers.any():
+ outlier_cols = torch.argwhere(outliers.any(dim=0)).view(-1)
+ else:
+ # Needed for torch.compile support.
+ outlier_cols = torch.empty(0, device=A.device, dtype=torch.int64)
+
+ with _cuda_device_of(A):
+ lib.cint8_vector_quant(
+ get_ptr(A),
+ get_ptr(out_row),
+ get_ptr(row_stats),
+ ct.c_float(threshold),
+ ct.c_int32(rows),
+ ct.c_int32(cols),
+ _get_tensor_stream(A),
+ )
+
+ # Zero out values from outlier columns across all rows.
+ # The kernel will handle this for outliers themselves, so we can optimize for rows=1.
+ if rows > 1 and outlier_cols is not None:
+ out_row[:, outlier_cols] = 0
+
+ return out_row, row_stats, outlier_cols
+
+
+@register_kernel("bitsandbytes::int8_double_quant", "cuda")
+def _(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+ # Use CUDA kernel for rowwise and COO tensor
+ quant_row, row_stats, outlier_cols = torch.ops.bitsandbytes.int8_vectorwise_quant.default(
+ A,
+ threshold=threshold,
+ )
+
+ # PyTorch impl for colwise
+ col_stats, outlier_mask = _get_col_absmax(A, threshold=threshold)
+ if threshold > 0.0 and outlier_mask is not None:
+ A = A.masked_fill(outlier_mask, 0.0)
+ quant_col = torch.round(A.mul(127.0) / col_stats.unsqueeze(0)).to(torch.int8)
+
+ return quant_row, quant_col, row_stats, col_stats.flatten().float(), outlier_cols
+
+
+def _get_col_absmax(
+ A: torch.Tensor,
+ threshold=0.0,
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+ torch._check(A.is_floating_point())
+
+ outlier_mask = None
+
+ absA = A.abs().view(-1, A.shape[-1])
+
+ if threshold > 0.0:
+ # Filter outliers from stats when enabled
+ outlier_mask = absA >= threshold
+ absA.masked_fill_(outlier_mask, 0.0)
+
+ # shape [cols]; unsqueeze(0) gives [1,cols]
+ col_stats = absA.amax(dim=0, keepdim=False).float()
+
+ return col_stats, outlier_mask
+
+
+@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check_is_size(blocksize)
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty_like(A, dtype=torch.uint8)
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(A.numel()),
+ )
+
+ if A.dtype == torch.float16:
+ lib.cquantize_blockwise_fp16(*args)
+ elif A.dtype == torch.bfloat16:
+ lib.cquantize_blockwise_bf16(*args)
+ elif A.dtype == torch.float32:
+ lib.cquantize_blockwise_fp32(*args)
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise", "cuda")
+def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+ out = torch.empty_like(A, dtype=dtype)
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ torch._check(out.shape == A.shape, lambda: f"Expected out.shape == {A.shape}, got {out.shape}")
+ _dequantize_blockwise_impl(A, absmax, code, blocksize, dtype, out=out)
+
+
+def _dequantize_blockwise_impl(
+ A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
+) -> None:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+ torch._check(
+ dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"Blockwise dequantization only supports 16bit/32bit floating types, got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(A.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if dtype == torch.float16:
+ lib.cdequantize_blockwise_fp16(*args)
+ elif dtype == torch.bfloat16:
+ lib.cdequantize_blockwise_bf16(*args)
+ elif dtype == torch.float32:
+ lib.cdequantize_blockwise_fp32(*args)
+
+
+@register_kernel("bitsandbytes::quantize_4bit", "cuda")
+def _(
+ A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ A.dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
+ )
+
+ n = A.numel()
+ blocks = -(n // -blocksize)
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty(((n + 1) // (quant_storage.itemsize * 2), 1), device=A.device, dtype=quant_storage)
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int32(blocksize),
+ ct.c_int(n),
+ )
+
+ if A.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_bf16_nf4(*args)
+ elif A.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp16_nf4(*args)
+ elif A.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cquantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cquantize_blockwise_fp32_nf4(*args)
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_4bit", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+) -> torch.Tensor:
+ out = torch.empty(shape, dtype=dtype, device=A.device)
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::dequantize_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(out.shape == shape, lambda: f"Expected out.shape == {shape}, got {out.shape}")
+ torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
+ _dequantize_4bit_impl(A, absmax, blocksize, quant_type, dtype, out=out)
+
+
+def _dequantize_4bit_impl(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ dtype: torch.dtype,
+ out: torch.Tensor,
+) -> None:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(quant_type in ["fp4", "nf4"])
+ torch._check(
+ dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+ )
+
+ with _cuda_device_of(A):
+ args = (
+ None,
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_int(blocksize),
+ ct.c_int(out.numel()),
+ _get_tensor_stream(A),
+ )
+
+ if out.dtype == torch.bfloat16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_bf16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_bf16_nf4(*args)
+ elif out.dtype == torch.float16:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp16_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp16_nf4(*args)
+ elif out.dtype == torch.float32:
+ if quant_type == "fp4":
+ lib.cdequantize_blockwise_fp32_fp4(*args)
+ else:
+ lib.cdequantize_blockwise_fp32_nf4(*args)
+
+
+@register_kernel("bitsandbytes::gemv_4bit", "cuda")
+def _(
+ A: torch.Tensor, B: torch.Tensor, shapeB: Sequence[int], absmax: torch.Tensor, code: torch.Tensor, blocksize: int
+) -> torch.Tensor:
+ shape = (*A.shape[:-1], shapeB[0])
+ out = torch.empty(shape, device=A.device, dtype=A.dtype)
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+ return out
+
+
+@register_kernel("bitsandbytes::gemv_4bit.out", "cuda")
+def _(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check(
+ out.shape == (*A.shape[:-1], shapeB[0]),
+ lambda: f"Expected out.shape == {(*A.shape[:-1], shapeB[0])}, got {out.shape}",
+ )
+ torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
+ _gemv_4bit_impl(A, B, shapeB, absmax, code, blocksize, out=out)
+
+
+def _gemv_4bit_impl(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ out: torch.Tensor,
+) -> None:
+ torch._check_is_size(blocksize)
+ torch._check(
+ A.numel() == A.size(-1),
+ lambda: f"A must be a vector with leading dimensions of 1, got {A.shape}",
+ )
+ torch._check(
+ A.dtype in [torch.float16, torch.bfloat16, torch.float32],
+ lambda: f"A must be float16, bfloat16, or float32, got {A.dtype}",
+ )
+ torch._check(
+ B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"B must be backed by storage of type uint8, bfloat16, float16, or float32, got {B.dtype}",
+ )
+ torch._check(absmax.dtype == torch.float32, lambda: f"absmax must be float32, got {absmax.dtype}")
+ torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
+
+ m = ct.c_int32(shapeB[0])
+ n = ct.c_int32(1)
+ k = ct.c_int32(shapeB[1])
+
+ lda = m
+ ldb = ct.c_int32((A.shape[-1] + 1) // 2)
+ ldc = m
+
+ stream = _get_tensor_stream(A)
+
+ with _cuda_device_of(A):
+ if A.dtype == torch.float16:
+ lib.cgemm_4bit_inference_naive_fp16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.bfloat16:
+ lib.cgemm_4bit_inference_naive_bf16(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
+ elif A.dtype == torch.float32:
+ lib.cgemm_4bit_inference_naive_fp32(
+ m,
+ n,
+ k,
+ get_ptr(A),
+ get_ptr(B),
+ get_ptr(absmax),
+ get_ptr(code),
+ get_ptr(out),
+ lda,
+ ldb,
+ ldc,
+ ct.c_int32(blocksize),
+ stream,
+ )
From d1a5e8dec4e212e5c722d884809d5645c4772a1b Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:35:33 +0530
Subject: [PATCH 41/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 27 ++++++++++++++++++++++-----
1 file changed, 22 insertions(+), 5 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index efdef2871..fd7b7b9a2 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -8,7 +8,7 @@
from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
from ..._ops import register_kernel
-from ...cextension import lib
+from ...cextension import lib, HIP_ENVIRONMENT
@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -210,7 +210,12 @@ def _get_col_absmax(
@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
n = A.numel()
@@ -264,7 +269,11 @@ def _(
def _dequantize_blockwise_impl(
A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
torch._check(
dtype in [torch.float16, torch.bfloat16, torch.float32],
@@ -294,7 +303,11 @@ def _dequantize_blockwise_impl(
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -372,7 +385,11 @@ def _dequantize_4bit_impl(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
dtype in [torch.bfloat16, torch.float16, torch.float32],
From 843ea338f968e06d586ac70c68e70b3a2c56c228 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:39:54 +0530
Subject: [PATCH 42/85] Update functional.py
---
bitsandbytes/functional.py | 316 +++++++++++++++++--------------------
1 file changed, 147 insertions(+), 169 deletions(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 2ae977e7a..b0092ffd1 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -15,7 +15,7 @@
from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
-from .cextension import lib, HIP_ENVIRONMENT
+from .cextension import lib
name2qmap = {}
@@ -719,159 +719,152 @@ def __eq__(self, other):
)
-def quantize_blockwise(
- A: torch.Tensor,
- code: Optional[torch.Tensor] = None,
- absmax: Optional[torch.Tensor] = None,
- out: Optional[torch.Tensor] = None,
- blocksize=4096,
- nested=False,
-) -> tuple[torch.Tensor, QuantState]:
- """Quantize a tensor in blocks of values.
- The input tensor is quantized by dividing it into blocks of `blocksize` values.
- The the absolute maximum value within these blocks is calculated for scaling
- the non-linear quantization.
- Args:
- A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
- code (`torch.Tensor`, *optional*):
- A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
- For more details, see (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
- absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
- out (`torch.Tensor`, *optional*): A tensor to use to store the result.
- blocksize (`int`, *optional*):
- The size of the blocks. Defaults to 4096.
- Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
- nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
- Raises:
- ValueError: Raised when the input data type is not supported.
- Returns:
- `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
- - `torch.Tensor`: The quantized tensor.
- - [`QuantState`]: The state object used to undo the quantization.
- """
-
- if code is None:
- if "dynamic" not in name2qmap:
- name2qmap["dynamic"] = create_dynamic_map().to(A.device)
- code = name2qmap["dynamic"]
-
- if HIP_ENVIRONMENT:
- assert blocksize in [4096, 2048, 1024, 512, 256, 128]
- else:
- assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
-
- _out, _absmax = torch.ops.bitsandbytes.quantize_blockwise.default(
- A,
- code.to(A.device),
- blocksize,
- )
-
- if nested:
- offset = _absmax.mean()
- _absmax -= offset
- qabsmax, state2 = quantize_blockwise(_absmax, blocksize=blocksize, nested=False)
- quant_state = QuantState(
- absmax=qabsmax,
- code=code,
- blocksize=blocksize,
- dtype=A.dtype,
- offset=offset,
- state2=state2,
- )
- else:
- quant_state = QuantState(absmax=_absmax, code=code.to(A.device), blocksize=blocksize, dtype=A.dtype)
-
- # TODO(matthewdouglas): Deprecate out kwarg
- out = out.copy_(_out) if out is not None else _out
-
- # TODO(matthewdouglas): Deprecate absmax kwarg
- if absmax is not None:
- quant_state.absmax = absmax.copy_(quant_state.absmax)
-
- return out, quant_state
-
-
-def dequantize_blockwise(
- A: torch.Tensor,
- quant_state: Optional[QuantState] = None,
- absmax: Optional[torch.Tensor] = None,
- code: Optional[torch.Tensor] = None,
- out: Optional[torch.Tensor] = None,
- blocksize: int = 4096,
- nested=False,
-) -> torch.Tensor:
- """Dequantize a tensor in blocks of values.
- The input tensor is dequantized by dividing it into blocks of `blocksize` values.
- The the absolute maximum value within these blocks is used for scaling
- the non-linear dequantization.
- Args:
- A (`torch.Tensor`): The quantized input tensor.
- quant_state ([`QuantState`], *optional*):
- The quantization state as returned by [`quantize_blockwise`].
- Required if `absmax` is not provided.
- absmax (`torch.Tensor`, *optional*):
- A tensor containing the scaling values.
- Required if `quant_state` is not provided and ignored otherwise.
- code (`torch.Tensor`, *optional*):
- A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
- For more details, see (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
- Ignored when `quant_state` is provided.
- out (`torch.Tensor`, *optional*): A tensor to use to store the result.
- blocksize (`int`, *optional*):
- The size of the blocks. Defaults to 4096.
- Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
- Ignored when `quant_state` is provided.
- Raises:
- ValueError: Raised when the input data type is not supported.
- Returns:
- `torch.Tensor`:
- The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
- """
-
- assert quant_state is not None or absmax is not None
- if code is None and quant_state is None:
- if "dynamic" not in name2qmap:
- name2qmap["dynamic"] = create_dynamic_map().to(A.device)
- code = name2qmap["dynamic"]
-
- if quant_state is None:
- quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
-
- if HIP_ENVIRONMENT:
- supported_blocksizes = [4096, 2048, 1024, 512, 256, 128]
- else:
- supported_blocksizes = [4096, 2048, 1024, 512, 256, 128, 64]
-
- if quant_state.blocksize not in supported_blocksizes:
- raise ValueError(
- f"The blocksize of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}"
- )
-
- absmax = quant_state.absmax
- if quant_state.nested:
- absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
- absmax += quant_state.offset
- if absmax.dtype != torch.float32:
- absmax = absmax.float()
-
- if out is not None:
- torch.ops.bitsandbytes.dequantize_blockwise.out(
- A,
- absmax,
- code.to(A.device),
- blocksize,
- quant_state.dtype,
- out=out,
- )
- return out
-
- return torch.ops.bitsandbytes.dequantize_blockwise.default(
- A,
- absmax,
- quant_state.code.to(A.device),
- quant_state.blocksize,
- quant_state.dtype,
- )
+def quantize_blockwise(
+ A: torch.Tensor,
+ code: Optional[torch.Tensor] = None,
+ absmax: Optional[torch.Tensor] = None,
+ out: Optional[torch.Tensor] = None,
+ blocksize=4096,
+ nested=False,
+) -> tuple[torch.Tensor, QuantState]:
+ """Quantize a tensor in blocks of values.
+
+ The input tensor is quantized by dividing it into blocks of `blocksize` values.
+ The the absolute maximum value within these blocks is calculated for scaling
+ the non-linear quantization.
+
+ Args:
+ A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
+ code (`torch.Tensor`, *optional*):
+ A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
+ For more details, see (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
+ absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
+ out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+ blocksize (`int`, *optional*):
+ The size of the blocks. Defaults to 4096.
+ Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+ nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
+
+ Raises:
+ ValueError: Raised when the input data type is not supported.
+
+ Returns:
+ `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
+ - `torch.Tensor`: The quantized tensor.
+ - [`QuantState`]: The state object used to undo the quantization.
+ """
+
+ if code is None:
+ if "dynamic" not in name2qmap:
+ name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+ code = name2qmap["dynamic"]
+
+ _out, _absmax = torch.ops.bitsandbytes.quantize_blockwise.default(
+ A,
+ code.to(A.device),
+ blocksize,
+ )
+
+ if nested:
+ offset = _absmax.mean()
+ _absmax -= offset
+ qabsmax, state2 = quantize_blockwise(_absmax, blocksize=blocksize, nested=False)
+ quant_state = QuantState(
+ absmax=qabsmax,
+ code=code,
+ blocksize=blocksize,
+ dtype=A.dtype,
+ offset=offset,
+ state2=state2,
+ )
+ else:
+ quant_state = QuantState(absmax=_absmax, code=code.to(A.device), blocksize=blocksize, dtype=A.dtype)
+
+ # TODO(matthewdouglas): Deprecate out kwarg
+ out = out.copy_(_out) if out is not None else _out
+
+ # TODO(matthewdouglas): Deprecate absmax kwarg
+ if absmax is not None:
+ quant_state.absmax = absmax.copy_(quant_state.absmax)
+
+ return out, quant_state
+
+
+def dequantize_blockwise(
+ A: torch.Tensor,
+ quant_state: Optional[QuantState] = None,
+ absmax: Optional[torch.Tensor] = None,
+ code: Optional[torch.Tensor] = None,
+ out: Optional[torch.Tensor] = None,
+ blocksize: int = 4096,
+ nested=False,
+) -> torch.Tensor:
+ """Dequantize a tensor in blocks of values.
+
+ The input tensor is dequantized by dividing it into blocks of `blocksize` values.
+ The the absolute maximum value within these blocks is used for scaling
+ the non-linear dequantization.
+
+ Args:
+ A (`torch.Tensor`): The quantized input tensor.
+ quant_state ([`QuantState`], *optional*):
+ The quantization state as returned by [`quantize_blockwise`].
+ Required if `absmax` is not provided.
+ absmax (`torch.Tensor`, *optional*):
+ A tensor containing the scaling values.
+ Required if `quant_state` is not provided and ignored otherwise.
+ code (`torch.Tensor`, *optional*):
+ A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
+ For more details, see (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
+ Ignored when `quant_state` is provided.
+ out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+ blocksize (`int`, *optional*):
+ The size of the blocks. Defaults to 4096.
+ Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+ Ignored when `quant_state` is provided.
+
+ Raises:
+ ValueError: Raised when the input data type is not supported.
+
+ Returns:
+ `torch.Tensor`:
+ The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
+ """
+
+ assert quant_state is not None or absmax is not None
+ if code is None and quant_state is None:
+ if "dynamic" not in name2qmap:
+ name2qmap["dynamic"] = create_dynamic_map().to(A.device)
+ code = name2qmap["dynamic"]
+
+ if quant_state is None:
+ quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
+
+ absmax = quant_state.absmax
+ if quant_state.nested:
+ absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+ absmax += quant_state.offset
+ if absmax.dtype != torch.float32:
+ absmax = absmax.float()
+
+ if out is not None:
+ torch.ops.bitsandbytes.dequantize_blockwise.out(
+ A,
+ absmax,
+ code.to(A.device),
+ blocksize,
+ quant_state.dtype,
+ out=out,
+ )
+ return out
+
+ return torch.ops.bitsandbytes.dequantize_blockwise.default(
+ A,
+ absmax,
+ quant_state.code.to(A.device),
+ quant_state.blocksize,
+ quant_state.dtype,
+ )
def get_4bit_type(typename, device=None, blocksize=64):
@@ -964,8 +957,6 @@ def quantize_fp4(
compress_statistics=False,
quant_storage=torch.uint8,
):
- if HIP_ENVIRONMENT:
- blocksize = 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage)
@@ -977,8 +968,6 @@ def quantize_nf4(
compress_statistics=False,
quant_storage=torch.uint8,
):
- if HIP_ENVIRONMENT:
- blocksize = 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage)
@@ -1014,9 +1003,6 @@ def quantize_4bit(
- `torch.Tensor`: The quantized tensor with packed 4-bit values.
- [`QuantState`]: The state object used to undo the quantization.
"""
- if HIP_ENVIRONMENT:
- blocksize = 128
-
input_shape = A.shape
_out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
@@ -1069,8 +1055,6 @@ def dequantize_fp4(
out: Optional[torch.Tensor] = None,
blocksize: int = 64,
) -> torch.Tensor:
- if HIP_ENVIRONMENT:
- blocksize = 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
@@ -1081,8 +1065,6 @@ def dequantize_nf4(
out: Optional[torch.Tensor] = None,
blocksize: int = 64,
) -> torch.Tensor:
- if HIP_ENVIRONMENT:
- blocksize = 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
@@ -1120,10 +1102,6 @@ def dequantize_4bit(
Returns:
`torch.Tensor`: The dequantized tensor.
"""
-
- if HIP_ENVIRONMENT:
- blocksize = 128
-
if quant_state is None:
assert absmax is not None and out is not None
From d6d2e5f32ffd30070c45f89704b8db20f600b577 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 12:57:37 +0530
Subject: [PATCH 43/85] Update functional.py
---
bitsandbytes/functional.py | 32 +++++++++++++++++++++++++++++++-
1 file changed, 31 insertions(+), 1 deletion(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index b0092ffd1..959eeb33a 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -15,7 +15,7 @@
from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
-from .cextension import lib
+from .cextension import lib, HIP_ENVIRONMENT
name2qmap = {}
@@ -758,6 +758,11 @@ def quantize_blockwise(
if "dynamic" not in name2qmap:
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
code = name2qmap["dynamic"]
+
+ if HIP_ENVIRONMENT:
+ assert blocksize in [4096, 2048, 1024, 512, 256, 128]
+ else:
+ assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
_out, _absmax = torch.ops.bitsandbytes.quantize_blockwise.default(
A,
@@ -839,6 +844,16 @@ def dequantize_blockwise(
if quant_state is None:
quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
+
+ if HIP_ENVIRONMENT:
+ supported_blocksizes = [4096, 2048, 1024, 512, 256, 128]
+ else:
+ supported_blocksizes = [4096, 2048, 1024, 512, 256, 128, 64]
+
+ if quant_state.blocksize not in supported_blocksizes:
+ raise ValueError(
+ f"The blocksize of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}"
+ )
absmax = quant_state.absmax
if quant_state.nested:
@@ -957,6 +972,8 @@ def quantize_fp4(
compress_statistics=False,
quant_storage=torch.uint8,
):
+ if HIP_ENVIRONMENT:
+ blocksize = 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage)
@@ -968,6 +985,8 @@ def quantize_nf4(
compress_statistics=False,
quant_storage=torch.uint8,
):
+ if HIP_ENVIRONMENT:
+ blocksize = 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage)
@@ -1003,6 +1022,9 @@ def quantize_4bit(
- `torch.Tensor`: The quantized tensor with packed 4-bit values.
- [`QuantState`]: The state object used to undo the quantization.
"""
+ if HIP_ENVIRONMENT:
+ blocksize = 128
+
input_shape = A.shape
_out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
@@ -1055,6 +1077,8 @@ def dequantize_fp4(
out: Optional[torch.Tensor] = None,
blocksize: int = 64,
) -> torch.Tensor:
+ if HIP_ENVIRONMENT:
+ blocksize = 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
@@ -1065,6 +1089,8 @@ def dequantize_nf4(
out: Optional[torch.Tensor] = None,
blocksize: int = 64,
) -> torch.Tensor:
+ if HIP_ENVIRONMENT:
+ blocksize = 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
@@ -1102,6 +1128,10 @@ def dequantize_4bit(
Returns:
`torch.Tensor`: The dequantized tensor.
"""
+
+ if HIP_ENVIRONMENT:
+ blocksize = 128
+
if quant_state is None:
assert absmax is not None and out is not None
From e3f9f21236ac76cac026eacf1da26f15e7a0ad1f Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 13:23:18 +0530
Subject: [PATCH 44/85] Update functional.py
---
bitsandbytes/functional.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 959eeb33a..f4be0dc2f 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1022,6 +1022,7 @@ def quantize_4bit(
- `torch.Tensor`: The quantized tensor with packed 4-bit values.
- [`QuantState`]: The state object used to undo the quantization.
"""
+
if HIP_ENVIRONMENT:
blocksize = 128
From bc0957daa57fc1364f914c2928bcfb730f97dc9d Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 17:26:33 +0530
Subject: [PATCH 45/85] Update test_ops.py
---
tests/test_ops.py | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/tests/test_ops.py b/tests/test_ops.py
index 4da1663f0..bb49c7dbb 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -5,6 +5,7 @@
import bitsandbytes
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
+from bitsandbytes.cextension import HIP_ENVIRONMENT
class TestLLMInt8Ops:
@@ -95,7 +96,7 @@ def test_int8_scaled_mm(self, device, dtype, has_bias):
class TestInt8BlockwiseQuantOps:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
- @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
def test_quantize_blockwise(self, device, dtype, blocksize):
if device == "cpu":
if dtype != torch.float32:
@@ -119,7 +120,7 @@ def test_quantize_blockwise(self, device, dtype, blocksize):
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
- @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
def test_dequantize_blockwise(self, device, dtype, blocksize):
if device == "cpu" and dtype != torch.float32:
pytest.skip("CPU implementation is only available for float32")
@@ -145,7 +146,7 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "cpu" and quant_type != "nf4":
pytest.xfail("CPU implementation is only available for nf4")
@@ -169,7 +170,7 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "cpu":
if quant_type != "nf4":
@@ -206,7 +207,7 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "cpu":
pytest.xfail("CPU implementation is not available")
From b8247ab109de936bcefb932b7d0ed996168f8445 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 17:34:22 +0530
Subject: [PATCH 46/85] Update test_functional.py
---
tests/test_functional.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 96e77e4f4..3b9b53a24 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -8,6 +8,7 @@
import torch
import bitsandbytes as bnb
+from bitsandbytes.cextension import HIP_ENVIRONMENT
from bitsandbytes import functional as F
from tests.helpers import (
BOOLEAN_TUPLES,
@@ -91,7 +92,7 @@ class Test8BitBlockwiseQuantizeFunctional:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
@pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested"))
- @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
+ @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128] if HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128, 64] )
@pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
iters = 100
@@ -147,7 +148,7 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
@pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
@pytest.mark.parametrize("hidden", [128])
- @pytest.mark.parametrize("blocksize", [4096, 16384])
+ @pytest.mark.parametrize("blocksize", [4096] if HIP_ENVIRONMENT else [4096, 16384])
def test_blockwise_cpu_large(self, hidden, blocksize):
diffs = []
reldiffs = []
@@ -1105,7 +1106,7 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
+ @pytest.mark.parametrize("blocksize", [128, 256, 512, 1024, 2048, 4096] if HIP_ENVIRONMENT else [64, 128, 256, 512, 1024, 2048, 4096])
def test_4bit_quant(self, device, dtype, quant_type, blocksize):
if device == "cpu" and quant_type != "nf4":
pytest.xfail("fp4 quantization is not supported on CPU")
From 531758a10835e68a10002eb825383a1a0608cb65 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 20:19:07 +0530
Subject: [PATCH 47/85] Update test_ops.py
---
tests/test_ops.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/tests/test_ops.py b/tests/test_ops.py
index bb49c7dbb..a99d080b3 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -96,7 +96,7 @@ def test_int8_scaled_mm(self, device, dtype, has_bias):
class TestInt8BlockwiseQuantOps:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
- @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
def test_quantize_blockwise(self, device, dtype, blocksize):
if device == "cpu":
if dtype != torch.float32:
@@ -120,7 +120,7 @@ def test_quantize_blockwise(self, device, dtype, blocksize):
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
- @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
def test_dequantize_blockwise(self, device, dtype, blocksize):
if device == "cpu" and dtype != torch.float32:
pytest.skip("CPU implementation is only available for float32")
@@ -146,7 +146,7 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "cpu" and quant_type != "nf4":
pytest.xfail("CPU implementation is only available for nf4")
@@ -170,7 +170,7 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "cpu":
if quant_type != "nf4":
@@ -207,7 +207,7 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
@pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [128, 256, 512] if HIP_ENVIRONMENT else [64, 128, 256, 512])
+ @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
if device == "cpu":
pytest.xfail("CPU implementation is not available")
From 6d7db8efa3a2d249434378ab09f3e9f5c0d72c26 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 20:29:23 +0530
Subject: [PATCH 48/85] Update test_functional.py
---
tests/test_functional.py | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 3b9b53a24..4b62c2567 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -92,7 +92,7 @@ class Test8BitBlockwiseQuantizeFunctional:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
@pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested"))
- @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128] if HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128, 64] )
+ @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64] if not HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128] )
@pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
iters = 100
@@ -148,7 +148,7 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
@pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
@pytest.mark.parametrize("hidden", [128])
- @pytest.mark.parametrize("blocksize", [4096] if HIP_ENVIRONMENT else [4096, 16384])
+ @pytest.mark.parametrize("blocksize", [4096, 16384] if not HIP_ENVIRONMENT else [4096])
def test_blockwise_cpu_large(self, hidden, blocksize):
diffs = []
reldiffs = []
@@ -1106,7 +1106,7 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [128, 256, 512, 1024, 2048, 4096] if HIP_ENVIRONMENT else [64, 128, 256, 512, 1024, 2048, 4096])
+ @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096] if not HIP_ENVIRONMENT else [128, 256, 512, 1024, 2048, 4096])
def test_4bit_quant(self, device, dtype, quant_type, blocksize):
if device == "cpu" and quant_type != "nf4":
pytest.xfail("fp4 quantization is not supported on CPU")
@@ -1205,7 +1205,10 @@ def test_bench_4bit_dequant(self, quant_type):
# torch.matmul(b, a.t())
# torch.cuda.synchronize()
# print((time.time()-t0)/iters*1e6)
-
+
+ @pytest.mark.skipif(
+ HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
+ )
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}")
@pytest.mark.parametrize("storage_type", ["nf4", "fp4"])
@@ -1369,6 +1372,9 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
@pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"])
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
+ @pytest.mark.skipif(
+ HIP_ENVIRONMENT, reason="this test is not supported on ROCm with gfx90a architecture yet",
+ )
def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
if device == "cpu" and storage_type != "nf4":
pytest.xfail("fp4 quantization is not supported on CPU")
From 632e95b92d9feba37401ede69ad119017b50ae9d Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 21:05:21 +0530
Subject: [PATCH 49/85] Update test_functional.py
---
tests/test_functional.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 4b62c2567..7ad604d9f 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1373,7 +1373,7 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
@pytest.mark.skipif(
- HIP_ENVIRONMENT, reason="this test is not supported on ROCm with gfx90a architecture yet",
+ HIP_ENVIRONMENT, reason="this test is not supported on ROCm with gfx90a architecture yet"
)
def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
if device == "cpu" and storage_type != "nf4":
From aaa71d7ecf7fa8b41158c9ec2d023db294b336dc Mon Sep 17 00:00:00 2001
From: jiqing-feng
Date: Thu, 29 May 2025 00:07:08 +0800
Subject: [PATCH 50/85] Enable CPU/XPU native and ipex path (#1628)
* enable ipex
Signed-off-by: jiqing-feng
* fix cpu 8bit quantization
Signed-off-by: jiqing-feng
* fix int8 and nf4 cpu inference
Signed-off-by: jiqing-feng
* add cpu fp4 and rem
Signed-off-by: jiqing-feng
* fix dequantize nf4 xpu
Signed-off-by: jiqing-feng
* fix ipex op
Signed-off-by: jiqing-feng
* fix dequantize nf4 name
Signed-off-by: jiqing-feng
* fix dequantize nf4 ipex
Signed-off-by: jiqing-feng
* fix matmul8bitfp
Signed-off-by: jiqing-feng
* enable cpu tests
Signed-off-by: jiqing-feng
* fix format
Signed-off-by: jiqing-feng
* fix quantize blockwise output shape
Signed-off-by: jiqing-feng
* fix quant_storage bf16 and gemv cpu
Signed-off-by: jiqing-feng
* fix cpu tests
Signed-off-by: jiqing-feng
* fix xpu tests
Signed-off-by: jiqing-feng
* fix lib
Signed-off-by: jiqing-feng
* skip xpu dequantize blockwise op check
Signed-off-by: jiqing-feng
* fix matmul8bit
Signed-off-by: jiqing-feng
* skip not used function teests
Signed-off-by: jiqing-feng
* fix matmul8bit fp
Signed-off-by: jiqing-feng
* check ipex before MatMul8bitFp
Signed-off-by: jiqing-feng
* update ipex install guide
Signed-off-by: jiqing-feng
* update install guide
Signed-off-by: jiqing-feng
* fix error log
Signed-off-by: jiqing-feng
* fix error lof
Signed-off-by: jiqing-feng
* update comment
Signed-off-by: jiqing-feng
* move torch op to default
Signed-off-by: jiqing-feng
* revert ipex check
Signed-off-by: jiqing-feng
* fix code tabledevice
Signed-off-by: jiqing-feng
* fix code table device
Signed-off-by: jiqing-feng
* fix xpu ops
Signed-off-by: jiqing-feng
---------
Signed-off-by: jiqing-feng
---
bitsandbytes/__init__.py | 3 +
bitsandbytes/_ops.py | 21 +++
bitsandbytes/autograd/_functions.py | 73 +++++++++
bitsandbytes/backends/cpu/ops.py | 225 +++++++++-----------------
bitsandbytes/backends/default/ops.py | 159 ++++++++++++++++++
bitsandbytes/backends/utils.py | 57 +++++++
bitsandbytes/backends/xpu/__init__.py | 0
bitsandbytes/backends/xpu/ops.py | 51 ++++++
bitsandbytes/cextension.py | 17 +-
bitsandbytes/functional.py | 79 ++++++++-
bitsandbytes/nn/modules.py | 58 +++++--
bitsandbytes/utils.py | 8 +
docs/source/installation.mdx | 21 ++-
tests/test_autograd.py | 3 -
tests/test_functional.py | 33 +---
tests/test_linear4bit.py | 30 +---
tests/test_modules.py | 15 --
tests/test_ops.py | 26 ++-
18 files changed, 621 insertions(+), 258 deletions(-)
create mode 100755 bitsandbytes/backends/utils.py
create mode 100755 bitsandbytes/backends/xpu/__init__.py
create mode 100755 bitsandbytes/backends/xpu/ops.py
mode change 100644 => 100755 bitsandbytes/functional.py
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 12088a70c..9a2524953 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -34,6 +34,9 @@
if torch.cuda.is_available():
from .backends.cuda import ops as cuda_ops
+if torch.xpu.is_available():
+ from .backends.xpu import ops as xpu_ops
+
def _import_backends():
"""
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
index 9a3ac46ac..a260852f5 100644
--- a/bitsandbytes/_ops.py
+++ b/bitsandbytes/_ops.py
@@ -4,6 +4,8 @@
import torch
+from .cextension import ipex_cpu, ipex_xpu
+
_IS_TORCH_GTE_24 = False
if hasattr(torch.library, "register_fake"):
@@ -327,3 +329,22 @@ def _(
)
torch._check(out.device == A.device, lambda: f"Expected out.device == {A.device}, got {out.device}")
torch._check(out.dtype == A.dtype, lambda: f"Expected out.dtype == {A.dtype}, got {out.dtype}")
+
+
+if ipex_cpu or ipex_xpu:
+ # Register the dequantize_nf4_ipex implementation
+ torch.library.define(
+ "bitsandbytes::dequantize_nf4_ipex",
+ "(Tensor A, Tensor absmax, int blocksize, int[] shape, ScalarType dtype) -> Tensor",
+ )
+
+ @register_fake("bitsandbytes::dequantize_nf4_ipex")
+ def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ ) -> torch.Tensor:
+ torch._check_is_size(blocksize)
+ return torch.empty(shape, dtype=dtype, device=A.device)
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index c7ad3a82c..746d6c1ec 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -8,6 +8,7 @@
from typing_extensions import deprecated
import bitsandbytes.functional as F
+from bitsandbytes.functional import ipex_cpu, ipex_xpu
# The inverse transformation for the colTuring and colAmpere format were contributed by Alex Borzunov:
# https://github.com/bigscience-workshop/petals/blob/main/src/petals/utils/linear8bitlt_patch.py
@@ -298,6 +299,63 @@ def backward(ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
return grad_A, grad_B, None, grad_bias, None
+class MatMul8bitFp(torch.autograd.Function):
+ # For Intel CPU and XPU MatMul8bitFp is much faster (~3x) than MatMul8bitLt in finetune.
+ # Because the MatMul8bitLt has more mechanisms in computing grad.
+ # We don't have fast kernel for quant/dequant 8bit in CPU/XPU, so it's very slow.
+ # We'd like to use dequant + matmul to run finetune with good performance.
+
+ @staticmethod
+ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
+ if state.has_fp16_weights or state.CB is None:
+ has_grad = getattr(B, "grad", None) is not None
+ is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1)
+ if is_transposed:
+ B = B.contiguous()
+
+ if (state.is_training and not has_grad) or state.CB is None or state.SCB is None:
+ state.reset_grads()
+ state.CB, state.SCB, _ = F.int8_vectorwise_quant(B.to(torch.float16))
+ B = state.CB
+
+ CB = state.CB.data.to(A.dtype).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
+ output = torch.nn.functional.linear(A, CB, bias)
+ # to pass the test: tests/test_modules.py::test_linear8bitlt_no_fp16_weights[2.0-xpu]
+ state.idx = False
+ ctx.state = state
+ ctx.dtype_A = A.dtype
+ ctx.grad_shape = A.shape
+ ctx.A = A
+ ctx.dtype_bias = None if bias is None else bias.dtype
+ return output
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad
+ A = ctx.A
+ state = ctx.state
+ grad_A = grad_B = grad_bias = None
+ if req_gradBias:
+ # compute grad_bias first before changing grad_output dtype
+ grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias)
+
+ # Cast grad_output to fp16
+ if len(grad_output.shape) == 3:
+ grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
+
+ if req_gradB:
+ grad_B = torch.matmul(A.t(), grad_output).t()
+
+ if req_gradA:
+ if state.CB is not None:
+ CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
+ grad_A = torch.matmul(grad_output.to(ctx.dtype_A), CB).view(ctx.grad_shape)
+ else:
+ raise Exception("State must contain CB matrix for backward")
+
+ return grad_A, grad_B, None, grad_bias, None
+
+
class MatMul4Bit(torch.autograd.Function):
# forward is the same, but we added the fallback for pre-turing GPUs
# backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None")
@@ -366,6 +424,10 @@ def matmul(
state = state or MatmulLtState()
if threshold > 0.0:
state.threshold = threshold
+ # MatMul8bitLt is slower because no fast kernel for quant/dequant 8bit in CPU/XPU
+ if state.is_training:
+ if (A.device.type == "cpu" and ipex_cpu) or (A.device.type == "xpu" and ipex_xpu):
+ return MatMul8bitFp.apply(A, B, out, bias, state)
return MatMul8bitLt.apply(A, B, out, bias, state)
@@ -378,6 +440,17 @@ def matmul_4bit(
):
assert quant_state is not None
+ if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
+ if getattr(quant_state, "ipex", False):
+ # IPEX CPU will change weight to 4D so don't need transpose
+ B = B.t() if B.dim() == 2 else B
+ out = F.gemv_4bit(A, B, out, state=quant_state)
+ if bias is not None:
+ out += bias
+ return out
+ else:
+ return MatMul4Bit.apply(A, B, out, bias, quant_state)
+
if A.numel() == A.shape[-1] and A.requires_grad == False:
if A.shape[-1] % quant_state.blocksize != 0:
warn(
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
index d5ab9aa88..5f009ea40 100644
--- a/bitsandbytes/backends/cpu/ops.py
+++ b/bitsandbytes/backends/cpu/ops.py
@@ -7,6 +7,7 @@
from ..._ops import register_kernel
from ...cextension import lib
+from ..utils import ipex_cpu
# torch._int_mm for s8@s8->s32 is supported on CPU from torch 2.4+.
# However, we can overflow if we use this without AVX512_VNNI support.
@@ -26,22 +27,42 @@ def _(A: torch.Tensor, B: torch.Tensor):
@register_kernel("bitsandbytes::quantize_blockwise", "cpu")
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
- torch._check(A.dtype == torch.float32, lambda: f"A must be float32 on cpu, got {A.dtype}")
n = A.numel()
- blocks = -(n // -blocksize)
- absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
- out = torch.empty_like(A, dtype=torch.uint8)
-
- lib.cquantize_blockwise_cpu_fp32(
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_longlong(blocksize),
- ct.c_longlong(n),
- )
+ # Only FP32 has c++ kernrl
+ if A.dtype == torch.float32:
+ blocks = -(n // -blocksize)
+
+ absmax = torch.empty((blocks,), device=A.device, dtype=torch.float32)
+ out = torch.empty_like(A, dtype=torch.uint8)
+
+ lib.cquantize_blockwise_cpu_fp32(
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_longlong(blocksize),
+ ct.c_longlong(n),
+ )
+ else:
+ rem = n % blocksize
+ has_rem = rem > 0
+ blocks = n // blocksize + has_rem
+ absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+ A_reshaped = A.reshape(n)
+ A_com = A_reshaped[: n - rem]
+ A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+ absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+ scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[: blocks - has_rem].view(-1, 1)), -1, 1)
+ scaled_A = scaled_A.reshape(-1)
+ if has_rem:
+ absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
+ scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
+ scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+
+ diff = torch.abs(scaled_A.unsqueeze(-1) - code.to(scaled_A.device))
+ out = torch.argmin(diff, dim=-1).to(torch.uint8).to(scaled_A.device).reshape(A.shape)
return out, absmax
@@ -50,144 +71,50 @@ def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor
def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
torch._check_is_size(blocksize)
torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
- torch._check(dtype == torch.float32, lambda: f"dtype must be float32 on cpu, got {dtype}")
-
- out = torch.empty_like(A, dtype=dtype)
- lib.cdequantize_blockwise_cpu_fp32(
- get_ptr(code),
- get_ptr(A),
- get_ptr(absmax),
- get_ptr(out),
- ct.c_longlong(blocksize),
- ct.c_longlong(A.numel()),
- )
+ # Only FP32 has c++ kernrl
+ if dtype == torch.float32:
+ out = torch.empty_like(A, dtype=dtype)
+
+ lib.cdequantize_blockwise_cpu_fp32(
+ get_ptr(code),
+ get_ptr(A),
+ get_ptr(absmax),
+ get_ptr(out),
+ ct.c_longlong(blocksize),
+ ct.c_longlong(A.numel()),
+ )
+ else:
+ out = code[A.reshape(-1).int()]
+ blocks = out.shape[-1] // blocksize
+ res = out.shape[-1] % blocksize
+ if res != 0:
+ out = torch.nn.functional.pad(out, (0, blocksize - res), mode="constant", value=0)
+ out = (out.view(-1, blocksize) * absmax.view(-1, 1)).to(dtype).reshape(-1)
+ out = out[: blocks * blocksize + res]
+ out = out.reshape(A.shape)
return out
-_NF4_QUANT_TABLE = torch.tensor(
- [
- -1.0,
- -0.6961928009986877,
- -0.5250730514526367,
- -0.39491748809814453,
- -0.28444138169288635,
- -0.18477343022823334,
- -0.09105003625154495,
- 0.0,
- 0.07958029955625534,
- 0.16093020141124725,
- 0.24611230194568634,
- 0.33791524171829224,
- 0.44070982933044434,
- 0.5626170039176941,
- 0.7229568362236023,
- 1.0,
- ],
- dtype=torch.float32,
- device="cpu",
-)
-
-
-@register_kernel("bitsandbytes::quantize_4bit", "cpu")
-def _(
- A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check_is_size(blocksize)
- torch._check(quant_type == "nf4", lambda: f"quant_type must be nf4 on CPU, got {quant_type}")
- torch._check(
- A.dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
- )
-
- n = A.numel()
-
- # TODO: Support when weight matrix is not divisible by blocksize
- torch._check(n % blocksize == 0, lambda: f"n must be divisible by blocksize, got {n} and {blocksize}")
-
- # Divide into blocks and normalize
- blocks = A.reshape(-1, blocksize)
- absmax = blocks.abs().max(dim=1).values.float()
- scaled = blocks / absmax.unsqueeze(-1)
-
- # Quantize with the lookup table
- quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(torch.uint8)
-
- # Pack two quantized values per byte
- packed = quantized[::2] << 4 | quantized[1::2]
-
- if quant_storage != torch.uint8:
- packed = packed.squeeze().view(quant_storage).unsqueeze(1)
-
- return packed, absmax.float()
-
-
-@register_kernel("bitsandbytes::dequantize_4bit", "cpu")
-def _(
- A: torch.Tensor,
- absmax: torch.Tensor,
- blocksize: int,
- quant_type: str,
- shape: Sequence[int],
- dtype: torch.dtype,
-) -> torch.Tensor:
- torch._check_is_size(blocksize)
- torch._check(quant_type == "nf4", lambda: f"quant_type must be nf4 on CPU, got {quant_type}")
- torch._check(
- dtype in [torch.bfloat16, torch.float16, torch.float32],
- lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
- )
- torch._check(
- A.dtype == torch.uint8,
- lambda: f"Blockwise 4bit dequantization on CPU only supports uint8 storage, got {A.dtype}",
- )
-
- A = A.view(-1, 1)
-
- # Grab upper and lower nibbles. Using int64 for indexing in the LUT.
- upper = (A >> 4).to(torch.int64)
- lower = (A & 0x0F).to(torch.int64)
-
- # Expand to blocks
- blocks = torch.cat((upper, lower), dim=1).reshape(-1, blocksize)
-
- # Dequantize
- blocks = _NF4_QUANT_TABLE[blocks] * absmax[:, None]
-
- # Reshape to original shape
- blocks = blocks.reshape(-1, *shape[1:])
-
- return blocks.to(dtype)
-
-
-@register_kernel("bitsandbytes::gemv_4bit", "cpu")
-def _(
- A: torch.Tensor,
- B: torch.Tensor,
- shapeB: Sequence[int],
- absmax: torch.Tensor,
- code: torch.Tensor,
- blocksize: int,
-) -> torch.Tensor:
- # TODO: We need to determine whether `code` is NF4, FP4, or other.
- # Right now we assume NF4, as this is the only one supported on CPU.
-
- B_dq = torch.ops.bitsandbytes.dequantize_4bit.default(
- B,
- absmax,
- blocksize,
- "nf4",
- shape=shapeB,
- dtype=A.dtype,
- )
-
- # User called gemv with B.t(), so we need to transpose it back.
- # if B.shape[0] == 1:
- # B_dq = B_dq.t()
-
- return torch.nn.functional.linear(
- A,
- B_dq,
- bias=None,
- )
+if ipex_cpu:
+ from bitsandbytes.utils import _reverse_4bit_compress_format
+
+ @register_kernel("bitsandbytes::dequantize_nf4_ipex", "cpu")
+ def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ ) -> torch.Tensor:
+ ipex_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(A, "nf4", shape, 2)
+ A = _reverse_4bit_compress_format(ipex_weight.reshape(-1)).reshape(1, -1)
+ return torch.ops.bitsandbytes.dequantize_4bit.default(
+ A,
+ absmax,
+ blocksize,
+ "nf4",
+ shape,
+ dtype,
+ )
diff --git a/bitsandbytes/backends/default/ops.py b/bitsandbytes/backends/default/ops.py
index 729c2b047..ce5926979 100644
--- a/bitsandbytes/backends/default/ops.py
+++ b/bitsandbytes/backends/default/ops.py
@@ -1,9 +1,11 @@
+from collections.abc import Sequence
from math import prod
from typing import Optional
import torch
from ..._ops import register_kernel
+from ..utils import CODE
@register_kernel("bitsandbytes::int8_mm_dequant", "default")
@@ -142,3 +144,160 @@ def _(A: torch.Tensor, threshold=0.0):
A[outliers] = outlier_restore
return out_row, row_stats, outlier_cols
+
+
+@register_kernel("bitsandbytes::quantize_blockwise", "default")
+def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check_is_size(blocksize)
+
+ n = A.numel()
+ rem = n % blocksize
+ has_rem = rem > 0
+ blocks = n // blocksize + has_rem
+ absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+ A_reshaped = A.reshape(n)
+ A_com = A_reshaped[: n - rem]
+ A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+ absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+ scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[: blocks - has_rem].view(-1, 1)), -1, 1)
+ scaled_A = scaled_A.reshape(-1)
+ if has_rem:
+ absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
+ scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
+ scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+
+ diff = torch.abs(scaled_A.unsqueeze(-1) - code.to(scaled_A.device))
+ out = torch.argmin(diff, dim=-1).to(torch.uint8).to(scaled_A.device).reshape(A.shape)
+
+ return out, absmax
+
+
+@register_kernel("bitsandbytes::dequantize_blockwise", "default")
+def _(A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype) -> torch.Tensor:
+ torch._check_is_size(blocksize)
+ torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
+
+ out = code[A.reshape(-1).int()]
+ blocks = out.shape[-1] // blocksize
+ res = out.shape[-1] % blocksize
+ if res != 0:
+ out = torch.nn.functional.pad(out, (0, blocksize - res), mode="constant", value=0)
+ out = (out.view(-1, blocksize) * absmax.view(-1, 1)).to(dtype).reshape(-1)
+ out = out[: blocks * blocksize + res]
+ out = out.reshape(A.shape)
+
+ return out
+
+
+@register_kernel("bitsandbytes::quantize_4bit", "default")
+def _(
+ A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
+) -> tuple[torch.Tensor, torch.Tensor]:
+ torch._check_is_size(blocksize)
+ torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
+ torch._check(
+ A.dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit quantization only supports 16/32-bit floats, but got {A.dtype}",
+ )
+
+ n = A.numel()
+ full_blocks = n // blocksize
+ rem = n % blocksize
+ blocks = full_blocks + 1 if rem else full_blocks
+ absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+ A_flattened = A.reshape(n)
+
+ # Scale full blocks of the tensor to [-1, 1]
+ A_full_blocks = A_flattened[: n - rem].reshape(n // blocksize, blocksize)
+ absmax[:full_blocks] = torch.abs(A_full_blocks).max(dim=-1)[0]
+ scaled = torch.clamp(A_full_blocks * (1 / absmax[:full_blocks].view(-1, 1)), -1, 1).reshape(-1)
+
+ # Scale any partial block
+ if rem:
+ A_rem = A_flattened[-rem:]
+ absmax[-1] = torch.abs(A_rem).max()
+ scaled_rem = torch.clamp(A_rem * (1 / absmax[-1]), -1, 1)
+ scaled = torch.cat([scaled, scaled_rem], dim=0)
+
+ # Quantize with the lookup table
+ code = CODE[quant_type].to(scaled.device).to(scaled.dtype)
+ quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - code), dim=-1, keepdim=True).to(torch.uint8)
+
+ # Pack two quantized values per byte
+ packed = quantized[::2] << 4 | quantized[1::2]
+
+ if quant_storage != torch.uint8:
+ packed = packed.squeeze().view(quant_storage).unsqueeze(1)
+
+ return packed, absmax.float()
+
+
+@register_kernel("bitsandbytes::dequantize_4bit", "default")
+def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ quant_type: str,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+) -> torch.Tensor:
+ torch._check_is_size(blocksize)
+ torch._check(quant_type in ("nf4", "fp4"), lambda: f"quant_type must be nf4 or fp4, got {quant_type}")
+ torch._check(
+ dtype in [torch.bfloat16, torch.float16, torch.float32],
+ lambda: f"Blockwise 4bit dequantization only supports 16/32-bit floats, but got {dtype}",
+ )
+
+ # Enable non uint8 dtype
+ if A.dtype != torch.uint8:
+ A = A.view(torch.uint8)
+
+ A = A.reshape(-1)
+ # Map nf4 to [-1, 1]
+ out_dq = torch.empty(A.size(0) * 2, dtype=torch.int32, device=A.device)
+ n = out_dq.numel()
+ out_dq[1::2] = A & 0xF
+ out_dq[::2] = A >> 4
+ # code is fp32, cast to dtype to avoid the mismatch issue
+ code = CODE[quant_type].to(dtype).to(A.device)
+ out_dq = code[out_dq]
+
+ # Apply scales
+ if out_dq.numel() != n:
+ assert out_dq.numel() == n + 1
+ out_dq = torch.narrow(out_dq, 0, 0, n)
+ blocks = n // blocksize
+ blocks += 1 if n % blocksize > 0 else 0
+ rem = n % blocksize
+ has_rem = rem > 0
+
+ out = torch.empty(shape, dtype=dtype, device=A.device).reshape(-1)
+ if has_rem:
+ out[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(-1)
+ out[n - rem :] = out_dq[n - rem :] * absmax[-1]
+ else:
+ out = out_dq.view(-1, blocksize) * absmax.view(-1, 1)
+
+ out = out.reshape(-1, *shape[1:]).to(dtype)
+
+ return out
+
+
+@register_kernel("bitsandbytes::gemv_4bit", "default")
+def _(
+ A: torch.Tensor,
+ B: torch.Tensor,
+ shapeB: Sequence[int],
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+) -> torch.Tensor:
+ # Applied from dequantize_4bit
+ quant_type = "fp4" if code[1] > 0 else "nf4"
+ B_dq = torch.ops.bitsandbytes.dequantize_4bit.default(B, absmax, blocksize, quant_type, shapeB, A.dtype)
+
+ return torch.nn.functional.linear(
+ A,
+ B_dq,
+ bias=None,
+ )
diff --git a/bitsandbytes/backends/utils.py b/bitsandbytes/backends/utils.py
new file mode 100755
index 000000000..cc88ffae1
--- /dev/null
+++ b/bitsandbytes/backends/utils.py
@@ -0,0 +1,57 @@
+import torch
+
+try:
+ # to support Intel CPU/XPU (IPEX) backend
+ import intel_extension_for_pytorch as ipex
+
+ ipex_cpu = ipex if ipex._C._has_cpu() else None
+ ipex_xpu = ipex if ipex._C._has_xpu() else None
+except BaseException:
+ ipex_cpu = None
+ ipex_xpu = None
+
+_NF4_QUANT_TABLE = torch.tensor(
+ [
+ -1.0,
+ -0.6961928009986877,
+ -0.5250730514526367,
+ -0.39491748809814453,
+ -0.28444138169288635,
+ -0.18477343022823334,
+ -0.09105003625154495,
+ 0.0,
+ 0.07958029955625534,
+ 0.16093020141124725,
+ 0.24611230194568634,
+ 0.33791524171829224,
+ 0.44070982933044434,
+ 0.5626170039176941,
+ 0.7229568362236023,
+ 1.0,
+ ],
+ dtype=torch.float32,
+ device="xpu" if torch.xpu.is_available() else "cpu", # Only cpu/xpu use this table for now.
+)
+_FP4_QUANT_TABLE = torch.tensor(
+ [
+ 0.0000,
+ 0.0052,
+ 0.6667,
+ 1.0000,
+ 0.3333,
+ 0.5000,
+ 0.1667,
+ 0.2500,
+ 0.0000,
+ -0.0052,
+ -0.6667,
+ -1.0000,
+ -0.3333,
+ -0.5000,
+ -0.1667,
+ -0.2500,
+ ],
+ dtype=torch.float32,
+ device="xpu" if torch.xpu.is_available() else "cpu", # Only cpu/xpu use this table for now.
+)
+CODE = {"nf4": _NF4_QUANT_TABLE, "fp4": _FP4_QUANT_TABLE}
diff --git a/bitsandbytes/backends/xpu/__init__.py b/bitsandbytes/backends/xpu/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/bitsandbytes/backends/xpu/ops.py b/bitsandbytes/backends/xpu/ops.py
new file mode 100755
index 000000000..47a3bd009
--- /dev/null
+++ b/bitsandbytes/backends/xpu/ops.py
@@ -0,0 +1,51 @@
+from collections.abc import Sequence
+
+import torch
+
+from ..._ops import register_kernel
+from ..utils import ipex_xpu
+
+if torch.__version__ >= (2, 7):
+
+ @register_kernel("bitsandbytes::int8_linear_matmul", "xpu")
+ def _(A: torch.Tensor, B: torch.Tensor):
+ return torch._int_mm(
+ A.reshape(-1, A.shape[-1]),
+ B.t(),
+ ).reshape(*A.shape[:-1], B.shape[0])
+
+
+if ipex_xpu:
+
+ @register_kernel("bitsandbytes::dequantize_nf4_ipex", "xpu")
+ def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ blocksize: int,
+ shape: Sequence[int],
+ dtype: torch.dtype,
+ ) -> torch.Tensor:
+ return torch.ops.torch_ipex.dequantize_4bit(A, "nf4", shape, absmax, None, blocksize).t().to(dtype)
+
+ @register_kernel("bitsandbytes::dequantize_blockwise", "xpu")
+ def _(
+ A: torch.Tensor,
+ absmax: torch.Tensor,
+ code: torch.Tensor,
+ blocksize: int,
+ dtype: torch.dtype,
+ ) -> torch.Tensor:
+ shape = A.shape
+ out = torch.empty(A.reshape(-1).shape, dtype=dtype, device=A.device)
+ # void cdequantize_blockwise_fp32(
+ # float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n, cudaStream_t stream)
+ if dtype == torch.float16:
+ ipex_xpu.xpu.bitsandbytes.cdequantize_blockwise_fp16(code, A, absmax, out, blocksize, A.numel())
+ elif dtype == torch.bfloat16:
+ ipex_xpu.xpu.bitsandbytes.cdequantize_blockwise_bf16(code, A, absmax, out, blocksize, A.numel())
+ elif dtype == torch.float32:
+ ipex_xpu.xpu.bitsandbytes.cdequantize_blockwise_fp32(code, A, absmax, out, blocksize, A.numel())
+ else:
+ raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {out.dtype}")
+
+ return out.reshape(shape)
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index ebc363991..b112df2f7 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -286,11 +286,26 @@ def get_native_library() -> BNBNativeLibrary:
return BNBNativeLibrary(dll)
+try:
+ # to support Intel CPU/GPU (XPU) backend
+ import intel_extension_for_pytorch as ipex
+
+ ipex_cpu = ipex if ipex._C._has_cpu() else None
+ ipex_xpu = ipex if ipex._C._has_xpu() else None
+except BaseException:
+ ipex_cpu = None
+ ipex_xpu = None
+
+
try:
lib = get_native_library()
except Exception as e:
error_msg = str(e)
- logger.error(f"bitsandbytes library load error: {error_msg}\n", exc_info=True)
+ if not (ipex_cpu or ipex_xpu):
+ logger.error(
+ f"bitsandbytes library load error: {error_msg}\n If you are using Intel CPU/XPU, please install intel_extension_for_pytorch to enable required ops",
+ exc_info=True,
+ )
# create a mock with error messaging as fallback
lib = ErrorHandlerMockBNBNativeLibrary(error_msg)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
old mode 100644
new mode 100755
index 0bd4c8b4e..ffb66681a
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -13,9 +13,9 @@
from torch import Tensor
from typing_extensions import deprecated
-from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
+from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict
-from .cextension import lib
+from .cextension import ipex_cpu, ipex_xpu, lib
name2qmap = {}
@@ -1122,6 +1122,16 @@ def dequantize_4bit(
if absmax.dtype != torch.float32:
absmax = absmax.float()
+ # IPEX format is different, we need extra process.
+ if getattr(quant_state, "ipex", False) and quant_state.quant_type == "nf4":
+ return torch.ops.bitsandbytes.dequantize_nf4_ipex(
+ A,
+ absmax,
+ quant_state.blocksize,
+ quant_state.shape,
+ quant_state.dtype,
+ )
+
if out is not None:
torch.ops.bitsandbytes.dequantize_4bit.out(
A, absmax, quant_state.blocksize, quant_state.quant_type, quant_state.shape, quant_state.dtype, out=out
@@ -1709,6 +1719,25 @@ def gemv_4bit(
if state.nested:
absmax = dequantize_blockwise(absmax, state.state2) + state.offset
+ if getattr(state, "ipex", False) and state.quant_type == "nf4":
+ # compute_dtype: 1 indicates fp16, 2 indicates bf16
+ compute_dtype = 2 if A.dtype == torch.bfloat16 else 1
+ out = torch.ops.torch_ipex.woq_linear(
+ A,
+ B,
+ "nf4",
+ state.shape,
+ state.new_scales,
+ state.new_zeros,
+ None,
+ None,
+ state.blocksize,
+ compute_dtype,
+ 1,
+ state.compensation,
+ )
+ return out
+
if out is not None:
torch.ops.bitsandbytes.gemv_4bit.out(
A,
@@ -2507,3 +2536,49 @@ def vectorwise_mm_dequant(xq, S1, S2, dtype=torch.half, quant_type="vector"):
return x.to(dtype)
else:
return None
+
+
+def _enable_ipex_fusion(linear: torch.nn.Module, x: torch.Tensor):
+ quant_state = linear.weight.quant_state
+
+ if quant_state.nested:
+ absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+ absmax += quant_state.offset
+ if absmax.dtype != torch.float32:
+ absmax = absmax.float()
+
+ quant_state.absmax = absmax
+ quant_state.nested = False
+ delattr(quant_state, "state2")
+
+ if x.device.type == "cpu" and ipex_cpu:
+ converted_weight = _reverse_4bit_compress_format(linear.weight.data)
+ new_weight, new_scales, new_zeros, _, compensation = torch.ops.ipex_prepack.woq_linear_pack_weight(
+ converted_weight.reshape([quant_state.shape[0], quant_state.shape[1] // 2]),
+ "nf4",
+ quant_state.shape, # weight shape
+ quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize), # scales
+ None, # zero_points
+ None, # bias
+ None, # batch_size
+ quant_state.blocksize,
+ 2,
+ )
+ elif x.device.type == "xpu" and ipex_xpu:
+ new_weight = _reverse_4bit_compress_format(linear.weight.data)
+ new_scales = quant_state.absmax.view(quant_state.shape[0], quant_state.shape[1] // quant_state.blocksize)
+ new_zeros = None
+ compensation = None
+ new_scales = list(new_scales)
+ if not linear.training and not x.requires_grad:
+ new_weight = new_weight.reshape([quant_state.shape[0], quant_state.shape[1] // 2])
+ else:
+ raise ValueError(
+ "Please check the device and ipex version. The device should be cpu or xpu while ipex version should >= 2.7"
+ )
+
+ linear.weight.data = new_weight.data
+ linear.weight.quant_state.ipex = True
+ linear.weight.quant_state.new_scales = new_scales
+ linear.weight.quant_state.new_zeros = new_zeros
+ linear.weight.quant_state.compensation = compensation
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 500102ab1..ccd842ce3 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -11,11 +11,12 @@
import torch.nn.functional as F
import bitsandbytes as bnb
-from bitsandbytes.functional import QuantState
+from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu
from bitsandbytes.optim import GlobalOptimManager
from bitsandbytes.utils import (
INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
OutlierTracer,
+ _reverse_4bit_compress_format,
)
T = TypeVar("T", bound="torch.nn.Module")
@@ -444,6 +445,7 @@ def __init__(
self.compute_type_is_set = False
self.quant_state = None
self.quant_storage = quant_storage
+ self.ipex_linear_is_set = False
def set_compute_type(self, x):
if x.dtype in [torch.float32, torch.bfloat16]:
@@ -470,13 +472,40 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
save weight and bias,
then fill state_dict with components of quant_state
"""
+ if getattr(self.weight, "quant_state", None) is not None and getattr(self.weight.quant_state, "ipex", False):
+ if self.weight.device.type == "cpu":
+ original_weight = torch.ops.ipex_prepack.woq_linear_unpack_weight(
+ self.weight, "nf4", self.weight.quant_state.shape, 2
+ )
+ self.weight.data = _reverse_4bit_compress_format(original_weight.data)
+ elif self.weight.device.type == "xpu":
+ self.weight.data = _reverse_4bit_compress_format(self.weight.data.reshape(1, -1))
+
+ self.weight.quant_state.ipex = False
+ self.ipex_linear_is_set = False
+
super()._save_to_state_dict(destination, prefix, keep_vars) # saving weight and bias
if getattr(self.weight, "quant_state", None) is not None:
for k, v in self.weight.quant_state.as_dict(packed=True).items():
destination[prefix + "weight." + k] = v if keep_vars else v.detach()
+ def set_ipex_linear(self, x: torch.Tensor):
+ if (
+ not getattr(self.weight.quant_state, "ipex", False)
+ and self.weight.data.dtype == torch.uint8
+ and self.weight.quant_state.shape[1] % self.weight.quant_state.blocksize == 0
+ and self.weight.quant_state.quant_type == "nf4"
+ ):
+ if x.device.type == "xpu" or (x.device.type == "cpu" and not self.training and x.requires_grad == False):
+ _enable_ipex_fusion(self, x)
+
def forward(self, x: torch.Tensor):
+ # Check if ipex fusion can be used
+ if not self.ipex_linear_is_set and (ipex_cpu or ipex_xpu):
+ self.set_ipex_linear(x)
+ self.ipex_linear_is_set = True
+
fix_4bit_weight_quant_state_from_module(self)
# weights are cast automatically as Int8Params, but the bias has to be cast manually
@@ -492,8 +521,10 @@ def forward(self, x: torch.Tensor):
x = x.to(self.compute_dtype)
bias = None if self.bias is None else self.bias.to(self.compute_dtype)
+ # IPEX CPU will change weight to 4D so don't need transpose
+ weight = self.weight.t() if self.weight.dim() == 2 else self.weight
- return bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
+ return bnb.matmul_4bit(x, weight, bias=bias, quant_state=self.weight.quant_state).to(inp_dtype)
class LinearFP4(Linear4bit):
@@ -644,17 +675,20 @@ def to(self, *args, **kwargs):
device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
if device is not None and device.type != "meta" and self.data.device.type == "cpu":
- return self._quantize(device)
- else:
- new_param = Int8Params(
- super().to(device=device, dtype=dtype, non_blocking=non_blocking),
- requires_grad=self.requires_grad,
- has_fp16_weights=self.has_fp16_weights,
- )
- new_param.CB = self.CB
- new_param.SCB = self.SCB
+ if device.type != "cpu" or self.data.dtype != torch.int8:
+ return self._quantize(device)
+ elif self.data.dtype == torch.int8 and device.type in ("cpu", "xpu"):
+ self.CB = self.data
- return new_param
+ new_param = Int8Params(
+ super().to(device=device, dtype=dtype, non_blocking=non_blocking),
+ requires_grad=self.requires_grad,
+ has_fp16_weights=self.has_fp16_weights,
+ )
+ new_param.CB = self.CB
+ new_param.SCB = self.SCB
+
+ return new_param
def maybe_rearrange_weight(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 0828dd295..7920e2188 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -38,6 +38,14 @@ def outlier_hook(module, input):
hook.remove()
+# convert btw standard 4-bit compression format and ipex compression format
+def _reverse_4bit_compress_format(weight: torch.Tensor):
+ out_1 = (weight & 0xF0) >> 4
+ out_2 = (weight & 0xF) << 4
+ out = out_1 | out_2
+ return out
+
+
class OutlierTracer:
_instance = None
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 11dfbf5ea..e61ce4655 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -238,15 +238,24 @@ pip install -e . # `-e` for "editable" install, when developing BNB (otherwise
#### Intel CPU + XPU
-It does not need compile CPP codes, all required ops are in [intel_extension_for_pytorch](https://pytorch-extension.intel.com/), please follow the instruction to install ipex.
+If you are using Intel CPU on Linux or Intel XPU on Linux/Windows, please follow the [instruction](https://pytorch-extension.intel.com/) or the following command to install intel_extension_for_pytorch so you can get better performance.
-The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#cuda-compile).
+CPU: `pip install intel_extension_for_pytorch`
+XPU: `pip install intel_extension_for_pytorch --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/`
-```bash
-pip install intel_extension_for_pytorch
-git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
-pip install -e . # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+Install bitsandbytes:
+CPU: Need to build CPU C++ codes
+```
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install .
+```
+XPU:
```
+pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
+```
+
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index fc2e7aa6f..5fbe1065f 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -180,9 +180,6 @@ def test_matmul_4bit(
compress_statistics,
quant_type,
):
- if device == "cpu" and quant_type == "fp4":
- pytest.xfail("Only nf4 is supported on CPU")
-
dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
if has_bias == False:
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 8568d45f0..fa4a14ae9 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -103,10 +103,9 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
if nested:
pytest.skip("Not a typical use case.")
if blocksize != 256:
- pytest.skip("Only blocksize 256 is the typical one supported on CPU.")
-
+ pytest.skip("Only blocksize 256 is used in CPU/XPU")
if dtype != torch.float32:
- pytest.xfail(f"CPU implementation currently only supports float32, got {dtype}")
+ pytest.skip("Only float32 is used in CPU/XPU")
diffs = []
reldiffs = []
@@ -138,10 +137,11 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
abserr = sum(diffs) / len(diffs)
relerr = sum(reldiffs) / len(reldiffs)
if signed:
- assert abserr < 0.0035
+ threshold_abserr = 0.0036 if device in ("cpu", "xpu") else 0.0035
+ assert abserr < 0.0036
assert relerr < 0.015
else:
- assert abserr < 0.00175
+ assert abserr < 0.00175 if device in ("cpu", "xpu") else 0.0023
assert relerr < 0.012
assert A2.dtype == dtype
@@ -172,8 +172,8 @@ def test_blockwise_cpu_large(self, hidden, blocksize):
@pytest.mark.parametrize("bits", range(2, 9), ids=id_formatter("bits"))
@pytest.mark.parametrize("method", ["linear", "fp8", "dynamic", "quantile"])
def test_few_bit_quant(self, device, bits, method):
- if device == "cpu" and bits != 8:
- pytest.skip("CPU implementation only supports 8 bits")
+ if device in ("cpu", "xpu") and bits != 8:
+ pytest.skip("CPU/XPU implementation only supports 8 bits")
abserrs = []
relerrs = []
@@ -1080,9 +1080,6 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
def test_4bit_quant(self, device, dtype, quant_type, blocksize):
- if device == "cpu" and quant_type != "nf4":
- pytest.xfail("fp4 quantization is not supported on CPU")
-
A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
@@ -1115,9 +1112,6 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
def test_4bit_compressed_stats(self, device, quant_type, blocksize):
- if device == "cpu" and quant_type != "nf4":
- pytest.xfail("fp4 quantization is not supported on CPU")
-
errs1 = []
errs2 = []
for i in range(10):
@@ -1190,12 +1184,6 @@ def test_bench_4bit_dequant(self, quant_type):
)
@pytest.mark.parametrize("dim", [128, 256, 512, 1024], ids=id_formatter("dim"))
def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double_quant, kind):
- if device == "cpu":
- if storage_type != "nf4":
- pytest.xfail("fp4 quantization is not supported on CPU")
- if quant_storage != torch.uint8:
- pytest.xfail("Only uint8 storage is supported on CPU")
-
errs1 = []
errs2 = []
errs3 = []
@@ -1342,13 +1330,6 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
- if device == "cpu":
- if storage_type != "nf4":
- pytest.xfail("fp4 quantization is not supported on CPU")
-
- if dtype == torch.bfloat16 and torch.__version__ < (2, 3):
- pytest.xfail("eye doe not support bfloat16 on CPU in torch < 2.3")
-
dims = 10
torch.random.manual_seed(np.random.randint(0, 412424242))
dims = get_test_dims(0, 8192, n=dims)
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index f3673797c..b5db2eb6f 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -32,12 +32,6 @@
@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
@pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward"))
def test_linear_serialization(device, quant_type, compress_statistics, bias, quant_storage, save_before_forward):
- if device == "cpu":
- if quant_type == "fp4":
- pytest.xfail("FP4 is not supported for CPU")
- if quant_storage != "uint8":
- pytest.xfail("Only uint8 storage is supported for CPU")
-
original_dtype = torch.float16
compute_dtype = None
layer_shape = (300, 400)
@@ -194,13 +188,7 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_copy_param(device, quant_type, blocksize, compress_statistics):
- if device == "cpu":
- if compress_statistics:
- pytest.skip("Currently segfaults on CPU")
- if quant_type == "fp4":
- pytest.xfail("FP4 not supported on CPU")
-
- tensor = torch.linspace(1, blocksize, blocksize)
+ tensor = torch.randn(300, 400)
param = bnb.nn.Params4bit(
data=tensor,
quant_type=quant_type,
@@ -219,13 +207,7 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
- if device == "cpu":
- if compress_statistics:
- pytest.skip("Currently segfaults on CPU")
- if quant_type == "fp4":
- pytest.xfail("FP4 not supported on CPU")
-
- tensor = torch.linspace(1, blocksize, blocksize)
+ tensor = torch.randn(300, 400)
param = bnb.nn.Params4bit(
data=tensor,
quant_type=quant_type,
@@ -251,13 +233,7 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("blocksize", [64, 128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
- if device == "cpu":
- if compress_statistics:
- pytest.skip("Currently segfaults on CPU")
- if quant_type == "fp4":
- pytest.xfail("FP4 not supported on CPU")
-
- original_tensor = torch.linspace(1, blocksize, blocksize, dtype=torch.float32)
+ original_tensor = torch.randn(300, 400)
original_param = bnb.nn.Params4bit(
data=original_tensor,
quant_type=quant_type,
diff --git a/tests/test_modules.py b/tests/test_modules.py
index c8ec6311a..aa6f19c9e 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -391,12 +391,6 @@ def test_fp8linear():
ids=lambda x: x.__name__ if inspect.isclass(x) else str(x),
)
def test_embedding_lossless(device, embedding_class, input_shape, embedding_dim, quant_storage):
- if device == "cpu":
- if embedding_class is bnb.nn.EmbeddingFP4:
- pytest.xfail("FP4 is not supported for CPU")
- if quant_storage is not None and quant_storage != torch.uint8:
- pytest.xfail("CPU only supports uint8 storage for 4bit")
-
num_embeddings = 128
src_weight = (torch.randn((num_embeddings, embedding_dim), dtype=torch.float32) > 0).to(
@@ -442,12 +436,6 @@ def test_embedding_lossless(device, embedding_class, input_shape, embedding_dim,
ids=lambda x: x.__name__ if inspect.isclass(x) else str(x),
)
def test_embedding_error(device, embedding_class, input_shape, embedding_dim, quant_storage):
- if device == "cpu":
- if embedding_class is bnb.nn.EmbeddingFP4:
- pytest.xfail("FP4 is not supported for CPU")
- if quant_storage is not None and quant_storage != torch.uint8:
- pytest.xfail("CPU only supports uint8 storage for 4bit")
-
is_8bit = embedding_class is bnb.nn.Embedding8bit
num_embeddings = 128
@@ -482,9 +470,6 @@ def test_embedding_error(device, embedding_class, input_shape, embedding_dim, qu
@pytest.mark.parametrize("device", get_available_devices())
def test_4bit_linear_warnings(device):
- if device == "cpu":
- pytest.xfail("gemv_4bit op is not yet implemented on CPU")
-
dim1 = 64
with pytest.warns(UserWarning, match=r"inference or training"):
diff --git a/tests/test_ops.py b/tests/test_ops.py
index e85bc0ef0..9a0ae3338 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -143,6 +143,10 @@ def test_dequantize_blockwise(self, device, dtype, blocksize):
assert out.dtype == dtype
assert out.device == A.device
+ # TODO: Enable it
+ if device == "xpu":
+ pytest.skip("XPU implementation have torch.op inside torch.op, it will fail on op check")
+
opcheck(torch.ops.bitsandbytes.dequantize_blockwise.default, (A, absmax, code, blocksize, dtype))
@@ -153,15 +157,9 @@ class Test4bitBlockwiseQuantOps:
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
- if device == "cpu" and quant_type != "nf4":
- pytest.xfail("CPU implementation is only available for nf4")
-
- if storage_dtype != torch.uint8:
- pytest.xfail("Known issue with storage_dtype != uint8")
-
A = torch.randn(1024, 1024, dtype=dtype, device=device)
- out, absmax = torch.ops.bitsandbytes.quantize_4bit(A, blocksize, quant_type, storage_dtype)
+ out, absmax = torch.ops.bitsandbytes.quantize_4bit.default(A, blocksize, quant_type, storage_dtype)
assert out.device == A.device
assert out.dtype == storage_dtype
@@ -169,6 +167,10 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
assert absmax.device == A.device
assert absmax.dtype == torch.float32
+ # TODO: Enable it
+ if device in ("cpu", "xpu") and storage_dtype == torch.bfloat16:
+ pytest.skip("CPU bf16 storage_dtype will fail on torch op check")
+
opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
@pytest.mark.parametrize("device", get_available_devices())
@@ -177,13 +179,6 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
- if device == "cpu":
- if quant_type != "nf4":
- pytest.xfail("CPU implementation is only available for nf4")
-
- if storage_dtype != torch.uint8:
- pytest.xfail("CPU implementation only supports uint8 storage")
-
shape = (128, 128)
n = prod(shape)
@@ -215,9 +210,6 @@ def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksi
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
@pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
- if device == "cpu":
- pytest.xfail("CPU implementation is not available")
-
out_features = 1024
in_features = 256
From 90d9af2c387f05bcf4dc8d409a0ac3e4ef0d8e95 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 28 May 2025 22:04:55 +0530
Subject: [PATCH 51/85] Update functional.py
---
bitsandbytes/functional.py | 36 ++++++++++++++++++------------------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index f4be0dc2f..2405a1985 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -968,12 +968,12 @@ def quantize_fp4(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=64,
+ blocksize=None,
compress_statistics=False,
quant_storage=torch.uint8,
):
- if HIP_ENVIRONMENT:
- blocksize = 128
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage)
@@ -981,12 +981,12 @@ def quantize_nf4(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=64,
+ blocksize=None,
compress_statistics=False,
quant_storage=torch.uint8,
):
- if HIP_ENVIRONMENT:
- blocksize = 128
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage)
@@ -994,7 +994,7 @@ def quantize_4bit(
A: torch.Tensor,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize=64,
+ blocksize=None,
compress_statistics=False,
quant_type="fp4",
quant_storage=torch.uint8,
@@ -1023,8 +1023,8 @@ def quantize_4bit(
- [`QuantState`]: The state object used to undo the quantization.
"""
- if HIP_ENVIRONMENT:
- blocksize = 128
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
input_shape = A.shape
@@ -1076,10 +1076,10 @@ def dequantize_fp4(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: int = 64,
+ blocksize: Optional[int] = None,
) -> torch.Tensor:
- if HIP_ENVIRONMENT:
- blocksize = 128
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
@@ -1088,10 +1088,10 @@ def dequantize_nf4(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: int = 64,
+ blocksize: Optional[int] = None,
) -> torch.Tensor:
- if HIP_ENVIRONMENT:
- blocksize = 128
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
@@ -1100,7 +1100,7 @@ def dequantize_4bit(
quant_state: Optional[QuantState] = None,
absmax: Optional[torch.Tensor] = None,
out: Optional[torch.Tensor] = None,
- blocksize: int = 64,
+ blocksize: Optional[int] = None,
quant_type="fp4",
) -> torch.Tensor:
"""Dequantizes a packed 4-bit quantized tensor.
@@ -1130,8 +1130,8 @@ def dequantize_4bit(
`torch.Tensor`: The dequantized tensor.
"""
- if HIP_ENVIRONMENT:
- blocksize = 128
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
if quant_state is None:
assert absmax is not None and out is not None
From 80048d89f249509db4c1fb482ce7694fcca3fdcb Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 01:38:52 +0530
Subject: [PATCH 52/85] Update functional.py
---
bitsandbytes/functional.py | 15 ---------------
1 file changed, 15 deletions(-)
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 2405a1985..03f6c323d 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -758,11 +758,6 @@ def quantize_blockwise(
if "dynamic" not in name2qmap:
name2qmap["dynamic"] = create_dynamic_map().to(A.device)
code = name2qmap["dynamic"]
-
- if HIP_ENVIRONMENT:
- assert blocksize in [4096, 2048, 1024, 512, 256, 128]
- else:
- assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
_out, _absmax = torch.ops.bitsandbytes.quantize_blockwise.default(
A,
@@ -844,16 +839,6 @@ def dequantize_blockwise(
if quant_state is None:
quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32)
-
- if HIP_ENVIRONMENT:
- supported_blocksizes = [4096, 2048, 1024, 512, 256, 128]
- else:
- supported_blocksizes = [4096, 2048, 1024, 512, 256, 128, 64]
-
- if quant_state.blocksize not in supported_blocksizes:
- raise ValueError(
- f"The blocksize of {quant_state.blocksize} is not supported. Supported values: {supported_blocksizes}"
- )
absmax = quant_state.absmax
if quant_state.nested:
From e448ebbadf0313f429005001791c56d092992f01 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 02:40:56 +0530
Subject: [PATCH 53/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 12 ++----------
1 file changed, 2 insertions(+), 10 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index fd7b7b9a2..f03d06599 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -303,11 +303,7 @@ def _dequantize_blockwise_impl(
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> tuple[torch.Tensor, torch.Tensor]:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -385,11 +381,7 @@ def _dequantize_4bit_impl(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
dtype in [torch.bfloat16, torch.float16, torch.float32],
From 048faa8ce60088fedc05474157c6356b14c3ee80 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 02:41:52 +0530
Subject: [PATCH 54/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index f03d06599..29dddc96e 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -381,7 +381,7 @@ def _dequantize_4bit_impl(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
dtype in [torch.bfloat16, torch.float16, torch.float32],
From c45e9d18c9fa55135cdaea92b68a4e8660d80bf6 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 02:44:51 +0530
Subject: [PATCH 55/85] Update test_functional.py
---
tests/test_functional.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 7ad604d9f..07c0d4964 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -148,7 +148,7 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
@pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
@pytest.mark.parametrize("hidden", [128])
- @pytest.mark.parametrize("blocksize", [4096, 16384] if not HIP_ENVIRONMENT else [4096])
+ @pytest.mark.parametrize("blocksize", [4096, 16384])
def test_blockwise_cpu_large(self, hidden, blocksize):
diffs = []
reldiffs = []
From 47a491fb213b5286e0ed3cc9af773bf02f416f24 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 03:36:25 +0530
Subject: [PATCH 56/85] Update test_functional.py
---
tests/test_functional.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 07c0d4964..2219efa2f 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -8,7 +8,7 @@
import torch
import bitsandbytes as bnb
-from bitsandbytes.cextension import HIP_ENVIRONMENT
+from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH
from bitsandbytes import functional as F
from tests.helpers import (
BOOLEAN_TUPLES,
@@ -1373,7 +1373,8 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
@pytest.mark.skipif(
- HIP_ENVIRONMENT, reason="this test is not supported on ROCm with gfx90a architecture yet"
+ HIP_ENVIRONMENT and ROCM_GPU_ARCH == "gfx90a",
+ reason="this test is not supported on ROCm with gfx90a architecture yet",
)
def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
if device == "cpu" and storage_type != "nf4":
From 86976bc22b04bc1415a13648582e453ce594700c Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 03:38:53 +0530
Subject: [PATCH 57/85] Update cextension.py
---
bitsandbytes/cextension.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index c8b02fb22..108aa0c9a 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -8,7 +8,7 @@
import torch
from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR
-from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_cuda_version_tuple
+from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_cuda_version_tuple, get_rocm_gpu_arch
logger = logging.getLogger(__name__)
@@ -298,6 +298,8 @@ def get_native_library() -> BNBNativeLibrary:
return BNBNativeLibrary(dll)
+ROCM_GPU_ARCH = get_rocm_gpu_arch()
+
try:
if torch.version.hip:
HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm"
From 98a142a7c7961fc58c0b90b388f080d56991b94c Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 03:41:51 +0530
Subject: [PATCH 58/85] Update cuda_specs.py
---
bitsandbytes/cuda_specs.py | 29 ++++++++++++++++++++++++++++-
1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index 64903cd49..da34dd608 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -1,6 +1,9 @@
import dataclasses
+import logging
+import re
+import subprocess
from functools import lru_cache
-from typing import Optional
+from typing import Optional, List, Tuple
import torch
@@ -73,3 +76,27 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
)
except Exception:
return None
+
+
+def get_rocm_gpu_arch() -> str:
+ """Get ROCm GPU architecture."""
+ logger = logging.getLogger(__name__)
+ try:
+ if torch.version.hip:
+ result = subprocess.run(["rocminfo"], capture_output=True, text=True)
+ match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
+ if match:
+ return "gfx" + match.group(1)
+ else:
+ return "unknown"
+ else:
+ return "unknown"
+ except Exception as e:
+ logger.error(f"Could not detect ROCm GPU architecture: {e}")
+ if torch.cuda.is_available():
+ logger.warning(
+ """
+ROCm GPU architecture detection failed despite ROCm being available.
+ """,
+ )
+ return "unknown"
From 888fe46fee6fe59f377e4c4a3f19468a06094b91 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 03:59:01 +0530
Subject: [PATCH 59/85] Update cuda_specs.py
---
bitsandbytes/cuda_specs.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index da34dd608..61d03083c 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -3,7 +3,7 @@
import re
import subprocess
from functools import lru_cache
-from typing import Optional, List, Tuple
+from typing import Optional
import torch
From c9c52b56c1145d9ecd6ccfc4833799eae3bb2ccd Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Thu, 29 May 2025 15:59:13 +0530
Subject: [PATCH 60/85] Update test_functional.py
---
tests/test_functional.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 2219efa2f..41ed7c984 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1141,7 +1141,7 @@ def test_4bit_quant(self, device, dtype, quant_type, blocksize):
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [64, 128], ids=id_formatter("blocksize"))
+ @pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128], ids=id_formatter("blocksize"))
def test_4bit_compressed_stats(self, device, quant_type, blocksize):
if device == "cpu" and quant_type != "nf4":
pytest.xfail("fp4 quantization is not supported on CPU")
From fc29586e8951cbe41aa5693ba0cd3ae3d25b05db Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 30 May 2025 17:23:38 +0530
Subject: [PATCH 61/85] Update test_linear4bit.py
---
tests/test_linear4bit.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 67b61cb05..474a00a1b 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -7,6 +7,7 @@
import torch
import bitsandbytes as bnb
+from bitsandbytes.cextension import HIP_ENVIRONMENT
from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, torch_load_from_buffer, torch_save_to_buffer
storage = {
@@ -16,7 +17,7 @@
"float32": torch.float32,
}
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
From 53b8b1c580093e39d43d0018fa47abee6966442c Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 30 May 2025 17:27:39 +0530
Subject: [PATCH 62/85] Update test_cuda_setup_evaluator.py
---
tests/test_cuda_setup_evaluator.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index 79406472e..1b2ea85db 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -1,6 +1,6 @@
import pytest
-from bitsandbytes.cextension import get_cuda_bnb_library_path
+from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
from bitsandbytes.cuda_specs import CUDASpecs
@@ -12,12 +12,12 @@ def cuda120_spec() -> CUDASpecs:
cuda_version_tuple=(12, 0),
)
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm")
def test_get_cuda_bnb_library_path(monkeypatch, cuda120_spec):
monkeypatch.delenv("BNB_CUDA_VERSION", raising=False)
assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda120"
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm")
def test_get_cuda_bnb_library_path_override(monkeypatch, cuda120_spec, caplog):
monkeypatch.setenv("BNB_CUDA_VERSION", "110")
assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda110"
From fe1fe7ccd0ab1c2a41da85d865e467de691cefac Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 30 May 2025 17:34:11 +0530
Subject: [PATCH 63/85] Update test_functional.py
---
tests/test_functional.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 41ed7c984..5f5ee488c 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -796,7 +796,7 @@ def test_coo_int8_vectorwise_quant(self, device, dim1, dim2):
A[:, outlier_cols] = 0
torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2)
-
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
class TestSpMMFunctional:
@pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1"))
From e198824c5c9e23bb15d6eb2aa07a04f09e95446f Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 30 May 2025 17:36:53 +0530
Subject: [PATCH 64/85] Update modules.py
---
bitsandbytes/nn/modules.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 937084cf1..6b6490265 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -212,7 +212,7 @@ def __new__(
data: Optional[torch.Tensor] = None,
requires_grad=False, # quantized weights should be frozen by default
quant_state: Optional[QuantState] = None,
- blocksize: int = 64,
+ blocksize: Optional[int] = None,
compress_statistics: bool = True,
quant_type: str = "fp4",
quant_storage: torch.dtype = torch.uint8,
@@ -221,7 +221,10 @@ def __new__(
) -> "Params4bit":
if data is None:
data = torch.empty(0)
-
+
+ if blocksize is None:
+ blocksize = 64 if not HIP_ENVIRONMENT else 128
+
self = torch.Tensor._make_subclass(cls, data, requires_grad)
self.blocksize = blocksize
self.compress_statistics = compress_statistics
From dd58310df17b69c63a9a06186e7f6bb24c98a199 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 30 May 2025 17:37:28 +0530
Subject: [PATCH 65/85] Update modules.py
---
bitsandbytes/nn/modules.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 6b6490265..2383f2c10 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -11,6 +11,7 @@
import torch.nn.functional as F
import bitsandbytes as bnb
+from bitsandbytes.cextension import HIP_ENVIRONMENT
from bitsandbytes.functional import QuantState
from bitsandbytes.optim import GlobalOptimManager
from bitsandbytes.utils import (
From 931bd70d868df8a663d32c3d4b410f72a45c1c3b Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 30 May 2025 17:50:14 +0530
Subject: [PATCH 66/85] Update ops.py
---
bitsandbytes/backends/cuda/ops.py | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 29dddc96e..fd7b7b9a2 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -303,7 +303,11 @@ def _dequantize_blockwise_impl(
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> tuple[torch.Tensor, torch.Tensor]:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -381,7 +385,11 @@ def _dequantize_4bit_impl(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
dtype in [torch.bfloat16, torch.float16, torch.float32],
From 9e62d466d226a62bd61e73afd676a694e1d13eac Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Fri, 30 May 2025 18:56:05 +0530
Subject: [PATCH 67/85] Update test_linear4bit.py
---
tests/test_linear4bit.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 474a00a1b..c241a265d 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -184,7 +184,7 @@ def test_linear_serialization(device, quant_type, compress_statistics, bias, qua
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
-@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_copy_param(device, quant_type, blocksize, compress_statistics):
if device == "cpu":
@@ -209,7 +209,7 @@ def test_copy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
-@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
if device == "cpu":
@@ -241,7 +241,7 @@ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
-@pytest.mark.parametrize("blocksize", [64, 128])
+@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
if device == "cpu":
From 1f71562a9ba57dd209f844549ffc8ff98bebb06d Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Mon, 2 Jun 2025 19:05:12 +0530
Subject: [PATCH 68/85] Update ops.py
---
bitsandbytes/backends/cpu/ops.py | 93 ++++++++++++++++++++++++++------
1 file changed, 76 insertions(+), 17 deletions(-)
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
index d5ab9aa88..f58be5d2a 100644
--- a/bitsandbytes/backends/cpu/ops.py
+++ b/bitsandbytes/backends/cpu/ops.py
@@ -103,16 +103,39 @@ def _(
n = A.numel()
- # TODO: Support when weight matrix is not divisible by blocksize
- torch._check(n % blocksize == 0, lambda: f"n must be divisible by blocksize, got {n} and {blocksize}")
-
- # Divide into blocks and normalize
- blocks = A.reshape(-1, blocksize)
- absmax = blocks.abs().max(dim=1).values.float()
- scaled = blocks / absmax.unsqueeze(-1)
-
- # Quantize with the lookup table
- quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(torch.uint8)
+ blocks = n // blocksize
+ rem = n % blocksize
+ has_rem = rem > 0
+ if has_rem:
+ blocks += 1
+
+ absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
+ A_reshaped = A.reshape(n)
+
+ if n >= blocksize:
+ A_com = A_reshaped[: n - rem]
+ A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+ absmax[:blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=1).values.float()
+ scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[:blocks - has_rem].unsqueeze(-1)), -1, 1)
+ scaled_A = scaled_A.reshape(-1)
+
+ if has_rem:
+ absmax[-1] = torch.abs(A_reshaped[n - rem :]).max().float()
+ scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
+ scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+
+ # Quantize with the lookup table
+ quantized = torch.argmin(torch.abs(scaled_A.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(torch.uint8)
+ else:
+ blocks = A.reshape(-1, blocksize)
+ absmax = blocks.abs().max(dim=1).values.float()
+ scaled_A = blocks / absmax.unsqueeze(-1)
+
+ # Quantize with the lookup table
+ quantized = torch.argmin(torch.abs(scaled_A.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(torch.uint8)
+
+ if quantized.numel() % 2 == 1:
+ quantized = torch.cat([quantized, torch.zeros((1, 1), device=A.device, dtype=torch.uint8)])
# Pack two quantized values per byte
packed = quantized[::2] << 4 | quantized[1::2]
@@ -149,16 +172,52 @@ def _(
upper = (A >> 4).to(torch.int64)
lower = (A & 0x0F).to(torch.int64)
- # Expand to blocks
- blocks = torch.cat((upper, lower), dim=1).reshape(-1, blocksize)
+ # Calculate the total number of elements in the original tensor
+ n = 1
+ for d in shape:
+ n *= d
+
+ # Concatenate upper and lower nibbles
+ indices = torch.cat((upper, lower), dim=1).reshape(-1)
+
+ if indices.numel() > n:
+ indices = indices[:n]
+
+ blocks = n // blocksize
+ rem = n % blocksize
+ has_rem = rem > 0
+ if has_rem:
+ blocks += 1
+
+ if has_rem:
+ out = torch.empty(shape, dtype=dtype, device=A.device)
+ out_reshaped = out.reshape(-1)
+
+ padded_indices = torch.zeros(blocks * blocksize, dtype=indices.dtype, device=indices.device)
+ padded_indices[:n] = indices
+ blocks_data = padded_indices.reshape(-1, blocksize)
+
+ # Dequantize full blocks
+ dequantized = _NF4_QUANT_TABLE[blocks_data]
+
+ # Apply scales to full blocks
+ out_reshaped[:n - rem] = (
+ dequantized[:blocks - 1].reshape(-1, blocksize) * absmax[:blocks - 1].view(-1, 1)
+ ).reshape(-1)
+
+ # Apply scale to remainder block
+ out_reshaped[n - rem:] = dequantized[blocks - 1, :rem] * absmax[-1]
+ else:
+ # Expand to blocks
+ blocks = torch.cat((upper, lower), dim=1).reshape(-1, blocksize)
- # Dequantize
- blocks = _NF4_QUANT_TABLE[blocks] * absmax[:, None]
+ # Dequantize
+ blocks = _NF4_QUANT_TABLE[blocks] * absmax[:, None]
- # Reshape to original shape
- blocks = blocks.reshape(-1, *shape[1:])
+ # Reshape to original shape
+ out = blocks.reshape(-1, *shape[1:])
- return blocks.to(dtype)
+ return out.to(dtype)
@register_kernel("bitsandbytes::gemv_4bit", "cpu")
From eac7632e28043caad307cf2b5e1ff61fc9cbfe12 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Mon, 2 Jun 2025 20:46:28 +0530
Subject: [PATCH 69/85] Update ops.py
---
bitsandbytes/backends/cpu/ops.py | 93 ++++++--------------------------
1 file changed, 17 insertions(+), 76 deletions(-)
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
index f58be5d2a..d5ab9aa88 100644
--- a/bitsandbytes/backends/cpu/ops.py
+++ b/bitsandbytes/backends/cpu/ops.py
@@ -103,39 +103,16 @@ def _(
n = A.numel()
- blocks = n // blocksize
- rem = n % blocksize
- has_rem = rem > 0
- if has_rem:
- blocks += 1
-
- absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32)
- A_reshaped = A.reshape(n)
-
- if n >= blocksize:
- A_com = A_reshaped[: n - rem]
- A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
- absmax[:blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=1).values.float()
- scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[:blocks - has_rem].unsqueeze(-1)), -1, 1)
- scaled_A = scaled_A.reshape(-1)
-
- if has_rem:
- absmax[-1] = torch.abs(A_reshaped[n - rem :]).max().float()
- scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
- scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
-
- # Quantize with the lookup table
- quantized = torch.argmin(torch.abs(scaled_A.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(torch.uint8)
- else:
- blocks = A.reshape(-1, blocksize)
- absmax = blocks.abs().max(dim=1).values.float()
- scaled_A = blocks / absmax.unsqueeze(-1)
-
- # Quantize with the lookup table
- quantized = torch.argmin(torch.abs(scaled_A.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(torch.uint8)
-
- if quantized.numel() % 2 == 1:
- quantized = torch.cat([quantized, torch.zeros((1, 1), device=A.device, dtype=torch.uint8)])
+ # TODO: Support when weight matrix is not divisible by blocksize
+ torch._check(n % blocksize == 0, lambda: f"n must be divisible by blocksize, got {n} and {blocksize}")
+
+ # Divide into blocks and normalize
+ blocks = A.reshape(-1, blocksize)
+ absmax = blocks.abs().max(dim=1).values.float()
+ scaled = blocks / absmax.unsqueeze(-1)
+
+ # Quantize with the lookup table
+ quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(torch.uint8)
# Pack two quantized values per byte
packed = quantized[::2] << 4 | quantized[1::2]
@@ -172,52 +149,16 @@ def _(
upper = (A >> 4).to(torch.int64)
lower = (A & 0x0F).to(torch.int64)
- # Calculate the total number of elements in the original tensor
- n = 1
- for d in shape:
- n *= d
-
- # Concatenate upper and lower nibbles
- indices = torch.cat((upper, lower), dim=1).reshape(-1)
-
- if indices.numel() > n:
- indices = indices[:n]
-
- blocks = n // blocksize
- rem = n % blocksize
- has_rem = rem > 0
- if has_rem:
- blocks += 1
-
- if has_rem:
- out = torch.empty(shape, dtype=dtype, device=A.device)
- out_reshaped = out.reshape(-1)
-
- padded_indices = torch.zeros(blocks * blocksize, dtype=indices.dtype, device=indices.device)
- padded_indices[:n] = indices
- blocks_data = padded_indices.reshape(-1, blocksize)
-
- # Dequantize full blocks
- dequantized = _NF4_QUANT_TABLE[blocks_data]
-
- # Apply scales to full blocks
- out_reshaped[:n - rem] = (
- dequantized[:blocks - 1].reshape(-1, blocksize) * absmax[:blocks - 1].view(-1, 1)
- ).reshape(-1)
-
- # Apply scale to remainder block
- out_reshaped[n - rem:] = dequantized[blocks - 1, :rem] * absmax[-1]
- else:
- # Expand to blocks
- blocks = torch.cat((upper, lower), dim=1).reshape(-1, blocksize)
+ # Expand to blocks
+ blocks = torch.cat((upper, lower), dim=1).reshape(-1, blocksize)
- # Dequantize
- blocks = _NF4_QUANT_TABLE[blocks] * absmax[:, None]
+ # Dequantize
+ blocks = _NF4_QUANT_TABLE[blocks] * absmax[:, None]
- # Reshape to original shape
- out = blocks.reshape(-1, *shape[1:])
+ # Reshape to original shape
+ blocks = blocks.reshape(-1, *shape[1:])
- return out.to(dtype)
+ return blocks.to(dtype)
@register_kernel("bitsandbytes::gemv_4bit", "cpu")
From 66dcfc407f59052fa9d5359cdebf619886100033 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Mon, 2 Jun 2025 21:16:02 +0530
Subject: [PATCH 70/85] Update test_linear4bit.py
---
tests/test_linear4bit.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index c241a265d..1b7a7722c 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -17,7 +17,6 @@
"float32": torch.float32,
}
-@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
From b96905d26c63355884e7decc65297591e108679d Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Mon, 2 Jun 2025 21:17:02 +0530
Subject: [PATCH 71/85] Update test_linear4bit.py
From a2a74edef39e30b66e0b43accf1ceb7d48378f24 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 2 Jun 2025 11:50:12 -0400
Subject: [PATCH 72/85] Bump dev version
---
bitsandbytes/__init__.py | 2 +-
setup.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 9a2524953..5014e8240 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -67,4 +67,4 @@ def _import_backends():
"optim.optimizer.MockArgs": False,
}
-__version__ = "0.46.0"
+__version__ = "0.47.0.dev0"
diff --git a/setup.py b/setup.py
index 3208bf1f0..8c84b2c73 100644
--- a/setup.py
+++ b/setup.py
@@ -12,4 +12,4 @@ def has_ext_modules(self):
return True
-setup(version="0.46.0", packages=find_packages(), distclass=BinaryDistribution)
+setup(version="0.47.0.dev0", packages=find_packages(), distclass=BinaryDistribution)
From ef31c362e22b201551605bc6d808026ea33da59c Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Mon, 2 Jun 2025 23:55:14 +0530
Subject: [PATCH 73/85] Update python-package.yml
---
.github/workflows/python-package.yml | 643 ++++++++++++++-------------
1 file changed, 343 insertions(+), 300 deletions(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index fbaa27d56..10daf0f79 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -1,303 +1,346 @@
-name: Python package
-
-on:
- push: {}
- pull_request:
- branches: [main]
- paths:
- - ".github/workflows/python-package.yml"
- - "bitsandbytes/**"
- - "csrc/**"
- - "include/**"
- - "tests/**"
- - "CMakeLists.txt"
- - "requirements*.txt"
- - "setup.py"
- - "pyproject.toml"
- release:
- types: [published]
- workflow_dispatch: {} # Allow manual trigger
- workflow_call: {} # Allow triggering from other worfkflows
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
- cancel-in-progress: true
-
-jobs:
- ##
- # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
- ##
- build-shared-libs:
- strategy:
- matrix:
- include:
- - os: ubuntu-22.04
- arch: x86_64
- - os: ubuntu-22.04-arm
- arch: aarch64
- - os: windows-latest
- arch: x86_64
- - os: macos-latest
- arch: arm64
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout@v4
- - name: Setup MSVC
- if: startsWith(matrix.os, 'windows')
- uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
- - name: Build C++
- run: bash .github/scripts/build-cpu.sh
- env:
- build_os: ${{ matrix.os }}
- build_arch: ${{ matrix.arch }}
- - name: Upload build artifact
- uses: actions/upload-artifact@v4
- with:
- name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
- path: output/*
- retention-days: 7
- ##
- # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
- ##
- build-shared-libs-cuda:
- strategy:
- fail-fast: false
- matrix:
- os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]
- include:
- - os: ubuntu-22.04
- arch: x86_64
- - os: ubuntu-22.04-arm
- arch: aarch64
- - os: windows-latest
- arch: x86_64
- cuda_version:
- ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout@v4
- # Windows: We install Cuda on the agent (slow)
- - uses: Jimver/cuda-toolkit@v0.2.22
- if: startsWith(matrix.os, 'windows')
- id: cuda-toolkit
- with:
- cuda: ${{ matrix.cuda_version }}
- method: "network"
- sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
- linux-local-args: '["--toolkit"]'
- use-github-cache: false
- - name: Setup MSVC
- if: startsWith(matrix.os, 'windows')
- uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
- - name: Build C++
- run: bash .github/scripts/build-cuda.sh
- env:
- build_os: ${{ matrix.os }}
- build_arch: ${{ matrix.arch }}
- cuda_version: ${{ matrix.cuda_version }}
- - name: Upload build artifact
- uses: actions/upload-artifact@v4
- with:
- name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
- path: output/*
- retention-days: 7
-
- build-wheels:
- needs:
- - build-shared-libs
- - build-shared-libs-cuda
- strategy:
- matrix:
- os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
- include:
- - os: ubuntu-22.04
- arch: x86_64
- - os: ubuntu-22.04-arm
- arch: aarch64
- - os: windows-latest
- arch: x86_64
- - os: macos-latest
- arch: arm64
- # The specific Python version is irrelevant in this context as we are only packaging non-C extension
- # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
- # dictated by the packaged code itself, not the Python version used for packaging.
- python-version: ["3.10"]
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout@v4
- - name: Download build artifacts
- uses: actions/download-artifact@v4
- with:
- merge-multiple: true
- pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
- path: output/
- - name: Copy correct platform shared library
- shell: bash
- run: |
- ls -lR output/
- cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
- with:
- python-version: ${{ matrix.python-version }}
- cache: pip
- - run: pip install build wheel
- - run: python -m build .
- - name: Determine and Set Platform Tag, then Tag Wheel
- shell: bash
- run: |
- PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}")
- echo "PLATFORM_TAG=$PLATFORM_TAG"
- wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl
- - name: Upload build artifact
- uses: actions/upload-artifact@v4
- with:
- name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
- path: dist/bitsandbytes-*.whl
- retention-days: 7
-
- upload-pre-release-wheels:
- name: Create release and upload artifacts
- runs-on: ubuntu-latest
- if: github.ref_name == 'main'
- permissions:
- contents: write
- needs:
- - build-wheels
- steps:
- - name: Download and rename artifacts
- uses: actions/download-artifact@v4
- with:
- path: tmp/
- pattern: "bdist_wheel_*"
- merge-multiple: true
+name: Python package
- - name: Inspect tmp directory after downloading artifacts
- run: ls -alFR tmp/
+on:
+ push: {}
+ pull_request:
+ branches: [main]
+ paths:
+ - ".github/workflows/python-package.yml"
+ - "bitsandbytes/**"
+ - "csrc/**"
+ - "include/**"
+ - "tests/**"
+ - "CMakeLists.txt"
+ - "requirements*.txt"
+ - "setup.py"
+ - "pyproject.toml"
+ release:
+ types: [published]
+ workflow_dispatch: {} # Allow manual trigger
+ workflow_call: {} # Allow triggering from other worfkflows
- - name: Move and rename wheel files with pattern replacement
- run: |
- mkdir -p wheels/
-
- # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name
- # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the
- # wheel directly from the GH pre-release which gets updated continuously, e.g.
- # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`
- STABLE_PLACEHOLDER_VERSION="1.33.7.preview"
-
- # exclude macos wheels for now
- find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do
- wheel_filename=$(basename "$wheel")
-
- # Strip off the original version
- rest=${wheel_filename#bitsandbytes-*-}
- new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}"
-
- echo "Renaming $wheel_filename → $new_name"
- mv "$wheel" "wheels/${new_name}"
- done
-
- - name: Inspect wheels directory after renaming files
- run: ls -alFR wheels/
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
- - name: Delete old pre-release (if exists)
- run: |
- gh release delete continuous-release_main --cleanup-tag -y || true
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
- - name: Generate pip install commands for release body
- run: |
- cat > body.md << 'ENDOFMARKDOWN'
- ## Latest `main` Wheel Pre-release
-
- This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
-
- **How to install:**
- Pick the correct command for your platform and run it in your terminal:
-
- ENDOFMARKDOWN
-
- for whl in wheels/*.whl; do
- fname=$(basename "$whl")
- url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"
- echo "\`\`\`sh" >> body.md
- echo "pip install $url" >> body.md
- echo "\`\`\`" >> body.md
- echo "" >> body.md
- done
-
- cat >> body.md << 'ENDOFMARKDOWN'
- > **Note:**
- > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
- ENDOFMARKDOWN
-
- # for debugging:
- cat body.md
-
- - name: Create new pre-release and upload artifacts
- uses: softprops/action-gh-release@v2.2.1
- with:
- files: wheels/*.whl
- prerelease: true
- name: Latest `main` wheel
- body_path: body.md
- tag_name: continuous-release_main
- make_latest: false
- draft: false
- target_commitish: ${{ github.sha }}
-
- audit-wheels:
- needs: build-wheels
- strategy:
- matrix:
- os: [ubuntu-22.04, ubuntu-22.04-arm]
- include:
- - os: ubuntu-22.04
- arch: x86_64
- - os: ubuntu-22.04-arm
- arch: aarch64
- runs-on: ${{ matrix.os }}
- env:
- PIP_DISABLE_PIP_VERSION_CHECK: 1
- steps:
- - uses: actions/checkout@v4
- - name: Download wheel
- uses: actions/download-artifact@v4
- with:
- name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
- path: wheels/
- - name: Set up Python
- uses: actions/setup-python@v5
- with:
- python-version: "3.12"
- - run: pip install auditwheel
- - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY
-
- publish-wheels:
- name: Publish wheels to PyPI
- needs: [build-wheels, audit-wheels]
- runs-on: ubuntu-latest
- if: |
- github.repository == 'bitsandbytes-foundation/bitsandbytes'
- && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
- environment:
- name: release
- url: https://pypi.org/p/bitsandbytes
- permissions:
- id-token: write
- steps:
- - name: Download distribution artifacts
- uses: actions/download-artifact@v4
- with:
- path: dist/
- pattern: "bdist_wheel_*"
- merge-multiple: true
-
- - name: Remove macOS wheels
- run: rm dist/*macos*
-
- - name: Publish to PyPI
- uses: pypa/gh-action-pypi-publish@release/v1
- with:
- print-hash: true
+jobs:
+ ##
+ # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
+ ##
+ build-shared-libs:
+ strategy:
+ matrix:
+ include:
+ - os: ubuntu-22.04
+ arch: x86_64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ - os: windows-latest
+ arch: x86_64
+ - os: macos-latest
+ arch: arm64
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup MSVC
+ if: startsWith(matrix.os, 'windows')
+ uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
+ - name: Build C++
+ run: bash .github/scripts/build-cpu.sh
+ env:
+ build_os: ${{ matrix.os }}
+ build_arch: ${{ matrix.arch }}
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
+ path: output/*
+ retention-days: 7
+ ##
+ # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
+ ##
+ build-shared-libs-cuda:
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]
+ include:
+ - os: ubuntu-22.04
+ arch: x86_64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ - os: windows-latest
+ arch: x86_64
+ cuda_version:
+ ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ # Windows: We install Cuda on the agent (slow)
+ - uses: Jimver/cuda-toolkit@v0.2.22
+ if: startsWith(matrix.os, 'windows')
+ id: cuda-toolkit
+ with:
+ cuda: ${{ matrix.cuda_version }}
+ method: "network"
+ sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
+ linux-local-args: '["--toolkit"]'
+ use-github-cache: false
+ - name: Setup MSVC
+ if: startsWith(matrix.os, 'windows')
+ uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
+ - name: Build C++
+ run: bash .github/scripts/build-cuda.sh
+ env:
+ build_os: ${{ matrix.os }}
+ build_arch: ${{ matrix.arch }}
+ cuda_version: ${{ matrix.cuda_version }}
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
+ path: output/*
+ retention-days: 7
+ build-shared-libs-rocm:
+ strategy:
+ matrix:
+ os: [ubuntu-22.04]
+ arch: [x86_64]
+ rocm_version:
+ ["6.1.2", "6.2.4", "6.3.2"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Docker multiarch
+ uses: docker/setup-qemu-action@v3
+ - name: Clean up disk space
+ run: |
+ sudo rm -rf \
+ /usr/share/dotnet \
+ /opt/ghc \
+ "/usr/local/share/boost" \
+ "$AGENT_TOOLSDIRECTORY" \
+ /opt/hostedtoolcache \
+ /opt/google/chrome \
+ /opt/microsoft/msedge \
+ /opt/microsoft/powershell \
+ /opt/pipx \
+ /usr/lib/mono \
+ /usr/local/julia* \
+ /usr/local/lib/android \
+ /usr/local/lib/node_modules \
+ /usr/local/share/chromium \
+ /usr/local/share/powershell \
+ /usr/share/swift
+ - name: Build C++
+ run: bash .github/scripts/build-rocm.sh
+ env:
+ build_os: ${{ matrix.os }}
+ build_arch: ${{ matrix.arch }}
+ rocm_version: ${{ matrix.rocm_version }}
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+ path: output/*
+ retention-days: 7
+ build-wheels:
+ needs:
+ - build-shared-libs
+ - build-shared-libs-cuda
+ - build-shared-libs-rocm
+ strategy:
+ matrix:
+ os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
+ include:
+ - os: ubuntu-22.04
+ arch: x86_64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ - os: windows-latest
+ arch: x86_64
+ - os: macos-latest
+ arch: arm64
+ # The specific Python version is irrelevant in this context as we are only packaging non-C extension
+ # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
+ # dictated by the packaged code itself, not the Python version used for packaging.
+ python-version: ["3.10"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Download build artifacts
+ uses: actions/download-artifact@v4
+ with:
+ merge-multiple: true
+ pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
+ path: output/
+ - name: Copy correct platform shared library
+ shell: bash
+ run: |
+ ls -lR output/
+ cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: pip
+ - run: pip install build wheel
+ - run: python -m build .
+ - name: Determine and Set Platform Tag, then Tag Wheel
+ shell: bash
+ run: |
+ PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}")
+ echo "PLATFORM_TAG=$PLATFORM_TAG"
+ wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
+ path: dist/bitsandbytes-*.whl
+ retention-days: 7
+
+ upload-pre-release-wheels:
+ name: Create release and upload artifacts
+ runs-on: ubuntu-latest
+ if: github.ref_name == 'main'
+ permissions:
+ contents: write
+ needs:
+ - build-wheels
+ steps:
+ - name: Download and rename artifacts
+ uses: actions/download-artifact@v4
+ with:
+ path: tmp/
+ pattern: "bdist_wheel_*"
+ merge-multiple: true
+
+ - name: Inspect tmp directory after downloading artifacts
+ run: ls -alFR tmp/
+
+ - name: Move and rename wheel files with pattern replacement
+ run: |
+ mkdir -p wheels/
+
+ # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name
+ # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the
+ # wheel directly from the GH pre-release which gets updated continuously, e.g.
+ # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`
+ STABLE_PLACEHOLDER_VERSION="1.33.7.preview"
+
+ # exclude macos wheels for now
+ find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do
+ wheel_filename=$(basename "$wheel")
+
+ # Strip off the original version
+ rest=${wheel_filename#bitsandbytes-*-}
+ new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}"
+
+ echo "Renaming $wheel_filename → $new_name"
+ mv "$wheel" "wheels/${new_name}"
+ done
+
+ - name: Inspect wheels directory after renaming files
+ run: ls -alFR wheels/
+
+ - name: Delete old pre-release (if exists)
+ run: |
+ gh release delete continuous-release_main --cleanup-tag -y || true
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Generate pip install commands for release body
+ run: |
+ cat > body.md << 'ENDOFMARKDOWN'
+ ## Latest `main` Wheel Pre-release
+
+ This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
+
+ **How to install:**
+ Pick the correct command for your platform and run it in your terminal:
+
+ ENDOFMARKDOWN
+
+ for whl in wheels/*.whl; do
+ fname=$(basename "$whl")
+ url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"
+ echo "\`\`\`sh" >> body.md
+ echo "pip install $url" >> body.md
+ echo "\`\`\`" >> body.md
+ echo "" >> body.md
+ done
+
+ cat >> body.md << 'ENDOFMARKDOWN'
+ > **Note:**
+ > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
+ ENDOFMARKDOWN
+
+ # for debugging:
+ cat body.md
+
+ - name: Create new pre-release and upload artifacts
+ uses: softprops/action-gh-release@v2.2.1
+ with:
+ files: wheels/*.whl
+ prerelease: true
+ name: Latest `main` wheel
+ body_path: body.md
+ tag_name: continuous-release_main
+ make_latest: false
+ draft: false
+ target_commitish: ${{ github.sha }}
+
+ audit-wheels:
+ needs: build-wheels
+ strategy:
+ matrix:
+ os: [ubuntu-22.04, ubuntu-22.04-arm]
+ include:
+ - os: ubuntu-22.04
+ arch: x86_64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ runs-on: ${{ matrix.os }}
+ env:
+ PIP_DISABLE_PIP_VERSION_CHECK: 1
+ steps:
+ - uses: actions/checkout@v4
+ - name: Download wheel
+ uses: actions/download-artifact@v4
+ with:
+ name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
+ path: wheels/
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - run: pip install auditwheel
+ - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY
+
+ publish-wheels:
+ name: Publish wheels to PyPI
+ needs: [build-wheels, audit-wheels]
+ runs-on: ubuntu-latest
+ if: |
+ github.repository == 'bitsandbytes-foundation/bitsandbytes'
+ && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+ environment:
+ name: release
+ url: https://pypi.org/p/bitsandbytes
+ permissions:
+ id-token: write
+ steps:
+ - name: Download distribution artifacts
+ uses: actions/download-artifact@v4
+ with:
+ path: dist/
+ pattern: "bdist_wheel_*"
+ merge-multiple: true
+
+ - name: Remove macOS wheels
+ run: rm dist/*macos*
+
+ - name: Publish to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ print-hash: true
From e1435f01776137c3a253228b4234a23535532161 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Mon, 2 Jun 2025 23:57:25 +0530
Subject: [PATCH 74/85] Update python-package.yml
---
.github/workflows/python-package.yml | 643 +++++++++++++--------------
1 file changed, 300 insertions(+), 343 deletions(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 10daf0f79..fbaa27d56 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -1,346 +1,303 @@
-name: Python package
+name: Python package
+
+on:
+ push: {}
+ pull_request:
+ branches: [main]
+ paths:
+ - ".github/workflows/python-package.yml"
+ - "bitsandbytes/**"
+ - "csrc/**"
+ - "include/**"
+ - "tests/**"
+ - "CMakeLists.txt"
+ - "requirements*.txt"
+ - "setup.py"
+ - "pyproject.toml"
+ release:
+ types: [published]
+ workflow_dispatch: {} # Allow manual trigger
+ workflow_call: {} # Allow triggering from other worfkflows
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ ##
+ # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
+ ##
+ build-shared-libs:
+ strategy:
+ matrix:
+ include:
+ - os: ubuntu-22.04
+ arch: x86_64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ - os: windows-latest
+ arch: x86_64
+ - os: macos-latest
+ arch: arm64
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup MSVC
+ if: startsWith(matrix.os, 'windows')
+ uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
+ - name: Build C++
+ run: bash .github/scripts/build-cpu.sh
+ env:
+ build_os: ${{ matrix.os }}
+ build_arch: ${{ matrix.arch }}
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
+ path: output/*
+ retention-days: 7
+ ##
+ # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
+ ##
+ build-shared-libs-cuda:
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]
+ include:
+ - os: ubuntu-22.04
+ arch: x86_64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ - os: windows-latest
+ arch: x86_64
+ cuda_version:
+ ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ # Windows: We install Cuda on the agent (slow)
+ - uses: Jimver/cuda-toolkit@v0.2.22
+ if: startsWith(matrix.os, 'windows')
+ id: cuda-toolkit
+ with:
+ cuda: ${{ matrix.cuda_version }}
+ method: "network"
+ sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
+ linux-local-args: '["--toolkit"]'
+ use-github-cache: false
+ - name: Setup MSVC
+ if: startsWith(matrix.os, 'windows')
+ uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
+ - name: Build C++
+ run: bash .github/scripts/build-cuda.sh
+ env:
+ build_os: ${{ matrix.os }}
+ build_arch: ${{ matrix.arch }}
+ cuda_version: ${{ matrix.cuda_version }}
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
+ path: output/*
+ retention-days: 7
+
+ build-wheels:
+ needs:
+ - build-shared-libs
+ - build-shared-libs-cuda
+ strategy:
+ matrix:
+ os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
+ include:
+ - os: ubuntu-22.04
+ arch: x86_64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ - os: windows-latest
+ arch: x86_64
+ - os: macos-latest
+ arch: arm64
+ # The specific Python version is irrelevant in this context as we are only packaging non-C extension
+ # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
+ # dictated by the packaged code itself, not the Python version used for packaging.
+ python-version: ["3.10"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Download build artifacts
+ uses: actions/download-artifact@v4
+ with:
+ merge-multiple: true
+ pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
+ path: output/
+ - name: Copy correct platform shared library
+ shell: bash
+ run: |
+ ls -lR output/
+ cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: pip
+ - run: pip install build wheel
+ - run: python -m build .
+ - name: Determine and Set Platform Tag, then Tag Wheel
+ shell: bash
+ run: |
+ PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}")
+ echo "PLATFORM_TAG=$PLATFORM_TAG"
+ wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
+ path: dist/bitsandbytes-*.whl
+ retention-days: 7
+
+ upload-pre-release-wheels:
+ name: Create release and upload artifacts
+ runs-on: ubuntu-latest
+ if: github.ref_name == 'main'
+ permissions:
+ contents: write
+ needs:
+ - build-wheels
+ steps:
+ - name: Download and rename artifacts
+ uses: actions/download-artifact@v4
+ with:
+ path: tmp/
+ pattern: "bdist_wheel_*"
+ merge-multiple: true
-on:
- push: {}
- pull_request:
- branches: [main]
- paths:
- - ".github/workflows/python-package.yml"
- - "bitsandbytes/**"
- - "csrc/**"
- - "include/**"
- - "tests/**"
- - "CMakeLists.txt"
- - "requirements*.txt"
- - "setup.py"
- - "pyproject.toml"
- release:
- types: [published]
- workflow_dispatch: {} # Allow manual trigger
- workflow_call: {} # Allow triggering from other worfkflows
+ - name: Inspect tmp directory after downloading artifacts
+ run: ls -alFR tmp/
-concurrency:
- group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
- cancel-in-progress: true
+ - name: Move and rename wheel files with pattern replacement
+ run: |
+ mkdir -p wheels/
+
+ # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name
+ # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the
+ # wheel directly from the GH pre-release which gets updated continuously, e.g.
+ # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`
+ STABLE_PLACEHOLDER_VERSION="1.33.7.preview"
+
+ # exclude macos wheels for now
+ find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do
+ wheel_filename=$(basename "$wheel")
+
+ # Strip off the original version
+ rest=${wheel_filename#bitsandbytes-*-}
+ new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}"
+
+ echo "Renaming $wheel_filename → $new_name"
+ mv "$wheel" "wheels/${new_name}"
+ done
+
+ - name: Inspect wheels directory after renaming files
+ run: ls -alFR wheels/
-jobs:
- ##
- # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
- ##
- build-shared-libs:
- strategy:
- matrix:
- include:
- - os: ubuntu-22.04
- arch: x86_64
- - os: ubuntu-22.04-arm
- arch: aarch64
- - os: windows-latest
- arch: x86_64
- - os: macos-latest
- arch: arm64
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout@v4
- - name: Setup MSVC
- if: startsWith(matrix.os, 'windows')
- uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
- - name: Build C++
- run: bash .github/scripts/build-cpu.sh
- env:
- build_os: ${{ matrix.os }}
- build_arch: ${{ matrix.arch }}
- - name: Upload build artifact
- uses: actions/upload-artifact@v4
- with:
- name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
- path: output/*
- retention-days: 7
- ##
- # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
- ##
- build-shared-libs-cuda:
- strategy:
- fail-fast: false
- matrix:
- os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]
- include:
- - os: ubuntu-22.04
- arch: x86_64
- - os: ubuntu-22.04-arm
- arch: aarch64
- - os: windows-latest
- arch: x86_64
- cuda_version:
- ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout@v4
- # Windows: We install Cuda on the agent (slow)
- - uses: Jimver/cuda-toolkit@v0.2.22
- if: startsWith(matrix.os, 'windows')
- id: cuda-toolkit
- with:
- cuda: ${{ matrix.cuda_version }}
- method: "network"
- sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
- linux-local-args: '["--toolkit"]'
- use-github-cache: false
- - name: Setup MSVC
- if: startsWith(matrix.os, 'windows')
- uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
- - name: Build C++
- run: bash .github/scripts/build-cuda.sh
- env:
- build_os: ${{ matrix.os }}
- build_arch: ${{ matrix.arch }}
- cuda_version: ${{ matrix.cuda_version }}
- - name: Upload build artifact
- uses: actions/upload-artifact@v4
- with:
- name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
- path: output/*
- retention-days: 7
- build-shared-libs-rocm:
- strategy:
- matrix:
- os: [ubuntu-22.04]
- arch: [x86_64]
- rocm_version:
- ["6.1.2", "6.2.4", "6.3.2"]
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout@v4
- - name: Set up Docker multiarch
- uses: docker/setup-qemu-action@v3
- - name: Clean up disk space
- run: |
- sudo rm -rf \
- /usr/share/dotnet \
- /opt/ghc \
- "/usr/local/share/boost" \
- "$AGENT_TOOLSDIRECTORY" \
- /opt/hostedtoolcache \
- /opt/google/chrome \
- /opt/microsoft/msedge \
- /opt/microsoft/powershell \
- /opt/pipx \
- /usr/lib/mono \
- /usr/local/julia* \
- /usr/local/lib/android \
- /usr/local/lib/node_modules \
- /usr/local/share/chromium \
- /usr/local/share/powershell \
- /usr/share/swift
- - name: Build C++
- run: bash .github/scripts/build-rocm.sh
- env:
- build_os: ${{ matrix.os }}
- build_arch: ${{ matrix.arch }}
- rocm_version: ${{ matrix.rocm_version }}
- - name: Upload build artifact
- uses: actions/upload-artifact@v4
- with:
- name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
- path: output/*
- retention-days: 7
- build-wheels:
- needs:
- - build-shared-libs
- - build-shared-libs-cuda
- - build-shared-libs-rocm
- strategy:
- matrix:
- os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
- include:
- - os: ubuntu-22.04
- arch: x86_64
- - os: ubuntu-22.04-arm
- arch: aarch64
- - os: windows-latest
- arch: x86_64
- - os: macos-latest
- arch: arm64
- # The specific Python version is irrelevant in this context as we are only packaging non-C extension
- # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
- # dictated by the packaged code itself, not the Python version used for packaging.
- python-version: ["3.10"]
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout@v4
- - name: Download build artifacts
- uses: actions/download-artifact@v4
- with:
- merge-multiple: true
- pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
- path: output/
- - name: Copy correct platform shared library
- shell: bash
- run: |
- ls -lR output/
- cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
- with:
- python-version: ${{ matrix.python-version }}
- cache: pip
- - run: pip install build wheel
- - run: python -m build .
- - name: Determine and Set Platform Tag, then Tag Wheel
- shell: bash
- run: |
- PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}")
- echo "PLATFORM_TAG=$PLATFORM_TAG"
- wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl
- - name: Upload build artifact
- uses: actions/upload-artifact@v4
- with:
- name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
- path: dist/bitsandbytes-*.whl
- retention-days: 7
-
- upload-pre-release-wheels:
- name: Create release and upload artifacts
- runs-on: ubuntu-latest
- if: github.ref_name == 'main'
- permissions:
- contents: write
- needs:
- - build-wheels
- steps:
- - name: Download and rename artifacts
- uses: actions/download-artifact@v4
- with:
- path: tmp/
- pattern: "bdist_wheel_*"
- merge-multiple: true
-
- - name: Inspect tmp directory after downloading artifacts
- run: ls -alFR tmp/
-
- - name: Move and rename wheel files with pattern replacement
- run: |
- mkdir -p wheels/
-
- # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name
- # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the
- # wheel directly from the GH pre-release which gets updated continuously, e.g.
- # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`
- STABLE_PLACEHOLDER_VERSION="1.33.7.preview"
-
- # exclude macos wheels for now
- find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do
- wheel_filename=$(basename "$wheel")
-
- # Strip off the original version
- rest=${wheel_filename#bitsandbytes-*-}
- new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}"
-
- echo "Renaming $wheel_filename → $new_name"
- mv "$wheel" "wheels/${new_name}"
- done
-
- - name: Inspect wheels directory after renaming files
- run: ls -alFR wheels/
-
- - name: Delete old pre-release (if exists)
- run: |
- gh release delete continuous-release_main --cleanup-tag -y || true
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
- - name: Generate pip install commands for release body
- run: |
- cat > body.md << 'ENDOFMARKDOWN'
- ## Latest `main` Wheel Pre-release
-
- This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
-
- **How to install:**
- Pick the correct command for your platform and run it in your terminal:
-
- ENDOFMARKDOWN
-
- for whl in wheels/*.whl; do
- fname=$(basename "$whl")
- url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"
- echo "\`\`\`sh" >> body.md
- echo "pip install $url" >> body.md
- echo "\`\`\`" >> body.md
- echo "" >> body.md
- done
-
- cat >> body.md << 'ENDOFMARKDOWN'
- > **Note:**
- > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
- ENDOFMARKDOWN
-
- # for debugging:
- cat body.md
-
- - name: Create new pre-release and upload artifacts
- uses: softprops/action-gh-release@v2.2.1
- with:
- files: wheels/*.whl
- prerelease: true
- name: Latest `main` wheel
- body_path: body.md
- tag_name: continuous-release_main
- make_latest: false
- draft: false
- target_commitish: ${{ github.sha }}
-
- audit-wheels:
- needs: build-wheels
- strategy:
- matrix:
- os: [ubuntu-22.04, ubuntu-22.04-arm]
- include:
- - os: ubuntu-22.04
- arch: x86_64
- - os: ubuntu-22.04-arm
- arch: aarch64
- runs-on: ${{ matrix.os }}
- env:
- PIP_DISABLE_PIP_VERSION_CHECK: 1
- steps:
- - uses: actions/checkout@v4
- - name: Download wheel
- uses: actions/download-artifact@v4
- with:
- name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
- path: wheels/
- - name: Set up Python
- uses: actions/setup-python@v5
- with:
- python-version: "3.12"
- - run: pip install auditwheel
- - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY
-
- publish-wheels:
- name: Publish wheels to PyPI
- needs: [build-wheels, audit-wheels]
- runs-on: ubuntu-latest
- if: |
- github.repository == 'bitsandbytes-foundation/bitsandbytes'
- && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
- environment:
- name: release
- url: https://pypi.org/p/bitsandbytes
- permissions:
- id-token: write
- steps:
- - name: Download distribution artifacts
- uses: actions/download-artifact@v4
- with:
- path: dist/
- pattern: "bdist_wheel_*"
- merge-multiple: true
-
- - name: Remove macOS wheels
- run: rm dist/*macos*
-
- - name: Publish to PyPI
- uses: pypa/gh-action-pypi-publish@release/v1
- with:
- print-hash: true
+ - name: Delete old pre-release (if exists)
+ run: |
+ gh release delete continuous-release_main --cleanup-tag -y || true
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Generate pip install commands for release body
+ run: |
+ cat > body.md << 'ENDOFMARKDOWN'
+ ## Latest `main` Wheel Pre-release
+
+ This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
+
+ **How to install:**
+ Pick the correct command for your platform and run it in your terminal:
+
+ ENDOFMARKDOWN
+
+ for whl in wheels/*.whl; do
+ fname=$(basename "$whl")
+ url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"
+ echo "\`\`\`sh" >> body.md
+ echo "pip install $url" >> body.md
+ echo "\`\`\`" >> body.md
+ echo "" >> body.md
+ done
+
+ cat >> body.md << 'ENDOFMARKDOWN'
+ > **Note:**
+ > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
+ ENDOFMARKDOWN
+
+ # for debugging:
+ cat body.md
+
+ - name: Create new pre-release and upload artifacts
+ uses: softprops/action-gh-release@v2.2.1
+ with:
+ files: wheels/*.whl
+ prerelease: true
+ name: Latest `main` wheel
+ body_path: body.md
+ tag_name: continuous-release_main
+ make_latest: false
+ draft: false
+ target_commitish: ${{ github.sha }}
+
+ audit-wheels:
+ needs: build-wheels
+ strategy:
+ matrix:
+ os: [ubuntu-22.04, ubuntu-22.04-arm]
+ include:
+ - os: ubuntu-22.04
+ arch: x86_64
+ - os: ubuntu-22.04-arm
+ arch: aarch64
+ runs-on: ${{ matrix.os }}
+ env:
+ PIP_DISABLE_PIP_VERSION_CHECK: 1
+ steps:
+ - uses: actions/checkout@v4
+ - name: Download wheel
+ uses: actions/download-artifact@v4
+ with:
+ name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
+ path: wheels/
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.12"
+ - run: pip install auditwheel
+ - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY
+
+ publish-wheels:
+ name: Publish wheels to PyPI
+ needs: [build-wheels, audit-wheels]
+ runs-on: ubuntu-latest
+ if: |
+ github.repository == 'bitsandbytes-foundation/bitsandbytes'
+ && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+ environment:
+ name: release
+ url: https://pypi.org/p/bitsandbytes
+ permissions:
+ id-token: write
+ steps:
+ - name: Download distribution artifacts
+ uses: actions/download-artifact@v4
+ with:
+ path: dist/
+ pattern: "bdist_wheel_*"
+ merge-multiple: true
+
+ - name: Remove macOS wheels
+ run: rm dist/*macos*
+
+ - name: Publish to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ print-hash: true
From da9a271446295e012cd61263836ab8fea0a06af8 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 3 Jun 2025 00:06:56 +0530
Subject: [PATCH 75/85] Update python-package.yml
---
.github/workflows/python-package.yml | 53 +++++++++++++++++++++++++---
1 file changed, 49 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index fbaa27d56..8b0bbb374 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -102,10 +102,55 @@ jobs:
path: output/*
retention-days: 7
- build-wheels:
- needs:
- - build-shared-libs
- - build-shared-libs-cuda
+ build-shared-libs-rocm:
+ strategy:
+ matrix:
+ os: [ubuntu-22.04]
+ arch: [x86_64]
+ rocm_version:
+ ["6.1.2", "6.2.4", "6.3.2"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Docker multiarch
+ uses: docker/setup-qemu-action@v3
+ - name: Clean up disk space
+ run: |
+ sudo rm -rf \
+ /usr/share/dotnet \
+ /opt/ghc \
+ "/usr/local/share/boost" \
+ "$AGENT_TOOLSDIRECTORY" \
+ /opt/hostedtoolcache \
+ /opt/google/chrome \
+ /opt/microsoft/msedge \
+ /opt/microsoft/powershell \
+ /opt/pipx \
+ /usr/lib/mono \
+ /usr/local/julia* \
+ /usr/local/lib/android \
+ /usr/local/lib/node_modules \
+ /usr/local/share/chromium \
+ /usr/local/share/powershell \
+ /usr/share/swift
+ - name: Build C++
+ run: bash .github/scripts/build-rocm.sh
+ env:
+ build_os: ${{ matrix.os }}
+ build_arch: ${{ matrix.arch }}
+ rocm_version: ${{ matrix.rocm_version }}
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+ path: output/*
+ retention-days: 7
+
+ build-wheels:
+ needs:
+ - build-shared-libs
+ - build-shared-libs-cuda
+ - build-shared-libs-rocm
strategy:
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
From 08848daddb2ec6bd13f7b5a0720bd6d34988d818 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 3 Jun 2025 00:12:54 +0530
Subject: [PATCH 76/85] Update python-package.yml
---
.github/workflows/python-package.yml | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 8b0bbb374..a65d0f5bb 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -145,12 +145,12 @@ jobs:
name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
path: output/*
retention-days: 7
-
- build-wheels:
- needs:
- - build-shared-libs
- - build-shared-libs-cuda
- - build-shared-libs-rocm
+
+ build-wheels:
+ needs:
+ - build-shared-libs
+ - build-shared-libs-cuda
+ - build-shared-libs-rocm
strategy:
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
From 945f7c1d8dda65000a435e359bab9552fe7650b4 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 2 Jun 2025 15:11:34 -0400
Subject: [PATCH 77/85] Fix CI regression (#1666)
* Tests: xfail opcheck for 4bit quantization with floating storage dtypes
* Tests: xfail opcheck for 4bit quantization with floating storage dtypes
* Tests: skip test_gemv_eye_4bit on CPU with bf16 when not supported by torch
* Tests: skip test_gemv_eye_4bit on CPU with bf16 when not supported by torch
---
bitsandbytes/__init__.py | 2 +-
bitsandbytes/backends/utils.py | 8 ++++++--
tests/test_functional.py | 3 +++
tests/test_ops.py | 5 ++---
4 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 5014e8240..c747398ce 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -34,7 +34,7 @@
if torch.cuda.is_available():
from .backends.cuda import ops as cuda_ops
-if torch.xpu.is_available():
+if hasattr(torch, "xpu") and torch.xpu.is_available():
from .backends.xpu import ops as xpu_ops
diff --git a/bitsandbytes/backends/utils.py b/bitsandbytes/backends/utils.py
index cc88ffae1..bf277e7ea 100755
--- a/bitsandbytes/backends/utils.py
+++ b/bitsandbytes/backends/utils.py
@@ -30,7 +30,9 @@
1.0,
],
dtype=torch.float32,
- device="xpu" if torch.xpu.is_available() else "cpu", # Only cpu/xpu use this table for now.
+ device="xpu"
+ if hasattr(torch, "xpu") and torch.xpu.is_available()
+ else "cpu", # Only cpu/xpu use this table for now.
)
_FP4_QUANT_TABLE = torch.tensor(
[
@@ -52,6 +54,8 @@
-0.2500,
],
dtype=torch.float32,
- device="xpu" if torch.xpu.is_available() else "cpu", # Only cpu/xpu use this table for now.
+ device="xpu"
+ if hasattr(torch, "xpu") and torch.xpu.is_available()
+ else "cpu", # Only cpu/xpu use this table for now.
)
CODE = {"nf4": _NF4_QUANT_TABLE, "fp4": _FP4_QUANT_TABLE}
diff --git a/tests/test_functional.py b/tests/test_functional.py
index fa4a14ae9..6a94205e8 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1330,6 +1330,9 @@ def test_gemv_4bit(self, device, dim, dtype, storage_type, quant_storage, double
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"])
def test_gemv_eye_4bit(self, device, storage_type, dtype, double_quant):
+ if device == "cpu" and dtype == torch.bfloat16 and torch.__version__ < (2, 3):
+ pytest.skip("eye doe not support bfloat16 on CPU in torch < 2.3")
+
dims = 10
torch.random.manual_seed(np.random.randint(0, 412424242))
dims = get_test_dims(0, 8192, n=dims)
diff --git a/tests/test_ops.py b/tests/test_ops.py
index 9a0ae3338..7da19c012 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -167,9 +167,8 @@ def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize
assert absmax.device == A.device
assert absmax.dtype == torch.float32
- # TODO: Enable it
- if device in ("cpu", "xpu") and storage_dtype == torch.bfloat16:
- pytest.skip("CPU bf16 storage_dtype will fail on torch op check")
+ if storage_dtype != torch.uint8:
+ pytest.xfail("opcheck fails for storage_dtype != torch.uint8")
opcheck(torch.ops.bitsandbytes.quantize_4bit, (A, blocksize, quant_type, storage_dtype))
From 978cba3825e3624bc39d594a2bd01c2444e1af69 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 3 Jun 2025 01:33:00 +0530
Subject: [PATCH 78/85] Create build-rocm.sh
---
.github/scripts/build-rocm.sh | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
create mode 100644 .github/scripts/build-rocm.sh
diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
new file mode 100644
index 000000000..b508fac69
--- /dev/null
+++ b/.github/scripts/build-rocm.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+declare rocm_version
+
+set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+if [ "${build_os:0:6}" == ubuntu ]; then
+ image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+ echo "Using image $image"
+ docker run --rm --platform "linux/$build_arch" -i \
+ -w /src -v "$PWD:/src" "$image" sh -c \
+ "apt-get update \
+ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+ && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
+ && cmake --build ."
+fi
+
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
From 318a86e345840388b50fa466b34d5726123d0ff6 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 2 Jun 2025 17:34:05 -0400
Subject: [PATCH 79/85] Add CPU + IPEX to nightly CI (#1667)
* Tests: add linux x64 cpu+ipex to nightly CI workflow
* typo
* Tests: guard linear8bit compile test for ipex cpu issue
---
.github/workflows/tests.yml | 42 +++++++++++++++++++++++++++++++++++--
tests/test_linear8bitlt.py | 7 +++++--
2 files changed, 45 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b93bff4f0..b4c38ba6d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -161,6 +161,46 @@ jobs:
- name: Run tests
run: pytest --durations=100
+ test-cpu-ipex:
+ if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
+ needs: build-cpu
+ runs-on: banb-aws-general-8-plus-use1-public-80
+ env:
+ BNB_TEST_DEVICE: cpu
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Download build artifact
+ uses: actions/download-artifact@v4
+ with:
+ name: lib_cpu_ubuntu-22.04_x86_64
+ path: bitsandbytes/
+ merge-multiple: true
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: 3.9
+
+ - name: Install dependencies
+ run: |
+ pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cpu
+ pip install intel_extension_for_pytorch==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+ pip install -e ".[test]"
+ pip install pytest-cov
+
+ - name: Show installed packages
+ run: pip list
+
+ - name: Show environment information
+ run: python -m torch.utils.collect_env
+
+ - name: IPEX smoke test
+ run: python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__);"
+
+ - name: Run tests
+ run: pytest --durations=100
+
# test-cuda-aarch64:
# if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
# needs: build-cuda
@@ -182,8 +222,6 @@ jobs:
# - name: Show pip packages
# run: pip list
-
-
test-cuda:
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
needs: build-cuda
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index a77c693e0..271920b11 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -271,11 +271,14 @@ def test_linear8bitlt_torch_compile(device, threshold, bias, fullgraph, mode):
# Test with gradients. Currently only works with threshold=0.
# Has a strange regression on Linux aarch64 CPU in torch==2.6.0.
+ # There is also an issue with torch==2.7.0 on x86-64 with IPEX.
is_broken_platform = (
device == "cpu"
- and platform.machine() == "aarch64"
and platform.system() == "Linux"
- and ((2, 7) > torch.__version__ >= (2, 6))
+ and (
+ (platform.machine() == "aarch64" and (2, 6) <= torch.__version__ < (2, 7))
+ or (platform.machine() == "x86_64" and bnb.functional.ipex_cpu)
+ )
)
if threshold == 0 and not is_broken_platform:
From 10bee2509d810aec16678e14ab8ab30bac80a7bb Mon Sep 17 00:00:00 2001
From: MISHANMAUYRA
Date: Tue, 3 Jun 2025 17:43:55 +0530
Subject: [PATCH 80/85] Make ROCm build script executable
---
.github/scripts/build-rocm.sh | 0
1 file changed, 0 insertions(+), 0 deletions(-)
mode change 100644 => 100755 .github/scripts/build-rocm.sh
diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
old mode 100644
new mode 100755
From 55ebaac70f2eecf336840caeef4676d3a23cf994 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 3 Jun 2025 11:09:48 -0400
Subject: [PATCH 81/85] Tests: don't require grad on weights for
test_kbit_backprop
---
tests/test_modules.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/tests/test_modules.py b/tests/test_modules.py
index aa6f19c9e..319e67714 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -285,9 +285,6 @@ def test_linear_kbit_fp32_bias(device, module):
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("module", module_dict.values(), ids=module_dict.keys())
def test_kbit_backprop(device, module):
- if device == "cpu":
- pytest.xfail("Test is not yet supported on CPU")
-
b = 16
dim1 = 36
dim2 = 84
@@ -295,14 +292,15 @@ def test_kbit_backprop(device, module):
# dim2 = 83
ref = nn.Sequential(*[torch.nn.Linear(dim1, dim2), torch.nn.Linear(dim2, 128)])
- # ref[1].weight.requires_grad = False
torch.nn.init.kaiming_normal_(ref[0].weight)
torch.nn.init.kaiming_normal_(ref[1].weight)
+ ref[1].weight.requires_grad_(False)
kbit = nn.Sequential(*[torch.nn.Linear(dim1, dim2), module(dim2, 128)])
kbit[0].weight.detach().copy_(ref[0].weight)
kbit[1].weight.detach().copy_(ref[1].weight)
kbit[0].bias.detach().copy_(ref[0].bias)
kbit[1].bias.detach().copy_(ref[1].bias)
+ kbit[1].weight.requires_grad_(False)
ref = ref.half().to(device)
kbit = kbit.half().to(device)
kbit = kbit.half().to(device)
From 49c10f7cd38a9c32c625ac2c28396022a21d7d07 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 3 Jun 2025 21:26:18 +0530
Subject: [PATCH 82/85] Update main.py
---
bitsandbytes/diagnostics/main.py | 112 +++++++++++++++----------------
1 file changed, 56 insertions(+), 56 deletions(-)
diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py
index 4ce06d2d9..24a9d5730 100644
--- a/bitsandbytes/diagnostics/main.py
+++ b/bitsandbytes/diagnostics/main.py
@@ -5,14 +5,14 @@
import torch
-from bitsandbytes import __version__ as bnb_version
-from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
-from bitsandbytes.consts import PACKAGE_GITHUB_URL
-from bitsandbytes.cuda_specs import get_cuda_specs
-from bitsandbytes.diagnostics.cuda import (
- print_diagnostics,
- print_runtime_diagnostics,
-)
+from bitsandbytes import __version__ as bnb_version
+from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
+from bitsandbytes.consts import PACKAGE_GITHUB_URL
+from bitsandbytes.cuda_specs import get_cuda_specs
+from bitsandbytes.diagnostics.cuda import (
+ print_diagnostics,
+ print_runtime_diagnostics,
+)
from bitsandbytes.diagnostics.utils import print_dedented, print_header
_RELATED_PACKAGES = [
@@ -71,51 +71,51 @@ def show_environment():
print(f" {pkg}: {version}")
-def main():
- print_header(f"bitsandbytes v{bnb_version}")
- show_environment()
- print_header("")
-
- cuda_specs = get_cuda_specs()
- if HIP_ENVIRONMENT:
- rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
- rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
- print(f"{BNB_BACKEND} specs:{rocm_specs}")
- else:
- print(f"{BNB_BACKEND} specs:{cuda_specs}")
- if not torch.cuda.is_available():
- print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
- if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
- print(f"- {BNB_BACKEND} not installed")
- print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
- if cuda_specs:
- print_diagnostics(cuda_specs)
- print_runtime_diagnostics()
- print_header("")
- print_header("DEBUG INFO END")
- print_header("")
- print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
- try:
- sanity_check()
- print("SUCCESS!")
- print("Installation was successful!")
- return
- except RuntimeError as e:
- if "not available in CPU-only" in str(e):
- print(
- f"WARNING: {__package__} is currently running as CPU-only!\n"
- "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
- f"If you think that this is so erroneously,\nplease report an issue!",
- )
- else:
- raise e
- except Exception:
- traceback.print_exc()
- print_dedented(
- f"""
- Above we output some debug information.
- Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
- WARNING: Please be sure to sanitize sensitive info from the output before posting it.
- """,
- )
- sys.exit(1)
+def main():
+ print_header(f"bitsandbytes v{bnb_version}")
+ show_environment()
+ print_header("")
+
+ cuda_specs = get_cuda_specs()
+ if HIP_ENVIRONMENT:
+ rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
+ rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
+ print(f"{BNB_BACKEND} specs:{rocm_specs}")
+ else:
+ print(f"{BNB_BACKEND} specs:{cuda_specs}")
+ if not torch.cuda.is_available():
+ print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
+ if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
+ print(f"- {BNB_BACKEND} not installed")
+ print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
+ if cuda_specs:
+ print_diagnostics(cuda_specs)
+ print_runtime_diagnostics()
+ print_header("")
+ print_header("DEBUG INFO END")
+ print_header("")
+ print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
+ try:
+ sanity_check()
+ print("SUCCESS!")
+ print("Installation was successful!")
+ return
+ except RuntimeError as e:
+ if "not available in CPU-only" in str(e):
+ print(
+ f"WARNING: {__package__} is currently running as CPU-only!\n"
+ "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
+ f"If you think that this is so erroneously,\nplease report an issue!",
+ )
+ else:
+ raise e
+ except Exception:
+ traceback.print_exc()
+ print_dedented(
+ f"""
+ Above we output some debug information.
+ Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
+ WARNING: Please be sure to sanitize sensitive info from the output before posting it.
+ """,
+ )
+ sys.exit(1)
From 26ac4e8fec9a005d41a47f7290448e4342b40a6e Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 3 Jun 2025 21:29:07 +0530
Subject: [PATCH 83/85] Update cuda_specs.py
---
bitsandbytes/cuda_specs.py | 48 +++++++++++++++++++-------------------
1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index 61d03083c..bbdf457cc 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -1,6 +1,6 @@
import dataclasses
-import logging
-import re
+import logging
+import re
import subprocess
from functools import lru_cache
from typing import Optional
@@ -78,25 +78,25 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
return None
-def get_rocm_gpu_arch() -> str:
- """Get ROCm GPU architecture."""
- logger = logging.getLogger(__name__)
- try:
- if torch.version.hip:
- result = subprocess.run(["rocminfo"], capture_output=True, text=True)
- match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
- if match:
- return "gfx" + match.group(1)
- else:
- return "unknown"
- else:
- return "unknown"
- except Exception as e:
- logger.error(f"Could not detect ROCm GPU architecture: {e}")
- if torch.cuda.is_available():
- logger.warning(
- """
-ROCm GPU architecture detection failed despite ROCm being available.
- """,
- )
- return "unknown"
+def get_rocm_gpu_arch() -> str:
+ """Get ROCm GPU architecture."""
+ logger = logging.getLogger(__name__)
+ try:
+ if torch.version.hip:
+ result = subprocess.run(["rocminfo"], capture_output=True, text=True)
+ match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
+ if match:
+ return "gfx" + match.group(1)
+ else:
+ return "unknown"
+ else:
+ return "unknown"
+ except Exception as e:
+ logger.error(f"Could not detect ROCm GPU architecture: {e}")
+ if torch.cuda.is_available():
+ logger.warning(
+ """
+ROCm GPU architecture detection failed despite ROCm being available.
+ """,
+ )
+ return "unknown"
From fbfd590156601210683b9e545b3a90f238e2b2b9 Mon Sep 17 00:00:00 2001
From: MISHANMAUYRA
Date: Wed, 4 Jun 2025 01:46:21 +0530
Subject: [PATCH 84/85] Formatting
---
.github/workflows/python-package.yml | 86 +++---
bitsandbytes/backends/cuda/ops.py | 36 +--
bitsandbytes/cextension.py | 58 ++--
bitsandbytes/cuda_specs.py | 2 +-
bitsandbytes/diagnostics/cuda.py | 15 +-
bitsandbytes/diagnostics/main.py | 5 +-
bitsandbytes/functional.py | 10 +-
bitsandbytes/nn/modules.py | 8 +-
conflicts.diff | 382 +++++++++++++++++++++++++++
csrc/common_hip.cuh | 2 +-
csrc/kernels.hip | 26 +-
csrc/ops.hip | 10 +-
tests/test_cuda_setup_evaluator.py | 2 +
tests/test_functional.py | 15 +-
tests/test_linear4bit.py | 20 +-
tests/test_ops.py | 2 +-
16 files changed, 535 insertions(+), 144 deletions(-)
create mode 100644 conflicts.diff
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3fa033e27..99ad52c71 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -102,49 +102,49 @@ jobs:
path: output/*
retention-days: 7
- build-shared-libs-rocm:
- strategy:
- matrix:
- os: [ubuntu-22.04]
- arch: [x86_64]
- rocm_version:
- ["6.1.2", "6.2.4", "6.3.2"]
- runs-on: ${{ matrix.os }}
- steps:
- - uses: actions/checkout@v4
- - name: Set up Docker multiarch
- uses: docker/setup-qemu-action@v3
- - name: Clean up disk space
- run: |
- sudo rm -rf \
- /usr/share/dotnet \
- /opt/ghc \
- "/usr/local/share/boost" \
- "$AGENT_TOOLSDIRECTORY" \
- /opt/hostedtoolcache \
- /opt/google/chrome \
- /opt/microsoft/msedge \
- /opt/microsoft/powershell \
- /opt/pipx \
- /usr/lib/mono \
- /usr/local/julia* \
- /usr/local/lib/android \
- /usr/local/lib/node_modules \
- /usr/local/share/chromium \
- /usr/local/share/powershell \
- /usr/share/swift
- - name: Build C++
- run: bash .github/scripts/build-rocm.sh
- env:
- build_os: ${{ matrix.os }}
- build_arch: ${{ matrix.arch }}
- rocm_version: ${{ matrix.rocm_version }}
- - name: Upload build artifact
- uses: actions/upload-artifact@v4
- with:
- name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
- path: output/*
- retention-days: 7
+ build-shared-libs-rocm:
+ strategy:
+ matrix:
+ os: [ubuntu-22.04]
+ arch: [x86_64]
+ rocm_version:
+ ["6.1.2", "6.2.4", "6.3.2"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Docker multiarch
+ uses: docker/setup-qemu-action@v3
+ - name: Clean up disk space
+ run: |
+ sudo rm -rf \
+ /usr/share/dotnet \
+ /opt/ghc \
+ "/usr/local/share/boost" \
+ "$AGENT_TOOLSDIRECTORY" \
+ /opt/hostedtoolcache \
+ /opt/google/chrome \
+ /opt/microsoft/msedge \
+ /opt/microsoft/powershell \
+ /opt/pipx \
+ /usr/lib/mono \
+ /usr/local/julia* \
+ /usr/local/lib/android \
+ /usr/local/lib/node_modules \
+ /usr/local/share/chromium \
+ /usr/local/share/powershell \
+ /usr/share/swift
+ - name: Build C++
+ run: bash .github/scripts/build-rocm.sh
+ env:
+ build_os: ${{ matrix.os }}
+ build_arch: ${{ matrix.arch }}
+ rocm_version: ${{ matrix.rocm_version }}
+ - name: Upload build artifact
+ uses: actions/upload-artifact@v4
+ with:
+ name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+ path: output/*
+ retention-days: 7
build-wheels:
needs:
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index 4083576e5..13359bbd8 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -8,7 +8,7 @@
from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
from ..._ops import register_kernel
-from ...cextension import lib, HIP_ENVIRONMENT
+from ...cextension import HIP_ENVIRONMENT, lib
@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -210,12 +210,12 @@ def _get_col_absmax(
@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
-
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
+
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
+
torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
n = A.numel()
@@ -269,11 +269,11 @@ def _(
def _dequantize_blockwise_impl(
A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
+
torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
torch._check(
dtype in [torch.float16, torch.bfloat16, torch.float32],
@@ -303,11 +303,11 @@ def _dequantize_blockwise_impl(
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> tuple[torch.Tensor, torch.Tensor]:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
+
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -385,11 +385,11 @@ def _dequantize_4bit_impl(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
- if HIP_ENVIRONMENT:
- torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
- else:
+ if HIP_ENVIRONMENT:
+ torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+ else:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-
+
torch._check(quant_type in ["fp4", "nf4"])
torch._check(
dtype in [torch.bfloat16, torch.float16, torch.float32],
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index c7474c521..7f5483531 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -84,7 +84,7 @@ def get_available_cuda_binary_versions() -> list[str]:
lib_pattern = f"libbitsandbytes_{BNB_BACKEND.lower()}*{DYNAMIC_LIBRARY_SUFFIX}"
versions = []
for lib in Path(__file__).parent.glob(lib_pattern):
- pattern = r"{}(\d+)".format(BNB_BACKEND.lower())
+ pattern = rf"{BNB_BACKEND.lower()}(\d+)"
match = re.search(pattern, lib.name)
if match:
ver_code = int(match.group(1))
@@ -202,18 +202,16 @@ def _format_lib_error_message(
)
compile_instructions = (
- (
- "COMPILE FROM SOURCE for CPU-only:\n `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n"
- ) if not no_cuda_lib_found
- else
- (
+ ("COMPILE FROM SOURCE for CPU-only:\n `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n")
+ if not no_cuda_lib_found
+ else (
"You have two options:\n"
"1. COMPILE FROM SOURCE (required if no binary exists):\n"
" https://huggingface.co/docs/bitsandbytes/main/en/installation#cuda-compile\n"
"2. Use BNB_CUDA_VERSION to specify a DIFFERENT CUDA version from the detected one, which is installed on your machine and matching an available pre-compiled version listed above\n\n"
- ) if not HIP_ENVIRONMENT
- else
- (
+ )
+ if not HIP_ENVIRONMENT
+ else (
"You can COMPILE FROM SOURCE as mentioned here:\n"
" https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=AMD+ROCm#amd-gpu\n"
)
@@ -301,27 +299,27 @@ def get_native_library() -> BNBNativeLibrary:
return BNBNativeLibrary(dll)
-ROCM_GPU_ARCH = get_rocm_gpu_arch()
-
-try:
- # to support Intel CPU/GPU (XPU) backend
- import intel_extension_for_pytorch as ipex
-
- ipex_cpu = ipex if ipex._C._has_cpu() else None
- ipex_xpu = ipex if ipex._C._has_xpu() else None
-except BaseException:
- ipex_cpu = None
- ipex_xpu = None
-
-try:
- if torch.version.hip:
- HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm"
- else:
- HIP_ENVIRONMENT, BNB_BACKEND = False, "CUDA"
-
- lib = get_native_library()
-except Exception as e:
- error_msg = str(e)
+ROCM_GPU_ARCH = get_rocm_gpu_arch()
+
+try:
+ # to support Intel CPU/GPU (XPU) backend
+ import intel_extension_for_pytorch as ipex
+
+ ipex_cpu = ipex if ipex._C._has_cpu() else None
+ ipex_xpu = ipex if ipex._C._has_xpu() else None
+except BaseException:
+ ipex_cpu = None
+ ipex_xpu = None
+
+try:
+ if torch.version.hip:
+ HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm"
+ else:
+ HIP_ENVIRONMENT, BNB_BACKEND = False, "CUDA"
+
+ lib = get_native_library()
+except Exception as e:
+ error_msg = str(e)
if not (ipex_cpu or ipex_xpu):
logger.error(
f"bitsandbytes library load error: {error_msg}\n If you are using Intel CPU/XPU, please install intel_extension_for_pytorch to enable required ops",
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index bbdf457cc..32563a159 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -1,8 +1,8 @@
import dataclasses
+from functools import lru_cache
import logging
import re
import subprocess
-from functools import lru_cache
from typing import Optional
import torch
diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py
index e3d177ec4..29a9a66e1 100644
--- a/bitsandbytes/diagnostics/cuda.py
+++ b/bitsandbytes/diagnostics/cuda.py
@@ -5,8 +5,7 @@
import torch
-from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
-from bitsandbytes.consts import NONPYTORCH_DOC_URL
+from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
from bitsandbytes.cuda_specs import CUDASpecs
from bitsandbytes.diagnostics.utils import print_dedented
@@ -33,11 +32,13 @@
}
CUDA_RUNTIME_LIB_PATTERNS = (
- "libamdhip64.so*",
-) if HIP_ENVIRONMENT else (
- "cudart64*.dll", # Windows
- "libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
- "nvcuda*.dll", # Windows
+ ("libamdhip64.so*",)
+ if HIP_ENVIRONMENT
+ else (
+ "cudart64*.dll", # Windows
+ "libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
+ "nvcuda*.dll", # Windows
+ )
)
logger = logging.getLogger(__name__)
diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py
index 24a9d5730..7cd04e209 100644
--- a/bitsandbytes/diagnostics/main.py
+++ b/bitsandbytes/diagnostics/main.py
@@ -75,7 +75,7 @@ def main():
print_header(f"bitsandbytes v{bnb_version}")
show_environment()
print_header("")
-
+
cuda_specs = get_cuda_specs()
if HIP_ENVIRONMENT:
rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
@@ -85,7 +85,8 @@ def main():
print(f"{BNB_BACKEND} specs:{cuda_specs}")
if not torch.cuda.is_available():
print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
- if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
+ if not HIP_ENVIRONMENT:
+ print(f"- {BNB_BACKEND} driver not installed")
print(f"- {BNB_BACKEND} not installed")
print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
if cuda_specs:
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index cfbb4e309..56e2e7b28 100755
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -15,7 +15,7 @@
from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict
-from .cextension import lib, HIP_ENVIRONMENT, ipex_cpu, ipex_xpu
+from .cextension import HIP_ENVIRONMENT, ipex_cpu, ipex_xpu, lib
name2qmap = {}
@@ -1007,10 +1007,10 @@ def quantize_4bit(
- `torch.Tensor`: The quantized tensor with packed 4-bit values.
- [`QuantState`]: The state object used to undo the quantization.
"""
-
+
if blocksize is None:
blocksize = 64 if not HIP_ENVIRONMENT else 128
-
+
input_shape = A.shape
_out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
@@ -1114,10 +1114,10 @@ def dequantize_4bit(
Returns:
`torch.Tensor`: The dequantized tensor.
"""
-
+
if blocksize is None:
blocksize = 64 if not HIP_ENVIRONMENT else 128
-
+
if quant_state is None:
assert absmax is not None and out is not None
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index d8558bfef..a6d62b7d7 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -11,8 +11,8 @@
import torch.nn.functional as F
import bitsandbytes as bnb
-from bitsandbytes.cextension import HIP_ENVIRONMENT
-from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu
+from bitsandbytes.cextension import HIP_ENVIRONMENT
+from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu
from bitsandbytes.optim import GlobalOptimManager
from bitsandbytes.utils import (
INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
@@ -223,10 +223,10 @@ def __new__(
) -> "Params4bit":
if data is None:
data = torch.empty(0)
-
+
if blocksize is None:
blocksize = 64 if not HIP_ENVIRONMENT else 128
-
+
self = torch.Tensor._make_subclass(cls, data, requires_grad)
self.blocksize = blocksize
self.compress_statistics = compress_statistics
diff --git a/conflicts.diff b/conflicts.diff
new file mode 100644
index 000000000..cab8c6ea7
--- /dev/null
+++ b/conflicts.diff
@@ -0,0 +1,382 @@
+diff --cc bitsandbytes/cextension.py
+index 108aa0c,b112df2..0000000
+--- a/bitsandbytes/cextension.py
++++ b/bitsandbytes/cextension.py
+@@@ -28,17 -28,10 +29,15 @@@ def get_cuda_bnb_library_path(cuda_spec
+ override_value = os.environ.get("BNB_CUDA_VERSION")
+ if override_value:
+ library_name = re.sub(r"cuda\d+", f"cuda{override_value}", library_name, count=1)
+ + if torch.version.hip:
+ + raise RuntimeError(
+ + f"BNB_CUDA_VERSION={override_value} detected for ROCm!! \n"
+ + f"Clear the variable and retry: export BNB_CUDA_VERSION=\n"
+ + )
+ logger.warning(
+ f"WARNING: BNB_CUDA_VERSION={override_value} environment variable detected; loading {library_name}.\n"
+- "This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n"
++ "This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.\n"
+ "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n"
+- "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n"
+- "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: BNBNativeLi
+ return BNBNativeLibrary(dll)
+
+
+ +ROCM_GPU_ARCH = get_rocm_gpu_arch()
+ +
+ try:
+++<<<<<<< HEAD
+ + if torch.version.hip:
+ + HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm"
+ + else:
+ + HIP_ENVIRONMENT, BNB_BACKEND = False, "CUDA"
+ +
+++=======
++ # to support Intel CPU/GPU (XPU) backend
++ import intel_extension_for_pytorch as ipex
++
++ ipex_cpu = ipex if ipex._C._has_cpu() else None
++ ipex_xpu = ipex if ipex._C._has_xpu() else None
++ except BaseException:
++ ipex_cpu = None
++ ipex_xpu = None
++
++
++ try:
+++>>>>>>> upstream/main
+ lib = get_native_library()
+ except Exception as e:
+ error_msg = str(e)
+diff --cc bitsandbytes/diagnostics/cuda.py
+index b9de27f,e763ef2..0000000
+--- a/bitsandbytes/diagnostics/cuda.py
++++ b/bitsandbytes/diagnostics/cuda.py
+@@@ -5,8 -5,7 +5,12 @@@ from pathlib import Pat
+
+ import torch
+
+++<<<<<<< HEAD
+ +from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
+ +from bitsandbytes.consts import NONPYTORCH_DOC_URL
+++=======
++ from bitsandbytes.cextension import get_cuda_bnb_library_path
+++>>>>>>> upstream/main
+ from bitsandbytes.cuda_specs import CUDASpecs
+ from bitsandbytes.diagnostics.utils import print_dedented
+
+@@@ -146,42 -127,8 +134,38 @@@ def _print_cuda_diagnostics(cuda_specs
+ """,
+ )
+
+- # TODO:
+- # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
+- # (2) Multiple CUDA versions installed
+-
+
+ -def print_cuda_runtime_diagnostics() -> None:
+ +def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
+ + print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
+ +
+ + binary_path = get_cuda_bnb_library_path(cuda_specs)
+ + if not binary_path.exists():
+ + print_dedented(
+ + f"""
+ + Library not found: {binary_path}.
+ + Maybe you need to compile it from source? If you compiled from source, check that ROCm version
+ + in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
+ + and rebuild bitsandbytes.
+ + """,
+ + )
+ +
+ + hip_major, hip_minor = cuda_specs.cuda_version_tuple
+ + if (hip_major, hip_minor) < (6, 1):
+ + print_dedented(
+ + """
+ + WARNING: bitsandbytes is fully supported only from ROCm 6.1.
+ + """,
+ + )
+ +
+ +
+ +def print_diagnostics(cuda_specs: CUDASpecs) -> None:
+ + if HIP_ENVIRONMENT:
+ + _print_hip_diagnostics(cuda_specs)
+ + else:
+ + _print_cuda_diagnostics(cuda_specs)
+ +
+ +
+ +def _print_cuda_runtime_diagnostics() -> None:
+ cudart_paths = list(find_cudart_libraries())
+ if not cudart_paths:
+ print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
+diff --cc bitsandbytes/diagnostics/main.py
+index 8e2bc2a,aa4cb30..0000000
+--- a/bitsandbytes/diagnostics/main.py
++++ b/bitsandbytes/diagnostics/main.py
+@@@ -3,12 -5,11 +5,20 @@@ import tracebac
+
+ import torch
+
+++<<<<<<< HEAD
+ +from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
+ +from bitsandbytes.consts import PACKAGE_GITHUB_URL
+ +from bitsandbytes.cuda_specs import get_cuda_specs
+ +from bitsandbytes.diagnostics.cuda import (
+ + print_diagnostics,
+ + print_runtime_diagnostics,
+++=======
++ from bitsandbytes import __version__ as bnb_version
++ from bitsandbytes.consts import PACKAGE_GITHUB_URL
++ from bitsandbytes.cuda_specs import get_cuda_specs
++ from bitsandbytes.diagnostics.cuda import (
++ print_cuda_diagnostics,
+++>>>>>>> upstream/main
+ )
+ from bitsandbytes.diagnostics.utils import print_dedented, print_header
+
+@@@ -28,52 -41,77 +50,122 @@@ def sanity_check()
+ assert p1 != p2
+
+
++ def get_package_version(name: str) -> str:
++ try:
++ version = importlib.metadata.version(name)
++ except importlib.metadata.PackageNotFoundError:
++ version = "not found"
++ return version
++
++
++ def show_environment():
++ """Simple utility to print out environment information."""
++
++ print(f"Platform: {platform.platform()}")
++ if platform.system() == "Linux":
++ print(f" libc: {'-'.join(platform.libc_ver())}")
++
++ print(f"Python: {platform.python_version()}")
++
++ print(f"PyTorch: {torch.__version__}")
++ print(f" CUDA: {torch.version.cuda or 'N/A'}")
++ print(f" HIP: {torch.version.hip or 'N/A'}")
++ print(f" XPU: {getattr(torch.version, 'xpu', 'N/A') or 'N/A'}")
++
++ print("Related packages:")
++ for pkg in _RELATED_PACKAGES:
++ version = get_package_version(pkg)
++ print(f" {pkg}: {version}")
++
++
+ def main():
+- print_header("")
+- print_header("BUG REPORT INFORMATION")
++ print_header(f"bitsandbytes v{bnb_version}")
++ show_environment()
+ print_header("")
+
+- print_header("OTHER")
+ cuda_specs = get_cuda_specs()
+++<<<<<<< HEAD
+ + if HIP_ENVIRONMENT:
+ + rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
+ + rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
+ + print(f"{BNB_BACKEND} specs:{rocm_specs}")
+ + else:
+ + print(f"{BNB_BACKEND} specs:{cuda_specs}")
+ + if not torch.cuda.is_available():
+ + print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
+ + if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
+ + print(f"- {BNB_BACKEND} not installed")
+ + print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
+ + if cuda_specs:
+ + print_diagnostics(cuda_specs)
+ + print_runtime_diagnostics()
+ + print_header("")
+ + print_header("DEBUG INFO END")
+ + print_header("")
+ + print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
+ + try:
+ + sanity_check()
+ + print("SUCCESS!")
+ + print("Installation was successful!")
+ + return
+ + except RuntimeError as e:
+ + if "not available in CPU-only" in str(e):
+ + print(
+ + f"WARNING: {__package__} is currently running as CPU-only!\n"
+ + "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
+ + f"If you think that this is so erroneously,\nplease report an issue!",
+ + )
+ + else:
+ + raise e
+ + except Exception:
+ + traceback.print_exc()
+ + print_dedented(
+ + f"""
+ + Above we output some debug information.
+ + Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
+ + WARNING: Please be sure to sanitize sensitive info from the output before posting it.
+ + """,
+ + )
+ + sys.exit(1)
+++=======
++
++ if cuda_specs:
++ print_cuda_diagnostics(cuda_specs)
++
++ # TODO: There's a lot of noise in this; needs improvement.
++ # print_cuda_runtime_diagnostics()
++
++ if not torch.cuda.is_available():
++ print("PyTorch says CUDA is not available. Possible reasons:")
++ print("1. CUDA driver not installed")
++ print("2. Using a CPU-only PyTorch build")
++ print("3. No GPU detected")
++
++ else:
++ print("Checking that the library is importable and CUDA is callable...")
++
++ try:
++ sanity_check()
++ print("SUCCESS!")
++ return
++ except RuntimeError as e:
++ if "not available in CPU-only" in str(e):
++ print(
++ f"WARNING: {__package__} is currently running as CPU-only!\n"
++ "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
++ f"If you think that this is so erroneously,\nplease report an issue!",
++ )
++ else:
++ raise e
++ except Exception:
++ traceback.print_exc()
++
++ print_dedented(
++ f"""
++ Above we output some debug information.
++ Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
++ WARNING: Please be sure to sanitize sensitive info from the output before posting it.
++ """,
++ )
++ sys.exit(1)
+++>>>>>>> upstream/main
+diff --cc bitsandbytes/functional.py
+index 03f6c32,ffb6668..0000000
+mode 100644,100755..100755
+--- a/bitsandbytes/functional.py
++++ b/bitsandbytes/functional.py
+@@@ -13,9 -13,9 +13,13 @@@ import torc
+ from torch import Tensor
+ from typing_extensions import deprecated
+
+- from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
++ from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict
+
+++<<<<<<< HEAD
+ +from .cextension import lib, HIP_ENVIRONMENT
+++=======
++ from .cextension import ipex_cpu, ipex_xpu, lib
+++>>>>>>> upstream/main
+
+ name2qmap = {}
+
+diff --cc bitsandbytes/nn/modules.py
+index 2383f2c,ccd842c..0000000
+--- a/bitsandbytes/nn/modules.py
++++ b/bitsandbytes/nn/modules.py
+@@@ -11,8 -11,7 +11,12 @@@ from torch import Tensor, device, dtype
+ import torch.nn.functional as F
+
+ import bitsandbytes as bnb
+++<<<<<<< HEAD
+ +from bitsandbytes.cextension import HIP_ENVIRONMENT
+ +from bitsandbytes.functional import QuantState
+++=======
++ from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu
+++>>>>>>> upstream/main
+ from bitsandbytes.optim import GlobalOptimManager
+ from bitsandbytes.utils import (
+ INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
+diff --cc tests/test_linear4bit.py
+index 1b7a772,b5db2eb..0000000
+--- a/tests/test_linear4bit.py
++++ b/tests/test_linear4bit.py
+@@@ -7,8 -8,14 +8,19 @@@ import pytes
+ import torch
+
+ import bitsandbytes as bnb
+++<<<<<<< HEAD
+ +from bitsandbytes.cextension import HIP_ENVIRONMENT
+ +from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, torch_load_from_buffer, torch_save_to_buffer
+++=======
++ from tests.helpers import (
++ TRUE_FALSE,
++ describe_dtype,
++ get_available_devices,
++ id_formatter,
++ torch_load_from_buffer,
++ torch_save_to_buffer,
++ )
+++>>>>>>> upstream/main
+
+ storage = {
+ "uint8": torch.uint8,
+@@@ -183,16 -185,10 +189,10 @@@ def test_linear_serialization(device, q
+
+ @pytest.mark.parametrize("device", get_available_devices())
+ @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+ -@pytest.mark.parametrize("blocksize", [64, 128])
+ +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
+ @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+ def test_copy_param(device, quant_type, blocksize, compress_statistics):
+- if device == "cpu":
+- if compress_statistics:
+- pytest.skip("Currently segfaults on CPU")
+- if quant_type == "fp4":
+- pytest.xfail("FP4 not supported on CPU")
+-
+- tensor = torch.linspace(1, blocksize, blocksize)
++ tensor = torch.randn(300, 400)
+ param = bnb.nn.Params4bit(
+ data=tensor,
+ quant_type=quant_type,
+@@@ -208,16 -204,10 +208,10 @@@
+
+ @pytest.mark.parametrize("device", get_available_devices())
+ @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+ -@pytest.mark.parametrize("blocksize", [64, 128])
+ +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
+ @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+ def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
+- if device == "cpu":
+- if compress_statistics:
+- pytest.skip("Currently segfaults on CPU")
+- if quant_type == "fp4":
+- pytest.xfail("FP4 not supported on CPU")
+-
+- tensor = torch.linspace(1, blocksize, blocksize)
++ tensor = torch.randn(300, 400)
+ param = bnb.nn.Params4bit(
+ data=tensor,
+ quant_type=quant_type,
+@@@ -240,16 -230,10 +234,10 @@@
+
+ @pytest.mark.parametrize("device", get_available_devices())
+ @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+ -@pytest.mark.parametrize("blocksize", [64, 128])
+ +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
+ @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+ def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
+- if device == "cpu":
+- if compress_statistics:
+- pytest.skip("Currently segfaults on CPU")
+- if quant_type == "fp4":
+- pytest.xfail("FP4 not supported on CPU")
+-
+- original_tensor = torch.linspace(1, blocksize, blocksize, dtype=torch.float32)
++ original_tensor = torch.randn(300, 400)
+ original_param = bnb.nn.Params4bit(
+ data=original_tensor,
+ quant_type=quant_type,
diff --git a/csrc/common_hip.cuh b/csrc/common_hip.cuh
index e7fc4eb81..105179535 100644
--- a/csrc/common_hip.cuh
+++ b/csrc/common_hip.cuh
@@ -1,6 +1,6 @@
#pragma once
-#define BNB_WARP_SIZE warpSize
+#define BNB_WARP_SIZE warpSize
// These are set based on current BNB support for CDNA 2 & RDNA 3. Update as needed for future archs
#define BNB_MAX_THREADS_PER_SM 2048
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 368788f39..56e1d54db 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -532,7 +532,7 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
absmax[i / BLOCK_SIZE] = local_abs_max;
}
__syncthreads();
-
+
local_abs_max = smem_absmax_value[0];
if(STOCHASTIC)
@@ -610,7 +610,7 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
valid_items_load = min(TILE_SIZE, n - i);
valid_items_store = valid_items_load;
}
-
+
// Since blocksize will always be a power-of-2, we avoid more expensive
// division by the blocksize and instead use a shift operation.
// This is equivalent to (i+threadId.x*NUM_PER_TH)/blocksize.
@@ -811,7 +811,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p,
LoadFloat(temp_storage.loadf).Load(&(state2[i]), s2_vals, valid_items);
__syncthreads();
Load(temp_storage.load).Load(&(p[i]), p_vals, valid_items);
-
+
// Load additional state1 data for AdEMAMix
// TODO: Make constexpr after updating min compiler
if (OPTIMIZER == ADEMAMIX) {
@@ -1607,7 +1607,7 @@ kOptimizerStatic8bit2StateBlockwise(
unsigned char c1s[N_PER_TH];
unsigned char c2s[N_PER_TH];
unsigned char c3s[N_PER_TH];
-
+
T g_vals[N_PER_TH];
T p_vals[N_PER_TH];
typedef hipcub::BlockLoad LoadT;
@@ -1712,7 +1712,7 @@ kOptimizerStatic8bit2StateBlockwise(
new_local_abs_max1 = fmaxf(new_local_abs_max1, fabsf(s1_vals[j]));
new_local_abs_max2 = fmaxf(new_local_abs_max2, fabsf(s2_vals[j]));
-
+
if (OPTIMIZER == ADEMAMIX) {
new_local_abs_max3 = fmaxf(new_local_abs_max3, fabsf(s3_vals[j]));
}
@@ -1776,7 +1776,7 @@ kOptimizerStatic8bit2StateBlockwise(
} else {
p_vals[j] = (T)(((float)p_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps)))))));
}
-
+
if(weight_decay > 0.0f)
p_vals[j] = ((float)p_vals[j])*(1.0f-(lr*weight_decay));
}
@@ -2148,27 +2148,27 @@ __global__ void kdequant_mm_int32_fp16(
int local_values[ITEMS_PER_THREAD];
half local_output[ITEMS_PER_THREAD];
-
+
float local_rowStats[ITEMS_PER_THREAD];
float local_colStats[ITEMS_PER_THREAD];
float local_biasValue[ITEMS_PER_THREAD];
typedef hipcub::BlockLoad LoadInt32;
__shared__ typename LoadInt32::TempStorage loadint32;
-
+
int row_idx, col_idx;
-
+
#pragma unroll ITEMS_PER_THREAD
for(int j = 0; j < ITEMS_PER_THREAD; j++)
{
row_idx = (block_offset + thread_offset + j) / numCols;
col_idx = (block_offset + thread_offset + j) % numCols;
-
+
local_colStats[j] = col_idx >= numCols ? 0.0f : colStats[col_idx];
- local_rowStats[j] = row_idx >= numRows ? 0.0f : rowStats[row_idx];
+ local_rowStats[j] = row_idx >= numRows ? 0.0f : rowStats[row_idx];
local_biasValue[j] = ((bias == nullptr) || (col_idx >= numCols)) ? 0.0f : __half2float(bias[col_idx]);
}
-
+
// Each block loads THREADS * ITEMS_PER_THREAD values from A
int valid_items = block_offset + THREADS * ITEMS_PER_THREAD < n_out
? THREADS * ITEMS_PER_THREAD
@@ -2188,7 +2188,7 @@ __global__ void kdequant_mm_int32_fp16(
if (outIdx < n_out) {
out[outIdx] = local_output[j];
}
- }
+ }
}
#define DENORM 1.0f/127.0f
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 4d077d19a..eef616d48 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -199,10 +199,10 @@ template void optimizerStatic8bit(T* p, T* g,
}
}
-#define BLOCKSIZE_2STATE 256
-#define NUM_2STATE 1
-#define BLOCKSIZE_1STATE 256
-#define NUM_1STATE 1
+#define BLOCKSIZE_2STATE 256
+#define NUM_2STATE 1
+#define BLOCKSIZE_1STATE 256
+#define NUM_1STATE 1
template void optimizerStatic8bitBlockwise(
T* p,
@@ -443,7 +443,7 @@ static std::string hipError_to_string(const hipError_t ret)
}
template int igemmlt(
- hipblasLtHandle_t ltHandle,
+ hipblasLtHandle_t ltHandle,
int m, int n, int k,
const int8_t *A,
const int8_t *B,
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index 1b2ea85db..3d8b688ee 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -12,11 +12,13 @@ def cuda120_spec() -> CUDASpecs:
cuda_version_tuple=(12, 0),
)
+
@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm")
def test_get_cuda_bnb_library_path(monkeypatch, cuda120_spec):
monkeypatch.delenv("BNB_CUDA_VERSION", raising=False)
assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda120"
+
@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm")
def test_get_cuda_bnb_library_path_override(monkeypatch, cuda120_spec, caplog):
monkeypatch.setenv("BNB_CUDA_VERSION", "110")
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 2638033f1..e7c569442 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -8,8 +8,8 @@
import torch
import bitsandbytes as bnb
-from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH
from bitsandbytes import functional as F
+from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH
from tests.helpers import (
BOOLEAN_TUPLES,
TRUE_FALSE,
@@ -92,7 +92,10 @@ class Test8BitBlockwiseQuantizeFunctional:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
@pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested"))
- @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64] if not HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128] )
+ @pytest.mark.parametrize(
+ "blocksize",
+ [4096, 2048, 1024, 512, 256, 128, 64] if not HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128],
+ )
@pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
iters = 100
@@ -802,6 +805,7 @@ def test_coo_int8_vectorwise_quant(self, device, dim1, dim2):
A[:, outlier_cols] = 0
torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2)
+
@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
class TestSpMMFunctional:
@@ -1079,7 +1083,10 @@ class TestQuantize4BitFunctional:
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
- @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096] if not HIP_ENVIRONMENT else [128, 256, 512, 1024, 2048, 4096])
+ @pytest.mark.parametrize(
+ "blocksize",
+ [64, 128, 256, 512, 1024, 2048, 4096] if not HIP_ENVIRONMENT else [128, 256, 512, 1024, 2048, 4096],
+ )
def test_4bit_quant(self, device, dtype, quant_type, blocksize):
A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
@@ -1172,7 +1179,7 @@ def test_bench_4bit_dequant(self, quant_type):
# torch.matmul(b, a.t())
# torch.cuda.synchronize()
# print((time.time()-t0)/iters*1e6)
-
+
@pytest.mark.skipif(
HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
)
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 0426e5f76..09b6186db 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -8,16 +8,15 @@
import torch
import bitsandbytes as bnb
-
-from bitsandbytes.cextension import HIP_ENVIRONMENT
-from tests.helpers import (
- TRUE_FALSE,
- describe_dtype,
- get_available_devices,
- id_formatter,
- torch_load_from_buffer,
- torch_save_to_buffer,
-)
+from bitsandbytes.cextension import HIP_ENVIRONMENT
+from tests.helpers import (
+ TRUE_FALSE,
+ describe_dtype,
+ get_available_devices,
+ id_formatter,
+ torch_load_from_buffer,
+ torch_save_to_buffer,
+)
storage = {
"uint8": torch.uint8,
@@ -26,6 +25,7 @@
"float32": torch.float32,
}
+
@pytest.mark.parametrize("device", get_available_devices())
@pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
@pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
diff --git a/tests/test_ops.py b/tests/test_ops.py
index bfcc8260c..25cd1e9d0 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -4,8 +4,8 @@
import torch
import bitsandbytes
-from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
from bitsandbytes.cextension import HIP_ENVIRONMENT
+from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
# torch.library.opcheck is only available in torch 2.4 and later.
# When testing with older versions, we will skip it as a no-op.
From cd8bd2d61251beacc860b8719e2d2e18d9b6d3c3 Mon Sep 17 00:00:00 2001
From: mklabunde <37873854+mklabunde@users.noreply.github.com>
Date: Tue, 3 Jun 2025 22:41:07 +0200
Subject: [PATCH 85/85] pass current bnb_quantized when moving quantized
Params4bit to different device (#1665)
---
bitsandbytes/nn/modules.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ccd842ce3..e349cc843 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -354,6 +354,7 @@ def to(self, *args, **kwargs):
compress_statistics=self.compress_statistics,
quant_type=self.quant_type,
quant_storage=self.quant_storage,
+ bnb_quantized=self.bnb_quantized,
)
return new_param