Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/scripts/build-rocm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
declare build_arch
declare build_os
declare rocm_version

set -xeuo pipefail
bnb_rocm_arch="gfx90a;gfx942;gfx1100"
if [ "${build_os:0:6}" == ubuntu ]; then
image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
echo "Using image $image"
docker run --rm --platform "linux/$build_arch" -i \
-w /src -v "$PWD:/src" "$image" sh -c \
"apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
&& cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
&& cmake --build ."
fi

output_dir="output/${build_os}/${build_arch}"
mkdir -p "${output_dir}"
(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
55 changes: 50 additions & 5 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,55 @@ jobs:
path: output/*
retention-days: 7

build-shared-libs-rocm:
strategy:
matrix:
os: [ubuntu-22.04]
arch: [x86_64]
rocm_version:
["6.1.2", "6.2.4", "6.3.2"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v4
- name: Set up Docker multiarch
uses: docker/setup-qemu-action@v3
- name: Clean up disk space
run: |
sudo rm -rf \
/usr/share/dotnet \
/opt/ghc \
"/usr/local/share/boost" \
"$AGENT_TOOLSDIRECTORY" \
/opt/hostedtoolcache \
/opt/google/chrome \
/opt/microsoft/msedge \
/opt/microsoft/powershell \
/opt/pipx \
/usr/lib/mono \
/usr/local/julia* \
/usr/local/lib/android \
/usr/local/lib/node_modules \
/usr/local/share/chromium \
/usr/local/share/powershell \
/usr/share/swift
- name: Build C++
run: bash .github/scripts/build-rocm.sh
env:
build_os: ${{ matrix.os }}
build_arch: ${{ matrix.arch }}
rocm_version: ${{ matrix.rocm_version }}
- name: Upload build artifact
uses: actions/upload-artifact@v4
with:
name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
path: output/*
retention-days: 7

build-wheels:
needs:
- build-shared-libs
- build-shared-libs-cuda
- build-shared-libs-rocm
strategy:
matrix:
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
Expand Down Expand Up @@ -171,10 +216,10 @@ jobs:
path: tmp/
pattern: "bdist_wheel_*"
merge-multiple: true

- name: Inspect tmp directory after downloading artifacts
run: ls -alFR tmp/

- name: Move and rename wheel files with pattern replacement
run: |
mkdir -p wheels/
Expand All @@ -199,7 +244,7 @@ jobs:

- name: Inspect wheels directory after renaming files
run: ls -alFR wheels/

- name: Delete old pre-release (if exists)
run: |
gh release delete continuous-release_main --cleanup-tag -y || true
Expand All @@ -213,7 +258,7 @@ jobs:

This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.

**How to install:**
**How to install:**
Pick the correct command for your platform and run it in your terminal:

ENDOFMARKDOWN
Expand All @@ -228,7 +273,7 @@ jobs:
done

cat >> body.md << 'ENDOFMARKDOWN'
> **Note:**
> **Note:**
> These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
ENDOFMARKDOWN

Expand Down
36 changes: 18 additions & 18 deletions bitsandbytes/backends/cuda/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr

from ..._ops import register_kernel
from ...cextension import lib, HIP_ENVIRONMENT
from ...cextension import HIP_ENVIRONMENT, lib


@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
Expand Down Expand Up @@ -210,12 +210,12 @@ def _get_col_absmax(
@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
torch._check_is_size(blocksize)
if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
else:

if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
else:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])

torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")

n = A.numel()
Expand Down Expand Up @@ -269,11 +269,11 @@ def _(
def _dequantize_blockwise_impl(
A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
) -> None:
if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
else:
if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
else:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])

torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
torch._check(
dtype in [torch.float16, torch.bfloat16, torch.float32],
Expand Down Expand Up @@ -303,11 +303,11 @@ def _dequantize_blockwise_impl(
def _(
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
) -> tuple[torch.Tensor, torch.Tensor]:
if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
else:
if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
else:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])

torch._check(quant_type in ["fp4", "nf4"])
torch._check(
A.dtype in [torch.bfloat16, torch.float16, torch.float32],
Expand Down Expand Up @@ -385,11 +385,11 @@ def _dequantize_4bit_impl(
dtype: torch.dtype,
out: torch.Tensor,
) -> None:
if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
else:
if HIP_ENVIRONMENT:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
else:
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])

torch._check(quant_type in ["fp4", "nf4"])
torch._check(
dtype in [torch.bfloat16, torch.float16, torch.float32],
Expand Down
16 changes: 7 additions & 9 deletions bitsandbytes/cextension.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get_available_cuda_binary_versions() -> list[str]:
lib_pattern = f"libbitsandbytes_{BNB_BACKEND.lower()}*{DYNAMIC_LIBRARY_SUFFIX}"
versions = []
for lib in Path(__file__).parent.glob(lib_pattern):
pattern = r"{}(\d+)".format(BNB_BACKEND.lower())
pattern = rf"{BNB_BACKEND.lower()}(\d+)"
match = re.search(pattern, lib.name)
if match:
ver_code = int(match.group(1))
Expand Down Expand Up @@ -199,18 +199,16 @@ def _format_lib_error_message(
)

compile_instructions = (
(
"COMPILE FROM SOURCE for CPU-only:\n `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n"
) if not no_cuda_lib_found
else
(
("COMPILE FROM SOURCE for CPU-only:\n `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n")
if not no_cuda_lib_found
else (
"You have two options:\n"
"1. COMPILE FROM SOURCE (required if no binary exists):\n"
" https://huggingface.co/docs/bitsandbytes/main/en/installation#cuda-compile\n"
"2. Use BNB_CUDA_VERSION to specify a DIFFERENT CUDA version from the detected one, which is installed on your machine and matching an available pre-compiled version listed above\n\n"
) if not HIP_ENVIRONMENT
else
(
)
if not HIP_ENVIRONMENT
else (
"You can COMPILE FROM SOURCE as mentioned here:\n"
" https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=AMD+ROCm#amd-gpu\n"
)
Expand Down
50 changes: 25 additions & 25 deletions bitsandbytes/cuda_specs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import dataclasses
import logging
import re
import subprocess
from functools import lru_cache
import logging
import re
import subprocess
from typing import Optional

import torch
Expand Down Expand Up @@ -78,25 +78,25 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
return None


def get_rocm_gpu_arch() -> str:
"""Get ROCm GPU architecture."""
logger = logging.getLogger(__name__)
try:
if torch.version.hip:
result = subprocess.run(["rocminfo"], capture_output=True, text=True)
match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
if match:
return "gfx" + match.group(1)
else:
return "unknown"
else:
return "unknown"
except Exception as e:
logger.error(f"Could not detect ROCm GPU architecture: {e}")
if torch.cuda.is_available():
logger.warning(
"""
ROCm GPU architecture detection failed despite ROCm being available.
""",
)
return "unknown"
def get_rocm_gpu_arch() -> str:
"""Get ROCm GPU architecture."""
logger = logging.getLogger(__name__)
try:
if torch.version.hip:
result = subprocess.run(["rocminfo"], capture_output=True, text=True)
match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
if match:
return "gfx" + match.group(1)
else:
return "unknown"
else:
return "unknown"
except Exception as e:
logger.error(f"Could not detect ROCm GPU architecture: {e}")
if torch.cuda.is_available():
logger.warning(
"""
ROCm GPU architecture detection failed despite ROCm being available.
""",
)
return "unknown"
12 changes: 7 additions & 5 deletions bitsandbytes/diagnostics/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@
}

CUDA_RUNTIME_LIB_PATTERNS = (
"libamdhip64.so*",
) if HIP_ENVIRONMENT else (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
("libamdhip64.so*",)
if HIP_ENVIRONMENT
else (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)
)

logger = logging.getLogger(__name__)
Expand Down
3 changes: 2 additions & 1 deletion bitsandbytes/diagnostics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def main():
print(f"{BNB_BACKEND} specs:{cuda_specs}")
if not torch.cuda.is_available():
print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
if not HIP_ENVIRONMENT:
print(f"- {BNB_BACKEND} driver not installed")
print(f"- {BNB_BACKEND} not installed")
print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
if cuda_specs:
Expand Down
10 changes: 5 additions & 5 deletions bitsandbytes/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict

from .cextension import lib, HIP_ENVIRONMENT
from .cextension import HIP_ENVIRONMENT, lib

name2qmap = {}

Expand Down Expand Up @@ -1007,10 +1007,10 @@ def quantize_4bit(
- `torch.Tensor`: The quantized tensor with packed 4-bit values.
- [`QuantState`]: The state object used to undo the quantization.
"""

if blocksize is None:
blocksize = 64 if not HIP_ENVIRONMENT else 128

input_shape = A.shape

_out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
Expand Down Expand Up @@ -1114,10 +1114,10 @@ def dequantize_4bit(
Returns:
`torch.Tensor`: The dequantized tensor.
"""

if blocksize is None:
blocksize = 64 if not HIP_ENVIRONMENT else 128

if quant_state is None:
assert absmax is not None and out is not None

Expand Down
4 changes: 2 additions & 2 deletions bitsandbytes/nn/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,10 @@ def __new__(
) -> "Params4bit":
if data is None:
data = torch.empty(0)

if blocksize is None:
blocksize = 64 if not HIP_ENVIRONMENT else 128

self = torch.Tensor._make_subclass(cls, data, requires_grad)
self.blocksize = blocksize
self.compress_statistics = compress_statistics
Expand Down
2 changes: 1 addition & 1 deletion csrc/common_hip.cuh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once

#define BNB_WARP_SIZE warpSize
#define BNB_WARP_SIZE warpSize

// These are set based on current BNB support for CDNA 2 & RDNA 3. Update as needed for future archs
#define BNB_MAX_THREADS_PER_SM 2048
Expand Down
Loading