ROCm
diff --git a/‎.github/scripts/build-rocm.sh‎
Lines changed: 21 additions & 0 deletions b/‎.github/scripts/build-rocm.sh‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 50 additions & 5 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 50 additions & 5 deletions
diff --git a/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 18 additions & 18 deletions b/‎bitsandbytes/backends/cuda/ops.py‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎bitsandbytes/cextension.py‎
Lines changed: 7 additions & 9 deletions b/‎bitsandbytes/cextension.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎bitsandbytes/cuda_specs.py‎
Lines changed: 25 additions & 25 deletions b/‎bitsandbytes/cuda_specs.py‎
Lines changed: 25 additions & 25 deletions
diff --git a/‎bitsandbytes/diagnostics/cuda.py‎
Lines changed: 7 additions & 5 deletions b/‎bitsandbytes/diagnostics/cuda.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎bitsandbytes/diagnostics/main.py‎
Lines changed: 2 additions & 1 deletion b/‎bitsandbytes/diagnostics/main.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bitsandbytes/functional.py‎
Lines changed: 5 additions & 5 deletions b/‎bitsandbytes/functional.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎bitsandbytes/nn/modules.py‎
Lines changed: 2 additions & 2 deletions b/‎bitsandbytes/nn/modules.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/common_hip.cuh‎
Lines changed: 1 addition & 1 deletion b/‎csrc/common_hip.cuh‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,21 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+declare rocm_version
+
+set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+if [ "${build_os:0:6}" == ubuntu ]; then
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+	echo "Using image $image"
+	docker run --rm --platform "linux/$build_arch" -i \
+		-w /src -v "$PWD:/src" "$image" sh -c \
+		"apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
+      && cmake --build ."
+fi
+
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
@@ -102,10 +102,55 @@ jobs:
           path: output/*
           retention-days: 7
 
+  build-shared-libs-rocm:
+    strategy:
+      matrix:
+        os: [ubuntu-22.04]
+        arch: [x86_64]
+        rocm_version:
+          ["6.1.2", "6.2.4", "6.3.2"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker multiarch
+        uses: docker/setup-qemu-action@v3
+      - name: Clean up disk space
+        run: |
+          sudo rm -rf \
+              /usr/share/dotnet \
+              /opt/ghc \
+              "/usr/local/share/boost" \
+              "$AGENT_TOOLSDIRECTORY" \
+              /opt/hostedtoolcache \
+              /opt/google/chrome \
+              /opt/microsoft/msedge \
+              /opt/microsoft/powershell \
+              /opt/pipx \
+              /usr/lib/mono \
+              /usr/local/julia* \
+              /usr/local/lib/android \
+              /usr/local/lib/node_modules \
+              /usr/local/share/chromium \
+              /usr/local/share/powershell \
+              /usr/share/swift
+      - name: Build C++
+        run: bash .github/scripts/build-rocm.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+          path: output/*
+          retention-days: 7
+
   build-wheels:
     needs:
       - build-shared-libs
       - build-shared-libs-cuda
+      - build-shared-libs-rocm
     strategy:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
@@ -171,10 +216,10 @@ jobs:
           path: tmp/
           pattern: "bdist_wheel_*"
           merge-multiple: true
-  
+
       - name: Inspect tmp directory after downloading artifacts
         run: ls -alFR tmp/
-  
+
       - name: Move and rename wheel files with pattern replacement
         run: |
           mkdir -p wheels/
@@ -199,7 +244,7 @@ jobs:
 
       - name: Inspect wheels directory after renaming files
         run: ls -alFR wheels/
-  
+
       - name: Delete old pre-release (if exists)
         run: |
           gh release delete continuous-release_main --cleanup-tag -y || true
@@ -213,7 +258,7 @@ jobs:
 
           This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
 
-          **How to install:**  
+          **How to install:**
           Pick the correct command for your platform and run it in your terminal:
 
           ENDOFMARKDOWN
@@ -228,7 +273,7 @@ jobs:
           done
 
           cat >> body.md << 'ENDOFMARKDOWN'
-          > **Note:**  
+          > **Note:**
           > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
           ENDOFMARKDOWN
 
 
@@ -8,7 +8,7 @@
 from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
 
 from ..._ops import register_kernel
-from ...cextension import lib, HIP_ENVIRONMENT
+from ...cextension import HIP_ENVIRONMENT, lib
 
 
 @register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -210,12 +210,12 @@ def _get_col_absmax(
 @register_kernel("bitsandbytes::quantize_blockwise", "cuda")
 def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
-    
-    if HIP_ENVIRONMENT:  
-        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])  
-    else:  
+
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-        
+
     torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
 
     n = A.numel()
@@ -269,11 +269,11 @@ def _(
 def _dequantize_blockwise_impl(
     A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
 ) -> None:
-    if HIP_ENVIRONMENT:  
-        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])  
-    else:  
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-        
+
     torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
     torch._check(
         dtype in [torch.float16, torch.bfloat16, torch.float32],
@@ -303,11 +303,11 @@ def _dequantize_blockwise_impl(
 def _(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    if HIP_ENVIRONMENT:  
-        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])  
-    else:  
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-        
+
     torch._check(quant_type in ["fp4", "nf4"])
     torch._check(
         A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -385,11 +385,11 @@ def _dequantize_4bit_impl(
     dtype: torch.dtype,
     out: torch.Tensor,
 ) -> None:
-    if HIP_ENVIRONMENT:  
-        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])  
-    else:  
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-        
+
     torch._check(quant_type in ["fp4", "nf4"])
     torch._check(
         dtype in [torch.bfloat16, torch.float16, torch.float32],
 
@@ -81,7 +81,7 @@ def get_available_cuda_binary_versions() -> list[str]:
     lib_pattern = f"libbitsandbytes_{BNB_BACKEND.lower()}*{DYNAMIC_LIBRARY_SUFFIX}"
     versions = []
     for lib in Path(__file__).parent.glob(lib_pattern):
-        pattern = r"{}(\d+)".format(BNB_BACKEND.lower())
+        pattern = rf"{BNB_BACKEND.lower()}(\d+)"
         match = re.search(pattern, lib.name)
         if match:
             ver_code = int(match.group(1))
@@ -199,18 +199,16 @@ def _format_lib_error_message(
         )
 
         compile_instructions = (
-            (
-                "COMPILE FROM SOURCE for CPU-only:\n  `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n"
-            ) if not no_cuda_lib_found 
-            else
-            (
+            ("COMPILE FROM SOURCE for CPU-only:\n  `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n")
+            if not no_cuda_lib_found
+            else (
                 "You have two options:\n"
                 "1. COMPILE FROM SOURCE (required if no binary exists):\n"
                 "   https://huggingface.co/docs/bitsandbytes/main/en/installation#cuda-compile\n"
                 "2. Use BNB_CUDA_VERSION to specify a DIFFERENT CUDA version from the detected one, which is installed on your machine and matching an available pre-compiled version listed above\n\n"
-            ) if not HIP_ENVIRONMENT
-            else
-            (
+            )
+            if not HIP_ENVIRONMENT
+            else (
                 "You can COMPILE FROM SOURCE as mentioned here:\n"
                 "   https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=AMD+ROCm#amd-gpu\n"
             )
 
@@ -1,8 +1,8 @@
 import dataclasses
-import logging  
-import re  
-import subprocess
 from functools import lru_cache
+import logging
+import re
+import subprocess
 from typing import Optional
 
 import torch
@@ -78,25 +78,25 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
         return None
 
 
-def get_rocm_gpu_arch() -> str:  
-    """Get ROCm GPU architecture."""  
-    logger = logging.getLogger(__name__)  
-    try:  
-        if torch.version.hip:  
-            result = subprocess.run(["rocminfo"], capture_output=True, text=True)  
-            match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)  
-            if match:  
-                return "gfx" + match.group(1)  
-            else:  
-                return "unknown"  
-        else:  
-            return "unknown"  
-    except Exception as e:  
-        logger.error(f"Could not detect ROCm GPU architecture: {e}")  
-        if torch.cuda.is_available():  
-            logger.warning(  
-                """  
-ROCm GPU architecture detection failed despite ROCm being available.  
-                """,  
-            )  
-        return "unknown"  
+def get_rocm_gpu_arch() -> str:
+    """Get ROCm GPU architecture."""
+    logger = logging.getLogger(__name__)
+    try:
+        if torch.version.hip:
+            result = subprocess.run(["rocminfo"], capture_output=True, text=True)
+            match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
+            if match:
+                return "gfx" + match.group(1)
+            else:
+                return "unknown"
+        else:
+            return "unknown"
+    except Exception as e:
+        logger.error(f"Could not detect ROCm GPU architecture: {e}")
+        if torch.cuda.is_available():
+            logger.warning(
+                """
+ROCm GPU architecture detection failed despite ROCm being available.
+                """,
+            )
+        return "unknown"
@@ -33,11 +33,13 @@
 }
 
 CUDA_RUNTIME_LIB_PATTERNS = (
-    "libamdhip64.so*",
-) if HIP_ENVIRONMENT else (
-    "cudart64*.dll",  # Windows
-    "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
-    "nvcuda*.dll",  # Windows
+    ("libamdhip64.so*",)
+    if HIP_ENVIRONMENT
+    else (
+        "cudart64*.dll",  # Windows
+        "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
+        "nvcuda*.dll",  # Windows
+    )
 )
 
 logger = logging.getLogger(__name__)
 
@@ -43,7 +43,8 @@ def main():
         print(f"{BNB_BACKEND} specs:{cuda_specs}")
     if not torch.cuda.is_available():
         print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
-        if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
+        if not HIP_ENVIRONMENT:
+            print(f"- {BNB_BACKEND} driver not installed")
         print(f"- {BNB_BACKEND} not installed")
         print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
     if cuda_specs:
 
@@ -15,7 +15,7 @@
 
 from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
 
-from .cextension import lib, HIP_ENVIRONMENT
+from .cextension import HIP_ENVIRONMENT, lib
 
 name2qmap = {}
 
@@ -1007,10 +1007,10 @@ def quantize_4bit(
         - `torch.Tensor`: The quantized tensor with packed 4-bit values.
         - [`QuantState`]: The state object used to undo the quantization.
     """
-    
+
     if blocksize is None:
         blocksize = 64 if not HIP_ENVIRONMENT else 128
-        
+
     input_shape = A.shape
 
     _out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
@@ -1114,10 +1114,10 @@ def dequantize_4bit(
     Returns:
         `torch.Tensor`: The dequantized tensor.
     """
-    
+
     if blocksize is None:
         blocksize = 64 if not HIP_ENVIRONMENT else 128
-        
+
     if quant_state is None:
         assert absmax is not None and out is not None
 
 
@@ -222,10 +222,10 @@ def __new__(
     ) -> "Params4bit":
         if data is None:
             data = torch.empty(0)
-            
+
         if blocksize is None:
             blocksize = 64 if not HIP_ENVIRONMENT else 128
-            
+
         self = torch.Tensor._make_subclass(cls, data, requires_grad)
         self.blocksize = blocksize
         self.compress_statistics = compress_statistics
 
@@ -1,6 +1,6 @@
 #pragma once
 
-#define BNB_WARP_SIZE  warpSize 
+#define BNB_WARP_SIZE  warpSize
 
 // These are set based on current BNB support for CDNA 2 & RDNA 3. Update as needed for future archs
 #define BNB_MAX_THREADS_PER_SM 2048