Skip to content

Commit 47ac97d

Browse files
authored
Merge pull request #70 from MISHANMAURYA/upstream_main_mm
Add ROCm build targets
2 parents 79fc632 + 93768d0 commit 47ac97d

16 files changed

Lines changed: 171 additions & 94 deletions

File tree

.github/scripts/build-rocm.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
declare build_arch
3+
declare build_os
4+
declare rocm_version
5+
6+
set -xeuo pipefail
7+
bnb_rocm_arch="gfx90a;gfx942;gfx1100"
8+
if [ "${build_os:0:6}" == ubuntu ]; then
9+
image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
10+
echo "Using image $image"
11+
docker run --rm --platform "linux/$build_arch" -i \
12+
-w /src -v "$PWD:/src" "$image" sh -c \
13+
"apt-get update \
14+
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
15+
&& cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
16+
&& cmake --build ."
17+
fi
18+
19+
output_dir="output/${build_os}/${build_arch}"
20+
mkdir -p "${output_dir}"
21+
(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")

.github/workflows/python-package.yml

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,55 @@ jobs:
102102
path: output/*
103103
retention-days: 7
104104

105+
build-shared-libs-rocm:
106+
strategy:
107+
matrix:
108+
os: [ubuntu-22.04]
109+
arch: [x86_64]
110+
rocm_version:
111+
["6.1.2", "6.2.4", "6.3.2"]
112+
runs-on: ${{ matrix.os }}
113+
steps:
114+
- uses: actions/checkout@v4
115+
- name: Set up Docker multiarch
116+
uses: docker/setup-qemu-action@v3
117+
- name: Clean up disk space
118+
run: |
119+
sudo rm -rf \
120+
/usr/share/dotnet \
121+
/opt/ghc \
122+
"/usr/local/share/boost" \
123+
"$AGENT_TOOLSDIRECTORY" \
124+
/opt/hostedtoolcache \
125+
/opt/google/chrome \
126+
/opt/microsoft/msedge \
127+
/opt/microsoft/powershell \
128+
/opt/pipx \
129+
/usr/lib/mono \
130+
/usr/local/julia* \
131+
/usr/local/lib/android \
132+
/usr/local/lib/node_modules \
133+
/usr/local/share/chromium \
134+
/usr/local/share/powershell \
135+
/usr/share/swift
136+
- name: Build C++
137+
run: bash .github/scripts/build-rocm.sh
138+
env:
139+
build_os: ${{ matrix.os }}
140+
build_arch: ${{ matrix.arch }}
141+
rocm_version: ${{ matrix.rocm_version }}
142+
- name: Upload build artifact
143+
uses: actions/upload-artifact@v4
144+
with:
145+
name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
146+
path: output/*
147+
retention-days: 7
148+
105149
build-wheels:
106150
needs:
107151
- build-shared-libs
108152
- build-shared-libs-cuda
153+
- build-shared-libs-rocm
109154
strategy:
110155
matrix:
111156
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
@@ -171,10 +216,10 @@ jobs:
171216
path: tmp/
172217
pattern: "bdist_wheel_*"
173218
merge-multiple: true
174-
219+
175220
- name: Inspect tmp directory after downloading artifacts
176221
run: ls -alFR tmp/
177-
222+
178223
- name: Move and rename wheel files with pattern replacement
179224
run: |
180225
mkdir -p wheels/
@@ -199,7 +244,7 @@ jobs:
199244
200245
- name: Inspect wheels directory after renaming files
201246
run: ls -alFR wheels/
202-
247+
203248
- name: Delete old pre-release (if exists)
204249
run: |
205250
gh release delete continuous-release_main --cleanup-tag -y || true
@@ -213,7 +258,7 @@ jobs:
213258
214259
This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
215260
216-
**How to install:**
261+
**How to install:**
217262
Pick the correct command for your platform and run it in your terminal:
218263
219264
ENDOFMARKDOWN
@@ -228,7 +273,7 @@ jobs:
228273
done
229274
230275
cat >> body.md << 'ENDOFMARKDOWN'
231-
> **Note:**
276+
> **Note:**
232277
> These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
233278
ENDOFMARKDOWN
234279

bitsandbytes/backends/cuda/ops.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
99

1010
from ..._ops import register_kernel
11-
from ...cextension import lib, HIP_ENVIRONMENT
11+
from ...cextension import HIP_ENVIRONMENT, lib
1212

1313

1414
@register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -210,12 +210,12 @@ def _get_col_absmax(
210210
@register_kernel("bitsandbytes::quantize_blockwise", "cuda")
211211
def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
212212
torch._check_is_size(blocksize)
213-
214-
if HIP_ENVIRONMENT:
215-
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
216-
else:
213+
214+
if HIP_ENVIRONMENT:
215+
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
216+
else:
217217
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
218-
218+
219219
torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
220220

221221
n = A.numel()
@@ -269,11 +269,11 @@ def _(
269269
def _dequantize_blockwise_impl(
270270
A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
271271
) -> None:
272-
if HIP_ENVIRONMENT:
273-
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
274-
else:
272+
if HIP_ENVIRONMENT:
273+
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
274+
else:
275275
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
276-
276+
277277
torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
278278
torch._check(
279279
dtype in [torch.float16, torch.bfloat16, torch.float32],
@@ -303,11 +303,11 @@ def _dequantize_blockwise_impl(
303303
def _(
304304
A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
305305
) -> tuple[torch.Tensor, torch.Tensor]:
306-
if HIP_ENVIRONMENT:
307-
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
308-
else:
306+
if HIP_ENVIRONMENT:
307+
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
308+
else:
309309
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
310-
310+
311311
torch._check(quant_type in ["fp4", "nf4"])
312312
torch._check(
313313
A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -385,11 +385,11 @@ def _dequantize_4bit_impl(
385385
dtype: torch.dtype,
386386
out: torch.Tensor,
387387
) -> None:
388-
if HIP_ENVIRONMENT:
389-
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
390-
else:
388+
if HIP_ENVIRONMENT:
389+
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
390+
else:
391391
torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
392-
392+
393393
torch._check(quant_type in ["fp4", "nf4"])
394394
torch._check(
395395
dtype in [torch.bfloat16, torch.float16, torch.float32],

bitsandbytes/cextension.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def get_available_cuda_binary_versions() -> list[str]:
8181
lib_pattern = f"libbitsandbytes_{BNB_BACKEND.lower()}*{DYNAMIC_LIBRARY_SUFFIX}"
8282
versions = []
8383
for lib in Path(__file__).parent.glob(lib_pattern):
84-
pattern = r"{}(\d+)".format(BNB_BACKEND.lower())
84+
pattern = rf"{BNB_BACKEND.lower()}(\d+)"
8585
match = re.search(pattern, lib.name)
8686
if match:
8787
ver_code = int(match.group(1))
@@ -199,18 +199,16 @@ def _format_lib_error_message(
199199
)
200200

201201
compile_instructions = (
202-
(
203-
"COMPILE FROM SOURCE for CPU-only:\n `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n"
204-
) if not no_cuda_lib_found
205-
else
206-
(
202+
("COMPILE FROM SOURCE for CPU-only:\n `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n")
203+
if not no_cuda_lib_found
204+
else (
207205
"You have two options:\n"
208206
"1. COMPILE FROM SOURCE (required if no binary exists):\n"
209207
" https://huggingface.co/docs/bitsandbytes/main/en/installation#cuda-compile\n"
210208
"2. Use BNB_CUDA_VERSION to specify a DIFFERENT CUDA version from the detected one, which is installed on your machine and matching an available pre-compiled version listed above\n\n"
211-
) if not HIP_ENVIRONMENT
212-
else
213-
(
209+
)
210+
if not HIP_ENVIRONMENT
211+
else (
214212
"You can COMPILE FROM SOURCE as mentioned here:\n"
215213
" https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=AMD+ROCm#amd-gpu\n"
216214
)

bitsandbytes/cuda_specs.py

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import dataclasses
2-
import logging
3-
import re
4-
import subprocess
52
from functools import lru_cache
3+
import logging
4+
import re
5+
import subprocess
66
from typing import Optional
77

88
import torch
@@ -78,25 +78,25 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
7878
return None
7979

8080

81-
def get_rocm_gpu_arch() -> str:
82-
"""Get ROCm GPU architecture."""
83-
logger = logging.getLogger(__name__)
84-
try:
85-
if torch.version.hip:
86-
result = subprocess.run(["rocminfo"], capture_output=True, text=True)
87-
match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
88-
if match:
89-
return "gfx" + match.group(1)
90-
else:
91-
return "unknown"
92-
else:
93-
return "unknown"
94-
except Exception as e:
95-
logger.error(f"Could not detect ROCm GPU architecture: {e}")
96-
if torch.cuda.is_available():
97-
logger.warning(
98-
"""
99-
ROCm GPU architecture detection failed despite ROCm being available.
100-
""",
101-
)
102-
return "unknown"
81+
def get_rocm_gpu_arch() -> str:
82+
"""Get ROCm GPU architecture."""
83+
logger = logging.getLogger(__name__)
84+
try:
85+
if torch.version.hip:
86+
result = subprocess.run(["rocminfo"], capture_output=True, text=True)
87+
match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
88+
if match:
89+
return "gfx" + match.group(1)
90+
else:
91+
return "unknown"
92+
else:
93+
return "unknown"
94+
except Exception as e:
95+
logger.error(f"Could not detect ROCm GPU architecture: {e}")
96+
if torch.cuda.is_available():
97+
logger.warning(
98+
"""
99+
ROCm GPU architecture detection failed despite ROCm being available.
100+
""",
101+
)
102+
return "unknown"

bitsandbytes/diagnostics/cuda.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@
3333
}
3434

3535
CUDA_RUNTIME_LIB_PATTERNS = (
36-
"libamdhip64.so*",
37-
) if HIP_ENVIRONMENT else (
38-
"cudart64*.dll", # Windows
39-
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
40-
"nvcuda*.dll", # Windows
36+
("libamdhip64.so*",)
37+
if HIP_ENVIRONMENT
38+
else (
39+
"cudart64*.dll", # Windows
40+
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
41+
"nvcuda*.dll", # Windows
42+
)
4143
)
4244

4345
logger = logging.getLogger(__name__)

bitsandbytes/diagnostics/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ def main():
4343
print(f"{BNB_BACKEND} specs:{cuda_specs}")
4444
if not torch.cuda.is_available():
4545
print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
46-
if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
46+
if not HIP_ENVIRONMENT:
47+
print(f"- {BNB_BACKEND} driver not installed")
4748
print(f"- {BNB_BACKEND} not installed")
4849
print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
4950
if cuda_specs:

bitsandbytes/functional.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
1717

18-
from .cextension import lib, HIP_ENVIRONMENT
18+
from .cextension import HIP_ENVIRONMENT, lib
1919

2020
name2qmap = {}
2121

@@ -1007,10 +1007,10 @@ def quantize_4bit(
10071007
- `torch.Tensor`: The quantized tensor with packed 4-bit values.
10081008
- [`QuantState`]: The state object used to undo the quantization.
10091009
"""
1010-
1010+
10111011
if blocksize is None:
10121012
blocksize = 64 if not HIP_ENVIRONMENT else 128
1013-
1013+
10141014
input_shape = A.shape
10151015

10161016
_out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
@@ -1114,10 +1114,10 @@ def dequantize_4bit(
11141114
Returns:
11151115
`torch.Tensor`: The dequantized tensor.
11161116
"""
1117-
1117+
11181118
if blocksize is None:
11191119
blocksize = 64 if not HIP_ENVIRONMENT else 128
1120-
1120+
11211121
if quant_state is None:
11221122
assert absmax is not None and out is not None
11231123

bitsandbytes/nn/modules.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,10 +222,10 @@ def __new__(
222222
) -> "Params4bit":
223223
if data is None:
224224
data = torch.empty(0)
225-
225+
226226
if blocksize is None:
227227
blocksize = 64 if not HIP_ENVIRONMENT else 128
228-
228+
229229
self = torch.Tensor._make_subclass(cls, data, requires_grad)
230230
self.blocksize = blocksize
231231
self.compress_statistics = compress_statistics

csrc/common_hip.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#pragma once
22

3-
#define BNB_WARP_SIZE warpSize
3+
#define BNB_WARP_SIZE warpSize
44

55
// These are set based on current BNB support for CDNA 2 & RDNA 3. Update as needed for future archs
66
#define BNB_MAX_THREADS_PER_SM 2048

0 commit comments

Comments
 (0)