Skip to content

Commit 0507a45

Browse files
author
root
committed
Merge upstream/main into ROCm/rocm_enabled
2 parents 7524c09 + e817036 commit 0507a45

55 files changed

Lines changed: 4395 additions & 1043 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/FUNDING.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
open_collective: bitsandbytes

.github/scripts/build-cuda.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ if [[ -v cuda_targets ]]; then
1111
elif [ "${build_arch}" = "aarch64" ]; then
1212
build_capability="75;80;90"
1313

14-
# CUDA 12.8: Add sm100
15-
[[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;90;100"
14+
# CUDA 12.8+: Add sm100/sm120
15+
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
1616
else
17-
# By default, target Maxwell through Hopper.
18-
build_capability="50;52;60;61;70;75;80;86;89;90"
17+
# By default, target Pascal through Hopper.
18+
build_capability="60;70;75;80;86;89;90"
1919

20-
# CUDA 12.8: Add sm100 and sm120; remove < sm75 to align with PyTorch 2.7+cu128 minimum
21-
[[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;86;89;90;100;120"
20+
# CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
21+
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"
2222
fi
2323

2424
[[ "${build_os}" = windows-* ]] && python3 -m pip install ninja

.github/workflows/python-package.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,17 @@ jobs:
7272
- os: windows-latest
7373
arch: x86_64
7474
cuda_version:
75-
["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
75+
["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"]
7676
runs-on: ${{ matrix.os }}
7777
steps:
7878
- uses: actions/checkout@v4
7979
# Windows: We install Cuda on the agent (slow)
80-
- uses: Jimver/cuda-toolkit@v0.2.22
80+
- uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
8181
if: startsWith(matrix.os, 'windows')
8282
id: cuda-toolkit
8383
with:
84-
cuda: ${{ matrix.cuda_version }}
84+
# Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action.
85+
cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }}
8586
method: "network"
8687
sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
8788
linux-local-args: '["--toolkit"]'

.github/workflows/tests.yml

Lines changed: 23 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -49,22 +49,23 @@ jobs:
4949
build-cuda:
5050
strategy:
5151
matrix:
52-
cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
53-
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
52+
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
53+
os: [ubuntu-22.04, ubuntu-22.04-arm]
5454
include:
5555
- os: ubuntu-22.04
5656
arch: x86_64
5757
- os: ubuntu-22.04-arm
5858
arch: aarch64
5959
- os: windows-2025
6060
arch: x86_64
61+
cuda_version: "11.8.0"
6162
runs-on: ${{ matrix.os }}
6263

6364
steps:
6465
- uses: actions/checkout@v4
6566

6667
- name: Install CUDA Toolkit
67-
uses: Jimver/cuda-toolkit@v0.2.23
68+
uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
6869
if: startsWith(matrix.os, 'windows')
6970
id: cuda-toolkit
7071
with:
@@ -100,8 +101,8 @@ jobs:
100101
fail-fast: false
101102
matrix:
102103
os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
103-
# Test with the oldest supported torch version and the two newest.
104-
torch_version: ["2.2.2", "2.6.0", "2.7.1"]
104+
# Test with the oldest supported torch version, the newest two stable/RC.
105+
torch_version: ["2.3.1", "2.7.1", "2.8.0"]
105106
include:
106107
- os: ubuntu-22.04
107108
arch: x86_64
@@ -117,7 +118,7 @@ jobs:
117118
arch: arm64
118119
exclude:
119120
- os: ubuntu-22.04-arm
120-
torch_version: "2.2.2"
121+
torch_version: "2.3.1"
121122

122123
runs-on: ${{ matrix.runner || matrix.os }}
123124
env:
@@ -147,9 +148,10 @@ jobs:
147148
pip install -e ".[test]"
148149
pip install pytest-cov
149150
150-
# We need to downgrade to numpy<2 for torch<2.3 compatibility.
151+
# We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
152+
# See: https://github.com/pytorch/pytorch/issues/131668
151153
- name: Downgrade NumPy
152-
if: startsWith(matrix.torch_version, '2.2.')
154+
if: startsWith(matrix.os, 'windows') && startsWith(matrix.torch_version, '2.3.')
153155
run: pip install "numpy<2"
154156

155157
- name: Show installed packages
@@ -161,7 +163,7 @@ jobs:
161163
- name: Run tests
162164
run: pytest --durations=100
163165

164-
test-cpu-ipex:
166+
test-cpu-intel:
165167
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
166168
needs: build-cpu
167169
runs-on: banb-aws-general-8-plus-use1-public-80
@@ -185,7 +187,6 @@ jobs:
185187
- name: Install dependencies
186188
run: |
187189
pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
188-
pip install intel_extension_for_pytorch==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
189190
pip install -e ".[test]"
190191
pip install pytest-cov
191192
@@ -195,9 +196,6 @@ jobs:
195196
- name: Show environment information
196197
run: python -m torch.utils.collect_env
197198

198-
- name: IPEX smoke test
199-
run: python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__);"
200-
201199
- name: Run tests
202200
run: pytest --durations=100
203201

@@ -223,7 +221,7 @@ jobs:
223221
# run: pip list
224222

225223
test-hpu:
226-
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
224+
if: false # github.repository == 'bitsandbytes-foundation/bitsandbytes'
227225
needs: build-cpu
228226
strategy:
229227
fail-fast: false
@@ -279,21 +277,12 @@ jobs:
279277
run: pytest --durations=100
280278

281279
test-xpu:
282-
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
280+
if: false # github.repository == 'bitsandbytes-foundation/bitsandbytes'
283281
needs: build-cpu
284282
strategy:
285283
fail-fast: false
286284
matrix:
287285
torch_version: ["2.7.1"] #["2.6.0", "2.7.1"]
288-
ipex: [false]
289-
# ipex: [true, false]
290-
# include:
291-
# - torch_version: "2.6.0"
292-
# ipex: true
293-
# ipex_version: "2.6.10+xpu"
294-
# - torch_version: "2.7.1"
295-
# ipex: true
296-
# ipex_version: "2.7.10+xpu"
297286
runs-on:
298287
group: bandb-itac-bmsprpvc1550-8-1gpu
299288
env:
@@ -329,10 +318,6 @@ jobs:
329318
- name: Install PyTorch
330319
run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
331320

332-
- name: Install IPEX
333-
if: matrix.ipex == true
334-
run: pip install intel_extension_for_pytorch==${{ matrix.ipex_version }} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
335-
336321
- name: Install dependencies
337322
run: |
338323
pip install -e ".[test]"
@@ -358,17 +343,20 @@ jobs:
358343
os: [ubuntu-22.04, windows-2025]
359344
arch: [x86_64]
360345
gpu: [T4, L40S]
361-
cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
346+
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
362347
include:
363348
- cuda_version: "11.8.0"
364-
torch_version: "2.2.2"
349+
torch_version: "2.3.1"
365350
pypi_index: "https://download.pytorch.org/whl/cu118"
366351
- cuda_version: "12.6.3"
367352
torch_version: "2.6.0"
368353
pypi_index: "https://download.pytorch.org/whl/cu126"
369354
- cuda_version: "12.8.1"
370355
torch_version: "2.7.1"
371356
pypi_index: "https://download.pytorch.org/whl/cu128"
357+
- cuda_version: "12.9.1"
358+
torch_version: "2.8.0"
359+
pypi_index: "https://download.pytorch.org/whl/cu129"
372360

373361

374362
# Linux L40S runners
@@ -387,7 +375,7 @@ jobs:
387375
gpu: T4
388376
runner: CUDA-Windows-x64
389377
cuda_version: "11.8.0"
390-
torch_version: "2.2.0"
378+
torch_version: "2.3.1"
391379
pypi_index: "https://download.pytorch.org/whl/cu118"
392380
- os: windows-2025
393381
arch: x86_64
@@ -401,12 +389,14 @@ jobs:
401389
gpu: T4
402390
runner: CUDA-Windows-x64
403391
cuda_version: "11.8.0"
404-
torch_version: "2.7.1"
392+
torch_version: "2.7.1" # Note: this is the last PyTorch release supporting CUDA 11.8.
405393
pypi_index: "https://download.pytorch.org/whl/cu118"
406394

407395
exclude:
408396
# Our current T4 Windows runner has a driver too old (471.11)
409397
# and cannot support CUDA 12+. Skip for now.
398+
- os: windows-2025
399+
cuda_version: "12.9.1"
410400
- os: windows-2025
411401
cuda_version: "12.8.1"
412402
- os: windows-2025
@@ -438,15 +428,9 @@ jobs:
438428

439429
- name: Install dependencies
440430
run: |
441-
pip install torch==${{ matrix.torch_version }} --index-url ${{ matrix.pypi_index }}
431+
pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
442432
pip install -e ".[test]"
443433
pip install pytest-cov
444-
445-
# We need to downgrade to numpy<2 for torch<2.3 compatibility.
446-
- name: Downgrade NumPy
447-
if: startsWith(matrix.torch_version, '2.2.')
448-
run: pip install "numpy<2"
449-
450434
- name: Show installed packages
451435
run: pip list
452436

CMakeLists.txt

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
2828
set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
2929
set(MPS_FILES csrc/mps_ops.mm)
3030
set(METAL_FILES csrc/mps_kernels.metal)
31+
set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp)
3132
# C++ sources are always included
3233
list(APPEND SRC_FILES ${CPP_FILES})
3334

34-
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
35-
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
35+
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, xpu)")
36+
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps xpu)
3637
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
3738

3839
if(APPLE)
@@ -64,10 +65,19 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
6465
set(BUILD_CUDA OFF)
6566
set(BUILD_HIP OFF)
6667
set(BUILD_MPS ON)
68+
elseif(${COMPUTE_BACKEND} STREQUAL "xpu")
69+
if(APPLE)
70+
message(FATAL_ERROR "XPU is not supported on macOS" )
71+
endif()
72+
set(BUILD_CUDA OFF)
73+
set(BUILD_HIP OFF)
74+
set(BUILD_MPS OFF)
75+
set(BUILD_XPU ON)
6776
else()
6877
set(BUILD_CUDA OFF)
6978
set(BUILD_HIP OFF)
7079
set(BUILD_MPS OFF)
80+
set(BUILD_XPU OFF)
7181
endif()
7282

7383

@@ -217,6 +227,15 @@ elseif(BUILD_MPS)
217227
COMMENT "Compiling Metal kernels"
218228
VERBATIM)
219229
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
230+
elseif(BUILD_XPU)
231+
list(APPEND SRC_FILES ${XPU_FILES})
232+
string(APPEND BNB_OUTPUT_NAME "_xpu")
233+
add_compile_definitions(BUILD_XPU)
234+
set(CMAKE_C_COMPILER icx)
235+
set(CMAKE_CXX_COMPILER icpx)
236+
if(WIN32)
237+
set(CMAKE_CXX_COMPILER icx)
238+
endif()
220239
else()
221240
string(APPEND BNB_OUTPUT_NAME "_cpu")
222241
set(GPU_SOURCES)
@@ -285,6 +304,15 @@ if(BUILD_MPS)
285304
add_dependencies(bitsandbytes metallib)
286305
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
287306
endif()
307+
if(BUILD_XPU)
308+
set(SYCL_LINK_FLAGS "-fsycl;--offload-compress;-fsycl-targets=spir64_gen,spir64;-Xs;-device pvc,xe-lpg,ats-m150 -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required'")
309+
set(SYCL_COMPILE_FLAGS "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=spir64_gen,spir64;")
310+
311+
set_property(TARGET bitsandbytes PROPERTY CXX_STANDARD 20)
312+
target_compile_options(bitsandbytes PRIVATE ${SYCL_COMPILE_FLAGS})
313+
target_link_options(bitsandbytes PRIVATE ${SYCL_LINK_FLAGS})
314+
315+
endif()
288316

289317
if(WIN32)
290318
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")

MANIFEST.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
include CMakeLists.txt
2+
graft csrc
3+
graft include

0 commit comments

Comments
 (0)