bitsandbytes-foundation
diff --git a/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions b/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/scripts/build-cuda.sh‎
Lines changed: 9 additions & 3 deletions b/‎.github/scripts/build-cuda.sh‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎.github/scripts/build-rocm.sh‎
Lines changed: 15 additions & 8 deletions b/‎.github/scripts/build-rocm.sh‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎.github/scripts/build-xpu-windows.bat‎
Lines changed: 34 additions & 0 deletions b/‎.github/scripts/build-xpu-windows.bat‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎.github/scripts/build-xpu.sh‎
Lines changed: 22 additions & 0 deletions b/‎.github/scripts/build-xpu.sh‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.github/scripts/set_platform_tag.py‎
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/set_platform_tag.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 73 additions & 49 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 73 additions & 49 deletions
@@ -0,0 +1 @@
+*.bat text eol=crlf
@@ -11,14 +11,20 @@ if [[ -v cuda_targets ]]; then
 elif [ "${build_arch}" = "aarch64" ]; then
     build_capability="75;80;90"
 
-    # CUDA 12.8+: Add sm100/sm120
+    # CUDA 12.8-12.9: Add sm100/sm120
     [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
+
+    # CUDA 13.0+: Add sm100/sm110/sm120
+    [[ "${cuda_version}" == 13.*.* ]] && build_capability="75;80;90;100;110;120;121"
 else
     # By default, target Pascal through Hopper.
     build_capability="60;70;75;80;86;89;90"
 
     # CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
     [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"
+
+    # CUDA 13.0+: Remove < sm75 to align with PyTorch 2.9+cu130 minimum
+    [[ "${cuda_version}" == 13.*.* ]] && build_capability="75;80;86;89;90;100;120"
 fi
 
 [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
@@ -29,8 +35,8 @@ if [ "${build_os:0:6}" == ubuntu ]; then
     echo "Using image $image"
 
     docker run -i -w /src -v "$PWD:/src" "$image" bash -c \
-        "dnf update -y \
-        && dnf install cmake gcc-toolset-11 -y \
+        "dnf -y --refresh update --security \
+        && dnf -y install cmake gcc-toolset-11 --setopt=install_weak_deps=False --setopt=tsflags=nodocs \
         && source scl_source enable gcc-toolset-11 \
         && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" . \
         && cmake --build . --config Release"
 
@@ -4,15 +4,22 @@ declare build_os
 declare rocm_version
 
 set -xeuo pipefail
-bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+bnb_rocm_arch="gfx90a;gfx942;gfx1100;gfx1101"
+
+# ROCm 6.4+ - Add gfx1150/gfx1151/gfx1200/gfx1201. Note we assume >=6.4.4.
+[[ "${rocm_version}" == 6.4.* || "${rocm_version}" == 7.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx1150;gfx1151;gfx1200;gfx1201"
+
+# ROCm 7.0+ - Add gfx950
+[[ "${rocm_version}" == 7.* ]] && bnb_rocm_arch="${bnb_rocm_arch};gfx950"
+
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
-	echo "Using image $image"
-	docker run --rm --platform "linux/$build_arch" -i \
-		-w /src -v "$PWD:/src" "$image" sh -c \
-		"apt-get update \
-      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
+    image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+    echo "Using image $image"
+    docker run --rm --platform "linux/$build_arch" -i \
+        -w /src -v "$PWD:/src" "$image" sh -c \
+        "apt-get update \
+      && pip install cmake==3.31.6 \
+      && cmake -DCOMPUTE_BACKEND=hip -DCMAKE_BUILD_TYPE=MinSizeRel -DCMAKE_HIP_FLAGS=\"--offload-compress\" -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
 fi
 
 
@@ -0,0 +1,34 @@
+set INTEL_DLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+set INTEL_DLE_TMP=%RUNNER_TEMP%\intel_dle
+set INTEL_DLE_LOG=%RUNNER_TEMP%\intel_dle_log.txt
+
+echo ::group::Intel Deep Learning Essentials Installation
+curl -o intel-dle-installer.exe %INTEL_DLE_URL%
+start /wait "Intel DLE Install" intel-dle-installer.exe -f %INTEL_DLE_TMP% -l %INTEL_DLE_LOG% --silent -a --eula=accept -p=NEED_VS2022_INTEGRATION=0
+type %INTEL_DLE_LOG%
+if ERRORLEVEL 1 (
+    echo Failed to install Intel Deep Learning Essentials
+    exit /b 1
+)
+echo ::endgroup::
+
+echo ::group::Build Environment Setup
+call "%ProgramFiles(x86)%\Intel\oneAPI\setvars.bat"
+cmake -G Ninja -DCOMPUTE_BACKEND=xpu -DCMAKE_BUILD_TYPE=Release .
+if ERRORLEVEL 1 (
+    echo Failed to setup environment
+    exit /b 1
+)
+echo ::endgroup::
+
+echo ::group::Building with XPU backend
+cmake --build . --config Release
+if ERRORLEVEL 1 (
+    echo Build failed
+    exit /b 1
+)
+echo ::endgroup::
+
+set output_dir=output\%build_os%\x86_64
+if not exist "%output_dir%" mkdir "%output_dir%"
+copy bitsandbytes\*.dll "%output_dir%\" 2>nul
@@ -0,0 +1,22 @@
+#!/bin/bash
+declare build_os
+
+set -xeuo pipefail
+
+# We currently only build XPU on Linux.
+if [ "${build_os:0:6}" == ubuntu ]; then
+    # TODO: We might want to pre-build this as our own customized image in the future.
+    image=intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04
+    echo "Using image $image"
+    docker run --rm -i \
+        -w /src -v "$PWD:/src" "$image" sh -c \
+        "apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        cmake bison intel-fw-gpu intel-ocloc \
+      && cmake -DCOMPUTE_BACKEND=xpu . \
+      && cmake --build . --config Release"
+fi
+
+output_dir="output/${build_os}/x86_64"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
@@ -9,7 +9,7 @@ def get_platform_tag(architecture):
     if system == "Linux":
         tag = "manylinux_2_24_x86_64" if architecture == "x86_64" else "manylinux_2_24_aarch64"
     elif system == "Darwin":
-        tag = "macosx_13_1_x86_64" if architecture == "x86_64" else "macosx_13_1_arm64"
+        tag = "macosx_14_0_arm64"
     elif system == "Windows":
         tag = "win_amd64" if architecture == "x86_64" else "win_arm64"
     else:
 
@@ -6,12 +6,13 @@ on:
     branches: [main]
     paths:
       - ".github/workflows/python-package.yml"
+      - ".github/scripts/**"
       - "bitsandbytes/**"
       - "csrc/**"
       - "include/**"
       - "tests/**"
       - "CMakeLists.txt"
-      - "requirements*.txt"
+      - "MANIFEST.in"
       - "setup.py"
       - "pyproject.toml"
   release:
@@ -25,19 +26,19 @@ concurrency:
 
 jobs:
   ##
-  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
+  # This job matrix builds the CPU versions of the libraries for all supported platforms.
   ##
-  build-shared-libs:
+  build-cpu:
     strategy:
       matrix:
         include:
           - os: ubuntu-22.04
             arch: x86_64
           - os: ubuntu-22.04-arm
             arch: aarch64
-          - os: windows-latest
+          - os: windows-2025
             arch: x86_64
-          - os: macos-latest
+          - os: macos-15
             arch: arm64
     runs-on: ${{ matrix.os }}
     steps:
@@ -56,37 +57,39 @@ jobs:
           name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
           path: output/*
           retention-days: 7
+
   ##
   # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
   ##
-  build-shared-libs-cuda:
+  build-cuda:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
         include:
           - os: ubuntu-22.04
             arch: x86_64
           - os: ubuntu-22.04-arm
             arch: aarch64
-          - os: windows-latest
+          - os: windows-2025
             arch: x86_64
         cuda_version:
-          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"]
+          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1", "13.0.2"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
+      - uses: Jimver/cuda-toolkit@6008063726ffe3309d1b22e413d9e88fed91a2f2 # v0.2.29
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
-          # Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action.
-          cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }}
+          cuda: ${{ matrix.cuda_version }}
           method: "network"
-          sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
-          linux-local-args: '["--toolkit"]'
+          # The "crt" "nvvm" and "nvptxcompiler" components are added for CUDA 13.
+          sub-packages: ${{ format('["nvcc"{0},"cudart","cusparse","cublas","thrust","cublas_dev","cusparse_dev"]', startsWith(matrix.cuda_version, '13.') && ',"crt","nvvm","nvptxcompiler"' || '') }}
           use-github-cache: false
+          use-local-cache: false
+          log-file-suffix: ${{matrix.os}}-${{matrix.cuda_version}}.txt
       - name: Setup MSVC
         if: startsWith(matrix.os, 'windows')
         uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
@@ -103,37 +106,56 @@ jobs:
           path: output/*
           retention-days: 7
 
-  build-shared-libs-rocm:
+  build-xpu:
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, windows-2025]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build C++ (Linux)
+        if: runner.os == 'Linux'
+        run: bash .github/scripts/build-xpu.sh
+        env:
+          build_os: ${{ matrix.os }}
+      - name: Build C++ (Windows)
+        if: runner.os == 'Windows'
+        run: .github/scripts/build-xpu-windows.bat
+        shell: cmd
+        env:
+          build_os: ${{ matrix.os }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_xpu_${{ matrix.os }}_x86_64
+          path: output/*
+          retention-days: 7
+
+  build-rocm:
     strategy:
       matrix:
         os: [ubuntu-22.04]
         arch: [x86_64]
-        rocm_version:
-          ["6.1.2", "6.2.4", "6.3.2"]
+        rocm_version: ["6.2.4", "6.3.4", "6.4.4", "7.0.2", "7.1", "7.2"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Docker multiarch
-        uses: docker/setup-qemu-action@v3
       - name: Clean up disk space
         run: |
+          echo "Disk space before cleanup:"
+          df -h
+
+          # These are the biggest disk space hogs.
           sudo rm -rf \
-              /usr/share/dotnet \
-              /opt/ghc \
-              "/usr/local/share/boost" \
-              "$AGENT_TOOLSDIRECTORY" \
-              /opt/hostedtoolcache \
-              /opt/google/chrome \
-              /opt/microsoft/msedge \
-              /opt/microsoft/powershell \
-              /opt/pipx \
-              /usr/lib/mono \
-              /usr/local/julia* \
-              /usr/local/lib/android \
-              /usr/local/lib/node_modules \
-              /usr/local/share/chromium \
-              /usr/local/share/powershell \
-              /usr/share/swift
+            /opt/hostedtoolcache/CodeQL \
+            /usr/lib/dotnet \
+            /usr/lib/jvm \
+            /usr/local/.ghcup \
+            /usr/local/lib/android \
+            /usr/share/swift
+
+          echo "Disk space after cleanup:"
+          df -h
       - name: Build C++
         run: bash .github/scripts/build-rocm.sh
         env:
@@ -148,24 +170,28 @@ jobs:
           retention-days: 7
 
   build-wheels:
+    env:
+      # Skip rebuilding the CPU library when building the wheels.
+      BNB_SKIP_CMAKE: 1
     needs:
-      - build-shared-libs
-      - build-shared-libs-cuda
-      - build-shared-libs-rocm
+      - build-cpu
+      - build-cuda
+      - build-rocm
+      - build-xpu
     strategy:
       matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         include:
           - os: ubuntu-22.04
             arch: x86_64
           - os: ubuntu-22.04-arm
             arch: aarch64
-          - os: windows-latest
+          - os: windows-2025
             arch: x86_64
-          - os: macos-latest
+          - os: macos-15
             arch: arm64
         # The specific Python version is irrelevant in this context as we are only packaging non-C extension
-        # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
+        # code. This ensures compatibility across Python versions, as compatibility is
         # dictated by the packaged code itself, not the Python version used for packaging.
         python-version: ["3.10"]
     runs-on: ${{ matrix.os }}
@@ -239,8 +265,7 @@ jobs:
           # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`
           STABLE_PLACEHOLDER_VERSION="1.33.7.preview"
 
-          # exclude macos wheels for now
-          find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do
+          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
             wheel_filename=$(basename "$wheel")
 
             # Strip off the original version
@@ -291,9 +316,11 @@ jobs:
             if [[ "$fname" == *"manylinux_2_24_x86_64"* ]]; then
               echo "### Linux (x86_64)" >> body.md
             elif [[ "$fname" == *"manylinux_2_24_aarch64"* ]]; then
-              echo "### Linux (ARM/aarch64)" >> body.md
+              echo "### Linux (aarch64)" >> body.md
             elif [[ "$fname" == *"win_amd64"* ]]; then
               echo "### Windows (x86_64)" >> body.md
+            elif [[ "$fname" == *"macosx"* ]]; then
+              echo "### macOS 14+ (arm64)" >> body.md
             else
               echo "### Other platform" >> body.md
             fi
@@ -313,7 +340,7 @@ jobs:
           > pip install https://.../bitsandbytes-1.33.7-preview-py3-none-manylinux_2_24_x86_64.whl
           Collecting bitsandbytes==1.33.7rc0
           ...
-          Successfully installed bitsandbytes-0.46.0.dev0
+          Successfully installed bitsandbytes-0.49.0.dev0
           ```
           ENDOFMARKDOWN
 
@@ -378,9 +405,6 @@ jobs:
           pattern: "bdist_wheel_*"
           merge-multiple: true
 
-      - name: Remove macOS wheels
-        run: rm dist/*macos*
-
       - name: Publish to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         with: