ROCm
diff --git a/‎.github/FUNDING.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/FUNDING.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/scripts/build-cuda.sh‎
Lines changed: 6 additions & 6 deletions b/‎.github/scripts/build-cuda.sh‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.github/workflows/python-package.yml‎
Lines changed: 4 additions & 3 deletions b/‎.github/workflows/python-package.yml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 23 additions & 39 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 23 additions & 39 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 30 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 30 additions & 2 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 3 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1 @@
+open_collective: bitsandbytes
@@ -11,14 +11,14 @@ if [[ -v cuda_targets ]]; then
 elif [ "${build_arch}" = "aarch64" ]; then
     build_capability="75;80;90"
 
-    # CUDA 12.8: Add sm100
-    [[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;90;100"
+    # CUDA 12.8+: Add sm100/sm120
+    [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120"
 else
-    # By default, target Maxwell through Hopper.
-    build_capability="50;52;60;61;70;75;80;86;89;90"
+    # By default, target Pascal through Hopper.
+    build_capability="60;70;75;80;86;89;90"
 
-    # CUDA 12.8: Add sm100 and sm120; remove < sm75 to align with PyTorch 2.7+cu128 minimum
-    [[ "${cuda_version}" == 12.8.* ]] && build_capability="75;80;86;89;90;100;120"
+    # CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
+    [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"
 fi
 
 [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
 
@@ -72,16 +72,17 @@ jobs:
           - os: windows-latest
             arch: x86_64
         cuda_version:
-          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
+          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@v0.2.22
+      - uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
-          cuda: ${{ matrix.cuda_version }}
+          # Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action.
+          cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }}
           method: "network"
           sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
           linux-local-args: '["--toolkit"]'
 
@@ -49,22 +49,23 @@ jobs:
   build-cuda:
     strategy:
       matrix:
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
+        os: [ubuntu-22.04, ubuntu-22.04-arm]
         include:
           - os: ubuntu-22.04
             arch: x86_64
           - os: ubuntu-22.04-arm
             arch: aarch64
           - os: windows-2025
             arch: x86_64
+            cuda_version: "11.8.0"
     runs-on: ${{ matrix.os }}
 
     steps:
       - uses: actions/checkout@v4
 
       - name: Install CUDA Toolkit
-        uses: Jimver/cuda-toolkit@v0.2.23
+        uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
@@ -100,8 +101,8 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
-        # Test with the oldest supported torch version and the two newest.
-        torch_version: ["2.2.2", "2.6.0", "2.7.1"]
+        # Test with the oldest supported torch version, the newest two stable/RC.
+        torch_version: ["2.3.1", "2.7.1", "2.8.0"]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -117,7 +118,7 @@ jobs:
             arch: arm64
         exclude:
           - os: ubuntu-22.04-arm
-            torch_version: "2.2.2"
+            torch_version: "2.3.1"
 
     runs-on: ${{ matrix.runner || matrix.os }}
     env:
@@ -147,9 +148,10 @@ jobs:
           pip install -e ".[test]"
           pip install pytest-cov
 
-      # We need to downgrade to numpy<2 for torch<2.3 compatibility.
+      # We need to downgrade to numpy<2 for torch<2.4.1 compatibility on Windows
+      # See: https://github.com/pytorch/pytorch/issues/131668
       - name: Downgrade NumPy
-        if: startsWith(matrix.torch_version, '2.2.')
+        if: startsWith(matrix.os, 'windows') && startsWith(matrix.torch_version, '2.3.')
         run: pip install "numpy<2"
 
       - name: Show installed packages
@@ -161,7 +163,7 @@ jobs:
       - name: Run tests
         run: pytest --durations=100
 
-  test-cpu-ipex:
+  test-cpu-intel:
     if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu
     runs-on: banb-aws-general-8-plus-use1-public-80
@@ -185,7 +187,6 @@ jobs:
       - name: Install dependencies
         run: |
           pip install torch==2.7.1 --index-url https://download.pytorch.org/whl/cpu
-          pip install intel_extension_for_pytorch==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
           pip install -e ".[test]"
           pip install pytest-cov
 
@@ -195,9 +196,6 @@ jobs:
       - name: Show environment information
         run: python -m torch.utils.collect_env
 
-      - name: IPEX smoke test
-        run: python -c "import torch; import intel_extension_for_pytorch as ipex; print(torch.__version__); print(ipex.__version__);"
-
       - name: Run tests
         run: pytest --durations=100
 
@@ -223,7 +221,7 @@ jobs:
   #       run: pip list
 
   test-hpu:
-    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
+    if: false # github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu
     strategy:
       fail-fast: false
@@ -279,21 +277,12 @@ jobs:
         run: pytest --durations=100
 
   test-xpu:
-    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
+    if: false # github.repository == 'bitsandbytes-foundation/bitsandbytes'
     needs: build-cpu
     strategy:
       fail-fast: false
       matrix:
         torch_version: ["2.7.1"] #["2.6.0", "2.7.1"]
-        ipex: [false]
-        # ipex: [true, false]
-        # include:
-        #   - torch_version: "2.6.0"
-        #     ipex: true
-        #     ipex_version: "2.6.10+xpu"
-        #   - torch_version: "2.7.1"
-        #     ipex: true
-        #     ipex_version: "2.7.10+xpu"
     runs-on:
       group: bandb-itac-bmsprpvc1550-8-1gpu
     env:
@@ -329,10 +318,6 @@ jobs:
       - name: Install PyTorch
         run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
 
-      - name: Install IPEX
-        if: matrix.ipex == true
-        run: pip install intel_extension_for_pytorch==${{ matrix.ipex_version }} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-
       - name: Install dependencies
         run: |
           pip install -e ".[test]"
@@ -358,17 +343,20 @@ jobs:
         os: [ubuntu-22.04, windows-2025]
         arch: [x86_64]
         gpu: [T4, L40S]
-        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
         include:
           - cuda_version: "11.8.0"
-            torch_version: "2.2.2"
+            torch_version: "2.3.1"
             pypi_index: "https://download.pytorch.org/whl/cu118"
           - cuda_version: "12.6.3"
             torch_version: "2.6.0"
             pypi_index: "https://download.pytorch.org/whl/cu126"
           - cuda_version: "12.8.1"
             torch_version: "2.7.1"
             pypi_index: "https://download.pytorch.org/whl/cu128"
+          - cuda_version: "12.9.1"
+            torch_version: "2.8.0"
+            pypi_index: "https://download.pytorch.org/whl/cu129"
 
 
           # Linux L40S runners
@@ -387,7 +375,7 @@ jobs:
             gpu: T4
             runner: CUDA-Windows-x64
             cuda_version: "11.8.0"
-            torch_version: "2.2.0"
+            torch_version: "2.3.1"
             pypi_index: "https://download.pytorch.org/whl/cu118"
           - os: windows-2025
             arch: x86_64
@@ -401,12 +389,14 @@ jobs:
             gpu: T4
             runner: CUDA-Windows-x64
             cuda_version: "11.8.0"
-            torch_version: "2.7.1"
+            torch_version: "2.7.1"    # Note: this is the last PyTorch release supporting CUDA 11.8.
             pypi_index: "https://download.pytorch.org/whl/cu118"
 
         exclude:
           # Our current T4 Windows runner has a driver too old (471.11)
           # and cannot support CUDA 12+. Skip for now.
+          - os: windows-2025
+            cuda_version: "12.9.1"
           - os: windows-2025
             cuda_version: "12.8.1"
           - os: windows-2025
@@ -438,15 +428,9 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install torch==${{ matrix.torch_version }} --index-url ${{ matrix.pypi_index }}
+          pip install --pre torch~=${{ matrix.torch_version }}.dev0 --index-url ${{ matrix.pypi_index }}
           pip install -e ".[test]"
           pip install pytest-cov
-
-        # We need to downgrade to numpy<2 for torch<2.3 compatibility.
-      - name: Downgrade NumPy
-        if: startsWith(matrix.torch_version, '2.2.')
-        run: pip install "numpy<2"
-
       - name: Show installed packages
         run: pip list
 
 
@@ -28,11 +28,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
 set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
+set(XPU_FILES csrc/xpu_ops.cpp csrc/xpu_kernels.cpp)
 # C++ sources are always included
 list(APPEND SRC_FILES ${CPP_FILES})
 
-set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
-set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
+set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, xpu)")
+set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps xpu)
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
 
 if(APPLE)
@@ -64,10 +65,19 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
     set(BUILD_CUDA OFF)
     set(BUILD_HIP OFF)
     set(BUILD_MPS ON)
+elseif(${COMPUTE_BACKEND} STREQUAL "xpu")
+    if(APPLE)
+        message(FATAL_ERROR "XPU is not supported on macOS" )
+    endif()
+    set(BUILD_CUDA OFF)
+    set(BUILD_HIP OFF)
+    set(BUILD_MPS OFF)
+    set(BUILD_XPU ON)
 else()
     set(BUILD_CUDA OFF)
     set(BUILD_HIP OFF)
     set(BUILD_MPS OFF)
+    set(BUILD_XPU OFF)
 endif()
 
 
@@ -217,6 +227,15 @@ elseif(BUILD_MPS)
                 COMMENT "Compiling Metal kernels"
                 VERBATIM)
     add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
+elseif(BUILD_XPU)
+    list(APPEND SRC_FILES ${XPU_FILES})
+    string(APPEND BNB_OUTPUT_NAME "_xpu")
+    add_compile_definitions(BUILD_XPU)
+    set(CMAKE_C_COMPILER icx)
+    set(CMAKE_CXX_COMPILER icpx)
+    if(WIN32)
+        set(CMAKE_CXX_COMPILER icx)
+    endif()
 else()
     string(APPEND BNB_OUTPUT_NAME "_cpu")
     set(GPU_SOURCES)
@@ -285,6 +304,15 @@ if(BUILD_MPS)
     add_dependencies(bitsandbytes metallib)
     target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
 endif()
+if(BUILD_XPU)
+    set(SYCL_LINK_FLAGS "-fsycl;--offload-compress;-fsycl-targets=spir64_gen,spir64;-Xs;-device pvc,xe-lpg,ats-m150 -options ' -cl-intel-enable-auto-large-GRF-mode -cl-poison-unsupported-fp64-kernels -cl-intel-greater-than-4GB-buffer-required'")
+    set(SYCL_COMPILE_FLAGS "-fsycl;-fhonor-nans;-fhonor-infinities;-fno-associative-math;-fno-approx-func;-fno-sycl-instrument-device-code;--offload-compress;-fsycl-targets=spir64_gen,spir64;")
+
+    set_property(TARGET bitsandbytes PROPERTY CXX_STANDARD 20)
+    target_compile_options(bitsandbytes PRIVATE ${SYCL_COMPILE_FLAGS})
+    target_link_options(bitsandbytes PRIVATE ${SYCL_LINK_FLAGS})
+
+endif()
 
 if(WIN32)
     set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
 
@@ -0,0 +1,3 @@
+include CMakeLists.txt
+graft csrc
+graft include
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+include CMakeLists.txt`
	`2`	`+graft csrc`
	`3`	`+graft include`