From ef31c362e22b201551605bc6d808026ea33da59c Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Mon, 2 Jun 2025 23:55:14 +0530
Subject: [PATCH 1/8] Update python-package.yml

---
 .github/workflows/python-package.yml | 643 ++++++++++++++-------------
 1 file changed, 343 insertions(+), 300 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index fbaa27d56..10daf0f79 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -1,303 +1,346 @@
-name: Python package
-
-on:
-  push: {}
-  pull_request:
-    branches: [main]
-    paths:
-      - ".github/workflows/python-package.yml"
-      - "bitsandbytes/**"
-      - "csrc/**"
-      - "include/**"
-      - "tests/**"
-      - "CMakeLists.txt"
-      - "requirements*.txt"
-      - "setup.py"
-      - "pyproject.toml"
-  release:
-    types: [published]
-  workflow_dispatch: {} # Allow manual trigger
-  workflow_call: {} # Allow triggering from other worfkflows
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  ##
-  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
-  ##
-  build-shared-libs:
-    strategy:
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            arch: x86_64
-          - os: ubuntu-22.04-arm
-            arch: aarch64
-          - os: windows-latest
-            arch: x86_64
-          - os: macos-latest
-            arch: arm64
-    runs-on: ${{ matrix.os }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup MSVC
-        if: startsWith(matrix.os, 'windows')
-        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
-      - name: Build C++
-        run: bash .github/scripts/build-cpu.sh
-        env:
-          build_os: ${{ matrix.os }}
-          build_arch: ${{ matrix.arch }}
-      - name: Upload build artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
-          path: output/*
-          retention-days: 7
-  ##
-  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
-  ##
-  build-shared-libs-cuda:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]
-        include:
-          - os: ubuntu-22.04
-            arch: x86_64
-          - os: ubuntu-22.04-arm
-            arch: aarch64
-          - os: windows-latest
-            arch: x86_64
-        cuda_version:
-          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
-    runs-on: ${{ matrix.os }}
-    steps:
-      - uses: actions/checkout@v4
-        # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@v0.2.22
-        if: startsWith(matrix.os, 'windows')
-        id: cuda-toolkit
-        with:
-          cuda: ${{ matrix.cuda_version }}
-          method: "network"
-          sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
-          linux-local-args: '["--toolkit"]'
-          use-github-cache: false
-      - name: Setup MSVC
-        if: startsWith(matrix.os, 'windows')
-        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
-      - name: Build C++
-        run: bash .github/scripts/build-cuda.sh
-        env:
-          build_os: ${{ matrix.os }}
-          build_arch: ${{ matrix.arch }}
-          cuda_version: ${{ matrix.cuda_version }}
-      - name: Upload build artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
-          path: output/*
-          retention-days: 7
-
-  build-wheels:
-    needs:
-      - build-shared-libs
-      - build-shared-libs-cuda
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
-        include:
-          - os: ubuntu-22.04
-            arch: x86_64
-          - os: ubuntu-22.04-arm
-            arch: aarch64
-          - os: windows-latest
-            arch: x86_64
-          - os: macos-latest
-            arch: arm64
-        # The specific Python version is irrelevant in this context as we are only packaging non-C extension
-        # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
-        # dictated by the packaged code itself, not the Python version used for packaging.
-        python-version: ["3.10"]
-    runs-on: ${{ matrix.os }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Download build artifacts
-        uses: actions/download-artifact@v4
-        with:
-          merge-multiple: true
-          pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
-          path: output/
-      - name: Copy correct platform shared library
-        shell: bash
-        run: |
-          ls -lR output/
-          cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-          cache: pip
-      - run: pip install build wheel
-      - run: python -m build .
-      - name: Determine and Set Platform Tag, then Tag Wheel
-        shell: bash
-        run: |
-          PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}")
-          echo "PLATFORM_TAG=$PLATFORM_TAG"
-          wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl
-      - name: Upload build artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
-          path: dist/bitsandbytes-*.whl
-          retention-days: 7
-
-  upload-pre-release-wheels:
-    name: Create release and upload artifacts
-    runs-on: ubuntu-latest
-    if: github.ref_name == 'main'
-    permissions:
-      contents: write
-    needs:
-      - build-wheels
-    steps:
-      - name: Download and rename artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: tmp/
-          pattern: "bdist_wheel_*"
-          merge-multiple: true
+name: Python package  
   
-      - name: Inspect tmp directory after downloading artifacts
-        run: ls -alFR tmp/
+on:  
+  push: {}  
+  pull_request:  
+    branches: [main]  
+    paths:  
+      - ".github/workflows/python-package.yml"  
+      - "bitsandbytes/**"  
+      - "csrc/**"  
+      - "include/**"  
+      - "tests/**"  
+      - "CMakeLists.txt"  
+      - "requirements*.txt"  
+      - "setup.py"  
+      - "pyproject.toml"  
+  release:  
+    types: [published]  
+  workflow_dispatch: {} # Allow manual trigger  
+  workflow_call: {} # Allow triggering from other worfkflows  
   
-      - name: Move and rename wheel files with pattern replacement
-        run: |
-          mkdir -p wheels/
-
-          # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name
-          # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the
-          # wheel directly from the GH pre-release which gets updated continuously, e.g.
-          # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`
-          STABLE_PLACEHOLDER_VERSION="1.33.7.preview"
-
-          # exclude macos wheels for now
-          find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do
-            wheel_filename=$(basename "$wheel")
-
-            # Strip off the original version
-            rest=${wheel_filename#bitsandbytes-*-}
-            new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}"
-
-            echo "Renaming $wheel_filename → $new_name"
-            mv "$wheel" "wheels/${new_name}"
-          done
-
-      - name: Inspect wheels directory after renaming files
-        run: ls -alFR wheels/
+concurrency:  
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}  
+  cancel-in-progress: true  
   
-      - name: Delete old pre-release (if exists)
-        run: |
-          gh release delete continuous-release_main --cleanup-tag -y || true
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Generate pip install commands for release body
-        run: |
-          cat > body.md << 'ENDOFMARKDOWN'
-          ## Latest `main` Wheel Pre-release
-
-          This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
-
-          **How to install:**  
-          Pick the correct command for your platform and run it in your terminal:
-
-          ENDOFMARKDOWN
-
-          for whl in wheels/*.whl; do
-            fname=$(basename "$whl")
-            url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"
-            echo "\`\`\`sh" >> body.md
-            echo "pip install $url" >> body.md
-            echo "\`\`\`" >> body.md
-            echo "" >> body.md
-          done
-
-          cat >> body.md << 'ENDOFMARKDOWN'
-          > **Note:**  
-          > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
-          ENDOFMARKDOWN
-
-          # for debugging:
-          cat body.md
-
-      - name: Create new pre-release and upload artifacts
-        uses: softprops/action-gh-release@v2.2.1
-        with:
-          files: wheels/*.whl
-          prerelease: true
-          name: Latest `main` wheel
-          body_path: body.md
-          tag_name: continuous-release_main
-          make_latest: false
-          draft: false
-          target_commitish: ${{ github.sha }}
-
-  audit-wheels:
-    needs: build-wheels
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm]
-        include:
-          - os: ubuntu-22.04
-            arch: x86_64
-          - os: ubuntu-22.04-arm
-            arch: aarch64
-    runs-on: ${{ matrix.os }}
-    env:
-      PIP_DISABLE_PIP_VERSION_CHECK: 1
-    steps:
-      - uses: actions/checkout@v4
-      - name: Download wheel
-        uses: actions/download-artifact@v4
-        with:
-          name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
-          path: wheels/
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-      - run: pip install auditwheel
-      - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY
-
-  publish-wheels:
-    name: Publish wheels to PyPI
-    needs: [build-wheels, audit-wheels]
-    runs-on: ubuntu-latest
-    if: |
-      github.repository == 'bitsandbytes-foundation/bitsandbytes'
-      && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
-    environment:
-      name: release
-      url: https://pypi.org/p/bitsandbytes
-    permissions:
-      id-token: write
-    steps:
-      - name: Download distribution artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: dist/
-          pattern: "bdist_wheel_*"
-          merge-multiple: true
-
-      - name: Remove macOS wheels
-        run: rm dist/*macos*
-
-      - name: Publish to PyPI
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          print-hash: true
+jobs:  
+  ##  
+  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.  
+  ##  
+  build-shared-libs:  
+    strategy:  
+      matrix:  
+        include:  
+          - os: ubuntu-22.04  
+            arch: x86_64  
+          - os: ubuntu-22.04-arm  
+            arch: aarch64  
+          - os: windows-latest  
+            arch: x86_64  
+          - os: macos-latest  
+            arch: arm64  
+    runs-on: ${{ matrix.os }}  
+    steps:  
+      - uses: actions/checkout@v4  
+      - name: Setup MSVC  
+        if: startsWith(matrix.os, 'windows')  
+        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl  
+      - name: Build C++  
+        run: bash .github/scripts/build-cpu.sh  
+        env:  
+          build_os: ${{ matrix.os }}  
+          build_arch: ${{ matrix.arch }}  
+      - name: Upload build artifact  
+        uses: actions/upload-artifact@v4  
+        with:  
+          name: shared_library_${{ matrix.os }}_${{ matrix.arch }}  
+          path: output/*  
+          retention-days: 7  
+  ##  
+  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)  
+  ##  
+  build-shared-libs-cuda:  
+    strategy:  
+      fail-fast: false  
+      matrix:  
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]  
+        include:  
+          - os: ubuntu-22.04  
+            arch: x86_64  
+          - os: ubuntu-22.04-arm  
+            arch: aarch64  
+          - os: windows-latest  
+            arch: x86_64  
+        cuda_version:  
+          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]  
+    runs-on: ${{ matrix.os }}  
+    steps:  
+      - uses: actions/checkout@v4  
+        # Windows: We install Cuda on the agent (slow)  
+      - uses: Jimver/cuda-toolkit@v0.2.22  
+        if: startsWith(matrix.os, 'windows')  
+        id: cuda-toolkit  
+        with:  
+          cuda: ${{ matrix.cuda_version }}  
+          method: "network"  
+          sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'  
+          linux-local-args: '["--toolkit"]'  
+          use-github-cache: false  
+      - name: Setup MSVC  
+        if: startsWith(matrix.os, 'windows')  
+        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl  
+      - name: Build C++  
+        run: bash .github/scripts/build-cuda.sh  
+        env:  
+          build_os: ${{ matrix.os }}  
+          build_arch: ${{ matrix.arch }}  
+          cuda_version: ${{ matrix.cuda_version }}  
+      - name: Upload build artifact  
+        uses: actions/upload-artifact@v4  
+        with:  
+          name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}  
+          path: output/*  
+          retention-days: 7  
+  build-shared-libs-rocm:  
+    strategy:  
+      matrix:  
+        os: [ubuntu-22.04]  
+        arch: [x86_64]  
+        rocm_version:  
+          ["6.1.2", "6.2.4", "6.3.2"]  
+    runs-on: ${{ matrix.os }}  
+    steps:  
+      - uses: actions/checkout@v4  
+      - name: Set up Docker multiarch  
+        uses: docker/setup-qemu-action@v3  
+      - name: Clean up disk space  
+        run: |  
+          sudo rm -rf \  
+              /usr/share/dotnet \  
+              /opt/ghc \  
+              "/usr/local/share/boost" \  
+              "$AGENT_TOOLSDIRECTORY" \  
+              /opt/hostedtoolcache \  
+              /opt/google/chrome \  
+              /opt/microsoft/msedge \  
+              /opt/microsoft/powershell \  
+              /opt/pipx \  
+              /usr/lib/mono \  
+              /usr/local/julia* \  
+              /usr/local/lib/android \  
+              /usr/local/lib/node_modules \  
+              /usr/local/share/chromium \  
+              /usr/local/share/powershell \  
+              /usr/share/swift  
+      - name: Build C++  
+        run: bash .github/scripts/build-rocm.sh  
+        env:  
+          build_os: ${{ matrix.os }}  
+          build_arch: ${{ matrix.arch }}  
+          rocm_version: ${{ matrix.rocm_version }}  
+      - name: Upload build artifact  
+        uses: actions/upload-artifact@v4  
+        with:  
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}  
+          path: output/*  
+          retention-days: 7    
+  build-wheels:  
+    needs:  
+      - build-shared-libs  
+      - build-shared-libs-cuda  
+      - build-shared-libs-rocm  
+    strategy:  
+      matrix:  
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]  
+        include:  
+          - os: ubuntu-22.04  
+            arch: x86_64  
+          - os: ubuntu-22.04-arm  
+            arch: aarch64  
+          - os: windows-latest  
+            arch: x86_64  
+          - os: macos-latest  
+            arch: arm64  
+        # The specific Python version is irrelevant in this context as we are only packaging non-C extension  
+        # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is  
+        # dictated by the packaged code itself, not the Python version used for packaging.  
+        python-version: ["3.10"]  
+    runs-on: ${{ matrix.os }}  
+    steps:  
+      - uses: actions/checkout@v4  
+      - name: Download build artifacts  
+        uses: actions/download-artifact@v4  
+        with:  
+          merge-multiple: true  
+          pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"  
+          path: output/  
+      - name: Copy correct platform shared library  
+        shell: bash  
+        run: |  
+          ls -lR output/  
+          cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/  
+      - name: Set up Python ${{ matrix.python-version }}  
+        uses: actions/setup-python@v5  
+        with:  
+          python-version: ${{ matrix.python-version }}  
+          cache: pip  
+      - run: pip install build wheel  
+      - run: python -m build .  
+      - name: Determine and Set Platform Tag, then Tag Wheel  
+        shell: bash  
+        run: |  
+          PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}")  
+          echo "PLATFORM_TAG=$PLATFORM_TAG"  
+          wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl  
+      - name: Upload build artifact  
+        uses: actions/upload-artifact@v4  
+        with:  
+          name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}  
+          path: dist/bitsandbytes-*.whl  
+          retention-days: 7  
+  
+  upload-pre-release-wheels:  
+    name: Create release and upload artifacts  
+    runs-on: ubuntu-latest  
+    if: github.ref_name == 'main'  
+    permissions:  
+      contents: write  
+    needs:  
+      - build-wheels  
+    steps:  
+      - name: Download and rename artifacts  
+        uses: actions/download-artifact@v4  
+        with:  
+          path: tmp/  
+          pattern: "bdist_wheel_*"  
+          merge-multiple: true  
+    
+      - name: Inspect tmp directory after downloading artifacts  
+        run: ls -alFR tmp/  
+    
+      - name: Move and rename wheel files with pattern replacement  
+        run: |  
+          mkdir -p wheels/  
+  
+          # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name  
+          # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the  
+          # wheel directly from the GH pre-release which gets updated continuously, e.g.  
+          # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`  
+          STABLE_PLACEHOLDER_VERSION="1.33.7.preview"  
+  
+          # exclude macos wheels for now  
+          find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do  
+            wheel_filename=$(basename "$wheel")  
+  
+            # Strip off the original version  
+            rest=${wheel_filename#bitsandbytes-*-}  
+            new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}"  
+  
+            echo "Renaming $wheel_filename → $new_name"  
+            mv "$wheel" "wheels/${new_name}"  
+          done  
+  
+      - name: Inspect wheels directory after renaming files  
+        run: ls -alFR wheels/  
+    
+      - name: Delete old pre-release (if exists)  
+        run: |  
+          gh release delete continuous-release_main --cleanup-tag -y || true  
+        env:  
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  
+  
+      - name: Generate pip install commands for release body  
+        run: |  
+          cat > body.md << 'ENDOFMARKDOWN'  
+          ## Latest `main` Wheel Pre-release  
+  
+          This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.  
+  
+          **How to install:**    
+          Pick the correct command for your platform and run it in your terminal:  
+  
+          ENDOFMARKDOWN  
+  
+          for whl in wheels/*.whl; do  
+            fname=$(basename "$whl")  
+            url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"  
+            echo "\`\`\`sh" >> body.md  
+            echo "pip install $url" >> body.md  
+            echo "\`\`\`" >> body.md  
+            echo "" >> body.md  
+          done  
+  
+          cat >> body.md << 'ENDOFMARKDOWN'  
+          > **Note:**    
+          > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.  
+          ENDOFMARKDOWN  
+  
+          # for debugging:  
+          cat body.md  
+  
+      - name: Create new pre-release and upload artifacts  
+        uses: softprops/action-gh-release@v2.2.1  
+        with:  
+          files: wheels/*.whl  
+          prerelease: true  
+          name: Latest `main` wheel  
+          body_path: body.md  
+          tag_name: continuous-release_main  
+          make_latest: false  
+          draft: false  
+          target_commitish: ${{ github.sha }}  
+  
+  audit-wheels:  
+    needs: build-wheels  
+    strategy:  
+      matrix:  
+        os: [ubuntu-22.04, ubuntu-22.04-arm]  
+        include:  
+          - os: ubuntu-22.04  
+            arch: x86_64  
+          - os: ubuntu-22.04-arm  
+            arch: aarch64  
+    runs-on: ${{ matrix.os }}  
+    env:  
+      PIP_DISABLE_PIP_VERSION_CHECK: 1  
+    steps:  
+      - uses: actions/checkout@v4  
+      - name: Download wheel  
+        uses: actions/download-artifact@v4  
+        with:  
+          name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}  
+          path: wheels/  
+      - name: Set up Python  
+        uses: actions/setup-python@v5  
+        with:  
+          python-version: "3.12"  
+      - run: pip install auditwheel  
+      - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY  
+  
+  publish-wheels:  
+    name: Publish wheels to PyPI  
+    needs: [build-wheels, audit-wheels]  
+    runs-on: ubuntu-latest  
+    if: |  
+      github.repository == 'bitsandbytes-foundation/bitsandbytes'  
+      && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')  
+    environment:  
+      name: release  
+      url: https://pypi.org/p/bitsandbytes  
+    permissions:  
+      id-token: write  
+    steps:  
+      - name: Download distribution artifacts  
+        uses: actions/download-artifact@v4  
+        with:  
+          path: dist/  
+          pattern: "bdist_wheel_*"  
+          merge-multiple: true  
+  
+      - name: Remove macOS wheels  
+        run: rm dist/*macos*  
+  
+      - name: Publish to PyPI  
+        uses: pypa/gh-action-pypi-publish@release/v1  
+        with:  
+          print-hash: true  

From e1435f01776137c3a253228b4234a23535532161 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Mon, 2 Jun 2025 23:57:25 +0530
Subject: [PATCH 2/8] Update python-package.yml

---
 .github/workflows/python-package.yml | 643 +++++++++++++--------------
 1 file changed, 300 insertions(+), 343 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 10daf0f79..fbaa27d56 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -1,346 +1,303 @@
-name: Python package  
+name: Python package
+
+on:
+  push: {}
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/python-package.yml"
+      - "bitsandbytes/**"
+      - "csrc/**"
+      - "include/**"
+      - "tests/**"
+      - "CMakeLists.txt"
+      - "requirements*.txt"
+      - "setup.py"
+      - "pyproject.toml"
+  release:
+    types: [published]
+  workflow_dispatch: {} # Allow manual trigger
+  workflow_call: {} # Allow triggering from other worfkflows
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ##
+  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
+  ##
+  build-shared-libs:
+    strategy:
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            arch: x86_64
+          - os: ubuntu-22.04-arm
+            arch: aarch64
+          - os: windows-latest
+            arch: x86_64
+          - os: macos-latest
+            arch: arm64
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup MSVC
+        if: startsWith(matrix.os, 'windows')
+        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
+      - name: Build C++
+        run: bash .github/scripts/build-cpu.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
+          path: output/*
+          retention-days: 7
+  ##
+  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
+  ##
+  build-shared-libs-cuda:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]
+        include:
+          - os: ubuntu-22.04
+            arch: x86_64
+          - os: ubuntu-22.04-arm
+            arch: aarch64
+          - os: windows-latest
+            arch: x86_64
+        cuda_version:
+          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+        # Windows: We install Cuda on the agent (slow)
+      - uses: Jimver/cuda-toolkit@v0.2.22
+        if: startsWith(matrix.os, 'windows')
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda_version }}
+          method: "network"
+          sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
+          linux-local-args: '["--toolkit"]'
+          use-github-cache: false
+      - name: Setup MSVC
+        if: startsWith(matrix.os, 'windows')
+        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
+      - name: Build C++
+        run: bash .github/scripts/build-cuda.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
+          cuda_version: ${{ matrix.cuda_version }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
+          path: output/*
+          retention-days: 7
+
+  build-wheels:
+    needs:
+      - build-shared-libs
+      - build-shared-libs-cuda
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
+        include:
+          - os: ubuntu-22.04
+            arch: x86_64
+          - os: ubuntu-22.04-arm
+            arch: aarch64
+          - os: windows-latest
+            arch: x86_64
+          - os: macos-latest
+            arch: arm64
+        # The specific Python version is irrelevant in this context as we are only packaging non-C extension
+        # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is
+        # dictated by the packaged code itself, not the Python version used for packaging.
+        python-version: ["3.10"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download build artifacts
+        uses: actions/download-artifact@v4
+        with:
+          merge-multiple: true
+          pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
+          path: output/
+      - name: Copy correct platform shared library
+        shell: bash
+        run: |
+          ls -lR output/
+          cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+      - run: pip install build wheel
+      - run: python -m build .
+      - name: Determine and Set Platform Tag, then Tag Wheel
+        shell: bash
+        run: |
+          PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}")
+          echo "PLATFORM_TAG=$PLATFORM_TAG"
+          wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
+          path: dist/bitsandbytes-*.whl
+          retention-days: 7
+
+  upload-pre-release-wheels:
+    name: Create release and upload artifacts
+    runs-on: ubuntu-latest
+    if: github.ref_name == 'main'
+    permissions:
+      contents: write
+    needs:
+      - build-wheels
+    steps:
+      - name: Download and rename artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: tmp/
+          pattern: "bdist_wheel_*"
+          merge-multiple: true
   
-on:  
-  push: {}  
-  pull_request:  
-    branches: [main]  
-    paths:  
-      - ".github/workflows/python-package.yml"  
-      - "bitsandbytes/**"  
-      - "csrc/**"  
-      - "include/**"  
-      - "tests/**"  
-      - "CMakeLists.txt"  
-      - "requirements*.txt"  
-      - "setup.py"  
-      - "pyproject.toml"  
-  release:  
-    types: [published]  
-  workflow_dispatch: {} # Allow manual trigger  
-  workflow_call: {} # Allow triggering from other worfkflows  
+      - name: Inspect tmp directory after downloading artifacts
+        run: ls -alFR tmp/
   
-concurrency:  
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}  
-  cancel-in-progress: true  
+      - name: Move and rename wheel files with pattern replacement
+        run: |
+          mkdir -p wheels/
+
+          # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name
+          # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the
+          # wheel directly from the GH pre-release which gets updated continuously, e.g.
+          # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`
+          STABLE_PLACEHOLDER_VERSION="1.33.7.preview"
+
+          # exclude macos wheels for now
+          find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do
+            wheel_filename=$(basename "$wheel")
+
+            # Strip off the original version
+            rest=${wheel_filename#bitsandbytes-*-}
+            new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}"
+
+            echo "Renaming $wheel_filename → $new_name"
+            mv "$wheel" "wheels/${new_name}"
+          done
+
+      - name: Inspect wheels directory after renaming files
+        run: ls -alFR wheels/
   
-jobs:  
-  ##  
-  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.  
-  ##  
-  build-shared-libs:  
-    strategy:  
-      matrix:  
-        include:  
-          - os: ubuntu-22.04  
-            arch: x86_64  
-          - os: ubuntu-22.04-arm  
-            arch: aarch64  
-          - os: windows-latest  
-            arch: x86_64  
-          - os: macos-latest  
-            arch: arm64  
-    runs-on: ${{ matrix.os }}  
-    steps:  
-      - uses: actions/checkout@v4  
-      - name: Setup MSVC  
-        if: startsWith(matrix.os, 'windows')  
-        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl  
-      - name: Build C++  
-        run: bash .github/scripts/build-cpu.sh  
-        env:  
-          build_os: ${{ matrix.os }}  
-          build_arch: ${{ matrix.arch }}  
-      - name: Upload build artifact  
-        uses: actions/upload-artifact@v4  
-        with:  
-          name: shared_library_${{ matrix.os }}_${{ matrix.arch }}  
-          path: output/*  
-          retention-days: 7  
-  ##  
-  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)  
-  ##  
-  build-shared-libs-cuda:  
-    strategy:  
-      fail-fast: false  
-      matrix:  
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest]  
-        include:  
-          - os: ubuntu-22.04  
-            arch: x86_64  
-          - os: ubuntu-22.04-arm  
-            arch: aarch64  
-          - os: windows-latest  
-            arch: x86_64  
-        cuda_version:  
-          ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"]  
-    runs-on: ${{ matrix.os }}  
-    steps:  
-      - uses: actions/checkout@v4  
-        # Windows: We install Cuda on the agent (slow)  
-      - uses: Jimver/cuda-toolkit@v0.2.22  
-        if: startsWith(matrix.os, 'windows')  
-        id: cuda-toolkit  
-        with:  
-          cuda: ${{ matrix.cuda_version }}  
-          method: "network"  
-          sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'  
-          linux-local-args: '["--toolkit"]'  
-          use-github-cache: false  
-      - name: Setup MSVC  
-        if: startsWith(matrix.os, 'windows')  
-        uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl  
-      - name: Build C++  
-        run: bash .github/scripts/build-cuda.sh  
-        env:  
-          build_os: ${{ matrix.os }}  
-          build_arch: ${{ matrix.arch }}  
-          cuda_version: ${{ matrix.cuda_version }}  
-      - name: Upload build artifact  
-        uses: actions/upload-artifact@v4  
-        with:  
-          name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}  
-          path: output/*  
-          retention-days: 7  
-  build-shared-libs-rocm:  
-    strategy:  
-      matrix:  
-        os: [ubuntu-22.04]  
-        arch: [x86_64]  
-        rocm_version:  
-          ["6.1.2", "6.2.4", "6.3.2"]  
-    runs-on: ${{ matrix.os }}  
-    steps:  
-      - uses: actions/checkout@v4  
-      - name: Set up Docker multiarch  
-        uses: docker/setup-qemu-action@v3  
-      - name: Clean up disk space  
-        run: |  
-          sudo rm -rf \  
-              /usr/share/dotnet \  
-              /opt/ghc \  
-              "/usr/local/share/boost" \  
-              "$AGENT_TOOLSDIRECTORY" \  
-              /opt/hostedtoolcache \  
-              /opt/google/chrome \  
-              /opt/microsoft/msedge \  
-              /opt/microsoft/powershell \  
-              /opt/pipx \  
-              /usr/lib/mono \  
-              /usr/local/julia* \  
-              /usr/local/lib/android \  
-              /usr/local/lib/node_modules \  
-              /usr/local/share/chromium \  
-              /usr/local/share/powershell \  
-              /usr/share/swift  
-      - name: Build C++  
-        run: bash .github/scripts/build-rocm.sh  
-        env:  
-          build_os: ${{ matrix.os }}  
-          build_arch: ${{ matrix.arch }}  
-          rocm_version: ${{ matrix.rocm_version }}  
-      - name: Upload build artifact  
-        uses: actions/upload-artifact@v4  
-        with:  
-          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}  
-          path: output/*  
-          retention-days: 7    
-  build-wheels:  
-    needs:  
-      - build-shared-libs  
-      - build-shared-libs-cuda  
-      - build-shared-libs-rocm  
-    strategy:  
-      matrix:  
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]  
-        include:  
-          - os: ubuntu-22.04  
-            arch: x86_64  
-          - os: ubuntu-22.04-arm  
-            arch: aarch64  
-          - os: windows-latest  
-            arch: x86_64  
-          - os: macos-latest  
-            arch: arm64  
-        # The specific Python version is irrelevant in this context as we are only packaging non-C extension  
-        # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is  
-        # dictated by the packaged code itself, not the Python version used for packaging.  
-        python-version: ["3.10"]  
-    runs-on: ${{ matrix.os }}  
-    steps:  
-      - uses: actions/checkout@v4  
-      - name: Download build artifacts  
-        uses: actions/download-artifact@v4  
-        with:  
-          merge-multiple: true  
-          pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"  
-          path: output/  
-      - name: Copy correct platform shared library  
-        shell: bash  
-        run: |  
-          ls -lR output/  
-          cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/  
-      - name: Set up Python ${{ matrix.python-version }}  
-        uses: actions/setup-python@v5  
-        with:  
-          python-version: ${{ matrix.python-version }}  
-          cache: pip  
-      - run: pip install build wheel  
-      - run: python -m build .  
-      - name: Determine and Set Platform Tag, then Tag Wheel  
-        shell: bash  
-        run: |  
-          PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}")  
-          echo "PLATFORM_TAG=$PLATFORM_TAG"  
-          wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl  
-      - name: Upload build artifact  
-        uses: actions/upload-artifact@v4  
-        with:  
-          name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}  
-          path: dist/bitsandbytes-*.whl  
-          retention-days: 7  
-  
-  upload-pre-release-wheels:  
-    name: Create release and upload artifacts  
-    runs-on: ubuntu-latest  
-    if: github.ref_name == 'main'  
-    permissions:  
-      contents: write  
-    needs:  
-      - build-wheels  
-    steps:  
-      - name: Download and rename artifacts  
-        uses: actions/download-artifact@v4  
-        with:  
-          path: tmp/  
-          pattern: "bdist_wheel_*"  
-          merge-multiple: true  
-    
-      - name: Inspect tmp directory after downloading artifacts  
-        run: ls -alFR tmp/  
-    
-      - name: Move and rename wheel files with pattern replacement  
-        run: |  
-          mkdir -p wheels/  
-  
-          # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name  
-          # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the  
-          # wheel directly from the GH pre-release which gets updated continuously, e.g.  
-          # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl`  
-          STABLE_PLACEHOLDER_VERSION="1.33.7.preview"  
-  
-          # exclude macos wheels for now  
-          find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do  
-            wheel_filename=$(basename "$wheel")  
-  
-            # Strip off the original version  
-            rest=${wheel_filename#bitsandbytes-*-}  
-            new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}"  
-  
-            echo "Renaming $wheel_filename → $new_name"  
-            mv "$wheel" "wheels/${new_name}"  
-          done  
-  
-      - name: Inspect wheels directory after renaming files  
-        run: ls -alFR wheels/  
-    
-      - name: Delete old pre-release (if exists)  
-        run: |  
-          gh release delete continuous-release_main --cleanup-tag -y || true  
-        env:  
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}  
-  
-      - name: Generate pip install commands for release body  
-        run: |  
-          cat > body.md << 'ENDOFMARKDOWN'  
-          ## Latest `main` Wheel Pre-release  
-  
-          This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.  
-  
-          **How to install:**    
-          Pick the correct command for your platform and run it in your terminal:  
-  
-          ENDOFMARKDOWN  
-  
-          for whl in wheels/*.whl; do  
-            fname=$(basename "$whl")  
-            url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"  
-            echo "\`\`\`sh" >> body.md  
-            echo "pip install $url" >> body.md  
-            echo "\`\`\`" >> body.md  
-            echo "" >> body.md  
-          done  
-  
-          cat >> body.md << 'ENDOFMARKDOWN'  
-          > **Note:**    
-          > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.  
-          ENDOFMARKDOWN  
-  
-          # for debugging:  
-          cat body.md  
-  
-      - name: Create new pre-release and upload artifacts  
-        uses: softprops/action-gh-release@v2.2.1  
-        with:  
-          files: wheels/*.whl  
-          prerelease: true  
-          name: Latest `main` wheel  
-          body_path: body.md  
-          tag_name: continuous-release_main  
-          make_latest: false  
-          draft: false  
-          target_commitish: ${{ github.sha }}  
-  
-  audit-wheels:  
-    needs: build-wheels  
-    strategy:  
-      matrix:  
-        os: [ubuntu-22.04, ubuntu-22.04-arm]  
-        include:  
-          - os: ubuntu-22.04  
-            arch: x86_64  
-          - os: ubuntu-22.04-arm  
-            arch: aarch64  
-    runs-on: ${{ matrix.os }}  
-    env:  
-      PIP_DISABLE_PIP_VERSION_CHECK: 1  
-    steps:  
-      - uses: actions/checkout@v4  
-      - name: Download wheel  
-        uses: actions/download-artifact@v4  
-        with:  
-          name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}  
-          path: wheels/  
-      - name: Set up Python  
-        uses: actions/setup-python@v5  
-        with:  
-          python-version: "3.12"  
-      - run: pip install auditwheel  
-      - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY  
-  
-  publish-wheels:  
-    name: Publish wheels to PyPI  
-    needs: [build-wheels, audit-wheels]  
-    runs-on: ubuntu-latest  
-    if: |  
-      github.repository == 'bitsandbytes-foundation/bitsandbytes'  
-      && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')  
-    environment:  
-      name: release  
-      url: https://pypi.org/p/bitsandbytes  
-    permissions:  
-      id-token: write  
-    steps:  
-      - name: Download distribution artifacts  
-        uses: actions/download-artifact@v4  
-        with:  
-          path: dist/  
-          pattern: "bdist_wheel_*"  
-          merge-multiple: true  
-  
-      - name: Remove macOS wheels  
-        run: rm dist/*macos*  
-  
-      - name: Publish to PyPI  
-        uses: pypa/gh-action-pypi-publish@release/v1  
-        with:  
-          print-hash: true  
+      - name: Delete old pre-release (if exists)
+        run: |
+          gh release delete continuous-release_main --cleanup-tag -y || true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Generate pip install commands for release body
+        run: |
+          cat > body.md << 'ENDOFMARKDOWN'
+          ## Latest `main` Wheel Pre-release
+
+          This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
+
+          **How to install:**  
+          Pick the correct command for your platform and run it in your terminal:
+
+          ENDOFMARKDOWN
+
+          for whl in wheels/*.whl; do
+            fname=$(basename "$whl")
+            url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname"
+            echo "\`\`\`sh" >> body.md
+            echo "pip install $url" >> body.md
+            echo "\`\`\`" >> body.md
+            echo "" >> body.md
+          done
+
+          cat >> body.md << 'ENDOFMARKDOWN'
+          > **Note:**  
+          > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
+          ENDOFMARKDOWN
+
+          # for debugging:
+          cat body.md
+
+      - name: Create new pre-release and upload artifacts
+        uses: softprops/action-gh-release@v2.2.1
+        with:
+          files: wheels/*.whl
+          prerelease: true
+          name: Latest `main` wheel
+          body_path: body.md
+          tag_name: continuous-release_main
+          make_latest: false
+          draft: false
+          target_commitish: ${{ github.sha }}
+
+  audit-wheels:
+    needs: build-wheels
+    strategy:
+      matrix:
+        os: [ubuntu-22.04, ubuntu-22.04-arm]
+        include:
+          - os: ubuntu-22.04
+            arch: x86_64
+          - os: ubuntu-22.04-arm
+            arch: aarch64
+    runs-on: ${{ matrix.os }}
+    env:
+      PIP_DISABLE_PIP_VERSION_CHECK: 1
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download wheel
+        uses: actions/download-artifact@v4
+        with:
+          name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}
+          path: wheels/
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - run: pip install auditwheel
+      - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY
+
+  publish-wheels:
+    name: Publish wheels to PyPI
+    needs: [build-wheels, audit-wheels]
+    runs-on: ubuntu-latest
+    if: |
+      github.repository == 'bitsandbytes-foundation/bitsandbytes'
+      && github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+    environment:
+      name: release
+      url: https://pypi.org/p/bitsandbytes
+    permissions:
+      id-token: write
+    steps:
+      - name: Download distribution artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: dist/
+          pattern: "bdist_wheel_*"
+          merge-multiple: true
+
+      - name: Remove macOS wheels
+        run: rm dist/*macos*
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          print-hash: true

From da9a271446295e012cd61263836ab8fea0a06af8 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 3 Jun 2025 00:06:56 +0530
Subject: [PATCH 3/8] Update python-package.yml

---
 .github/workflows/python-package.yml | 53 +++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index fbaa27d56..8b0bbb374 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -102,10 +102,55 @@ jobs:
           path: output/*
           retention-days: 7
 
-  build-wheels:
-    needs:
-      - build-shared-libs
-      - build-shared-libs-cuda
+  build-shared-libs-rocm:  
+    strategy:  
+      matrix:  
+        os: [ubuntu-22.04]  
+        arch: [x86_64]  
+        rocm_version:  
+          ["6.1.2", "6.2.4", "6.3.2"]  
+    runs-on: ${{ matrix.os }}  
+    steps:  
+      - uses: actions/checkout@v4  
+      - name: Set up Docker multiarch  
+        uses: docker/setup-qemu-action@v3  
+      - name: Clean up disk space  
+        run: |  
+          sudo rm -rf \  
+              /usr/share/dotnet \  
+              /opt/ghc \  
+              "/usr/local/share/boost" \  
+              "$AGENT_TOOLSDIRECTORY" \  
+              /opt/hostedtoolcache \  
+              /opt/google/chrome \  
+              /opt/microsoft/msedge \  
+              /opt/microsoft/powershell \  
+              /opt/pipx \  
+              /usr/lib/mono \  
+              /usr/local/julia* \  
+              /usr/local/lib/android \  
+              /usr/local/lib/node_modules \  
+              /usr/local/share/chromium \  
+              /usr/local/share/powershell \  
+              /usr/share/swift  
+      - name: Build C++  
+        run: bash .github/scripts/build-rocm.sh  
+        env:  
+          build_os: ${{ matrix.os }}  
+          build_arch: ${{ matrix.arch }}  
+          rocm_version: ${{ matrix.rocm_version }}  
+      - name: Upload build artifact  
+        uses: actions/upload-artifact@v4  
+        with:  
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}  
+          path: output/*  
+          retention-days: 7  
+  
+  build-wheels:  
+    needs:  
+      - build-shared-libs  
+      - build-shared-libs-cuda  
+      - build-shared-libs-rocm 
     strategy:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]

From 08848daddb2ec6bd13f7b5a0720bd6d34988d818 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 3 Jun 2025 00:12:54 +0530
Subject: [PATCH 4/8] Update python-package.yml

---
 .github/workflows/python-package.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 8b0bbb374..a65d0f5bb 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -145,12 +145,12 @@ jobs:
           name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}  
           path: output/*  
           retention-days: 7  
-  
-  build-wheels:  
-    needs:  
-      - build-shared-libs  
-      - build-shared-libs-cuda  
-      - build-shared-libs-rocm 
+
+  build-wheels:
+    needs:
+      - build-shared-libs
+      - build-shared-libs-cuda
+      - build-shared-libs-rocm
     strategy:
       matrix:
         os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]

From 978cba3825e3624bc39d594a2bd01c2444e1af69 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Tue, 3 Jun 2025 01:33:00 +0530
Subject: [PATCH 5/8] Create build-rocm.sh

---
 .github/scripts/build-rocm.sh | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/scripts/build-rocm.sh

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
new file mode 100644
index 000000000..b508fac69
--- /dev/null
+++ b/.github/scripts/build-rocm.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+declare rocm_version
+
+set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+if [ "${build_os:0:6}" == ubuntu ]; then
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+	echo "Using image $image"
+	docker run --rm --platform "linux/$build_arch" -i \
+		-w /src -v "$PWD:/src" "$image" sh -c \
+		"apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
+      && cmake --build ."
+fi
+
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")

From af6561aec6d7df66f58d4f667e1f1307aef57011 Mon Sep 17 00:00:00 2001
From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com>
Date: Wed, 4 Jun 2025 00:34:30 +0530
Subject: [PATCH 6/8] Update cuda_specs.py

---
 bitsandbytes/cuda_specs.py | 48 +++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index 61d03083c..bbdf457cc 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -1,6 +1,6 @@
 import dataclasses
-import logging  
-import re  
+import logging
+import re
 import subprocess
 from functools import lru_cache
 from typing import Optional
@@ -78,25 +78,25 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
         return None
 
 
-def get_rocm_gpu_arch() -> str:  
-    """Get ROCm GPU architecture."""  
-    logger = logging.getLogger(__name__)  
-    try:  
-        if torch.version.hip:  
-            result = subprocess.run(["rocminfo"], capture_output=True, text=True)  
-            match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)  
-            if match:  
-                return "gfx" + match.group(1)  
-            else:  
-                return "unknown"  
-        else:  
-            return "unknown"  
-    except Exception as e:  
-        logger.error(f"Could not detect ROCm GPU architecture: {e}")  
-        if torch.cuda.is_available():  
-            logger.warning(  
-                """  
-ROCm GPU architecture detection failed despite ROCm being available.  
-                """,  
-            )  
-        return "unknown"  
+def get_rocm_gpu_arch() -> str:
+    """Get ROCm GPU architecture."""
+    logger = logging.getLogger(__name__)
+    try:
+        if torch.version.hip:
+            result = subprocess.run(["rocminfo"], capture_output=True, text=True)
+            match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
+            if match:
+                return "gfx" + match.group(1)
+            else:
+                return "unknown"
+        else:
+            return "unknown"
+    except Exception as e:
+        logger.error(f"Could not detect ROCm GPU architecture: {e}")
+        if torch.cuda.is_available():
+            logger.warning(
+                """
+ROCm GPU architecture detection failed despite ROCm being available.
+                """,
+            )
+        return "unknown"

From 405b4843fe2dffc0ab8059f82a4e3fb399ed10f0 Mon Sep 17 00:00:00 2001
From: MISHANMAUYRA <mishanmaurya31081@gmail.com>
Date: Wed, 4 Jun 2025 00:54:11 +0530
Subject: [PATCH 7/8] Fix trailing whitespace

---
 .github/workflows/python-package.yml |  96 +++----
 bitsandbytes/backends/cuda/ops.py    |  36 +--
 bitsandbytes/cextension.py           |  16 +-
 bitsandbytes/cuda_specs.py           |   2 +-
 bitsandbytes/diagnostics/cuda.py     |  12 +-
 bitsandbytes/diagnostics/main.py     |   3 +-
 bitsandbytes/functional.py           |  10 +-
 bitsandbytes/nn/modules.py           |   4 +-
 conflicts.diff                       | 382 +++++++++++++++++++++++++++
 csrc/common_hip.cuh                  |   2 +-
 csrc/kernels.hip                     |  26 +-
 csrc/ops.hip                         |  10 +-
 tests/test_cuda_setup_evaluator.py   |   2 +
 tests/test_functional.py             |  15 +-
 tests/test_linear4bit.py             |   1 +
 tests/test_ops.py                    |   2 +-
 16 files changed, 506 insertions(+), 113 deletions(-)
 create mode 100644 conflicts.diff

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index a65d0f5bb..3673ac608 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -102,49 +102,49 @@ jobs:
           path: output/*
           retention-days: 7
 
-  build-shared-libs-rocm:  
-    strategy:  
-      matrix:  
-        os: [ubuntu-22.04]  
-        arch: [x86_64]  
-        rocm_version:  
-          ["6.1.2", "6.2.4", "6.3.2"]  
-    runs-on: ${{ matrix.os }}  
-    steps:  
-      - uses: actions/checkout@v4  
-      - name: Set up Docker multiarch  
-        uses: docker/setup-qemu-action@v3  
-      - name: Clean up disk space  
-        run: |  
-          sudo rm -rf \  
-              /usr/share/dotnet \  
-              /opt/ghc \  
-              "/usr/local/share/boost" \  
-              "$AGENT_TOOLSDIRECTORY" \  
-              /opt/hostedtoolcache \  
-              /opt/google/chrome \  
-              /opt/microsoft/msedge \  
-              /opt/microsoft/powershell \  
-              /opt/pipx \  
-              /usr/lib/mono \  
-              /usr/local/julia* \  
-              /usr/local/lib/android \  
-              /usr/local/lib/node_modules \  
-              /usr/local/share/chromium \  
-              /usr/local/share/powershell \  
-              /usr/share/swift  
-      - name: Build C++  
-        run: bash .github/scripts/build-rocm.sh  
-        env:  
-          build_os: ${{ matrix.os }}  
-          build_arch: ${{ matrix.arch }}  
-          rocm_version: ${{ matrix.rocm_version }}  
-      - name: Upload build artifact  
-        uses: actions/upload-artifact@v4  
-        with:  
-          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}  
-          path: output/*  
-          retention-days: 7  
+  build-shared-libs-rocm:
+    strategy:
+      matrix:
+        os: [ubuntu-22.04]
+        arch: [x86_64]
+        rocm_version:
+          ["6.1.2", "6.2.4", "6.3.2"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker multiarch
+        uses: docker/setup-qemu-action@v3
+      - name: Clean up disk space
+        run: |
+          sudo rm -rf \
+              /usr/share/dotnet \
+              /opt/ghc \
+              "/usr/local/share/boost" \
+              "$AGENT_TOOLSDIRECTORY" \
+              /opt/hostedtoolcache \
+              /opt/google/chrome \
+              /opt/microsoft/msedge \
+              /opt/microsoft/powershell \
+              /opt/pipx \
+              /usr/lib/mono \
+              /usr/local/julia* \
+              /usr/local/lib/android \
+              /usr/local/lib/node_modules \
+              /usr/local/share/chromium \
+              /usr/local/share/powershell \
+              /usr/share/swift
+      - name: Build C++
+        run: bash .github/scripts/build-rocm.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+          path: output/*
+          retention-days: 7
 
   build-wheels:
     needs:
@@ -216,10 +216,10 @@ jobs:
           path: tmp/
           pattern: "bdist_wheel_*"
           merge-multiple: true
-  
+
       - name: Inspect tmp directory after downloading artifacts
         run: ls -alFR tmp/
-  
+
       - name: Move and rename wheel files with pattern replacement
         run: |
           mkdir -p wheels/
@@ -244,7 +244,7 @@ jobs:
 
       - name: Inspect wheels directory after renaming files
         run: ls -alFR wheels/
-  
+
       - name: Delete old pre-release (if exists)
         run: |
           gh release delete continuous-release_main --cleanup-tag -y || true
@@ -258,7 +258,7 @@ jobs:
 
           This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch.
 
-          **How to install:**  
+          **How to install:**
           Pick the correct command for your platform and run it in your terminal:
 
           ENDOFMARKDOWN
@@ -273,7 +273,7 @@ jobs:
           done
 
           cat >> body.md << 'ENDOFMARKDOWN'
-          > **Note:**  
+          > **Note:**
           > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes.
           ENDOFMARKDOWN
 
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
index fd7b7b9a2..9089d6fc2 100644
--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -8,7 +8,7 @@
 from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr
 
 from ..._ops import register_kernel
-from ...cextension import lib, HIP_ENVIRONMENT
+from ...cextension import HIP_ENVIRONMENT, lib
 
 
 @register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -210,12 +210,12 @@ def _get_col_absmax(
 @register_kernel("bitsandbytes::quantize_blockwise", "cuda")
 def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
-    
-    if HIP_ENVIRONMENT:  
-        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])  
-    else:  
+
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-        
+
     torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")
 
     n = A.numel()
@@ -269,11 +269,11 @@ def _(
 def _dequantize_blockwise_impl(
     A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
 ) -> None:
-    if HIP_ENVIRONMENT:  
-        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])  
-    else:  
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-        
+
     torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
     torch._check(
         dtype in [torch.float16, torch.bfloat16, torch.float32],
@@ -303,11 +303,11 @@ def _dequantize_blockwise_impl(
 def _(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    if HIP_ENVIRONMENT:  
-        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])  
-    else:  
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-        
+
     torch._check(quant_type in ["fp4", "nf4"])
     torch._check(
         A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -385,11 +385,11 @@ def _dequantize_4bit_impl(
     dtype: torch.dtype,
     out: torch.Tensor,
 ) -> None:
-    if HIP_ENVIRONMENT:  
-        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])  
-    else:  
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
         torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
-        
+
     torch._check(quant_type in ["fp4", "nf4"])
     torch._check(
         dtype in [torch.bfloat16, torch.float16, torch.float32],
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 108aa0c9a..5283df93e 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -81,7 +81,7 @@ def get_available_cuda_binary_versions() -> list[str]:
     lib_pattern = f"libbitsandbytes_{BNB_BACKEND.lower()}*{DYNAMIC_LIBRARY_SUFFIX}"
     versions = []
     for lib in Path(__file__).parent.glob(lib_pattern):
-        pattern = r"{}(\d+)".format(BNB_BACKEND.lower())
+        pattern = rf"{BNB_BACKEND.lower()}(\d+)"
         match = re.search(pattern, lib.name)
         if match:
             ver_code = int(match.group(1))
@@ -199,18 +199,16 @@ def _format_lib_error_message(
         )
 
         compile_instructions = (
-            (
-                "COMPILE FROM SOURCE for CPU-only:\n  `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n"
-            ) if not no_cuda_lib_found 
-            else
-            (
+            ("COMPILE FROM SOURCE for CPU-only:\n  `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n")
+            if not no_cuda_lib_found
+            else (
                 "You have two options:\n"
                 "1. COMPILE FROM SOURCE (required if no binary exists):\n"
                 "   https://huggingface.co/docs/bitsandbytes/main/en/installation#cuda-compile\n"
                 "2. Use BNB_CUDA_VERSION to specify a DIFFERENT CUDA version from the detected one, which is installed on your machine and matching an available pre-compiled version listed above\n\n"
-            ) if not HIP_ENVIRONMENT
-            else
-            (
+            )
+            if not HIP_ENVIRONMENT
+            else (
                 "You can COMPILE FROM SOURCE as mentioned here:\n"
                 "   https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=AMD+ROCm#amd-gpu\n"
             )
diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
index bbdf457cc..32563a159 100644
--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
@@ -1,8 +1,8 @@
 import dataclasses
+from functools import lru_cache
 import logging
 import re
 import subprocess
-from functools import lru_cache
 from typing import Optional
 
 import torch
diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py
index b9de27fd7..b9db101ab 100644
--- a/bitsandbytes/diagnostics/cuda.py
+++ b/bitsandbytes/diagnostics/cuda.py
@@ -33,11 +33,13 @@
 }
 
 CUDA_RUNTIME_LIB_PATTERNS = (
-    "libamdhip64.so*",
-) if HIP_ENVIRONMENT else (
-    "cudart64*.dll",  # Windows
-    "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
-    "nvcuda*.dll",  # Windows
+    ("libamdhip64.so*",)
+    if HIP_ENVIRONMENT
+    else (
+        "cudart64*.dll",  # Windows
+        "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
+        "nvcuda*.dll",  # Windows
+    )
 )
 
 logger = logging.getLogger(__name__)
diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py
index 8e2bc2a7b..bf31d7978 100644
--- a/bitsandbytes/diagnostics/main.py
+++ b/bitsandbytes/diagnostics/main.py
@@ -43,7 +43,8 @@ def main():
         print(f"{BNB_BACKEND} specs:{cuda_specs}")
     if not torch.cuda.is_available():
         print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
-        if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
+        if not HIP_ENVIRONMENT:
+            print(f"- {BNB_BACKEND} driver not installed")
         print(f"- {BNB_BACKEND} not installed")
         print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
     if cuda_specs:
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 03f6c323d..9b7ce2da9 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -15,7 +15,7 @@
 
 from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
 
-from .cextension import lib, HIP_ENVIRONMENT
+from .cextension import HIP_ENVIRONMENT, lib
 
 name2qmap = {}
 
@@ -1007,10 +1007,10 @@ def quantize_4bit(
         - `torch.Tensor`: The quantized tensor with packed 4-bit values.
         - [`QuantState`]: The state object used to undo the quantization.
     """
-    
+
     if blocksize is None:
         blocksize = 64 if not HIP_ENVIRONMENT else 128
-        
+
     input_shape = A.shape
 
     _out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
@@ -1114,10 +1114,10 @@ def dequantize_4bit(
     Returns:
         `torch.Tensor`: The dequantized tensor.
     """
-    
+
     if blocksize is None:
         blocksize = 64 if not HIP_ENVIRONMENT else 128
-        
+
     if quant_state is None:
         assert absmax is not None and out is not None
 
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 2383f2c10..a2facac28 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -222,10 +222,10 @@ def __new__(
     ) -> "Params4bit":
         if data is None:
             data = torch.empty(0)
-            
+
         if blocksize is None:
             blocksize = 64 if not HIP_ENVIRONMENT else 128
-            
+
         self = torch.Tensor._make_subclass(cls, data, requires_grad)
         self.blocksize = blocksize
         self.compress_statistics = compress_statistics
diff --git a/conflicts.diff b/conflicts.diff
new file mode 100644
index 000000000..cab8c6ea7
--- /dev/null
+++ b/conflicts.diff
@@ -0,0 +1,382 @@
+diff --cc bitsandbytes/cextension.py
+index 108aa0c,b112df2..0000000
+--- a/bitsandbytes/cextension.py
++++ b/bitsandbytes/cextension.py
+@@@ -28,17 -28,10 +29,15 @@@ def get_cuda_bnb_library_path(cuda_spec
+      override_value = os.environ.get("BNB_CUDA_VERSION")
+      if override_value:
+          library_name = re.sub(r"cuda\d+", f"cuda{override_value}", library_name, count=1)
+ +        if torch.version.hip:
+ +            raise RuntimeError(
+ +                f"BNB_CUDA_VERSION={override_value} detected for ROCm!! \n"
+ +                f"Clear the variable and retry: export BNB_CUDA_VERSION=\n"
+ +            )
+          logger.warning(
+              f"WARNING: BNB_CUDA_VERSION={override_value} environment variable detected; loading {library_name}.\n"
+-             "This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n"
++             "This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.\n"
+              "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n"
+-             "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n"
+-             "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n",
+          )
+
+      return PACKAGE_DIR / library_name
+@@@ -298,14 -286,18 +301,28 @@@ def get_native_library() -> BNBNativeLi
+      return BNBNativeLibrary(dll)
+
+
+ +ROCM_GPU_ARCH = get_rocm_gpu_arch()
+ +
+  try:
+++<<<<<<< HEAD
+ +    if torch.version.hip:
+ +        HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm"
+ +    else:
+ +        HIP_ENVIRONMENT, BNB_BACKEND = False, "CUDA"
+ +
+++=======
++     # to support Intel CPU/GPU (XPU) backend
++     import intel_extension_for_pytorch as ipex
++
++     ipex_cpu = ipex if ipex._C._has_cpu() else None
++     ipex_xpu = ipex if ipex._C._has_xpu() else None
++ except BaseException:
++     ipex_cpu = None
++     ipex_xpu = None
++
++
++ try:
+++>>>>>>> upstream/main
+      lib = get_native_library()
+  except Exception as e:
+      error_msg = str(e)
+diff --cc bitsandbytes/diagnostics/cuda.py
+index b9de27f,e763ef2..0000000
+--- a/bitsandbytes/diagnostics/cuda.py
++++ b/bitsandbytes/diagnostics/cuda.py
+@@@ -5,8 -5,7 +5,12 @@@ from pathlib import Pat
+
+  import torch
+
+++<<<<<<< HEAD
+ +from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
+ +from bitsandbytes.consts import NONPYTORCH_DOC_URL
+++=======
++ from bitsandbytes.cextension import get_cuda_bnb_library_path
+++>>>>>>> upstream/main
+  from bitsandbytes.cuda_specs import CUDASpecs
+  from bitsandbytes.diagnostics.utils import print_dedented
+
+@@@ -146,42 -127,8 +134,38 @@@ def _print_cuda_diagnostics(cuda_specs
+              """,
+          )
+
+-     # TODO:
+-     # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
+-     # (2) Multiple CUDA versions installed
+-
+
+ -def print_cuda_runtime_diagnostics() -> None:
+ +def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
+ +    print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
+ +
+ +    binary_path = get_cuda_bnb_library_path(cuda_specs)
+ +    if not binary_path.exists():
+ +        print_dedented(
+ +            f"""
+ +        Library not found: {binary_path}.
+ +        Maybe you need to compile it from source? If you compiled from source, check that ROCm version
+ +        in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
+ +        and rebuild bitsandbytes.
+ +        """,
+ +        )
+ +
+ +    hip_major, hip_minor = cuda_specs.cuda_version_tuple
+ +    if (hip_major, hip_minor) < (6, 1):
+ +        print_dedented(
+ +            """
+ +            WARNING: bitsandbytes is fully supported only from ROCm 6.1.
+ +            """,
+ +        )
+ +
+ +
+ +def print_diagnostics(cuda_specs: CUDASpecs) -> None:
+ +    if HIP_ENVIRONMENT:
+ +        _print_hip_diagnostics(cuda_specs)
+ +    else:
+ +        _print_cuda_diagnostics(cuda_specs)
+ +
+ +
+ +def _print_cuda_runtime_diagnostics() -> None:
+      cudart_paths = list(find_cudart_libraries())
+      if not cudart_paths:
+          print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
+diff --cc bitsandbytes/diagnostics/main.py
+index 8e2bc2a,aa4cb30..0000000
+--- a/bitsandbytes/diagnostics/main.py
++++ b/bitsandbytes/diagnostics/main.py
+@@@ -3,12 -5,11 +5,20 @@@ import tracebac
+
+  import torch
+
+++<<<<<<< HEAD
+ +from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
+ +from bitsandbytes.consts import PACKAGE_GITHUB_URL
+ +from bitsandbytes.cuda_specs import get_cuda_specs
+ +from bitsandbytes.diagnostics.cuda import (
+ +    print_diagnostics,
+ +    print_runtime_diagnostics,
+++=======
++ from bitsandbytes import __version__ as bnb_version
++ from bitsandbytes.consts import PACKAGE_GITHUB_URL
++ from bitsandbytes.cuda_specs import get_cuda_specs
++ from bitsandbytes.diagnostics.cuda import (
++     print_cuda_diagnostics,
+++>>>>>>> upstream/main
+  )
+  from bitsandbytes.diagnostics.utils import print_dedented, print_header
+
+@@@ -28,52 -41,77 +50,122 @@@ def sanity_check()
+      assert p1 != p2
+
+
++ def get_package_version(name: str) -> str:
++     try:
++         version = importlib.metadata.version(name)
++     except importlib.metadata.PackageNotFoundError:
++         version = "not found"
++     return version
++
++
++ def show_environment():
++     """Simple utility to print out environment information."""
++
++     print(f"Platform: {platform.platform()}")
++     if platform.system() == "Linux":
++         print(f"  libc: {'-'.join(platform.libc_ver())}")
++
++     print(f"Python: {platform.python_version()}")
++
++     print(f"PyTorch: {torch.__version__}")
++     print(f"  CUDA: {torch.version.cuda or 'N/A'}")
++     print(f"  HIP: {torch.version.hip or 'N/A'}")
++     print(f"  XPU: {getattr(torch.version, 'xpu', 'N/A') or 'N/A'}")
++
++     print("Related packages:")
++     for pkg in _RELATED_PACKAGES:
++         version = get_package_version(pkg)
++         print(f"  {pkg}: {version}")
++
++
+  def main():
+-     print_header("")
+-     print_header("BUG REPORT INFORMATION")
++     print_header(f"bitsandbytes v{bnb_version}")
++     show_environment()
+      print_header("")
+
+-     print_header("OTHER")
+      cuda_specs = get_cuda_specs()
+++<<<<<<< HEAD
+ +    if HIP_ENVIRONMENT:
+ +        rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
+ +        rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
+ +        print(f"{BNB_BACKEND} specs:{rocm_specs}")
+ +    else:
+ +        print(f"{BNB_BACKEND} specs:{cuda_specs}")
+ +    if not torch.cuda.is_available():
+ +        print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
+ +        if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
+ +        print(f"- {BNB_BACKEND} not installed")
+ +        print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
+ +    if cuda_specs:
+ +        print_diagnostics(cuda_specs)
+ +    print_runtime_diagnostics()
+ +    print_header("")
+ +    print_header("DEBUG INFO END")
+ +    print_header("")
+ +    print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
+ +    try:
+ +        sanity_check()
+ +        print("SUCCESS!")
+ +        print("Installation was successful!")
+ +        return
+ +    except RuntimeError as e:
+ +        if "not available in CPU-only" in str(e):
+ +            print(
+ +                f"WARNING: {__package__} is currently running as CPU-only!\n"
+ +                "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
+ +                f"If you think that this is so erroneously,\nplease report an issue!",
+ +            )
+ +        else:
+ +            raise e
+ +    except Exception:
+ +        traceback.print_exc()
+ +    print_dedented(
+ +        f"""
+ +        Above we output some debug information.
+ +        Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
+ +        WARNING: Please be sure to sanitize sensitive info from the output before posting it.
+ +        """,
+ +    )
+ +    sys.exit(1)
+++=======
++
++     if cuda_specs:
++         print_cuda_diagnostics(cuda_specs)
++
++     # TODO: There's a lot of noise in this; needs improvement.
++     # print_cuda_runtime_diagnostics()
++
++     if not torch.cuda.is_available():
++         print("PyTorch says CUDA is not available. Possible reasons:")
++         print("1. CUDA driver not installed")
++         print("2. Using a CPU-only PyTorch build")
++         print("3. No GPU detected")
++
++     else:
++         print("Checking that the library is importable and CUDA is callable...")
++
++         try:
++             sanity_check()
++             print("SUCCESS!")
++             return
++         except RuntimeError as e:
++             if "not available in CPU-only" in str(e):
++                 print(
++                     f"WARNING: {__package__} is currently running as CPU-only!\n"
++                     "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
++                     f"If you think that this is so erroneously,\nplease report an issue!",
++                 )
++             else:
++                 raise e
++         except Exception:
++             traceback.print_exc()
++
++         print_dedented(
++             f"""
++             Above we output some debug information.
++             Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
++             WARNING: Please be sure to sanitize sensitive info from the output before posting it.
++             """,
++         )
++         sys.exit(1)
+++>>>>>>> upstream/main
+diff --cc bitsandbytes/functional.py
+index 03f6c32,ffb6668..0000000
+mode 100644,100755..100755
+--- a/bitsandbytes/functional.py
++++ b/bitsandbytes/functional.py
+@@@ -13,9 -13,9 +13,13 @@@ import torc
+  from torch import Tensor
+  from typing_extensions import deprecated
+
+- from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
++ from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict
+
+++<<<<<<< HEAD
+ +from .cextension import lib, HIP_ENVIRONMENT
+++=======
++ from .cextension import ipex_cpu, ipex_xpu, lib
+++>>>>>>> upstream/main
+
+  name2qmap = {}
+
+diff --cc bitsandbytes/nn/modules.py
+index 2383f2c,ccd842c..0000000
+--- a/bitsandbytes/nn/modules.py
++++ b/bitsandbytes/nn/modules.py
+@@@ -11,8 -11,7 +11,12 @@@ from torch import Tensor, device, dtype
+  import torch.nn.functional as F
+
+  import bitsandbytes as bnb
+++<<<<<<< HEAD
+ +from bitsandbytes.cextension import HIP_ENVIRONMENT
+ +from bitsandbytes.functional import QuantState
+++=======
++ from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu
+++>>>>>>> upstream/main
+  from bitsandbytes.optim import GlobalOptimManager
+  from bitsandbytes.utils import (
+      INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
+diff --cc tests/test_linear4bit.py
+index 1b7a772,b5db2eb..0000000
+--- a/tests/test_linear4bit.py
++++ b/tests/test_linear4bit.py
+@@@ -7,8 -8,14 +8,19 @@@ import pytes
+  import torch
+
+  import bitsandbytes as bnb
+++<<<<<<< HEAD
+ +from bitsandbytes.cextension import HIP_ENVIRONMENT
+ +from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, torch_load_from_buffer, torch_save_to_buffer
+++=======
++ from tests.helpers import (
++     TRUE_FALSE,
++     describe_dtype,
++     get_available_devices,
++     id_formatter,
++     torch_load_from_buffer,
++     torch_save_to_buffer,
++ )
+++>>>>>>> upstream/main
+
+  storage = {
+      "uint8": torch.uint8,
+@@@ -183,16 -185,10 +189,10 @@@ def test_linear_serialization(device, q
+
+  @pytest.mark.parametrize("device", get_available_devices())
+  @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+ -@pytest.mark.parametrize("blocksize", [64, 128])
+ +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
+  @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+  def test_copy_param(device, quant_type, blocksize, compress_statistics):
+-     if device == "cpu":
+-         if compress_statistics:
+-             pytest.skip("Currently segfaults on CPU")
+-         if quant_type == "fp4":
+-             pytest.xfail("FP4 not supported on CPU")
+-
+-     tensor = torch.linspace(1, blocksize, blocksize)
++     tensor = torch.randn(300, 400)
+      param = bnb.nn.Params4bit(
+          data=tensor,
+          quant_type=quant_type,
+@@@ -208,16 -204,10 +208,10 @@@
+
+  @pytest.mark.parametrize("device", get_available_devices())
+  @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+ -@pytest.mark.parametrize("blocksize", [64, 128])
+ +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
+  @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+  def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
+-     if device == "cpu":
+-         if compress_statistics:
+-             pytest.skip("Currently segfaults on CPU")
+-         if quant_type == "fp4":
+-             pytest.xfail("FP4 not supported on CPU")
+-
+-     tensor = torch.linspace(1, blocksize, blocksize)
++     tensor = torch.randn(300, 400)
+      param = bnb.nn.Params4bit(
+          data=tensor,
+          quant_type=quant_type,
+@@@ -240,16 -230,10 +234,10 @@@
+
+  @pytest.mark.parametrize("device", get_available_devices())
+  @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+ -@pytest.mark.parametrize("blocksize", [64, 128])
+ +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
+  @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
+  def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
+-     if device == "cpu":
+-         if compress_statistics:
+-             pytest.skip("Currently segfaults on CPU")
+-         if quant_type == "fp4":
+-             pytest.xfail("FP4 not supported on CPU")
+-
+-     original_tensor = torch.linspace(1, blocksize, blocksize, dtype=torch.float32)
++     original_tensor = torch.randn(300, 400)
+      original_param = bnb.nn.Params4bit(
+          data=original_tensor,
+          quant_type=quant_type,
diff --git a/csrc/common_hip.cuh b/csrc/common_hip.cuh
index e7fc4eb81..105179535 100644
--- a/csrc/common_hip.cuh
+++ b/csrc/common_hip.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#define BNB_WARP_SIZE  warpSize 
+#define BNB_WARP_SIZE  warpSize
 
 // These are set based on current BNB support for CDNA 2 & RDNA 3. Update as needed for future archs
 #define BNB_MAX_THREADS_PER_SM 2048
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index 368788f39..56e1d54db 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -532,7 +532,7 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float
       absmax[i / BLOCK_SIZE] = local_abs_max;
     }
     __syncthreads();
-    
+
     local_abs_max = smem_absmax_value[0];
 
     if(STOCHASTIC)
@@ -610,7 +610,7 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs
       valid_items_load = min(TILE_SIZE, n - i);
       valid_items_store = valid_items_load;
     }
-    
+
     // Since blocksize will always be a power-of-2, we avoid more expensive
     // division by the blocksize and instead use a shift operation.
     // This is equivalent to (i+threadId.x*NUM_PER_TH)/blocksize.
@@ -811,7 +811,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p,
       LoadFloat(temp_storage.loadf).Load(&(state2[i]), s2_vals, valid_items);
       __syncthreads();
       Load(temp_storage.load).Load(&(p[i]), p_vals, valid_items);
-      
+
       // Load additional state1 data for AdEMAMix
       // TODO: Make constexpr after updating min compiler
       if (OPTIMIZER == ADEMAMIX) {
@@ -1607,7 +1607,7 @@ kOptimizerStatic8bit2StateBlockwise(
     unsigned char c1s[N_PER_TH];
     unsigned char c2s[N_PER_TH];
     unsigned char c3s[N_PER_TH];
-    
+
     T g_vals[N_PER_TH];
     T p_vals[N_PER_TH];
     typedef hipcub::BlockLoad<T, BLOCK_SIZE/N_PER_TH, N_PER_TH, hipcub::BLOCK_LOAD_WARP_TRANSPOSE> LoadT;
@@ -1712,7 +1712,7 @@ kOptimizerStatic8bit2StateBlockwise(
 
             new_local_abs_max1 = fmaxf(new_local_abs_max1, fabsf(s1_vals[j]));
             new_local_abs_max2 = fmaxf(new_local_abs_max2, fabsf(s2_vals[j]));
-        
+
             if (OPTIMIZER == ADEMAMIX) {
               new_local_abs_max3 = fmaxf(new_local_abs_max3, fabsf(s3_vals[j]));
             }
@@ -1776,7 +1776,7 @@ kOptimizerStatic8bit2StateBlockwise(
               } else {
 		p_vals[j] = (T)(((float)p_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps)))))));
               }
-	
+
 	      if(weight_decay > 0.0f)
 									p_vals[j] = ((float)p_vals[j])*(1.0f-(lr*weight_decay));
 						}
@@ -2148,27 +2148,27 @@ __global__ void kdequant_mm_int32_fp16(
 
   int local_values[ITEMS_PER_THREAD];
   half local_output[ITEMS_PER_THREAD];
-  
+
   float local_rowStats[ITEMS_PER_THREAD];
   float local_colStats[ITEMS_PER_THREAD];
   float local_biasValue[ITEMS_PER_THREAD];
 
   typedef hipcub::BlockLoad<int, THREADS, ITEMS_PER_THREAD, hipcub::BLOCK_LOAD_DIRECT> LoadInt32;
   __shared__ typename LoadInt32::TempStorage loadint32;
-  
+
   int row_idx, col_idx;
-  
+
   #pragma unroll ITEMS_PER_THREAD
   for(int j = 0; j < ITEMS_PER_THREAD; j++)
   {
        row_idx = (block_offset + thread_offset + j) / numCols;
        col_idx = (block_offset + thread_offset + j) % numCols;
-       
+
        local_colStats[j] = col_idx >= numCols ? 0.0f : colStats[col_idx];
-       local_rowStats[j] = row_idx >= numRows ? 0.0f : rowStats[row_idx]; 
+       local_rowStats[j] = row_idx >= numRows ? 0.0f : rowStats[row_idx];
        local_biasValue[j] = ((bias == nullptr) || (col_idx >= numCols)) ? 0.0f : __half2float(bias[col_idx]);
   }
- 
+
   // Each block loads THREADS * ITEMS_PER_THREAD values from A
   int valid_items = block_offset + THREADS * ITEMS_PER_THREAD < n_out
     ? THREADS * ITEMS_PER_THREAD
@@ -2188,7 +2188,7 @@ __global__ void kdequant_mm_int32_fp16(
     if (outIdx < n_out) {
       out[outIdx] = local_output[j];
     }
-  } 
+  }
 }
 
 #define DENORM 1.0f/127.0f
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 4d077d19a..eef616d48 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -199,10 +199,10 @@ template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
 	}
 }
 
-#define BLOCKSIZE_2STATE 256 
-#define NUM_2STATE 1 
-#define BLOCKSIZE_1STATE 256 
-#define NUM_1STATE 1 
+#define BLOCKSIZE_2STATE 256
+#define NUM_2STATE 1
+#define BLOCKSIZE_1STATE 256
+#define NUM_1STATE 1
 
 template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
 	T* p,
@@ -443,7 +443,7 @@ static std::string hipError_to_string(const hipError_t ret)
 }
 
 template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(
-  hipblasLtHandle_t ltHandle, 
+  hipblasLtHandle_t ltHandle,
   int m, int n, int k,
   const int8_t *A,
   const int8_t *B,
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index 1b2ea85db..3d8b688ee 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -12,11 +12,13 @@ def cuda120_spec() -> CUDASpecs:
         cuda_version_tuple=(12, 0),
     )
 
+
 @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm")
 def test_get_cuda_bnb_library_path(monkeypatch, cuda120_spec):
     monkeypatch.delenv("BNB_CUDA_VERSION", raising=False)
     assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda120"
 
+
 @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm")
 def test_get_cuda_bnb_library_path_override(monkeypatch, cuda120_spec, caplog):
     monkeypatch.setenv("BNB_CUDA_VERSION", "110")
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 5f5ee488c..a2964c733 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -8,8 +8,8 @@
 import torch
 
 import bitsandbytes as bnb
-from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH
 from bitsandbytes import functional as F
+from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH
 from tests.helpers import (
     BOOLEAN_TUPLES,
     TRUE_FALSE,
@@ -92,7 +92,10 @@ class Test8BitBlockwiseQuantizeFunctional:
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
     @pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested"))
-    @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64] if not HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128] )
+    @pytest.mark.parametrize(
+        "blocksize",
+        [4096, 2048, 1024, 512, 256, 128, 64] if not HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128],
+    )
     @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
     def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
         iters = 100
@@ -796,6 +799,7 @@ def test_coo_int8_vectorwise_quant(self, device, dim1, dim2):
                 A[:, outlier_cols] = 0
                 torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2)
 
+
 @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required")
 class TestSpMMFunctional:
@@ -1106,7 +1110,10 @@ class TestQuantize4BitFunctional:
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
     @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096] if not HIP_ENVIRONMENT else [128, 256, 512, 1024, 2048, 4096])
+    @pytest.mark.parametrize(
+        "blocksize",
+        [64, 128, 256, 512, 1024, 2048, 4096] if not HIP_ENVIRONMENT else [128, 256, 512, 1024, 2048, 4096],
+    )
     def test_4bit_quant(self, device, dtype, quant_type, blocksize):
         if device == "cpu" and quant_type != "nf4":
             pytest.xfail("fp4 quantization is not supported on CPU")
@@ -1205,7 +1212,7 @@ def test_bench_4bit_dequant(self, quant_type):
         #    torch.matmul(b, a.t())
         # torch.cuda.synchronize()
         # print((time.time()-t0)/iters*1e6)
-    
+
     @pytest.mark.skipif(
         HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64"
     )
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 1b7a7722c..60c163477 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -17,6 +17,7 @@
     "float32": torch.float32,
 }
 
+
 @pytest.mark.parametrize("device", get_available_devices())
 @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"])
 @pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias"))
diff --git a/tests/test_ops.py b/tests/test_ops.py
index a99d080b3..a433a0c4b 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -4,8 +4,8 @@
 import torch
 
 import bitsandbytes
-from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
 from bitsandbytes.cextension import HIP_ENVIRONMENT
+from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter
 
 
 class TestLLMInt8Ops:

From 93768d07b1b753790a784f1472e5b6b1f9fa5c73 Mon Sep 17 00:00:00 2001
From: MISHANMAUYRA <mishanmaurya31081@gmail.com>
Date: Wed, 4 Jun 2025 01:24:09 +0530
Subject: [PATCH 8/8] Remove conflicts.diff

---
 conflicts.diff | 382 -------------------------------------------------
 1 file changed, 382 deletions(-)
 delete mode 100644 conflicts.diff

diff --git a/conflicts.diff b/conflicts.diff
deleted file mode 100644
index cab8c6ea7..000000000
--- a/conflicts.diff
+++ /dev/null
@@ -1,382 +0,0 @@
-diff --cc bitsandbytes/cextension.py
-index 108aa0c,b112df2..0000000
---- a/bitsandbytes/cextension.py
-+++ b/bitsandbytes/cextension.py
-@@@ -28,17 -28,10 +29,15 @@@ def get_cuda_bnb_library_path(cuda_spec
-      override_value = os.environ.get("BNB_CUDA_VERSION")
-      if override_value:
-          library_name = re.sub(r"cuda\d+", f"cuda{override_value}", library_name, count=1)
- +        if torch.version.hip:
- +            raise RuntimeError(
- +                f"BNB_CUDA_VERSION={override_value} detected for ROCm!! \n"
- +                f"Clear the variable and retry: export BNB_CUDA_VERSION=\n"
- +            )
-          logger.warning(
-              f"WARNING: BNB_CUDA_VERSION={override_value} environment variable detected; loading {library_name}.\n"
--             "This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n"
-+             "This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.\n"
-              "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n"
--             "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n"
--             "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n",
-          )
-
-      return PACKAGE_DIR / library_name
-@@@ -298,14 -286,18 +301,28 @@@ def get_native_library() -> BNBNativeLi
-      return BNBNativeLibrary(dll)
-
-
- +ROCM_GPU_ARCH = get_rocm_gpu_arch()
- +
-  try:
-++<<<<<<< HEAD
- +    if torch.version.hip:
- +        HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm"
- +    else:
- +        HIP_ENVIRONMENT, BNB_BACKEND = False, "CUDA"
- +
-++=======
-+     # to support Intel CPU/GPU (XPU) backend
-+     import intel_extension_for_pytorch as ipex
-+
-+     ipex_cpu = ipex if ipex._C._has_cpu() else None
-+     ipex_xpu = ipex if ipex._C._has_xpu() else None
-+ except BaseException:
-+     ipex_cpu = None
-+     ipex_xpu = None
-+
-+
-+ try:
-++>>>>>>> upstream/main
-      lib = get_native_library()
-  except Exception as e:
-      error_msg = str(e)
-diff --cc bitsandbytes/diagnostics/cuda.py
-index b9de27f,e763ef2..0000000
---- a/bitsandbytes/diagnostics/cuda.py
-+++ b/bitsandbytes/diagnostics/cuda.py
-@@@ -5,8 -5,7 +5,12 @@@ from pathlib import Pat
-
-  import torch
-
-++<<<<<<< HEAD
- +from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
- +from bitsandbytes.consts import NONPYTORCH_DOC_URL
-++=======
-+ from bitsandbytes.cextension import get_cuda_bnb_library_path
-++>>>>>>> upstream/main
-  from bitsandbytes.cuda_specs import CUDASpecs
-  from bitsandbytes.diagnostics.utils import print_dedented
-
-@@@ -146,42 -127,8 +134,38 @@@ def _print_cuda_diagnostics(cuda_specs
-              """,
-          )
-
--     # TODO:
--     # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
--     # (2) Multiple CUDA versions installed
--
-
- -def print_cuda_runtime_diagnostics() -> None:
- +def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
- +    print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
- +
- +    binary_path = get_cuda_bnb_library_path(cuda_specs)
- +    if not binary_path.exists():
- +        print_dedented(
- +            f"""
- +        Library not found: {binary_path}.
- +        Maybe you need to compile it from source? If you compiled from source, check that ROCm version
- +        in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
- +        and rebuild bitsandbytes.
- +        """,
- +        )
- +
- +    hip_major, hip_minor = cuda_specs.cuda_version_tuple
- +    if (hip_major, hip_minor) < (6, 1):
- +        print_dedented(
- +            """
- +            WARNING: bitsandbytes is fully supported only from ROCm 6.1.
- +            """,
- +        )
- +
- +
- +def print_diagnostics(cuda_specs: CUDASpecs) -> None:
- +    if HIP_ENVIRONMENT:
- +        _print_hip_diagnostics(cuda_specs)
- +    else:
- +        _print_cuda_diagnostics(cuda_specs)
- +
- +
- +def _print_cuda_runtime_diagnostics() -> None:
-      cudart_paths = list(find_cudart_libraries())
-      if not cudart_paths:
-          print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
-diff --cc bitsandbytes/diagnostics/main.py
-index 8e2bc2a,aa4cb30..0000000
---- a/bitsandbytes/diagnostics/main.py
-+++ b/bitsandbytes/diagnostics/main.py
-@@@ -3,12 -5,11 +5,20 @@@ import tracebac
-
-  import torch
-
-++<<<<<<< HEAD
- +from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
- +from bitsandbytes.consts import PACKAGE_GITHUB_URL
- +from bitsandbytes.cuda_specs import get_cuda_specs
- +from bitsandbytes.diagnostics.cuda import (
- +    print_diagnostics,
- +    print_runtime_diagnostics,
-++=======
-+ from bitsandbytes import __version__ as bnb_version
-+ from bitsandbytes.consts import PACKAGE_GITHUB_URL
-+ from bitsandbytes.cuda_specs import get_cuda_specs
-+ from bitsandbytes.diagnostics.cuda import (
-+     print_cuda_diagnostics,
-++>>>>>>> upstream/main
-  )
-  from bitsandbytes.diagnostics.utils import print_dedented, print_header
-
-@@@ -28,52 -41,77 +50,122 @@@ def sanity_check()
-      assert p1 != p2
-
-
-+ def get_package_version(name: str) -> str:
-+     try:
-+         version = importlib.metadata.version(name)
-+     except importlib.metadata.PackageNotFoundError:
-+         version = "not found"
-+     return version
-+
-+
-+ def show_environment():
-+     """Simple utility to print out environment information."""
-+
-+     print(f"Platform: {platform.platform()}")
-+     if platform.system() == "Linux":
-+         print(f"  libc: {'-'.join(platform.libc_ver())}")
-+
-+     print(f"Python: {platform.python_version()}")
-+
-+     print(f"PyTorch: {torch.__version__}")
-+     print(f"  CUDA: {torch.version.cuda or 'N/A'}")
-+     print(f"  HIP: {torch.version.hip or 'N/A'}")
-+     print(f"  XPU: {getattr(torch.version, 'xpu', 'N/A') or 'N/A'}")
-+
-+     print("Related packages:")
-+     for pkg in _RELATED_PACKAGES:
-+         version = get_package_version(pkg)
-+         print(f"  {pkg}: {version}")
-+
-+
-  def main():
--     print_header("")
--     print_header("BUG REPORT INFORMATION")
-+     print_header(f"bitsandbytes v{bnb_version}")
-+     show_environment()
-      print_header("")
-
--     print_header("OTHER")
-      cuda_specs = get_cuda_specs()
-++<<<<<<< HEAD
- +    if HIP_ENVIRONMENT:
- +        rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
- +        rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
- +        print(f"{BNB_BACKEND} specs:{rocm_specs}")
- +    else:
- +        print(f"{BNB_BACKEND} specs:{cuda_specs}")
- +    if not torch.cuda.is_available():
- +        print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
- +        if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed")
- +        print(f"- {BNB_BACKEND} not installed")
- +        print(f"- You have multiple conflicting {BNB_BACKEND} libraries")
- +    if cuda_specs:
- +        print_diagnostics(cuda_specs)
- +    print_runtime_diagnostics()
- +    print_header("")
- +    print_header("DEBUG INFO END")
- +    print_header("")
- +    print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
- +    try:
- +        sanity_check()
- +        print("SUCCESS!")
- +        print("Installation was successful!")
- +        return
- +    except RuntimeError as e:
- +        if "not available in CPU-only" in str(e):
- +            print(
- +                f"WARNING: {__package__} is currently running as CPU-only!\n"
- +                "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
- +                f"If you think that this is so erroneously,\nplease report an issue!",
- +            )
- +        else:
- +            raise e
- +    except Exception:
- +        traceback.print_exc()
- +    print_dedented(
- +        f"""
- +        Above we output some debug information.
- +        Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
- +        WARNING: Please be sure to sanitize sensitive info from the output before posting it.
- +        """,
- +    )
- +    sys.exit(1)
-++=======
-+
-+     if cuda_specs:
-+         print_cuda_diagnostics(cuda_specs)
-+
-+     # TODO: There's a lot of noise in this; needs improvement.
-+     # print_cuda_runtime_diagnostics()
-+
-+     if not torch.cuda.is_available():
-+         print("PyTorch says CUDA is not available. Possible reasons:")
-+         print("1. CUDA driver not installed")
-+         print("2. Using a CPU-only PyTorch build")
-+         print("3. No GPU detected")
-+
-+     else:
-+         print("Checking that the library is importable and CUDA is callable...")
-+
-+         try:
-+             sanity_check()
-+             print("SUCCESS!")
-+             return
-+         except RuntimeError as e:
-+             if "not available in CPU-only" in str(e):
-+                 print(
-+                     f"WARNING: {__package__} is currently running as CPU-only!\n"
-+                     "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n"
-+                     f"If you think that this is so erroneously,\nplease report an issue!",
-+                 )
-+             else:
-+                 raise e
-+         except Exception:
-+             traceback.print_exc()
-+
-+         print_dedented(
-+             f"""
-+             Above we output some debug information.
-+             Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose
-+             WARNING: Please be sure to sanitize sensitive info from the output before posting it.
-+             """,
-+         )
-+         sys.exit(1)
-++>>>>>>> upstream/main
-diff --cc bitsandbytes/functional.py
-index 03f6c32,ffb6668..0000000
-mode 100644,100755..100755
---- a/bitsandbytes/functional.py
-+++ b/bitsandbytes/functional.py
-@@@ -13,9 -13,9 +13,13 @@@ import torc
-  from torch import Tensor
-  from typing_extensions import deprecated
-
-- from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict
-+ from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict
-
-++<<<<<<< HEAD
- +from .cextension import lib, HIP_ENVIRONMENT
-++=======
-+ from .cextension import ipex_cpu, ipex_xpu, lib
-++>>>>>>> upstream/main
-
-  name2qmap = {}
-
-diff --cc bitsandbytes/nn/modules.py
-index 2383f2c,ccd842c..0000000
---- a/bitsandbytes/nn/modules.py
-+++ b/bitsandbytes/nn/modules.py
-@@@ -11,8 -11,7 +11,12 @@@ from torch import Tensor, device, dtype
-  import torch.nn.functional as F
-
-  import bitsandbytes as bnb
-++<<<<<<< HEAD
- +from bitsandbytes.cextension import HIP_ENVIRONMENT
- +from bitsandbytes.functional import QuantState
-++=======
-+ from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu
-++>>>>>>> upstream/main
-  from bitsandbytes.optim import GlobalOptimManager
-  from bitsandbytes.utils import (
-      INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
-diff --cc tests/test_linear4bit.py
-index 1b7a772,b5db2eb..0000000
---- a/tests/test_linear4bit.py
-+++ b/tests/test_linear4bit.py
-@@@ -7,8 -8,14 +8,19 @@@ import pytes
-  import torch
-
-  import bitsandbytes as bnb
-++<<<<<<< HEAD
- +from bitsandbytes.cextension import HIP_ENVIRONMENT
- +from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, torch_load_from_buffer, torch_save_to_buffer
-++=======
-+ from tests.helpers import (
-+     TRUE_FALSE,
-+     describe_dtype,
-+     get_available_devices,
-+     id_formatter,
-+     torch_load_from_buffer,
-+     torch_save_to_buffer,
-+ )
-++>>>>>>> upstream/main
-
-  storage = {
-      "uint8": torch.uint8,
-@@@ -183,16 -185,10 +189,10 @@@ def test_linear_serialization(device, q
-
-  @pytest.mark.parametrize("device", get_available_devices())
-  @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
- -@pytest.mark.parametrize("blocksize", [64, 128])
- +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
-  @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
-  def test_copy_param(device, quant_type, blocksize, compress_statistics):
--     if device == "cpu":
--         if compress_statistics:
--             pytest.skip("Currently segfaults on CPU")
--         if quant_type == "fp4":
--             pytest.xfail("FP4 not supported on CPU")
--
--     tensor = torch.linspace(1, blocksize, blocksize)
-+     tensor = torch.randn(300, 400)
-      param = bnb.nn.Params4bit(
-          data=tensor,
-          quant_type=quant_type,
-@@@ -208,16 -204,10 +208,10 @@@
-
-  @pytest.mark.parametrize("device", get_available_devices())
-  @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
- -@pytest.mark.parametrize("blocksize", [64, 128])
- +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
-  @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
-  def test_deepcopy_param(device, quant_type, blocksize, compress_statistics):
--     if device == "cpu":
--         if compress_statistics:
--             pytest.skip("Currently segfaults on CPU")
--         if quant_type == "fp4":
--             pytest.xfail("FP4 not supported on CPU")
--
--     tensor = torch.linspace(1, blocksize, blocksize)
-+     tensor = torch.randn(300, 400)
-      param = bnb.nn.Params4bit(
-          data=tensor,
-          quant_type=quant_type,
-@@@ -240,16 -230,10 +234,10 @@@
-
-  @pytest.mark.parametrize("device", get_available_devices())
-  @pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
- -@pytest.mark.parametrize("blocksize", [64, 128])
- +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128])
-  @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics"))
-  def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics):
--     if device == "cpu":
--         if compress_statistics:
--             pytest.skip("Currently segfaults on CPU")
--         if quant_type == "fp4":
--             pytest.xfail("FP4 not supported on CPU")
--
--     original_tensor = torch.linspace(1, blocksize, blocksize, dtype=torch.float32)
-+     original_tensor = torch.randn(300, 400)
-      original_param = bnb.nn.Params4bit(
-          data=original_tensor,
-          quant_type=quant_type,