From ef31c362e22b201551605bc6d808026ea33da59c Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Mon, 2 Jun 2025 23:55:14 +0530 Subject: [PATCH 1/8] Update python-package.yml --- .github/workflows/python-package.yml | 643 ++++++++++++++------------- 1 file changed, 343 insertions(+), 300 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index fbaa27d56..10daf0f79 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,303 +1,346 @@ -name: Python package - -on: - push: {} - pull_request: - branches: [main] - paths: - - ".github/workflows/python-package.yml" - - "bitsandbytes/**" - - "csrc/**" - - "include/**" - - "tests/**" - - "CMakeLists.txt" - - "requirements*.txt" - - "setup.py" - - "pyproject.toml" - release: - types: [published] - workflow_dispatch: {} # Allow manual trigger - workflow_call: {} # Allow triggering from other worfkflows - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - ## - # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. - ## - build-shared-libs: - strategy: - matrix: - include: - - os: ubuntu-22.04 - arch: x86_64 - - os: ubuntu-22.04-arm - arch: aarch64 - - os: windows-latest - arch: x86_64 - - os: macos-latest - arch: arm64 - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - - name: Setup MSVC - if: startsWith(matrix.os, 'windows') - uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - - name: Build C++ - run: bash .github/scripts/build-cpu.sh - env: - build_os: ${{ matrix.os }} - build_arch: ${{ matrix.arch }} - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: shared_library_${{ matrix.os }}_${{ matrix.arch }} - path: output/* - retention-days: 7 - ## - # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) - ## - build-shared-libs-cuda: - strategy: - fail-fast: false - matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest] - include: - - os: ubuntu-22.04 - arch: x86_64 - - os: ubuntu-22.04-arm - arch: aarch64 - - os: windows-latest - arch: x86_64 - cuda_version: - ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - # Windows: We install Cuda on the agent (slow) - - uses: Jimver/cuda-toolkit@v0.2.22 - if: startsWith(matrix.os, 'windows') - id: cuda-toolkit - with: - cuda: ${{ matrix.cuda_version }} - method: "network" - sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' - linux-local-args: '["--toolkit"]' - use-github-cache: false - - name: Setup MSVC - if: startsWith(matrix.os, 'windows') - uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - - name: Build C++ - run: bash .github/scripts/build-cuda.sh - env: - build_os: ${{ matrix.os }} - build_arch: ${{ matrix.arch }} - cuda_version: ${{ matrix.cuda_version }} - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} - path: output/* - retention-days: 7 - - build-wheels: - needs: - - build-shared-libs - - build-shared-libs-cuda - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest] - include: - - os: ubuntu-22.04 - arch: x86_64 - - os: ubuntu-22.04-arm - arch: aarch64 - - os: windows-latest - arch: x86_64 - - os: macos-latest - arch: arm64 - # The specific Python version is irrelevant in this context as we are only packaging non-C extension - # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is - # dictated by the packaged code itself, not the Python version used for packaging. - python-version: ["3.10"] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - - name: Download build artifacts - uses: actions/download-artifact@v4 - with: - merge-multiple: true - pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" - path: output/ - - name: Copy correct platform shared library - shell: bash - run: | - ls -lR output/ - cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache: pip - - run: pip install build wheel - - run: python -m build . - - name: Determine and Set Platform Tag, then Tag Wheel - shell: bash - run: | - PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}") - echo "PLATFORM_TAG=$PLATFORM_TAG" - wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} - path: dist/bitsandbytes-*.whl - retention-days: 7 - - upload-pre-release-wheels: - name: Create release and upload artifacts - runs-on: ubuntu-latest - if: github.ref_name == 'main' - permissions: - contents: write - needs: - - build-wheels - steps: - - name: Download and rename artifacts - uses: actions/download-artifact@v4 - with: - path: tmp/ - pattern: "bdist_wheel_*" - merge-multiple: true +name: Python package - - name: Inspect tmp directory after downloading artifacts - run: ls -alFR tmp/ +on: + push: {} + pull_request: + branches: [main] + paths: + - ".github/workflows/python-package.yml" + - "bitsandbytes/**" + - "csrc/**" + - "include/**" + - "tests/**" + - "CMakeLists.txt" + - "requirements*.txt" + - "setup.py" + - "pyproject.toml" + release: + types: [published] + workflow_dispatch: {} # Allow manual trigger + workflow_call: {} # Allow triggering from other worfkflows - - name: Move and rename wheel files with pattern replacement - run: | - mkdir -p wheels/ - - # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name - # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the - # wheel directly from the GH pre-release which gets updated continuously, e.g. - # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl` - STABLE_PLACEHOLDER_VERSION="1.33.7.preview" - - # exclude macos wheels for now - find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do - wheel_filename=$(basename "$wheel") - - # Strip off the original version - rest=${wheel_filename#bitsandbytes-*-} - new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}" - - echo "Renaming $wheel_filename → $new_name" - mv "$wheel" "wheels/${new_name}" - done - - - name: Inspect wheels directory after renaming files - run: ls -alFR wheels/ +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true - - name: Delete old pre-release (if exists) - run: | - gh release delete continuous-release_main --cleanup-tag -y || true - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Generate pip install commands for release body - run: | - cat > body.md << 'ENDOFMARKDOWN' - ## Latest `main` Wheel Pre-release - - This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch. - - **How to install:** - Pick the correct command for your platform and run it in your terminal: - - ENDOFMARKDOWN - - for whl in wheels/*.whl; do - fname=$(basename "$whl") - url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname" - echo "\`\`\`sh" >> body.md - echo "pip install $url" >> body.md - echo "\`\`\`" >> body.md - echo "" >> body.md - done - - cat >> body.md << 'ENDOFMARKDOWN' - > **Note:** - > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes. - ENDOFMARKDOWN - - # for debugging: - cat body.md - - - name: Create new pre-release and upload artifacts - uses: softprops/action-gh-release@v2.2.1 - with: - files: wheels/*.whl - prerelease: true - name: Latest `main` wheel - body_path: body.md - tag_name: continuous-release_main - make_latest: false - draft: false - target_commitish: ${{ github.sha }} - - audit-wheels: - needs: build-wheels - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm] - include: - - os: ubuntu-22.04 - arch: x86_64 - - os: ubuntu-22.04-arm - arch: aarch64 - runs-on: ${{ matrix.os }} - env: - PIP_DISABLE_PIP_VERSION_CHECK: 1 - steps: - - uses: actions/checkout@v4 - - name: Download wheel - uses: actions/download-artifact@v4 - with: - name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} - path: wheels/ - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - run: pip install auditwheel - - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY - - publish-wheels: - name: Publish wheels to PyPI - needs: [build-wheels, audit-wheels] - runs-on: ubuntu-latest - if: | - github.repository == 'bitsandbytes-foundation/bitsandbytes' - && github.event_name == 'push' && startsWith(github.ref, 'refs/tags') - environment: - name: release - url: https://pypi.org/p/bitsandbytes - permissions: - id-token: write - steps: - - name: Download distribution artifacts - uses: actions/download-artifact@v4 - with: - path: dist/ - pattern: "bdist_wheel_*" - merge-multiple: true - - - name: Remove macOS wheels - run: rm dist/*macos* - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - print-hash: true +jobs: + ## + # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. + ## + build-shared-libs: + strategy: + matrix: + include: + - os: ubuntu-22.04 + arch: x86_64 + - os: ubuntu-22.04-arm + arch: aarch64 + - os: windows-latest + arch: x86_64 + - os: macos-latest + arch: arm64 + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Setup MSVC + if: startsWith(matrix.os, 'windows') + uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl + - name: Build C++ + run: bash .github/scripts/build-cpu.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_${{ matrix.os }}_${{ matrix.arch }} + path: output/* + retention-days: 7 + ## + # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) + ## + build-shared-libs-cuda: + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest] + include: + - os: ubuntu-22.04 + arch: x86_64 + - os: ubuntu-22.04-arm + arch: aarch64 + - os: windows-latest + arch: x86_64 + cuda_version: + ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + # Windows: We install Cuda on the agent (slow) + - uses: Jimver/cuda-toolkit@v0.2.22 + if: startsWith(matrix.os, 'windows') + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda_version }} + method: "network" + sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' + linux-local-args: '["--toolkit"]' + use-github-cache: false + - name: Setup MSVC + if: startsWith(matrix.os, 'windows') + uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl + - name: Build C++ + run: bash .github/scripts/build-cuda.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} + cuda_version: ${{ matrix.cuda_version }} + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} + path: output/* + retention-days: 7 + build-shared-libs-rocm: + strategy: + matrix: + os: [ubuntu-22.04] + arch: [x86_64] + rocm_version: + ["6.1.2", "6.2.4", "6.3.2"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up Docker multiarch + uses: docker/setup-qemu-action@v3 + - name: Clean up disk space + run: | + sudo rm -rf \ + /usr/share/dotnet \ + /opt/ghc \ + "/usr/local/share/boost" \ + "$AGENT_TOOLSDIRECTORY" \ + /opt/hostedtoolcache \ + /opt/google/chrome \ + /opt/microsoft/msedge \ + /opt/microsoft/powershell \ + /opt/pipx \ + /usr/lib/mono \ + /usr/local/julia* \ + /usr/local/lib/android \ + /usr/local/lib/node_modules \ + /usr/local/share/chromium \ + /usr/local/share/powershell \ + /usr/share/swift + - name: Build C++ + run: bash .github/scripts/build-rocm.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} + rocm_version: ${{ matrix.rocm_version }} + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }} + path: output/* + retention-days: 7 + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + - build-shared-libs-rocm + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest] + include: + - os: ubuntu-22.04 + arch: x86_64 + - os: ubuntu-22.04-arm + arch: aarch64 + - os: windows-latest + arch: x86_64 + - os: macos-latest + arch: arm64 + # The specific Python version is irrelevant in this context as we are only packaging non-C extension + # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is + # dictated by the packaged code itself, not the Python version used for packaging. + python-version: ["3.10"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + merge-multiple: true + pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" + path: output/ + - name: Copy correct platform shared library + shell: bash + run: | + ls -lR output/ + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - run: pip install build wheel + - run: python -m build . + - name: Determine and Set Platform Tag, then Tag Wheel + shell: bash + run: | + PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}") + echo "PLATFORM_TAG=$PLATFORM_TAG" + wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} + path: dist/bitsandbytes-*.whl + retention-days: 7 + + upload-pre-release-wheels: + name: Create release and upload artifacts + runs-on: ubuntu-latest + if: github.ref_name == 'main' + permissions: + contents: write + needs: + - build-wheels + steps: + - name: Download and rename artifacts + uses: actions/download-artifact@v4 + with: + path: tmp/ + pattern: "bdist_wheel_*" + merge-multiple: true + + - name: Inspect tmp directory after downloading artifacts + run: ls -alFR tmp/ + + - name: Move and rename wheel files with pattern replacement + run: | + mkdir -p wheels/ + + # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name + # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the + # wheel directly from the GH pre-release which gets updated continuously, e.g. + # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl` + STABLE_PLACEHOLDER_VERSION="1.33.7.preview" + + # exclude macos wheels for now + find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do + wheel_filename=$(basename "$wheel") + + # Strip off the original version + rest=${wheel_filename#bitsandbytes-*-} + new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}" + + echo "Renaming $wheel_filename → $new_name" + mv "$wheel" "wheels/${new_name}" + done + + - name: Inspect wheels directory after renaming files + run: ls -alFR wheels/ + + - name: Delete old pre-release (if exists) + run: | + gh release delete continuous-release_main --cleanup-tag -y || true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate pip install commands for release body + run: | + cat > body.md << 'ENDOFMARKDOWN' + ## Latest `main` Wheel Pre-release + + This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch. + + **How to install:** + Pick the correct command for your platform and run it in your terminal: + + ENDOFMARKDOWN + + for whl in wheels/*.whl; do + fname=$(basename "$whl") + url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname" + echo "\`\`\`sh" >> body.md + echo "pip install $url" >> body.md + echo "\`\`\`" >> body.md + echo "" >> body.md + done + + cat >> body.md << 'ENDOFMARKDOWN' + > **Note:** + > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes. + ENDOFMARKDOWN + + # for debugging: + cat body.md + + - name: Create new pre-release and upload artifacts + uses: softprops/action-gh-release@v2.2.1 + with: + files: wheels/*.whl + prerelease: true + name: Latest `main` wheel + body_path: body.md + tag_name: continuous-release_main + make_latest: false + draft: false + target_commitish: ${{ github.sha }} + + audit-wheels: + needs: build-wheels + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] + include: + - os: ubuntu-22.04 + arch: x86_64 + - os: ubuntu-22.04-arm + arch: aarch64 + runs-on: ${{ matrix.os }} + env: + PIP_DISABLE_PIP_VERSION_CHECK: 1 + steps: + - uses: actions/checkout@v4 + - name: Download wheel + uses: actions/download-artifact@v4 + with: + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} + path: wheels/ + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install auditwheel + - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY + + publish-wheels: + name: Publish wheels to PyPI + needs: [build-wheels, audit-wheels] + runs-on: ubuntu-latest + if: | + github.repository == 'bitsandbytes-foundation/bitsandbytes' + && github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + environment: + name: release + url: https://pypi.org/p/bitsandbytes + permissions: + id-token: write + steps: + - name: Download distribution artifacts + uses: actions/download-artifact@v4 + with: + path: dist/ + pattern: "bdist_wheel_*" + merge-multiple: true + + - name: Remove macOS wheels + run: rm dist/*macos* + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + print-hash: true From e1435f01776137c3a253228b4234a23535532161 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Mon, 2 Jun 2025 23:57:25 +0530 Subject: [PATCH 2/8] Update python-package.yml --- .github/workflows/python-package.yml | 643 +++++++++++++-------------- 1 file changed, 300 insertions(+), 343 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 10daf0f79..fbaa27d56 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,346 +1,303 @@ -name: Python package +name: Python package + +on: + push: {} + pull_request: + branches: [main] + paths: + - ".github/workflows/python-package.yml" + - "bitsandbytes/**" + - "csrc/**" + - "include/**" + - "tests/**" + - "CMakeLists.txt" + - "requirements*.txt" + - "setup.py" + - "pyproject.toml" + release: + types: [published] + workflow_dispatch: {} # Allow manual trigger + workflow_call: {} # Allow triggering from other worfkflows + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + ## + # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. + ## + build-shared-libs: + strategy: + matrix: + include: + - os: ubuntu-22.04 + arch: x86_64 + - os: ubuntu-22.04-arm + arch: aarch64 + - os: windows-latest + arch: x86_64 + - os: macos-latest + arch: arm64 + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Setup MSVC + if: startsWith(matrix.os, 'windows') + uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl + - name: Build C++ + run: bash .github/scripts/build-cpu.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_${{ matrix.os }}_${{ matrix.arch }} + path: output/* + retention-days: 7 + ## + # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) + ## + build-shared-libs-cuda: + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest] + include: + - os: ubuntu-22.04 + arch: x86_64 + - os: ubuntu-22.04-arm + arch: aarch64 + - os: windows-latest + arch: x86_64 + cuda_version: + ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + # Windows: We install Cuda on the agent (slow) + - uses: Jimver/cuda-toolkit@v0.2.22 + if: startsWith(matrix.os, 'windows') + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda_version }} + method: "network" + sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' + linux-local-args: '["--toolkit"]' + use-github-cache: false + - name: Setup MSVC + if: startsWith(matrix.os, 'windows') + uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl + - name: Build C++ + run: bash .github/scripts/build-cuda.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} + cuda_version: ${{ matrix.cuda_version }} + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} + path: output/* + retention-days: 7 + + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest] + include: + - os: ubuntu-22.04 + arch: x86_64 + - os: ubuntu-22.04-arm + arch: aarch64 + - os: windows-latest + arch: x86_64 + - os: macos-latest + arch: arm64 + # The specific Python version is irrelevant in this context as we are only packaging non-C extension + # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is + # dictated by the packaged code itself, not the Python version used for packaging. + python-version: ["3.10"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + merge-multiple: true + pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" + path: output/ + - name: Copy correct platform shared library + shell: bash + run: | + ls -lR output/ + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - run: pip install build wheel + - run: python -m build . + - name: Determine and Set Platform Tag, then Tag Wheel + shell: bash + run: | + PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}") + echo "PLATFORM_TAG=$PLATFORM_TAG" + wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} + path: dist/bitsandbytes-*.whl + retention-days: 7 + + upload-pre-release-wheels: + name: Create release and upload artifacts + runs-on: ubuntu-latest + if: github.ref_name == 'main' + permissions: + contents: write + needs: + - build-wheels + steps: + - name: Download and rename artifacts + uses: actions/download-artifact@v4 + with: + path: tmp/ + pattern: "bdist_wheel_*" + merge-multiple: true -on: - push: {} - pull_request: - branches: [main] - paths: - - ".github/workflows/python-package.yml" - - "bitsandbytes/**" - - "csrc/**" - - "include/**" - - "tests/**" - - "CMakeLists.txt" - - "requirements*.txt" - - "setup.py" - - "pyproject.toml" - release: - types: [published] - workflow_dispatch: {} # Allow manual trigger - workflow_call: {} # Allow triggering from other worfkflows + - name: Inspect tmp directory after downloading artifacts + run: ls -alFR tmp/ -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true + - name: Move and rename wheel files with pattern replacement + run: | + mkdir -p wheels/ + + # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name + # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the + # wheel directly from the GH pre-release which gets updated continuously, e.g. + # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl` + STABLE_PLACEHOLDER_VERSION="1.33.7.preview" + + # exclude macos wheels for now + find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do + wheel_filename=$(basename "$wheel") + + # Strip off the original version + rest=${wheel_filename#bitsandbytes-*-} + new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}" + + echo "Renaming $wheel_filename → $new_name" + mv "$wheel" "wheels/${new_name}" + done + + - name: Inspect wheels directory after renaming files + run: ls -alFR wheels/ -jobs: - ## - # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. - ## - build-shared-libs: - strategy: - matrix: - include: - - os: ubuntu-22.04 - arch: x86_64 - - os: ubuntu-22.04-arm - arch: aarch64 - - os: windows-latest - arch: x86_64 - - os: macos-latest - arch: arm64 - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - - name: Setup MSVC - if: startsWith(matrix.os, 'windows') - uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - - name: Build C++ - run: bash .github/scripts/build-cpu.sh - env: - build_os: ${{ matrix.os }} - build_arch: ${{ matrix.arch }} - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: shared_library_${{ matrix.os }}_${{ matrix.arch }} - path: output/* - retention-days: 7 - ## - # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) - ## - build-shared-libs-cuda: - strategy: - fail-fast: false - matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest] - include: - - os: ubuntu-22.04 - arch: x86_64 - - os: ubuntu-22.04-arm - arch: aarch64 - - os: windows-latest - arch: x86_64 - cuda_version: - ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1"] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - # Windows: We install Cuda on the agent (slow) - - uses: Jimver/cuda-toolkit@v0.2.22 - if: startsWith(matrix.os, 'windows') - id: cuda-toolkit - with: - cuda: ${{ matrix.cuda_version }} - method: "network" - sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' - linux-local-args: '["--toolkit"]' - use-github-cache: false - - name: Setup MSVC - if: startsWith(matrix.os, 'windows') - uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - - name: Build C++ - run: bash .github/scripts/build-cuda.sh - env: - build_os: ${{ matrix.os }} - build_arch: ${{ matrix.arch }} - cuda_version: ${{ matrix.cuda_version }} - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} - path: output/* - retention-days: 7 - build-shared-libs-rocm: - strategy: - matrix: - os: [ubuntu-22.04] - arch: [x86_64] - rocm_version: - ["6.1.2", "6.2.4", "6.3.2"] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - - name: Set up Docker multiarch - uses: docker/setup-qemu-action@v3 - - name: Clean up disk space - run: | - sudo rm -rf \ - /usr/share/dotnet \ - /opt/ghc \ - "/usr/local/share/boost" \ - "$AGENT_TOOLSDIRECTORY" \ - /opt/hostedtoolcache \ - /opt/google/chrome \ - /opt/microsoft/msedge \ - /opt/microsoft/powershell \ - /opt/pipx \ - /usr/lib/mono \ - /usr/local/julia* \ - /usr/local/lib/android \ - /usr/local/lib/node_modules \ - /usr/local/share/chromium \ - /usr/local/share/powershell \ - /usr/share/swift - - name: Build C++ - run: bash .github/scripts/build-rocm.sh - env: - build_os: ${{ matrix.os }} - build_arch: ${{ matrix.arch }} - rocm_version: ${{ matrix.rocm_version }} - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }} - path: output/* - retention-days: 7 - build-wheels: - needs: - - build-shared-libs - - build-shared-libs-cuda - - build-shared-libs-rocm - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest] - include: - - os: ubuntu-22.04 - arch: x86_64 - - os: ubuntu-22.04-arm - arch: aarch64 - - os: windows-latest - arch: x86_64 - - os: macos-latest - arch: arm64 - # The specific Python version is irrelevant in this context as we are only packaging non-C extension - # code. This ensures compatibility across Python versions, including Python 3.9, as compatibility is - # dictated by the packaged code itself, not the Python version used for packaging. - python-version: ["3.10"] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - - name: Download build artifacts - uses: actions/download-artifact@v4 - with: - merge-multiple: true - pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" - path: output/ - - name: Copy correct platform shared library - shell: bash - run: | - ls -lR output/ - cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache: pip - - run: pip install build wheel - - run: python -m build . - - name: Determine and Set Platform Tag, then Tag Wheel - shell: bash - run: | - PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}") - echo "PLATFORM_TAG=$PLATFORM_TAG" - wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} - path: dist/bitsandbytes-*.whl - retention-days: 7 - - upload-pre-release-wheels: - name: Create release and upload artifacts - runs-on: ubuntu-latest - if: github.ref_name == 'main' - permissions: - contents: write - needs: - - build-wheels - steps: - - name: Download and rename artifacts - uses: actions/download-artifact@v4 - with: - path: tmp/ - pattern: "bdist_wheel_*" - merge-multiple: true - - - name: Inspect tmp directory after downloading artifacts - run: ls -alFR tmp/ - - - name: Move and rename wheel files with pattern replacement - run: | - mkdir -p wheels/ - - # The whole point of the continuous release is to have a stable download link and the only way to have a PEP 440–compliant wheel name - # is to use a stable placeholder version. Otherwise, pip won't let you install the wheel. The cool thing is that we can now install the - # wheel directly from the GH pre-release which gets updated continuously, e.g. - # `pip install https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl` - STABLE_PLACEHOLDER_VERSION="1.33.7.preview" - - # exclude macos wheels for now - find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do - wheel_filename=$(basename "$wheel") - - # Strip off the original version - rest=${wheel_filename#bitsandbytes-*-} - new_name="bitsandbytes-${STABLE_PLACEHOLDER_VERSION}-${rest}" - - echo "Renaming $wheel_filename → $new_name" - mv "$wheel" "wheels/${new_name}" - done - - - name: Inspect wheels directory after renaming files - run: ls -alFR wheels/ - - - name: Delete old pre-release (if exists) - run: | - gh release delete continuous-release_main --cleanup-tag -y || true - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Generate pip install commands for release body - run: | - cat > body.md << 'ENDOFMARKDOWN' - ## Latest `main` Wheel Pre-release - - This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch. - - **How to install:** - Pick the correct command for your platform and run it in your terminal: - - ENDOFMARKDOWN - - for whl in wheels/*.whl; do - fname=$(basename "$whl") - url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname" - echo "\`\`\`sh" >> body.md - echo "pip install $url" >> body.md - echo "\`\`\`" >> body.md - echo "" >> body.md - done - - cat >> body.md << 'ENDOFMARKDOWN' - > **Note:** - > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes. - ENDOFMARKDOWN - - # for debugging: - cat body.md - - - name: Create new pre-release and upload artifacts - uses: softprops/action-gh-release@v2.2.1 - with: - files: wheels/*.whl - prerelease: true - name: Latest `main` wheel - body_path: body.md - tag_name: continuous-release_main - make_latest: false - draft: false - target_commitish: ${{ github.sha }} - - audit-wheels: - needs: build-wheels - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm] - include: - - os: ubuntu-22.04 - arch: x86_64 - - os: ubuntu-22.04-arm - arch: aarch64 - runs-on: ${{ matrix.os }} - env: - PIP_DISABLE_PIP_VERSION_CHECK: 1 - steps: - - uses: actions/checkout@v4 - - name: Download wheel - uses: actions/download-artifact@v4 - with: - name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} - path: wheels/ - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - run: pip install auditwheel - - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY - - publish-wheels: - name: Publish wheels to PyPI - needs: [build-wheels, audit-wheels] - runs-on: ubuntu-latest - if: | - github.repository == 'bitsandbytes-foundation/bitsandbytes' - && github.event_name == 'push' && startsWith(github.ref, 'refs/tags') - environment: - name: release - url: https://pypi.org/p/bitsandbytes - permissions: - id-token: write - steps: - - name: Download distribution artifacts - uses: actions/download-artifact@v4 - with: - path: dist/ - pattern: "bdist_wheel_*" - merge-multiple: true - - - name: Remove macOS wheels - run: rm dist/*macos* - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - print-hash: true + - name: Delete old pre-release (if exists) + run: | + gh release delete continuous-release_main --cleanup-tag -y || true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate pip install commands for release body + run: | + cat > body.md << 'ENDOFMARKDOWN' + ## Latest `main` Wheel Pre-release + + This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch. + + **How to install:** + Pick the correct command for your platform and run it in your terminal: + + ENDOFMARKDOWN + + for whl in wheels/*.whl; do + fname=$(basename "$whl") + url="https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/$fname" + echo "\`\`\`sh" >> body.md + echo "pip install $url" >> body.md + echo "\`\`\`" >> body.md + echo "" >> body.md + done + + cat >> body.md << 'ENDOFMARKDOWN' + > **Note:** + > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes. + ENDOFMARKDOWN + + # for debugging: + cat body.md + + - name: Create new pre-release and upload artifacts + uses: softprops/action-gh-release@v2.2.1 + with: + files: wheels/*.whl + prerelease: true + name: Latest `main` wheel + body_path: body.md + tag_name: continuous-release_main + make_latest: false + draft: false + target_commitish: ${{ github.sha }} + + audit-wheels: + needs: build-wheels + strategy: + matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] + include: + - os: ubuntu-22.04 + arch: x86_64 + - os: ubuntu-22.04-arm + arch: aarch64 + runs-on: ${{ matrix.os }} + env: + PIP_DISABLE_PIP_VERSION_CHECK: 1 + steps: + - uses: actions/checkout@v4 + - name: Download wheel + uses: actions/download-artifact@v4 + with: + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} + path: wheels/ + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install auditwheel + - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY + + publish-wheels: + name: Publish wheels to PyPI + needs: [build-wheels, audit-wheels] + runs-on: ubuntu-latest + if: | + github.repository == 'bitsandbytes-foundation/bitsandbytes' + && github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + environment: + name: release + url: https://pypi.org/p/bitsandbytes + permissions: + id-token: write + steps: + - name: Download distribution artifacts + uses: actions/download-artifact@v4 + with: + path: dist/ + pattern: "bdist_wheel_*" + merge-multiple: true + + - name: Remove macOS wheels + run: rm dist/*macos* + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + print-hash: true From da9a271446295e012cd61263836ab8fea0a06af8 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Tue, 3 Jun 2025 00:06:56 +0530 Subject: [PATCH 3/8] Update python-package.yml --- .github/workflows/python-package.yml | 53 +++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index fbaa27d56..8b0bbb374 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -102,10 +102,55 @@ jobs: path: output/* retention-days: 7 - build-wheels: - needs: - - build-shared-libs - - build-shared-libs-cuda + build-shared-libs-rocm: + strategy: + matrix: + os: [ubuntu-22.04] + arch: [x86_64] + rocm_version: + ["6.1.2", "6.2.4", "6.3.2"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up Docker multiarch + uses: docker/setup-qemu-action@v3 + - name: Clean up disk space + run: | + sudo rm -rf \ + /usr/share/dotnet \ + /opt/ghc \ + "/usr/local/share/boost" \ + "$AGENT_TOOLSDIRECTORY" \ + /opt/hostedtoolcache \ + /opt/google/chrome \ + /opt/microsoft/msedge \ + /opt/microsoft/powershell \ + /opt/pipx \ + /usr/lib/mono \ + /usr/local/julia* \ + /usr/local/lib/android \ + /usr/local/lib/node_modules \ + /usr/local/share/chromium \ + /usr/local/share/powershell \ + /usr/share/swift + - name: Build C++ + run: bash .github/scripts/build-rocm.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} + rocm_version: ${{ matrix.rocm_version }} + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }} + path: output/* + retention-days: 7 + + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + - build-shared-libs-rocm strategy: matrix: os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest] From 08848daddb2ec6bd13f7b5a0720bd6d34988d818 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Tue, 3 Jun 2025 00:12:54 +0530 Subject: [PATCH 4/8] Update python-package.yml --- .github/workflows/python-package.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8b0bbb374..a65d0f5bb 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -145,12 +145,12 @@ jobs: name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }} path: output/* retention-days: 7 - - build-wheels: - needs: - - build-shared-libs - - build-shared-libs-cuda - - build-shared-libs-rocm + + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + - build-shared-libs-rocm strategy: matrix: os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest] From 978cba3825e3624bc39d594a2bd01c2444e1af69 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Tue, 3 Jun 2025 01:33:00 +0530 Subject: [PATCH 5/8] Create build-rocm.sh --- .github/scripts/build-rocm.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/scripts/build-rocm.sh diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh new file mode 100644 index 000000000..b508fac69 --- /dev/null +++ b/.github/scripts/build-rocm.sh @@ -0,0 +1,21 @@ +#!/bin/bash +declare build_arch +declare build_os +declare rocm_version + +set -xeuo pipefail +bnb_rocm_arch="gfx90a;gfx942;gfx1100" +if [ "${build_os:0:6}" == ubuntu ]; then + image=rocm/dev-ubuntu-22.04:${rocm_version}-complete + echo "Using image $image" + docker run --rm --platform "linux/$build_arch" -i \ + -w /src -v "$PWD:/src" "$image" sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \ + && cmake --build ." +fi + +output_dir="output/${build_os}/${build_arch}" +mkdir -p "${output_dir}" +(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}") From af6561aec6d7df66f58d4f667e1f1307aef57011 Mon Sep 17 00:00:00 2001 From: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Date: Wed, 4 Jun 2025 00:34:30 +0530 Subject: [PATCH 6/8] Update cuda_specs.py --- bitsandbytes/cuda_specs.py | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py index 61d03083c..bbdf457cc 100644 --- a/bitsandbytes/cuda_specs.py +++ b/bitsandbytes/cuda_specs.py @@ -1,6 +1,6 @@ import dataclasses -import logging -import re +import logging +import re import subprocess from functools import lru_cache from typing import Optional @@ -78,25 +78,25 @@ def get_cuda_specs() -> Optional[CUDASpecs]: return None -def get_rocm_gpu_arch() -> str: - """Get ROCm GPU architecture.""" - logger = logging.getLogger(__name__) - try: - if torch.version.hip: - result = subprocess.run(["rocminfo"], capture_output=True, text=True) - match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout) - if match: - return "gfx" + match.group(1) - else: - return "unknown" - else: - return "unknown" - except Exception as e: - logger.error(f"Could not detect ROCm GPU architecture: {e}") - if torch.cuda.is_available(): - logger.warning( - """ -ROCm GPU architecture detection failed despite ROCm being available. - """, - ) - return "unknown" +def get_rocm_gpu_arch() -> str: + """Get ROCm GPU architecture.""" + logger = logging.getLogger(__name__) + try: + if torch.version.hip: + result = subprocess.run(["rocminfo"], capture_output=True, text=True) + match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout) + if match: + return "gfx" + match.group(1) + else: + return "unknown" + else: + return "unknown" + except Exception as e: + logger.error(f"Could not detect ROCm GPU architecture: {e}") + if torch.cuda.is_available(): + logger.warning( + """ +ROCm GPU architecture detection failed despite ROCm being available. + """, + ) + return "unknown" From 405b4843fe2dffc0ab8059f82a4e3fb399ed10f0 Mon Sep 17 00:00:00 2001 From: MISHANMAUYRA Date: Wed, 4 Jun 2025 00:54:11 +0530 Subject: [PATCH 7/8] Fix trailing whitespace --- .github/workflows/python-package.yml | 96 +++---- bitsandbytes/backends/cuda/ops.py | 36 +-- bitsandbytes/cextension.py | 16 +- bitsandbytes/cuda_specs.py | 2 +- bitsandbytes/diagnostics/cuda.py | 12 +- bitsandbytes/diagnostics/main.py | 3 +- bitsandbytes/functional.py | 10 +- bitsandbytes/nn/modules.py | 4 +- conflicts.diff | 382 +++++++++++++++++++++++++++ csrc/common_hip.cuh | 2 +- csrc/kernels.hip | 26 +- csrc/ops.hip | 10 +- tests/test_cuda_setup_evaluator.py | 2 + tests/test_functional.py | 15 +- tests/test_linear4bit.py | 1 + tests/test_ops.py | 2 +- 16 files changed, 506 insertions(+), 113 deletions(-) create mode 100644 conflicts.diff diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a65d0f5bb..3673ac608 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -102,49 +102,49 @@ jobs: path: output/* retention-days: 7 - build-shared-libs-rocm: - strategy: - matrix: - os: [ubuntu-22.04] - arch: [x86_64] - rocm_version: - ["6.1.2", "6.2.4", "6.3.2"] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v4 - - name: Set up Docker multiarch - uses: docker/setup-qemu-action@v3 - - name: Clean up disk space - run: | - sudo rm -rf \ - /usr/share/dotnet \ - /opt/ghc \ - "/usr/local/share/boost" \ - "$AGENT_TOOLSDIRECTORY" \ - /opt/hostedtoolcache \ - /opt/google/chrome \ - /opt/microsoft/msedge \ - /opt/microsoft/powershell \ - /opt/pipx \ - /usr/lib/mono \ - /usr/local/julia* \ - /usr/local/lib/android \ - /usr/local/lib/node_modules \ - /usr/local/share/chromium \ - /usr/local/share/powershell \ - /usr/share/swift - - name: Build C++ - run: bash .github/scripts/build-rocm.sh - env: - build_os: ${{ matrix.os }} - build_arch: ${{ matrix.arch }} - rocm_version: ${{ matrix.rocm_version }} - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }} - path: output/* - retention-days: 7 + build-shared-libs-rocm: + strategy: + matrix: + os: [ubuntu-22.04] + arch: [x86_64] + rocm_version: + ["6.1.2", "6.2.4", "6.3.2"] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: Set up Docker multiarch + uses: docker/setup-qemu-action@v3 + - name: Clean up disk space + run: | + sudo rm -rf \ + /usr/share/dotnet \ + /opt/ghc \ + "/usr/local/share/boost" \ + "$AGENT_TOOLSDIRECTORY" \ + /opt/hostedtoolcache \ + /opt/google/chrome \ + /opt/microsoft/msedge \ + /opt/microsoft/powershell \ + /opt/pipx \ + /usr/lib/mono \ + /usr/local/julia* \ + /usr/local/lib/android \ + /usr/local/lib/node_modules \ + /usr/local/share/chromium \ + /usr/local/share/powershell \ + /usr/share/swift + - name: Build C++ + run: bash .github/scripts/build-rocm.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} + rocm_version: ${{ matrix.rocm_version }} + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }} + path: output/* + retention-days: 7 build-wheels: needs: @@ -216,10 +216,10 @@ jobs: path: tmp/ pattern: "bdist_wheel_*" merge-multiple: true - + - name: Inspect tmp directory after downloading artifacts run: ls -alFR tmp/ - + - name: Move and rename wheel files with pattern replacement run: | mkdir -p wheels/ @@ -244,7 +244,7 @@ jobs: - name: Inspect wheels directory after renaming files run: ls -alFR wheels/ - + - name: Delete old pre-release (if exists) run: | gh release delete continuous-release_main --cleanup-tag -y || true @@ -258,7 +258,7 @@ jobs: This pre-release contains the latest development wheels for all supported platforms, rebuilt automatically on every commit to the `main` branch. - **How to install:** + **How to install:** Pick the correct command for your platform and run it in your terminal: ENDOFMARKDOWN @@ -273,7 +273,7 @@ jobs: done cat >> body.md << 'ENDOFMARKDOWN' - > **Note:** + > **Note:** > These wheels are updated automatically with every commit to `main` and become available as soon as the [python-package.yml](.github/workflows/python-package.yml) workflow finishes. ENDOFMARKDOWN diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py index fd7b7b9a2..9089d6fc2 100644 --- a/bitsandbytes/backends/cuda/ops.py +++ b/bitsandbytes/backends/cuda/ops.py @@ -8,7 +8,7 @@ from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr from ..._ops import register_kernel -from ...cextension import lib, HIP_ENVIRONMENT +from ...cextension import HIP_ENVIRONMENT, lib @register_kernel("bitsandbytes::int8_linear_matmul", "cuda") @@ -210,12 +210,12 @@ def _get_col_absmax( @register_kernel("bitsandbytes::quantize_blockwise", "cuda") def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]: torch._check_is_size(blocksize) - - if HIP_ENVIRONMENT: - torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) - else: + + if HIP_ENVIRONMENT: + torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) + else: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64]) - + torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}") n = A.numel() @@ -269,11 +269,11 @@ def _( def _dequantize_blockwise_impl( A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor ) -> None: - if HIP_ENVIRONMENT: - torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) - else: + if HIP_ENVIRONMENT: + torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) + else: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64]) - + torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}") torch._check( dtype in [torch.float16, torch.bfloat16, torch.float32], @@ -303,11 +303,11 @@ def _dequantize_blockwise_impl( def _( A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype ) -> tuple[torch.Tensor, torch.Tensor]: - if HIP_ENVIRONMENT: - torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) - else: + if HIP_ENVIRONMENT: + torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) + else: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64]) - + torch._check(quant_type in ["fp4", "nf4"]) torch._check( A.dtype in [torch.bfloat16, torch.float16, torch.float32], @@ -385,11 +385,11 @@ def _dequantize_4bit_impl( dtype: torch.dtype, out: torch.Tensor, ) -> None: - if HIP_ENVIRONMENT: - torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) - else: + if HIP_ENVIRONMENT: + torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128]) + else: torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64]) - + torch._check(quant_type in ["fp4", "nf4"]) torch._check( dtype in [torch.bfloat16, torch.float16, torch.float32], diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 108aa0c9a..5283df93e 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -81,7 +81,7 @@ def get_available_cuda_binary_versions() -> list[str]: lib_pattern = f"libbitsandbytes_{BNB_BACKEND.lower()}*{DYNAMIC_LIBRARY_SUFFIX}" versions = [] for lib in Path(__file__).parent.glob(lib_pattern): - pattern = r"{}(\d+)".format(BNB_BACKEND.lower()) + pattern = rf"{BNB_BACKEND.lower()}(\d+)" match = re.search(pattern, lib.name) if match: ver_code = int(match.group(1)) @@ -199,18 +199,16 @@ def _format_lib_error_message( ) compile_instructions = ( - ( - "COMPILE FROM SOURCE for CPU-only:\n `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n" - ) if not no_cuda_lib_found - else - ( + ("COMPILE FROM SOURCE for CPU-only:\n `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n") + if not no_cuda_lib_found + else ( "You have two options:\n" "1. COMPILE FROM SOURCE (required if no binary exists):\n" " https://huggingface.co/docs/bitsandbytes/main/en/installation#cuda-compile\n" "2. Use BNB_CUDA_VERSION to specify a DIFFERENT CUDA version from the detected one, which is installed on your machine and matching an available pre-compiled version listed above\n\n" - ) if not HIP_ENVIRONMENT - else - ( + ) + if not HIP_ENVIRONMENT + else ( "You can COMPILE FROM SOURCE as mentioned here:\n" " https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=AMD+ROCm#amd-gpu\n" ) diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py index bbdf457cc..32563a159 100644 --- a/bitsandbytes/cuda_specs.py +++ b/bitsandbytes/cuda_specs.py @@ -1,8 +1,8 @@ import dataclasses +from functools import lru_cache import logging import re import subprocess -from functools import lru_cache from typing import Optional import torch diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py index b9de27fd7..b9db101ab 100644 --- a/bitsandbytes/diagnostics/cuda.py +++ b/bitsandbytes/diagnostics/cuda.py @@ -33,11 +33,13 @@ } CUDA_RUNTIME_LIB_PATTERNS = ( - "libamdhip64.so*", -) if HIP_ENVIRONMENT else ( - "cudart64*.dll", # Windows - "libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc. - "nvcuda*.dll", # Windows + ("libamdhip64.so*",) + if HIP_ENVIRONMENT + else ( + "cudart64*.dll", # Windows + "libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc. + "nvcuda*.dll", # Windows + ) ) logger = logging.getLogger(__name__) diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py index 8e2bc2a7b..bf31d7978 100644 --- a/bitsandbytes/diagnostics/main.py +++ b/bitsandbytes/diagnostics/main.py @@ -43,7 +43,8 @@ def main(): print(f"{BNB_BACKEND} specs:{cuda_specs}") if not torch.cuda.is_available(): print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:") - if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed") + if not HIP_ENVIRONMENT: + print(f"- {BNB_BACKEND} driver not installed") print(f"- {BNB_BACKEND} not installed") print(f"- You have multiple conflicting {BNB_BACKEND} libraries") if cuda_specs: diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 03f6c323d..9b7ce2da9 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -15,7 +15,7 @@ from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict -from .cextension import lib, HIP_ENVIRONMENT +from .cextension import HIP_ENVIRONMENT, lib name2qmap = {} @@ -1007,10 +1007,10 @@ def quantize_4bit( - `torch.Tensor`: The quantized tensor with packed 4-bit values. - [`QuantState`]: The state object used to undo the quantization. """ - + if blocksize is None: blocksize = 64 if not HIP_ENVIRONMENT else 128 - + input_shape = A.shape _out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default( @@ -1114,10 +1114,10 @@ def dequantize_4bit( Returns: `torch.Tensor`: The dequantized tensor. """ - + if blocksize is None: blocksize = 64 if not HIP_ENVIRONMENT else 128 - + if quant_state is None: assert absmax is not None and out is not None diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 2383f2c10..a2facac28 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -222,10 +222,10 @@ def __new__( ) -> "Params4bit": if data is None: data = torch.empty(0) - + if blocksize is None: blocksize = 64 if not HIP_ENVIRONMENT else 128 - + self = torch.Tensor._make_subclass(cls, data, requires_grad) self.blocksize = blocksize self.compress_statistics = compress_statistics diff --git a/conflicts.diff b/conflicts.diff new file mode 100644 index 000000000..cab8c6ea7 --- /dev/null +++ b/conflicts.diff @@ -0,0 +1,382 @@ +diff --cc bitsandbytes/cextension.py +index 108aa0c,b112df2..0000000 +--- a/bitsandbytes/cextension.py ++++ b/bitsandbytes/cextension.py +@@@ -28,17 -28,10 +29,15 @@@ def get_cuda_bnb_library_path(cuda_spec + override_value = os.environ.get("BNB_CUDA_VERSION") + if override_value: + library_name = re.sub(r"cuda\d+", f"cuda{override_value}", library_name, count=1) + + if torch.version.hip: + + raise RuntimeError( + + f"BNB_CUDA_VERSION={override_value} detected for ROCm!! \n" + + f"Clear the variable and retry: export BNB_CUDA_VERSION=\n" + + ) + logger.warning( + f"WARNING: BNB_CUDA_VERSION={override_value} environment variable detected; loading {library_name}.\n" +- "This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n" ++ "This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.\n" + "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n" +- "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n" +- "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: BNBNativeLi + return BNBNativeLibrary(dll) + + + +ROCM_GPU_ARCH = get_rocm_gpu_arch() + + + try: +++<<<<<<< HEAD + + if torch.version.hip: + + HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm" + + else: + + HIP_ENVIRONMENT, BNB_BACKEND = False, "CUDA" + + +++======= ++ # to support Intel CPU/GPU (XPU) backend ++ import intel_extension_for_pytorch as ipex ++ ++ ipex_cpu = ipex if ipex._C._has_cpu() else None ++ ipex_xpu = ipex if ipex._C._has_xpu() else None ++ except BaseException: ++ ipex_cpu = None ++ ipex_xpu = None ++ ++ ++ try: +++>>>>>>> upstream/main + lib = get_native_library() + except Exception as e: + error_msg = str(e) +diff --cc bitsandbytes/diagnostics/cuda.py +index b9de27f,e763ef2..0000000 +--- a/bitsandbytes/diagnostics/cuda.py ++++ b/bitsandbytes/diagnostics/cuda.py +@@@ -5,8 -5,7 +5,12 @@@ from pathlib import Pat + + import torch + +++<<<<<<< HEAD + +from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path + +from bitsandbytes.consts import NONPYTORCH_DOC_URL +++======= ++ from bitsandbytes.cextension import get_cuda_bnb_library_path +++>>>>>>> upstream/main + from bitsandbytes.cuda_specs import CUDASpecs + from bitsandbytes.diagnostics.utils import print_dedented + +@@@ -146,42 -127,8 +134,38 @@@ def _print_cuda_diagnostics(cuda_specs + """, + ) + +- # TODO: +- # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible) +- # (2) Multiple CUDA versions installed +- + + -def print_cuda_runtime_diagnostics() -> None: + +def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None: + + print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}") + + + + binary_path = get_cuda_bnb_library_path(cuda_specs) + + if not binary_path.exists(): + + print_dedented( + + f""" + + Library not found: {binary_path}. + + Maybe you need to compile it from source? If you compiled from source, check that ROCm version + + in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version + + and rebuild bitsandbytes. + + """, + + ) + + + + hip_major, hip_minor = cuda_specs.cuda_version_tuple + + if (hip_major, hip_minor) < (6, 1): + + print_dedented( + + """ + + WARNING: bitsandbytes is fully supported only from ROCm 6.1. + + """, + + ) + + + + + +def print_diagnostics(cuda_specs: CUDASpecs) -> None: + + if HIP_ENVIRONMENT: + + _print_hip_diagnostics(cuda_specs) + + else: + + _print_cuda_diagnostics(cuda_specs) + + + + + +def _print_cuda_runtime_diagnostics() -> None: + cudart_paths = list(find_cudart_libraries()) + if not cudart_paths: + print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.") +diff --cc bitsandbytes/diagnostics/main.py +index 8e2bc2a,aa4cb30..0000000 +--- a/bitsandbytes/diagnostics/main.py ++++ b/bitsandbytes/diagnostics/main.py +@@@ -3,12 -5,11 +5,20 @@@ import tracebac + + import torch + +++<<<<<<< HEAD + +from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT + +from bitsandbytes.consts import PACKAGE_GITHUB_URL + +from bitsandbytes.cuda_specs import get_cuda_specs + +from bitsandbytes.diagnostics.cuda import ( + + print_diagnostics, + + print_runtime_diagnostics, +++======= ++ from bitsandbytes import __version__ as bnb_version ++ from bitsandbytes.consts import PACKAGE_GITHUB_URL ++ from bitsandbytes.cuda_specs import get_cuda_specs ++ from bitsandbytes.diagnostics.cuda import ( ++ print_cuda_diagnostics, +++>>>>>>> upstream/main + ) + from bitsandbytes.diagnostics.utils import print_dedented, print_header + +@@@ -28,52 -41,77 +50,122 @@@ def sanity_check() + assert p1 != p2 + + ++ def get_package_version(name: str) -> str: ++ try: ++ version = importlib.metadata.version(name) ++ except importlib.metadata.PackageNotFoundError: ++ version = "not found" ++ return version ++ ++ ++ def show_environment(): ++ """Simple utility to print out environment information.""" ++ ++ print(f"Platform: {platform.platform()}") ++ if platform.system() == "Linux": ++ print(f" libc: {'-'.join(platform.libc_ver())}") ++ ++ print(f"Python: {platform.python_version()}") ++ ++ print(f"PyTorch: {torch.__version__}") ++ print(f" CUDA: {torch.version.cuda or 'N/A'}") ++ print(f" HIP: {torch.version.hip or 'N/A'}") ++ print(f" XPU: {getattr(torch.version, 'xpu', 'N/A') or 'N/A'}") ++ ++ print("Related packages:") ++ for pkg in _RELATED_PACKAGES: ++ version = get_package_version(pkg) ++ print(f" {pkg}: {version}") ++ ++ + def main(): +- print_header("") +- print_header("BUG REPORT INFORMATION") ++ print_header(f"bitsandbytes v{bnb_version}") ++ show_environment() + print_header("") + +- print_header("OTHER") + cuda_specs = get_cuda_specs() +++<<<<<<< HEAD + + if HIP_ENVIRONMENT: + + rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}'," + + rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}" + + print(f"{BNB_BACKEND} specs:{rocm_specs}") + + else: + + print(f"{BNB_BACKEND} specs:{cuda_specs}") + + if not torch.cuda.is_available(): + + print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:") + + if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed") + + print(f"- {BNB_BACKEND} not installed") + + print(f"- You have multiple conflicting {BNB_BACKEND} libraries") + + if cuda_specs: + + print_diagnostics(cuda_specs) + + print_runtime_diagnostics() + + print_header("") + + print_header("DEBUG INFO END") + + print_header("") + + print(f"Checking that the library is importable and {BNB_BACKEND} is callable...") + + try: + + sanity_check() + + print("SUCCESS!") + + print("Installation was successful!") + + return + + except RuntimeError as e: + + if "not available in CPU-only" in str(e): + + print( + + f"WARNING: {__package__} is currently running as CPU-only!\n" + + "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" + + f"If you think that this is so erroneously,\nplease report an issue!", + + ) + + else: + + raise e + + except Exception: + + traceback.print_exc() + + print_dedented( + + f""" + + Above we output some debug information. + + Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose + + WARNING: Please be sure to sanitize sensitive info from the output before posting it. + + """, + + ) + + sys.exit(1) +++======= ++ ++ if cuda_specs: ++ print_cuda_diagnostics(cuda_specs) ++ ++ # TODO: There's a lot of noise in this; needs improvement. ++ # print_cuda_runtime_diagnostics() ++ ++ if not torch.cuda.is_available(): ++ print("PyTorch says CUDA is not available. Possible reasons:") ++ print("1. CUDA driver not installed") ++ print("2. Using a CPU-only PyTorch build") ++ print("3. No GPU detected") ++ ++ else: ++ print("Checking that the library is importable and CUDA is callable...") ++ ++ try: ++ sanity_check() ++ print("SUCCESS!") ++ return ++ except RuntimeError as e: ++ if "not available in CPU-only" in str(e): ++ print( ++ f"WARNING: {__package__} is currently running as CPU-only!\n" ++ "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" ++ f"If you think that this is so erroneously,\nplease report an issue!", ++ ) ++ else: ++ raise e ++ except Exception: ++ traceback.print_exc() ++ ++ print_dedented( ++ f""" ++ Above we output some debug information. ++ Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose ++ WARNING: Please be sure to sanitize sensitive info from the output before posting it. ++ """, ++ ) ++ sys.exit(1) +++>>>>>>> upstream/main +diff --cc bitsandbytes/functional.py +index 03f6c32,ffb6668..0000000 +mode 100644,100755..100755 +--- a/bitsandbytes/functional.py ++++ b/bitsandbytes/functional.py +@@@ -13,9 -13,9 +13,13 @@@ import torc + from torch import Tensor + from typing_extensions import deprecated + +- from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict ++ from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict + +++<<<<<<< HEAD + +from .cextension import lib, HIP_ENVIRONMENT +++======= ++ from .cextension import ipex_cpu, ipex_xpu, lib +++>>>>>>> upstream/main + + name2qmap = {} + +diff --cc bitsandbytes/nn/modules.py +index 2383f2c,ccd842c..0000000 +--- a/bitsandbytes/nn/modules.py ++++ b/bitsandbytes/nn/modules.py +@@@ -11,8 -11,7 +11,12 @@@ from torch import Tensor, device, dtype + import torch.nn.functional as F + + import bitsandbytes as bnb +++<<<<<<< HEAD + +from bitsandbytes.cextension import HIP_ENVIRONMENT + +from bitsandbytes.functional import QuantState +++======= ++ from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu +++>>>>>>> upstream/main + from bitsandbytes.optim import GlobalOptimManager + from bitsandbytes.utils import ( + INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING, +diff --cc tests/test_linear4bit.py +index 1b7a772,b5db2eb..0000000 +--- a/tests/test_linear4bit.py ++++ b/tests/test_linear4bit.py +@@@ -7,8 -8,14 +8,19 @@@ import pytes + import torch + + import bitsandbytes as bnb +++<<<<<<< HEAD + +from bitsandbytes.cextension import HIP_ENVIRONMENT + +from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, torch_load_from_buffer, torch_save_to_buffer +++======= ++ from tests.helpers import ( ++ TRUE_FALSE, ++ describe_dtype, ++ get_available_devices, ++ id_formatter, ++ torch_load_from_buffer, ++ torch_save_to_buffer, ++ ) +++>>>>>>> upstream/main + + storage = { + "uint8": torch.uint8, +@@@ -183,16 -185,10 +189,10 @@@ def test_linear_serialization(device, q + + @pytest.mark.parametrize("device", get_available_devices()) + @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) + -@pytest.mark.parametrize("blocksize", [64, 128]) + +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) + @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) + def test_copy_param(device, quant_type, blocksize, compress_statistics): +- if device == "cpu": +- if compress_statistics: +- pytest.skip("Currently segfaults on CPU") +- if quant_type == "fp4": +- pytest.xfail("FP4 not supported on CPU") +- +- tensor = torch.linspace(1, blocksize, blocksize) ++ tensor = torch.randn(300, 400) + param = bnb.nn.Params4bit( + data=tensor, + quant_type=quant_type, +@@@ -208,16 -204,10 +208,10 @@@ + + @pytest.mark.parametrize("device", get_available_devices()) + @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) + -@pytest.mark.parametrize("blocksize", [64, 128]) + +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) + @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) + def test_deepcopy_param(device, quant_type, blocksize, compress_statistics): +- if device == "cpu": +- if compress_statistics: +- pytest.skip("Currently segfaults on CPU") +- if quant_type == "fp4": +- pytest.xfail("FP4 not supported on CPU") +- +- tensor = torch.linspace(1, blocksize, blocksize) ++ tensor = torch.randn(300, 400) + param = bnb.nn.Params4bit( + data=tensor, + quant_type=quant_type, +@@@ -240,16 -230,10 +234,10 @@@ + + @pytest.mark.parametrize("device", get_available_devices()) + @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) + -@pytest.mark.parametrize("blocksize", [64, 128]) + +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) + @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) + def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics): +- if device == "cpu": +- if compress_statistics: +- pytest.skip("Currently segfaults on CPU") +- if quant_type == "fp4": +- pytest.xfail("FP4 not supported on CPU") +- +- original_tensor = torch.linspace(1, blocksize, blocksize, dtype=torch.float32) ++ original_tensor = torch.randn(300, 400) + original_param = bnb.nn.Params4bit( + data=original_tensor, + quant_type=quant_type, diff --git a/csrc/common_hip.cuh b/csrc/common_hip.cuh index e7fc4eb81..105179535 100644 --- a/csrc/common_hip.cuh +++ b/csrc/common_hip.cuh @@ -1,6 +1,6 @@ #pragma once -#define BNB_WARP_SIZE warpSize +#define BNB_WARP_SIZE warpSize // These are set based on current BNB support for CDNA 2 & RDNA 3. Update as needed for future archs #define BNB_MAX_THREADS_PER_SM 2048 diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 368788f39..56e1d54db 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -532,7 +532,7 @@ __global__ void kQuantizeBlockwise(float * code, T * __restrict__ const A, float absmax[i / BLOCK_SIZE] = local_abs_max; } __syncthreads(); - + local_abs_max = smem_absmax_value[0]; if(STOCHASTIC) @@ -610,7 +610,7 @@ __global__ void kDequantizeBlockwise(float *code, unsigned char * A, float * abs valid_items_load = min(TILE_SIZE, n - i); valid_items_store = valid_items_load; } - + // Since blocksize will always be a power-of-2, we avoid more expensive // division by the blocksize and instead use a shift operation. // This is equivalent to (i+threadId.x*NUM_PER_TH)/blocksize. @@ -811,7 +811,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p, LoadFloat(temp_storage.loadf).Load(&(state2[i]), s2_vals, valid_items); __syncthreads(); Load(temp_storage.load).Load(&(p[i]), p_vals, valid_items); - + // Load additional state1 data for AdEMAMix // TODO: Make constexpr after updating min compiler if (OPTIMIZER == ADEMAMIX) { @@ -1607,7 +1607,7 @@ kOptimizerStatic8bit2StateBlockwise( unsigned char c1s[N_PER_TH]; unsigned char c2s[N_PER_TH]; unsigned char c3s[N_PER_TH]; - + T g_vals[N_PER_TH]; T p_vals[N_PER_TH]; typedef hipcub::BlockLoad LoadT; @@ -1712,7 +1712,7 @@ kOptimizerStatic8bit2StateBlockwise( new_local_abs_max1 = fmaxf(new_local_abs_max1, fabsf(s1_vals[j])); new_local_abs_max2 = fmaxf(new_local_abs_max2, fabsf(s2_vals[j])); - + if (OPTIMIZER == ADEMAMIX) { new_local_abs_max3 = fmaxf(new_local_abs_max3, fabsf(s3_vals[j])); } @@ -1776,7 +1776,7 @@ kOptimizerStatic8bit2StateBlockwise( } else { p_vals[j] = (T)(((float)p_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps))))))); } - + if(weight_decay > 0.0f) p_vals[j] = ((float)p_vals[j])*(1.0f-(lr*weight_decay)); } @@ -2148,27 +2148,27 @@ __global__ void kdequant_mm_int32_fp16( int local_values[ITEMS_PER_THREAD]; half local_output[ITEMS_PER_THREAD]; - + float local_rowStats[ITEMS_PER_THREAD]; float local_colStats[ITEMS_PER_THREAD]; float local_biasValue[ITEMS_PER_THREAD]; typedef hipcub::BlockLoad LoadInt32; __shared__ typename LoadInt32::TempStorage loadint32; - + int row_idx, col_idx; - + #pragma unroll ITEMS_PER_THREAD for(int j = 0; j < ITEMS_PER_THREAD; j++) { row_idx = (block_offset + thread_offset + j) / numCols; col_idx = (block_offset + thread_offset + j) % numCols; - + local_colStats[j] = col_idx >= numCols ? 0.0f : colStats[col_idx]; - local_rowStats[j] = row_idx >= numRows ? 0.0f : rowStats[row_idx]; + local_rowStats[j] = row_idx >= numRows ? 0.0f : rowStats[row_idx]; local_biasValue[j] = ((bias == nullptr) || (col_idx >= numCols)) ? 0.0f : __half2float(bias[col_idx]); } - + // Each block loads THREADS * ITEMS_PER_THREAD values from A int valid_items = block_offset + THREADS * ITEMS_PER_THREAD < n_out ? THREADS * ITEMS_PER_THREAD @@ -2188,7 +2188,7 @@ __global__ void kdequant_mm_int32_fp16( if (outIdx < n_out) { out[outIdx] = local_output[j]; } - } + } } #define DENORM 1.0f/127.0f diff --git a/csrc/ops.hip b/csrc/ops.hip index 4d077d19a..eef616d48 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -199,10 +199,10 @@ template void optimizerStatic8bit(T* p, T* g, } } -#define BLOCKSIZE_2STATE 256 -#define NUM_2STATE 1 -#define BLOCKSIZE_1STATE 256 -#define NUM_1STATE 1 +#define BLOCKSIZE_2STATE 256 +#define NUM_2STATE 1 +#define BLOCKSIZE_1STATE 256 +#define NUM_1STATE 1 template void optimizerStatic8bitBlockwise( T* p, @@ -443,7 +443,7 @@ static std::string hipError_to_string(const hipError_t ret) } template int igemmlt( - hipblasLtHandle_t ltHandle, + hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index 1b2ea85db..3d8b688ee 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -12,11 +12,13 @@ def cuda120_spec() -> CUDASpecs: cuda_version_tuple=(12, 0), ) + @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm") def test_get_cuda_bnb_library_path(monkeypatch, cuda120_spec): monkeypatch.delenv("BNB_CUDA_VERSION", raising=False) assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda120" + @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm") def test_get_cuda_bnb_library_path_override(monkeypatch, cuda120_spec, caplog): monkeypatch.setenv("BNB_CUDA_VERSION", "110") diff --git a/tests/test_functional.py b/tests/test_functional.py index 5f5ee488c..a2964c733 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -8,8 +8,8 @@ import torch import bitsandbytes as bnb -from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH from bitsandbytes import functional as F +from bitsandbytes.cextension import HIP_ENVIRONMENT, ROCM_GPU_ARCH from tests.helpers import ( BOOLEAN_TUPLES, TRUE_FALSE, @@ -92,7 +92,10 @@ class Test8BitBlockwiseQuantizeFunctional: @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) @pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested")) - @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64] if not HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128] ) + @pytest.mark.parametrize( + "blocksize", + [4096, 2048, 1024, 512, 256, 128, 64] if not HIP_ENVIRONMENT else [4096, 2048, 1024, 512, 256, 128], + ) @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed")) def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed): iters = 100 @@ -796,6 +799,7 @@ def test_coo_int8_vectorwise_quant(self, device, dim1, dim2): A[:, outlier_cols] = 0 torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2) + @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") class TestSpMMFunctional: @@ -1106,7 +1110,10 @@ class TestQuantize4BitFunctional: @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) @pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) - @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096] if not HIP_ENVIRONMENT else [128, 256, 512, 1024, 2048, 4096]) + @pytest.mark.parametrize( + "blocksize", + [64, 128, 256, 512, 1024, 2048, 4096] if not HIP_ENVIRONMENT else [128, 256, 512, 1024, 2048, 4096], + ) def test_4bit_quant(self, device, dtype, quant_type, blocksize): if device == "cpu" and quant_type != "nf4": pytest.xfail("fp4 quantization is not supported on CPU") @@ -1205,7 +1212,7 @@ def test_bench_4bit_dequant(self, quant_type): # torch.matmul(b, a.t()) # torch.cuda.synchronize() # print((time.time()-t0)/iters*1e6) - + @pytest.mark.skipif( HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64" ) diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 1b7a7722c..60c163477 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -17,6 +17,7 @@ "float32": torch.float32, } + @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"]) @pytest.mark.parametrize("bias", TRUE_FALSE, ids=id_formatter("bias")) diff --git a/tests/test_ops.py b/tests/test_ops.py index a99d080b3..a433a0c4b 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -4,8 +4,8 @@ import torch import bitsandbytes -from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter from bitsandbytes.cextension import HIP_ENVIRONMENT +from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter class TestLLMInt8Ops: From 93768d07b1b753790a784f1472e5b6b1f9fa5c73 Mon Sep 17 00:00:00 2001 From: MISHANMAUYRA Date: Wed, 4 Jun 2025 01:24:09 +0530 Subject: [PATCH 8/8] Remove conflicts.diff --- conflicts.diff | 382 ------------------------------------------------- 1 file changed, 382 deletions(-) delete mode 100644 conflicts.diff diff --git a/conflicts.diff b/conflicts.diff deleted file mode 100644 index cab8c6ea7..000000000 --- a/conflicts.diff +++ /dev/null @@ -1,382 +0,0 @@ -diff --cc bitsandbytes/cextension.py -index 108aa0c,b112df2..0000000 ---- a/bitsandbytes/cextension.py -+++ b/bitsandbytes/cextension.py -@@@ -28,17 -28,10 +29,15 @@@ def get_cuda_bnb_library_path(cuda_spec - override_value = os.environ.get("BNB_CUDA_VERSION") - if override_value: - library_name = re.sub(r"cuda\d+", f"cuda{override_value}", library_name, count=1) - + if torch.version.hip: - + raise RuntimeError( - + f"BNB_CUDA_VERSION={override_value} detected for ROCm!! \n" - + f"Clear the variable and retry: export BNB_CUDA_VERSION=\n" - + ) - logger.warning( - f"WARNING: BNB_CUDA_VERSION={override_value} environment variable detected; loading {library_name}.\n" -- "This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n" -+ "This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.\n" - "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n" -- "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n" -- "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: BNBNativeLi - return BNBNativeLibrary(dll) - - - +ROCM_GPU_ARCH = get_rocm_gpu_arch() - + - try: -++<<<<<<< HEAD - + if torch.version.hip: - + HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm" - + else: - + HIP_ENVIRONMENT, BNB_BACKEND = False, "CUDA" - + -++======= -+ # to support Intel CPU/GPU (XPU) backend -+ import intel_extension_for_pytorch as ipex -+ -+ ipex_cpu = ipex if ipex._C._has_cpu() else None -+ ipex_xpu = ipex if ipex._C._has_xpu() else None -+ except BaseException: -+ ipex_cpu = None -+ ipex_xpu = None -+ -+ -+ try: -++>>>>>>> upstream/main - lib = get_native_library() - except Exception as e: - error_msg = str(e) -diff --cc bitsandbytes/diagnostics/cuda.py -index b9de27f,e763ef2..0000000 ---- a/bitsandbytes/diagnostics/cuda.py -+++ b/bitsandbytes/diagnostics/cuda.py -@@@ -5,8 -5,7 +5,12 @@@ from pathlib import Pat - - import torch - -++<<<<<<< HEAD - +from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path - +from bitsandbytes.consts import NONPYTORCH_DOC_URL -++======= -+ from bitsandbytes.cextension import get_cuda_bnb_library_path -++>>>>>>> upstream/main - from bitsandbytes.cuda_specs import CUDASpecs - from bitsandbytes.diagnostics.utils import print_dedented - -@@@ -146,42 -127,8 +134,38 @@@ def _print_cuda_diagnostics(cuda_specs - """, - ) - -- # TODO: -- # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible) -- # (2) Multiple CUDA versions installed -- - - -def print_cuda_runtime_diagnostics() -> None: - +def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None: - + print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}") - + - + binary_path = get_cuda_bnb_library_path(cuda_specs) - + if not binary_path.exists(): - + print_dedented( - + f""" - + Library not found: {binary_path}. - + Maybe you need to compile it from source? If you compiled from source, check that ROCm version - + in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version - + and rebuild bitsandbytes. - + """, - + ) - + - + hip_major, hip_minor = cuda_specs.cuda_version_tuple - + if (hip_major, hip_minor) < (6, 1): - + print_dedented( - + """ - + WARNING: bitsandbytes is fully supported only from ROCm 6.1. - + """, - + ) - + - + - +def print_diagnostics(cuda_specs: CUDASpecs) -> None: - + if HIP_ENVIRONMENT: - + _print_hip_diagnostics(cuda_specs) - + else: - + _print_cuda_diagnostics(cuda_specs) - + - + - +def _print_cuda_runtime_diagnostics() -> None: - cudart_paths = list(find_cudart_libraries()) - if not cudart_paths: - print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.") -diff --cc bitsandbytes/diagnostics/main.py -index 8e2bc2a,aa4cb30..0000000 ---- a/bitsandbytes/diagnostics/main.py -+++ b/bitsandbytes/diagnostics/main.py -@@@ -3,12 -5,11 +5,20 @@@ import tracebac - - import torch - -++<<<<<<< HEAD - +from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT - +from bitsandbytes.consts import PACKAGE_GITHUB_URL - +from bitsandbytes.cuda_specs import get_cuda_specs - +from bitsandbytes.diagnostics.cuda import ( - + print_diagnostics, - + print_runtime_diagnostics, -++======= -+ from bitsandbytes import __version__ as bnb_version -+ from bitsandbytes.consts import PACKAGE_GITHUB_URL -+ from bitsandbytes.cuda_specs import get_cuda_specs -+ from bitsandbytes.diagnostics.cuda import ( -+ print_cuda_diagnostics, -++>>>>>>> upstream/main - ) - from bitsandbytes.diagnostics.utils import print_dedented, print_header - -@@@ -28,52 -41,77 +50,122 @@@ def sanity_check() - assert p1 != p2 - - -+ def get_package_version(name: str) -> str: -+ try: -+ version = importlib.metadata.version(name) -+ except importlib.metadata.PackageNotFoundError: -+ version = "not found" -+ return version -+ -+ -+ def show_environment(): -+ """Simple utility to print out environment information.""" -+ -+ print(f"Platform: {platform.platform()}") -+ if platform.system() == "Linux": -+ print(f" libc: {'-'.join(platform.libc_ver())}") -+ -+ print(f"Python: {platform.python_version()}") -+ -+ print(f"PyTorch: {torch.__version__}") -+ print(f" CUDA: {torch.version.cuda or 'N/A'}") -+ print(f" HIP: {torch.version.hip or 'N/A'}") -+ print(f" XPU: {getattr(torch.version, 'xpu', 'N/A') or 'N/A'}") -+ -+ print("Related packages:") -+ for pkg in _RELATED_PACKAGES: -+ version = get_package_version(pkg) -+ print(f" {pkg}: {version}") -+ -+ - def main(): -- print_header("") -- print_header("BUG REPORT INFORMATION") -+ print_header(f"bitsandbytes v{bnb_version}") -+ show_environment() - print_header("") - -- print_header("OTHER") - cuda_specs = get_cuda_specs() -++<<<<<<< HEAD - + if HIP_ENVIRONMENT: - + rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}'," - + rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}" - + print(f"{BNB_BACKEND} specs:{rocm_specs}") - + else: - + print(f"{BNB_BACKEND} specs:{cuda_specs}") - + if not torch.cuda.is_available(): - + print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:") - + if not HIP_ENVIRONMENT: print(f"- {BNB_BACKEND} driver not installed") - + print(f"- {BNB_BACKEND} not installed") - + print(f"- You have multiple conflicting {BNB_BACKEND} libraries") - + if cuda_specs: - + print_diagnostics(cuda_specs) - + print_runtime_diagnostics() - + print_header("") - + print_header("DEBUG INFO END") - + print_header("") - + print(f"Checking that the library is importable and {BNB_BACKEND} is callable...") - + try: - + sanity_check() - + print("SUCCESS!") - + print("Installation was successful!") - + return - + except RuntimeError as e: - + if "not available in CPU-only" in str(e): - + print( - + f"WARNING: {__package__} is currently running as CPU-only!\n" - + "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" - + f"If you think that this is so erroneously,\nplease report an issue!", - + ) - + else: - + raise e - + except Exception: - + traceback.print_exc() - + print_dedented( - + f""" - + Above we output some debug information. - + Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose - + WARNING: Please be sure to sanitize sensitive info from the output before posting it. - + """, - + ) - + sys.exit(1) -++======= -+ -+ if cuda_specs: -+ print_cuda_diagnostics(cuda_specs) -+ -+ # TODO: There's a lot of noise in this; needs improvement. -+ # print_cuda_runtime_diagnostics() -+ -+ if not torch.cuda.is_available(): -+ print("PyTorch says CUDA is not available. Possible reasons:") -+ print("1. CUDA driver not installed") -+ print("2. Using a CPU-only PyTorch build") -+ print("3. No GPU detected") -+ -+ else: -+ print("Checking that the library is importable and CUDA is callable...") -+ -+ try: -+ sanity_check() -+ print("SUCCESS!") -+ return -+ except RuntimeError as e: -+ if "not available in CPU-only" in str(e): -+ print( -+ f"WARNING: {__package__} is currently running as CPU-only!\n" -+ "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" -+ f"If you think that this is so erroneously,\nplease report an issue!", -+ ) -+ else: -+ raise e -+ except Exception: -+ traceback.print_exc() -+ -+ print_dedented( -+ f""" -+ Above we output some debug information. -+ Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose -+ WARNING: Please be sure to sanitize sensitive info from the output before posting it. -+ """, -+ ) -+ sys.exit(1) -++>>>>>>> upstream/main -diff --cc bitsandbytes/functional.py -index 03f6c32,ffb6668..0000000 -mode 100644,100755..100755 ---- a/bitsandbytes/functional.py -+++ b/bitsandbytes/functional.py -@@@ -13,9 -13,9 +13,13 @@@ import torc - from torch import Tensor - from typing_extensions import deprecated - -- from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict -+ from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict - -++<<<<<<< HEAD - +from .cextension import lib, HIP_ENVIRONMENT -++======= -+ from .cextension import ipex_cpu, ipex_xpu, lib -++>>>>>>> upstream/main - - name2qmap = {} - -diff --cc bitsandbytes/nn/modules.py -index 2383f2c,ccd842c..0000000 ---- a/bitsandbytes/nn/modules.py -+++ b/bitsandbytes/nn/modules.py -@@@ -11,8 -11,7 +11,12 @@@ from torch import Tensor, device, dtype - import torch.nn.functional as F - - import bitsandbytes as bnb -++<<<<<<< HEAD - +from bitsandbytes.cextension import HIP_ENVIRONMENT - +from bitsandbytes.functional import QuantState -++======= -+ from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu -++>>>>>>> upstream/main - from bitsandbytes.optim import GlobalOptimManager - from bitsandbytes.utils import ( - INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING, -diff --cc tests/test_linear4bit.py -index 1b7a772,b5db2eb..0000000 ---- a/tests/test_linear4bit.py -+++ b/tests/test_linear4bit.py -@@@ -7,8 -8,14 +8,19 @@@ import pytes - import torch - - import bitsandbytes as bnb -++<<<<<<< HEAD - +from bitsandbytes.cextension import HIP_ENVIRONMENT - +from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, torch_load_from_buffer, torch_save_to_buffer -++======= -+ from tests.helpers import ( -+ TRUE_FALSE, -+ describe_dtype, -+ get_available_devices, -+ id_formatter, -+ torch_load_from_buffer, -+ torch_save_to_buffer, -+ ) -++>>>>>>> upstream/main - - storage = { - "uint8": torch.uint8, -@@@ -183,16 -185,10 +189,10 @@@ def test_linear_serialization(device, q - - @pytest.mark.parametrize("device", get_available_devices()) - @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) - -@pytest.mark.parametrize("blocksize", [64, 128]) - +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) - @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) - def test_copy_param(device, quant_type, blocksize, compress_statistics): -- if device == "cpu": -- if compress_statistics: -- pytest.skip("Currently segfaults on CPU") -- if quant_type == "fp4": -- pytest.xfail("FP4 not supported on CPU") -- -- tensor = torch.linspace(1, blocksize, blocksize) -+ tensor = torch.randn(300, 400) - param = bnb.nn.Params4bit( - data=tensor, - quant_type=quant_type, -@@@ -208,16 -204,10 +208,10 @@@ - - @pytest.mark.parametrize("device", get_available_devices()) - @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) - -@pytest.mark.parametrize("blocksize", [64, 128]) - +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) - @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) - def test_deepcopy_param(device, quant_type, blocksize, compress_statistics): -- if device == "cpu": -- if compress_statistics: -- pytest.skip("Currently segfaults on CPU") -- if quant_type == "fp4": -- pytest.xfail("FP4 not supported on CPU") -- -- tensor = torch.linspace(1, blocksize, blocksize) -+ tensor = torch.randn(300, 400) - param = bnb.nn.Params4bit( - data=tensor, - quant_type=quant_type, -@@@ -240,16 -230,10 +234,10 @@@ - - @pytest.mark.parametrize("device", get_available_devices()) - @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) - -@pytest.mark.parametrize("blocksize", [64, 128]) - +@pytest.mark.parametrize("blocksize", [64, 128] if not HIP_ENVIRONMENT else [128]) - @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) - def test_params4bit_real_serialization(device, quant_type, blocksize, compress_statistics): -- if device == "cpu": -- if compress_statistics: -- pytest.skip("Currently segfaults on CPU") -- if quant_type == "fp4": -- pytest.xfail("FP4 not supported on CPU") -- -- original_tensor = torch.linspace(1, blocksize, blocksize, dtype=torch.float32) -+ original_tensor = torch.randn(300, 400) - original_param = bnb.nn.Params4bit( - data=original_tensor, - quant_type=quant_type,