Skip to content

Commit eea8518

Browse files
TimDettmersclaude
andcommitted
Merge feature/qutlass-nvfp4-gemm into QLORA-2
Combines NVMe weight streaming pipeline (QLORA-2) with NVFP4 quantization and CUTLASS/hand-written GEMM dispatch (qutlass). Resolved conflicts in: - _ops.py: Appended NVFP4 op definitions after kbit ops - backends/cuda/ops.py: Appended NVFP4 kernel registrations after kbit kernels - functional.py: Appended NVFP4 quantization functions after kbit functions - nn/modules.py: Preserved LinearKbit + prepare_model_for_kbit_training, then LinearNVFP4 class - .gitignore: Merged both additions Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2 parents 7fa3e4e + 70457ac commit eea8518

73 files changed

Lines changed: 21046 additions & 330 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/test-runner.yml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ on:
2626
gpu_type:
2727
type: string
2828
default: ""
29-
description: "GPU type for CUDA testing: T4, L40S"
29+
description: "GPU type for CUDA testing: T4, A10,L40S"
3030
# cpu_type currently only affects linux x64 CPU testing to select specific CPU architectures
3131
cpu_type:
3232
type: string
@@ -65,11 +65,14 @@ jobs:
6565
T4)
6666
TEST_RUNNER="bandb-aws-g4dn-4xlarge-plus-use1-public-80"
6767
;;
68+
A10)
69+
TEST_RUNNER="bandb-aws-g5-4xlarge-plus-use1-public-80"
70+
;;
6871
L40S)
6972
TEST_RUNNER="bandb-aws-g6e-4xlarge-plus-use1-public-80"
7073
;;
7174
*)
72-
echo "::error::Must specify gpu_type (T4 or L40S) for linux-x64 cuda backend"
75+
echo "::error::Must specify gpu_type (T4, A10, L40S) for linux-x64 cuda backend"
7376
exit 1
7477
;;
7578
esac
@@ -164,7 +167,7 @@ jobs:
164167
run: bash .github/scripts/build-cuda.sh
165168
env:
166169
cuda_version: ${{ inputs.cuda_version }}
167-
cuda_targets: "75;89"
170+
cuda_targets: "75;80;89"
168171

169172
- name: Upload build artifact
170173
uses: actions/upload-artifact@v4

.github/workflows/tests-nightly.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020
platform: [linux-x64, linux-aarch64, macos, windows]
2121
# default runners don't have AVX-512 support, but icelake does
2222
cpu_type: ["", icelake]
23-
torch_version: ["2.3.1", "2.8.0", "2.9.1"]
23+
torch_version: ["2.3.1", "2.9.1", "2.10.0"]
2424

2525
exclude:
2626
# aarch64 minimum torch version is 2.5.1
@@ -56,7 +56,7 @@ jobs:
5656
matrix:
5757
# Linux x64 cross-product
5858
platform: [linux-x64]
59-
gpu_type: [T4, L40S]
59+
gpu_type: [T4, A10, L40S]
6060
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "13.0.2"]
6161

6262
include:
@@ -65,13 +65,13 @@ jobs:
6565
torch_version: "2.3.1"
6666
pypi_index: "https://download.pytorch.org/whl/cu118"
6767
- cuda_version: "12.6.3"
68-
torch_version: "2.7.1"
68+
torch_version: "2.8.0"
6969
pypi_index: "https://download.pytorch.org/whl/cu126"
7070
- cuda_version: "12.8.1"
71-
torch_version: "2.8.0"
71+
torch_version: "2.9.1"
7272
pypi_index: "https://download.pytorch.org/whl/cu128"
7373
- cuda_version: "13.0.2"
74-
torch_version: "2.9.1"
74+
torch_version: "2.10.0"
7575
pypi_index: "https://download.pytorch.org/whl/cu130"
7676

7777
# Windows CUDA Tests - T4 GPU (CUDA 11.8 only, multiple torch versions)

.github/workflows/tests-pr.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
platform: [linux-x64, linux-aarch64, macos]
3232
# default runners don't have AVX-512 support, but icelake does
3333
cpu_type: ["", icelake]
34-
torch_version: ["2.3.1", "2.9.1"]
34+
torch_version: ["2.3.1", "2.10.0"]
3535

3636
exclude:
3737
# aarch64 minimum torch version is 2.5.1
@@ -64,7 +64,7 @@ jobs:
6464
fail-fast: false
6565
matrix:
6666
platform: [linux-x64]
67-
gpu_type: [T4, L40S]
67+
gpu_type: [T4, A10, L40S]
6868
cuda_version: ["11.8.0", "12.8.1", "13.0.2"]
6969

7070
include:
@@ -73,10 +73,10 @@ jobs:
7373
torch_version: "2.3.1"
7474
pypi_index: "https://download.pytorch.org/whl/cu118"
7575
- cuda_version: "12.8.1"
76-
torch_version: "2.8.0"
76+
torch_version: "2.9.1"
7777
pypi_index: "https://download.pytorch.org/whl/cu128"
7878
- cuda_version: "13.0.2"
79-
torch_version: "2.9.1"
79+
torch_version: "2.10.0"
8080
pypi_index: "https://download.pytorch.org/whl/cu130"
8181

8282
# Windows CUDA test - single configuration

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,3 +160,4 @@ cuda-spec.md
160160
cuda-spec-additions.md
161161
spec.md
162162
spec_details.md
163+
agents/*_issues.json

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "third_party/cutlass"]
2+
path = third_party/cutlass
3+
url = https://github.com/NVIDIA/cutlass.git

CLAUDE.md

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,64 @@
1-
# Coordinating agent work on GitHub issues
1+
# MANDATORY: Use git worktrees for all branch work
22

3-
To analyze open issues, generate prompts, and launch parallel worker agents, follow `agents/coordinator_guide.md`. This uses the GitHub issue tools in `~/git/lab_tools/github/` — see `agents/github_tools_guide.md` for the bitsandbytes-specific reference.
3+
NEVER work on a fix or feature branch inside the main `~/git/bitsandbytes` checkout. Always create a worktree first:
44

5-
# Parallel sessions
5+
```bash
6+
cd ~/git/bitsandbytes
7+
git worktree add ~/git/bnb-fix-<NUMBER> -b fix/issue-<NUMBER>
8+
cd ~/git/bnb-fix-<NUMBER>
9+
```
10+
11+
This keeps the main checkout clean and allows parallel sessions. If you are already inside a worktree directory, you do not need to create another one.
12+
13+
**Before creating a worktree**, check the worktree registry for existing ones — see the Git Worktrees section in `~/.claude/CLAUDE.md`. Bitsandbytes-specific naming conventions: `agents/worktree_guide.md`. General worktree guide: `~/git/lab_tools/worktree_guide.md`.
14+
15+
# MANDATORY: Check for existing PRs before starting work
616

7-
To work on multiple branches at once, use git worktrees:
17+
Before working on any issue, check whether a PR already exists:
818

919
```bash
10-
git worktree add ../bitsandbytes-<branch-name> -b <branch-name>
11-
cd ../bitsandbytes-<branch-name>
12-
claude
20+
gh pr list --search "issue-number OR keyword" --state open
1321
```
1422

15-
Full guide: `agents/worktree_guide.md`
23+
If a PR exists, review and build on it instead of starting from scratch. Do not create duplicate work.
1624

17-
# Testing
25+
# MANDATORY: Run linting before every pull request
1826

19-
Run the test suite with 4 parallel workers (optimal for any machine):
27+
Before pushing a PR branch, you MUST run the full pre-commit suite. CI will reject PRs that fail any check:
2028

2129
```bash
22-
pytest tests/ -v --tb=short -n 4
30+
pre-commit run --all-files
2331
```
2432

25-
Best practices, benchmark data, and known architecture-specific issues: `agents/testing_guide.md`
33+
This runs ruff, ruff format, typos, trailing-whitespace, clang-format, and all other CI lint hooks. Review and commit any changes it makes. Do NOT run only `ruff check` and `ruff format` — those are just 2 of 10 hooks. Full details: `agents/linting_guide.md`
34+
35+
# Testing: only run relevant tests
36+
37+
Do NOT run the full test suite — it takes 10+ minutes. Instead, run only the tests that cover the code you changed:
38+
39+
```bash
40+
pytest tests/test_relevant_file.py -v --tb=short -k "relevant_test_name"
41+
```
42+
43+
The full suite will be run separately. Best practices and known issues: `agents/testing_guide.md`
44+
45+
# Agent Dispatch (the "Dispatcher" role)
46+
47+
To triage open GitHub issues, generate prompt files, and launch parallel worker agents, read `agents/dispatch_guide.md`. If told "you're the Dispatcher" or "please read the Dispatch Guide," that's what this refers to. The dispatch workflow uses the GitHub issue tools in `agents/` — see `agents/github_tools_guide.md` for the bitsandbytes-specific reference.
48+
49+
# Issue maintenance and triage
50+
51+
To identify and close stale, duplicate, or resolved issues: `agents/issue_maintenance_guide.md`. Common closeable patterns (old CUDA setup, Windows pre-support, third-party app issues, etc.) are cataloged in `agents/issue_patterns.md`.
52+
53+
# Pull request review
54+
55+
When tasked with reviewing a pull request, you MUST read these guides before starting the review:
56+
57+
1. `agents/pr_review_guide.md` — The complete review workflow (classification, checklists, verdict format, and posting instructions). This is the primary guide; follow its steps sequentially.
58+
2. `agents/architecture_guide.md` — Codebase architecture and patterns
59+
3. `agents/code_standards.md` — Code quality expectations
60+
4. `agents/api_surface.md` — Public API catalog (for detecting breaking changes)
61+
5. `agents/downstream_integrations.md` — How Transformers, PEFT, Accelerate, TGI, and vLLM depend on bitsandbytes (for assessing downstream impact)
62+
6. `agents/security_guide.md` — Trust model and security checklist (especially for external contributor PRs)
63+
64+
For CUDA kernel changes, also read `agents/kbit_gemm_context.md`. The PR review guide references all of these at the appropriate steps.

CMakeLists.txt

Lines changed: 130 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,38 @@
1010
# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90;100;120`
1111
# Check your compute capability here: https://developer.nvidia.com/cuda-gpus
1212
# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
13+
# - ROCM_VERSION: Override the ROCm version shortcode used in the output library name.
14+
# Useful when PyTorch was built against a different ROCm version than the
15+
# system install. For example, `-DROCM_VERSION=70` produces
16+
# libbitsandbytes_rocm70.so even if the system has ROCm 7.2.
1317
cmake_minimum_required(VERSION 3.22.1)
1418

19+
# On Windows with HIP backend, auto-detect compilers from ROCM_PATH before project()
20+
if(WIN32 AND COMPUTE_BACKEND STREQUAL "hip")
21+
if(DEFINED ENV{ROCM_PATH})
22+
set(ROCM_PATH $ENV{ROCM_PATH})
23+
endif()
24+
if(ROCM_PATH AND NOT DEFINED CMAKE_CXX_COMPILER)
25+
set(CMAKE_CXX_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
26+
endif()
27+
if(ROCM_PATH AND NOT DEFINED CMAKE_HIP_COMPILER)
28+
set(CMAKE_HIP_COMPILER "${ROCM_PATH}/lib/llvm/bin/clang++.exe")
29+
endif()
30+
# On Windows, the HIP compiler needs explicit paths to find device libraries.
31+
if(ROCM_PATH)
32+
find_path(ROCM_DEVICE_LIB_PATH
33+
NAMES oclc_abi_version_400.bc ocml.bc
34+
PATHS "${ROCM_PATH}/amdgcn/bitcode"
35+
"${ROCM_PATH}/lib/llvm/amdgcn/bitcode"
36+
NO_DEFAULT_PATH
37+
)
38+
set(CMAKE_HIP_FLAGS "--rocm-path=${ROCM_PATH}")
39+
if(ROCM_DEVICE_LIB_PATH)
40+
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} --rocm-device-lib-path=${ROCM_DEVICE_LIB_PATH}")
41+
endif()
42+
endif()
43+
endif()
44+
1545
project(bitsandbytes LANGUAGES CXX)
1646

1747
# If run without specifying a build type, default to using the Release configuration:
@@ -197,20 +227,80 @@ if(BUILD_CUDA)
197227

198228
list(APPEND SRC_FILES ${CUDA_FILES})
199229

230+
# SM_120a NVFP4 GEMM kernel: requires compute_120a for block-scaled MMA
231+
# Only include if 120 or 121 is in the target architectures
232+
# Check both COMPUTE_CAPABILITY (may have been popped) and _LATEST_CAPABILITY
233+
set(_HAS_SM120 FALSE)
234+
foreach(_cap IN LISTS COMPUTE_CAPABILITY)
235+
if(_cap MATCHES "^12[01]$")
236+
set(_HAS_SM120 TRUE)
237+
endif()
238+
endforeach()
239+
if(_LATEST_CAPABILITY MATCHES "^12[01]$")
240+
set(_HAS_SM120 TRUE)
241+
endif()
242+
if(_HAS_SM120)
243+
# Build as separate OBJECT library with its own CUDA_ARCHITECTURES
244+
# to avoid conflict with the global architecture settings
245+
set(_NVFP4_SM120_SOURCES csrc/kernels_nvfp4_sm120.cu)
246+
247+
# CUTLASS-based NVFP4 GEMM (requires CUDA 12.8+)
248+
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8" AND EXISTS "${CMAKE_SOURCE_DIR}/third_party/cutlass/include")
249+
list(APPEND _NVFP4_SM120_SOURCES
250+
csrc/qutlass/gemm_nvfp4_sm120.cu
251+
csrc/qutlass/scale_reorder.cu
252+
csrc/qutlass/fused_quantize_nv.cu
253+
)
254+
set(_HAS_CUTLASS_NVFP4 TRUE)
255+
message(STATUS "CUTLASS NVFP4 SM_120a GEMM + fused quantize enabled")
256+
else()
257+
set(_HAS_CUTLASS_NVFP4 FALSE)
258+
message(STATUS "CUTLASS NVFP4 GEMM disabled (needs CUDA >= 12.8 and third_party/cutlass)")
259+
endif()
260+
261+
add_library(nvfp4_sm120a OBJECT ${_NVFP4_SM120_SOURCES})
262+
set_target_properties(nvfp4_sm120a PROPERTIES
263+
CUDA_ARCHITECTURES "120a"
264+
POSITION_INDEPENDENT_CODE ON
265+
CUDA_SEPARABLE_COMPILATION OFF
266+
)
267+
target_compile_options(nvfp4_sm120a PRIVATE
268+
$<$<COMPILE_LANGUAGE:CUDA>:--use_fast_math>
269+
)
270+
271+
if(_HAS_CUTLASS_NVFP4)
272+
target_include_directories(nvfp4_sm120a PRIVATE
273+
"${CMAKE_SOURCE_DIR}/third_party/cutlass/include"
274+
"${CMAKE_SOURCE_DIR}/third_party/cutlass/tools/util/include"
275+
"${CMAKE_SOURCE_DIR}/csrc/qutlass/include"
276+
)
277+
target_compile_options(nvfp4_sm120a PRIVATE
278+
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
279+
$<$<COMPILE_LANGUAGE:CUDA>:-std=c++17>
280+
$<$<COMPILE_LANGUAGE:CUDA>:-O3>
281+
$<$<COMPILE_LANGUAGE:CUDA>:-DNDEBUG>
282+
$<$<COMPILE_LANGUAGE:CUDA>:-DQUTLASS_DISABLE_PYBIND>
283+
)
284+
endif()
285+
286+
message(STATUS "NVFP4 SM_120a GEMM kernel enabled")
287+
endif()
288+
200289
string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
201290
add_compile_definitions(BUILD_CUDA)
202291
elseif(BUILD_HIP)
203-
enable_language(HIP)
204-
message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
292+
# Set target architectures before enable_language(HIP), which would otherwise
293+
# auto-detect a single GPU and override the defaults.
205294
if(DEFINED BNB_ROCM_ARCH)
206295
set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
207-
else()
208-
if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
209-
set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100;gfx1101;gfx1150;gfx1151;gfx1200;gfx1201")
210-
elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
211-
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
212-
endif()
296+
elseif(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
297+
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
298+
elseif(NOT CMAKE_HIP_ARCHITECTURES)
299+
set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100;gfx1101;gfx1150;gfx1151;gfx1200;gfx1201")
213300
endif()
301+
302+
enable_language(HIP)
303+
message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
214304
message(STATUS "HIP Targets: ${CMAKE_HIP_ARCHITECTURES}")
215305

216306
list(APPEND SRC_FILES ${HIP_FILES})
@@ -222,7 +312,15 @@ elseif(BUILD_HIP)
222312
string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
223313
string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
224314

225-
string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
315+
# Expose a cache variable that the user can set to override the ROCm version in the library name
316+
set(ROCM_VERSION "${HIP_VERSION_SHORT}" CACHE STRING "Expected ROCm Version Shortcode")
317+
318+
message(STATUS "ROCm Version: ${HIP_VERSION_SHORT} (from hipconfig)")
319+
if(NOT ROCM_VERSION STREQUAL "${HIP_VERSION_SHORT}")
320+
message(WARNING "Overriding ROCm version in library name: ${HIP_VERSION_SHORT} -> ${ROCM_VERSION}")
321+
endif()
322+
323+
string(APPEND BNB_OUTPUT_NAME "${ROCM_VERSION}")
226324
add_compile_definitions(__HIP_PLATFORM_AMD__)
227325
add_compile_definitions(__HIP_PLATFORM_HCC__)
228326
add_compile_definitions(BUILD_HIP)
@@ -263,6 +361,8 @@ endif()
263361
if(WIN32)
264362
# Export all symbols
265363
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
364+
# Prevent Windows SDK min/max macros from conflicting with std::min/std::max
365+
add_compile_definitions(NOMINMAX)
266366
endif()
267367

268368
if(MSVC)
@@ -274,6 +374,11 @@ add_library(bitsandbytes SHARED ${SRC_FILES})
274374
target_compile_features(bitsandbytes PUBLIC cxx_std_17)
275375
target_include_directories(bitsandbytes PUBLIC csrc)
276376

377+
# Link NVFP4 SM_120a object library if available
378+
if(TARGET nvfp4_sm120a)
379+
target_sources(bitsandbytes PRIVATE $<TARGET_OBJECTS:nvfp4_sm120a>)
380+
endif()
381+
277382
if (BUILD_CPU)
278383
if (OpenMP_CXX_FOUND)
279384
target_link_libraries(bitsandbytes PRIVATE OpenMP::OpenMP_CXX)
@@ -316,10 +421,11 @@ if(BUILD_CUDA)
316421
)
317422
endif()
318423
if(BUILD_HIP)
319-
if(NOT DEFINED ENV{ROCM_PATH})
320-
set(ROCM_PATH /opt/rocm)
321-
else()
424+
# Determine ROCM_PATH from environment variable, fallback to /opt/rocm on Linux
425+
if(DEFINED ENV{ROCM_PATH})
322426
set(ROCM_PATH $ENV{ROCM_PATH})
427+
else()
428+
set(ROCM_PATH /opt/rocm)
323429
endif()
324430
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
325431
macro(find_package_and_print_version PACKAGE_NAME)
@@ -331,14 +437,23 @@ if(BUILD_HIP)
331437
find_package_and_print_version(hipsparse REQUIRED)
332438

333439
## hacky way of excluding hip::amdhip64 (with it linked many tests unexpectedly fail e.g. adam8bit because of inaccuracies)
334-
set_target_properties(hip::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
335-
set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
336-
set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "")
440+
## On Windows, we need to link amdhip64 explicitly
441+
if(NOT WIN32)
442+
set_target_properties(hip::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
443+
set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
444+
set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "")
445+
endif()
337446

338447
target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
339448
target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib)
340449
target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse)
341450

451+
# On Windows, rocblas is not pulled in transitively by roc::hipblas
452+
# and is needed because ops_hip.cuh uses rocblas_handle directly.
453+
if(WIN32)
454+
target_link_libraries(bitsandbytes PUBLIC rocblas)
455+
endif()
456+
342457
target_compile_definitions(bitsandbytes PUBLIC BNB_USE_HIP)
343458
set_source_files_properties(${HIP_FILES} PROPERTIES LANGUAGE HIP)
344459
set_target_properties(bitsandbytes PROPERTIES LINKER_LANGUAGE CXX)

NOTICE.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
11
The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: PyTorch is licensed under the BSD license.
2+
3+
The NVFP4 GEMM kernel in `csrc/qutlass/` is derived from [QuTLASS](https://github.com/IST-DASLab/qutlass) by Roberto L. Castro (IST Austria), licensed under the Apache License 2.0.
4+
5+
[CUTLASS](https://github.com/NVIDIA/cutlass) by NVIDIA is included as a submodule in `third_party/cutlass/`, licensed under the BSD 3-Clause License.

0 commit comments

Comments
 (0)