fix: Remove merge artifacts that broke build (duplicate defs, stale instantiations)

TimDettmers · claude · TimDettmers · commit 201e561ca932 · 2026-03-27T06:09:19.000-04:00
After merging feature/qutlass-nvfp4-gemm into QLORA-2 (c25d7af), three files had duplicate/stale code that prevented compilation: - _ops.py: duplicate torch.library.define for dequantize_nvfp4, cutlass_fused_quantize_nvfp4, scale_to_blocked, gemm_nvfp4 - ops.cu: stale template instantiation block with wrong signatures (missing cudaStream_t, wrong absmax template params), duplicate testMMA, dead kbitGroupedScalarGemv launcher (kernel removed in ac7d6ff) - pythonInterface.cpp: missing BUILD_CUDA guard on training kernel bindings (half/__nv_bfloat16 unavailable to host compiler), duplicate cquantize/cdequantize_blockwise wrappers Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -1783,20 +1783,6 @@ def _(A: torch.Tensor, tensor_scale: Optional[float] = None) -> tuple[torch.Tens
     return packed, block_scales, ts_out
 
 
-# NVFP4 dequantization
-torch.library.define(
-    "bitsandbytes::dequantize_nvfp4",
-    "(Tensor packed, Tensor block_scales, float tensor_scale, int numel, ScalarType dtype) -> Tensor",
-)
-
-
-@register_fake("bitsandbytes::dequantize_nvfp4")
-def _(
-    packed: torch.Tensor, block_scales: torch.Tensor, tensor_scale: float, numel: int, dtype: torch.dtype
-) -> torch.Tensor:
-    return torch.empty(numel, dtype=dtype, device=packed.device)
-
-
 # NVFP4 Hadamard rotation (in-place)
 torch.library.define(
     "bitsandbytes::hadamard_rotate_nvfp4",
@@ -1825,66 +1811,3 @@ def _(A: torch.Tensor, tensor_scale: Optional[float] = None) -> tuple[torch.Tens
     block_scales = torch.empty(n // 16, dtype=torch.uint8, device=A.device)
     ts_out = torch.empty(1, dtype=torch.float32, device=A.device)
     return packed, block_scales, ts_out
-
-
-# CUTLASS-based fused quantize for NVFP4 (SM_120+)
-# Uses QuTLASS GEMM-as-quantize approach with always-on randomized Hadamard
-# rotation. The rotation is free (baked into the GEMM B operand) and improves
-# quantization quality by spreading outliers across blocks.
-torch.library.define(
-    "bitsandbytes::cutlass_fused_quantize_nvfp4",
-    "(Tensor A, float tensor_scale) -> (Tensor, Tensor, Tensor)",
-)
-
-
-@register_fake("bitsandbytes::cutlass_fused_quantize_nvfp4")
-def _(
-    A: torch.Tensor,
-    tensor_scale: float,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    n = A.numel()
-    torch._check(n % 16 == 0, lambda: f"NVFP4 requires numel divisible by 16, got {n}")
-    packed = torch.empty(n // 2, dtype=torch.uint8, device=A.device)
-    block_scales = torch.empty(n // 16, dtype=torch.uint8, device=A.device)
-    ts_out = torch.empty(1, dtype=torch.float32, device=A.device)
-    return packed, block_scales, ts_out
-
-
-# Scale reordering for CUTLASS block-scaled GEMM
-torch.library.define(
-    "bitsandbytes::scale_to_blocked",
-    "(Tensor scales, int H, int W) -> Tensor",
-)
-
-
-@register_fake("bitsandbytes::scale_to_blocked")
-def _(scales: torch.Tensor, H: int, W: int) -> torch.Tensor:
-    n_row_blocks = (H + 127) // 128
-    n_col_blocks = (W + 3) // 4
-    out_size = n_row_blocks * n_col_blocks * 128 * 4
-    return torch.empty(out_size, dtype=torch.uint8, device=scales.device)
-
-
-# NVFP4 GEMM (A @ B^T with block-scaled FP4 inputs)
-torch.library.define(
-    "bitsandbytes::gemm_nvfp4",
-    "(Tensor A_packed, Tensor B_packed, Tensor A_scales, Tensor B_scales, "
-    "float A_tensor_scale, float B_tensor_scale, int M, int N, int K) -> Tensor",
-)
-
-
-@register_fake("bitsandbytes::gemm_nvfp4")
-def _(
-    A_packed: torch.Tensor,
-    B_packed: torch.Tensor,
-    A_scales: torch.Tensor,
-    B_scales: torch.Tensor,
-    A_tensor_scale: float,
-    B_tensor_scale: float,
-    M: int,
-    N: int,
-    K: int,
-) -> torch.Tensor:
-    torch._check_is_size(M)
-    torch._check_is_size(N)
-    torch._check_is_size(K)
diff --git a/csrc/ops.cu b/csrc/ops.cu
@@ -5761,222 +5761,8 @@ INSTANTIATE_VQ_SCALAR_GEMV_F32(3, 8)
 INSTANTIATE_VQ_SCALAR_GEMV_F32(3, 10)
 INSTANTIATE_VQ_SCALAR_GEMV_F32(4, 8)
 
-// ============================================================================
-// Training Kernels (from QLORA-2 branch)
-// ============================================================================
-
-    }
-}
-
-// ---- Grouped scalar GEMV launcher ----
-template <int K, typename scalar_t>
-void kbitGroupedScalarGemv(
-    const scalar_t* A_concat, const unsigned int* B_packed_all, const unsigned char* B_absmax_all,
-    const float* codebook, scalar_t* C_concat, const int* expert_offsets, int K_dim, int N, int num_experts
-) {
-    constexpr int COLS_PER_BLOCK = 4;
-    constexpr int BLOCK_SIZE = 128;
-    int n_groups = (N + COLS_PER_BLOCK - 1) / COLS_PER_BLOCK;
-    dim3 grid(n_groups, num_experts);
-
-    kbit_grouped_scalar_gemv<K, 4, scalar_t><<<grid, BLOCK_SIZE>>>(
-        A_concat, B_packed_all, B_absmax_all, codebook, C_concat, expert_offsets, K_dim, N, num_experts
-    );
-    CUDA_CHECK_RETURN(cudaPeekAtLastError());
-}
-
-// ---- Debug: Simple MMA test kernel ----
-// Takes fp16 A[16,16] and fp16 B[16,8] (B stored row-major), outputs fp32 C[16,8].
-__global__ void test_mma_kernel(const half* __restrict__ A, const half* __restrict__ B, float* __restrict__ C) {
-    int lane_id = threadIdx.x % 32;
-    int gid = lane_id / 4;
-    int tid = lane_id % 4;
-
-    // Load A fragment: A is [16,16] row-major
-    // m16n8k16 register order (from Turing m16n8k8 decomposition):
-    //   a[0]: row_lo (gid), k_lo (tid*2..tid*2+1)
-    //   a[1]: row_hi (gid+8), k_lo (tid*2..tid*2+1)
-    //   a[2]: row_lo (gid), k_hi (tid*2+8..tid*2+9)
-    //   a[3]: row_hi (gid+8), k_hi (tid*2+8..tid*2+9)
-    uint32_t frag_a[4];
-    {
-        half2 h_rlo_klo = __halves2half2(A[gid * 16 + tid * 2], A[gid * 16 + tid * 2 + 1]);
-        half2 h_rhi_klo = __halves2half2(A[(gid + 8) * 16 + tid * 2], A[(gid + 8) * 16 + tid * 2 + 1]);
-        half2 h_rlo_khi = __halves2half2(A[gid * 16 + tid * 2 + 8], A[gid * 16 + tid * 2 + 9]);
-        half2 h_rhi_khi = __halves2half2(A[(gid + 8) * 16 + tid * 2 + 8], A[(gid + 8) * 16 + tid * 2 + 9]);
-        frag_a[0] = *reinterpret_cast<uint32_t*>(&h_rlo_klo);
-        frag_a[1] = *reinterpret_cast<uint32_t*>(&h_rhi_klo);
-        frag_a[2] = *reinterpret_cast<uint32_t*>(&h_rlo_khi);
-        frag_a[3] = *reinterpret_cast<uint32_t*>(&h_rhi_khi);
-    }
-
-    // Load B fragment: B is [16,8] row-major. MMA B is col-major, so B_col[k,n] = B_row[k,n].
-    uint32_t frag_b[2];
-    {
-        half2 b0 = __halves2half2(B[(tid * 2) * 8 + gid], B[(tid * 2 + 1) * 8 + gid]);
-        half2 b1 = __halves2half2(B[(tid * 2 + 8) * 8 + gid], B[(tid * 2 + 9) * 8 + gid]);
-        frag_b[0] = *reinterpret_cast<uint32_t*>(&b0);
-        frag_b[1] = *reinterpret_cast<uint32_t*>(&b1);
-    }
-
-    float c[4] = {0, 0, 0, 0};
-    asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-                 "{%0, %1, %2, %3}, "
-                 "{%4, %5, %6, %7}, "
-                 "{%8, %9}, "
-                 "{%10, %11, %12, %13};\n"
-                 : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-                 : "r"(frag_a[0]), "r"(frag_a[1]), "r"(frag_a[2]), "r"(frag_a[3]), "r"(frag_b[0]), "r"(frag_b[1]),
-                   "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-
-    // Write C[16,8] row-major
-    C[gid * 8 + tid * 2] = c[0];
-    C[gid * 8 + tid * 2 + 1] = c[1];
-    C[(gid + 8) * 8 + tid * 2] = c[2];
-    C[(gid + 8) * 8 + tid * 2 + 1] = c[3];
-}
-
-void testMMA(const half* A, const half* B, float* C) {
-    test_mma_kernel<<<1, 32>>>(A, B, C);
-    CUDA_CHECK_RETURN(cudaPeekAtLastError());
-}
-
-// ---- Template instantiations ----
-
-#define INSTANTIATE_KBIT_QUANT(T, K)                                                                                   \
-    template void quantizeBlockwise_kbit<T, K>(const float*, const T*, float*, unsigned int*, int);
-
-INSTANTIATE_KBIT_QUANT(half, 2)
-INSTANTIATE_KBIT_QUANT(half, 3)
-INSTANTIATE_KBIT_QUANT(half, 4)
-INSTANTIATE_KBIT_QUANT(half, 5)
-INSTANTIATE_KBIT_QUANT(__nv_bfloat16, 2)
-INSTANTIATE_KBIT_QUANT(__nv_bfloat16, 3)
-INSTANTIATE_KBIT_QUANT(__nv_bfloat16, 4)
-INSTANTIATE_KBIT_QUANT(__nv_bfloat16, 5)
-INSTANTIATE_KBIT_QUANT(float, 2)
-INSTANTIATE_KBIT_QUANT(float, 3)
-INSTANTIATE_KBIT_QUANT(float, 4)
-INSTANTIATE_KBIT_QUANT(float, 5)
-
-// Dequant instantiations: all output types × absmax types × K values
-#define INSTANTIATE_KBIT_DEQUANT(T, K, ABSMAX_T)                                                                       \
-    template void dequantizeBlockwise_kbit<T, K, ABSMAX_T>(                                                            \
-        const unsigned int*, const float*, const ABSMAX_T*, T*, int, cudaStream_t                                      \
-    );
-
-// uint8 E4M4 absmax (default)
-INSTANTIATE_KBIT_DEQUANT(half, 2, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(half, 3, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(half, 4, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(half, 5, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(__nv_bfloat16, 2, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(__nv_bfloat16, 3, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(__nv_bfloat16, 4, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(__nv_bfloat16, 5, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(float, 2, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(float, 3, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(float, 4, unsigned char)
-INSTANTIATE_KBIT_DEQUANT(float, 5, unsigned char)
-
-// fp16 absmax (option)
-INSTANTIATE_KBIT_DEQUANT(half, 2, half)
-INSTANTIATE_KBIT_DEQUANT(half, 3, half)
-INSTANTIATE_KBIT_DEQUANT(half, 4, half)
-INSTANTIATE_KBIT_DEQUANT(half, 5, half)
-INSTANTIATE_KBIT_DEQUANT(__nv_bfloat16, 2, half)
-INSTANTIATE_KBIT_DEQUANT(__nv_bfloat16, 3, half)
-INSTANTIATE_KBIT_DEQUANT(__nv_bfloat16, 4, half)
-INSTANTIATE_KBIT_DEQUANT(__nv_bfloat16, 5, half)
-INSTANTIATE_KBIT_DEQUANT(float, 2, half)
-INSTANTIATE_KBIT_DEQUANT(float, 3, half)
-INSTANTIATE_KBIT_DEQUANT(float, 4, half)
-INSTANTIATE_KBIT_DEQUANT(float, 5, half)
-
-// Repack instantiations: one per K value
-#define INSTANTIATE_KBIT_REPACK(K)                                                                                     \
-    template void repackKbit<K>(const unsigned int*, const float*, unsigned int*, unsigned char*, int, int);
-
-INSTANTIATE_KBIT_REPACK(2)
-INSTANTIATE_KBIT_REPACK(3)
-INSTANTIATE_KBIT_REPACK(4)
-INSTANTIATE_KBIT_REPACK(5)
-
-// GEMM instantiations: one per K value (fp16 only)
-#define INSTANTIATE_KBIT_GEMM(K)                                                                                       \
-    template void kbitGemmMinimal<K>(                                                                                  \
-        const half*, const unsigned int*, const unsigned char*, const float*, half*, int, int, int                     \
-    );                                                                                                                 \
-    template void kbitGemmPipelined<K>(                                                                                \
-        const half*, const unsigned int*, const unsigned char*, const float*, half*, int, int, int                     \
-    );                                                                                                                 \
-    template void kbitGemmSplitK<K>(                                                                                   \
-        const half*, const unsigned int*, const unsigned char*, const float*, half*, float*, int*, int, int, int, int  \
-    );
-
-INSTANTIATE_KBIT_GEMM(2)
-INSTANTIATE_KBIT_GEMM(3)
-INSTANTIATE_KBIT_GEMM(4)
-INSTANTIATE_KBIT_GEMM(5)
-
-// Production kernel instantiations (fp16 and bf16)
-#define INSTANTIATE_KBIT_GEMM_PROD(K)                                                                                  \
-    template void kbitGemmProd<K, half>(                                                                               \
-        const half*, const unsigned int*, const unsigned char*, const float*, half*, float*, int*, int, int, int, int  \
-    );                                                                                                                 \
-    template void kbitGemmProd<K, __nv_bfloat16>(                                                                      \
-        const __nv_bfloat16*, const unsigned int*, const unsigned char*, const float*, __nv_bfloat16*, float*, int*,   \
-        int, int, int, int                                                                                             \
-    );
-
-INSTANTIATE_KBIT_GEMM_PROD(2)
-INSTANTIATE_KBIT_GEMM_PROD(3)
-INSTANTIATE_KBIT_GEMM_PROD(4)
-INSTANTIATE_KBIT_GEMM_PROD(5)
-
-// Grouped expert GEMM instantiations (fp16 and bf16)
-#define INSTANTIATE_KBIT_GROUPED_GEMM_PROD(K)                                                                          \
-    template void kbitGroupedGemmProd<K, half>(                                                                        \
-        const half*, const unsigned int*, const unsigned char*, const float*, half*, const int*, int, int, int         \
-    );                                                                                                                 \
-    template void kbitGroupedGemmProd<K, __nv_bfloat16>(                                                               \
-        const __nv_bfloat16*, const unsigned int*, const unsigned char*, const float*, __nv_bfloat16*, const int*,     \
-        int, int, int                                                                                                  \
-    );
-
-INSTANTIATE_KBIT_GROUPED_GEMM_PROD(2)
-INSTANTIATE_KBIT_GROUPED_GEMM_PROD(3)
-INSTANTIATE_KBIT_GROUPED_GEMM_PROD(4)
-INSTANTIATE_KBIT_GROUPED_GEMM_PROD(5)
-
-// Scalar GEMV instantiations (fp16 and bf16) — flat layout, float32 absmax, C=1
-#define INSTANTIATE_KBIT_SCALAR_GEMV(K)                                                                                \
-    template void kbitScalarGemv<K, half>(                                                                             \
-        const half*, const unsigned int*, const float*, const float*, half*, int, int, int                             \
-    );                                                                                                                 \
-    template void kbitScalarGemv<K, __nv_bfloat16>(                                                                    \
-        const __nv_bfloat16*, const unsigned int*, const float*, const float*, __nv_bfloat16*, int, int, int           \
-    );
-
-INSTANTIATE_KBIT_SCALAR_GEMV(2)
-INSTANTIATE_KBIT_SCALAR_GEMV(3)
-INSTANTIATE_KBIT_SCALAR_GEMV(4)
-INSTANTIATE_KBIT_SCALAR_GEMV(5)
-
-// Grouped scalar GEMV instantiations (fp16 and bf16)
-#define INSTANTIATE_KBIT_GROUPED_SCALAR_GEMV(K)                                                                        \
-    template void kbitGroupedScalarGemv<K, half>(                                                                      \
-        const half*, const unsigned int*, const unsigned char*, const float*, half*, const int*, int, int, int         \
-    );                                                                                                                 \
-    template void kbitGroupedScalarGemv<K, __nv_bfloat16>(                                                             \
-        const __nv_bfloat16*, const unsigned int*, const unsigned char*, const float*, __nv_bfloat16*, const int*,     \
-        int, int, int                                                                                                  \
-    );
-
-INSTANTIATE_KBIT_GROUPED_SCALAR_GEMV(2)
-INSTANTIATE_KBIT_GROUPED_SCALAR_GEMV(3)
-INSTANTIATE_KBIT_GROUPED_SCALAR_GEMV(4)
-INSTANTIATE_KBIT_GROUPED_SCALAR_GEMV(5)
+// NOTE: kbitGroupedScalarGemv was removed (grouped MMA covers all MoE shapes).
+// See commit ac7d6ff.
 
 // ============================================================================
 // Training Kernels: SwiGLU, RMSNorm, RoPE
diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp
@@ -2569,6 +2569,7 @@ void chadamard_rotate_full_bf16(
 #endif
 }
 
+#if BUILD_CUDA || BUILD_HIP
 // ============================================================================
 // Training Kernel Bindings (from QLORA-2 branch)
 // ============================================================================
@@ -2675,42 +2676,6 @@ void ccross_entropy_backward_bf16(
 
 extern "C" {
 #if BUILD_CUDA || BUILD_HIP
-void cquantize(float* code, float* A, unsigned char* out, int n) { quantize(code, A, out, n); }
-
-void cdequantize(float* code, unsigned char* A, float* out, int n, cudaStream_t stream) {
-    dequantize(code, A, out, n, stream);
-}
-
-void cdequantize_blockwise_fp16_fp4(
-    float* code, unsigned char* A, float* absmax, half* out, int blocksize, const int n, cudaStream_t stream
-) {
-    dequantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n, stream);
-}
-
-void cdequantize_blockwise_fp16(
-    float* code, unsigned char* A, float* absmax, half* out, int blocksize, const int n, cudaStream_t stream
-) {
-    dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n, stream);
-}
-
-void cdequantize_blockwise_fp16_nf4(
-    float* code, unsigned char* A, float* absmax, half* out, int blocksize, const int n, cudaStream_t stream
-) {
-    dequantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n, stream);
-}
-
-void cquantize_blockwise_fp16(float* code, half* A, float* absmax, unsigned char* out, int blocksize, const int n) {
-    quantizeBlockwise_fp16(code, A, absmax, out, blocksize, n);
-}
-
-void cquantize_blockwise_fp16_fp4(float* code, half* A, float* absmax, unsigned char* out, int blocksize, const int n) {
-    quantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n);
-}
-
-void cquantize_blockwise_fp16_nf4(float* code, half* A, float* absmax, unsigned char* out, int blocksize, const int n) {
-    quantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n);
-}
-
 // Training kernel extern C wrappers
 void cswiglu_forward_fp16_c(const half* gate, const half* up, half* out, int n) {
     cswiglu_forward_fp16(gate, up, out, n);