second kernel

MekkCyber · MekkCyber · commit 104e6a8dfb2a · 2025-12-06T20:24:46.000+01:00
diff --git a/bitsandbytes/backends/mps/ops.py b/bitsandbytes/backends/mps/ops.py
@@ -131,6 +131,7 @@ def _dequantize_4bit_native(
         ct.c_int32(blocksize),
         ct.c_int32(out.numel()),
     )
+
     return True
 
 
@@ -163,7 +164,7 @@ def _(
     out = torch.empty(shape, dtype=dtype, device=A.device)
     if _dequantize_4bit_native(A, absmax, blocksize, quant_type, dtype, out):
         return out
-    return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+    # return _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
 
 
 @register_kernel("bitsandbytes::dequantize_4bit.out", "mps")
@@ -182,7 +183,6 @@ def _(
     torch._check(out.shape == tuple(shape), lambda: f"Expected out.shape == {tuple(shape)}, got {out.shape}")
     torch._check(out.dtype == dtype, lambda: f"Expected out.dtype == {dtype}, got {out.dtype}")
 
-    if not _dequantize_4bit_native(A, absmax, blocksize, quant_type, dtype, out):
-        result = _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
-        out.copy_(result)
-
+    _dequantize_4bit_native(A, absmax, blocksize, quant_type, dtype, out)
+        # result = _dequantize_4bit_impl(A, absmax, blocksize, quant_type, shape, dtype)
+        # out.copy_(result)
diff --git a/bitsandbytes/test_bnb_mac.py b/bitsandbytes/test_bnb_mac.py
diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal
@@ -85,30 +85,46 @@ inline void dequantize_block(
     uint n,
     uint blocksize,
     uint block_index,
-    constant float* code_table
+    uint thread_idx,
+    uint threadgroup_size,
+    constant float* code_table,
+    threadgroup float& shared_scale
 ) {
-    uint start = block_index * blocksize;
-    if (start >= n) {
+    uint block_start = block_index * blocksize;
+    if (block_start >= n) {
         return;
     }
+    uint block_end = min(block_start + blocksize, n);
+    uint pairs_in_block = (block_end - block_start + 1) >> 1;
 
-    uint end = min(start + blocksize, n);
-    float scale = absmax[block_index];
-    if (scale == 0.0f) {
-        for (uint i = start; i < end; ++i) {
-          output[i] = scalar_t(0.0f);
-        }
-        return;
+    if (thread_idx == 0) {
+        shared_scale = absmax[block_index];
     }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    float scale = shared_scale;
+
+    for (uint pair = thread_idx; pair < pairs_in_block; pair += threadgroup_size) {
+        uint value_index0 = block_start + pair * 2;
+        if (value_index0 >= block_end) {
+            break;
+        }
+
+        uint byte_index0 = value_index0 >> 1;
+        uchar byte_val0 = packed[byte_index0];
+        bool upper0 = ((value_index0 & 1) == 0);
+        uchar nibble0 = upper0 ? ((byte_val0 >> 4) & 0xF) : (byte_val0 & 0xF);
+        float decoded0 = code_table[nibble0] * scale;
+        output[value_index0] = scalar_t(decoded0);
 
-    uint base_byte = start >> 1;
-    for (uint offset = 0; offset < end - start; ++offset) {
-        uint global_index = start + offset;
-        uint byte_index = base_byte + (offset >> 1);
-        uchar byte_val = packed[byte_index];
-        uchar nibble = (offset & 1) == 0 ? (byte_val >> 4) & 0xF : byte_val & 0xF;
-        float decoded = code_table[nibble] * scale;
-        output[global_index] = scalar_t(decoded);
+        uint value_index1 = value_index0 + 1;
+        if (value_index1 < block_end) {
+            uint byte_index1 = value_index1 >> 1;
+            uchar byte_val1 = (byte_index1 == byte_index0) ? byte_val0 : packed[byte_index1];
+            bool upper1 = ((value_index1 & 1) == 0);
+            uchar nibble1 = upper1 ? ((byte_val1 >> 4) & 0xF) : (byte_val1 & 0xF);
+            float decoded1 = code_table[nibble1] * scale;
+            output[value_index1] = scalar_t(decoded1);
+        }
     }
 }
 
@@ -183,13 +199,15 @@ kernel void dequantize_4bit_fp16_fp4(
     constant uint& n [[buffer(3)]],
     constant uint& blocksize [[buffer(4)]],
     constant uint& blocks [[buffer(5)]],
-    uint gid [[thread_position_in_grid]],
-    
+    uint tgid [[threadgroup_position_in_grid]],
+    uint tid [[thread_index_in_threadgroup]],
+    uint threadgroup_size [[threads_per_threadgroup]]
 ) {
-    if (gid >= blocks) {
+    if (tgid >= blocks) {
         return;
     }
-    dequantize_block(packed, absmax, output, n, blocksize, gid, FP4_CODE);
+    threadgroup float shared_scale;
+    dequantize_block(packed, absmax, output, n, blocksize, tgid, tid, threadgroup_size, FP4_CODE, shared_scale);
 }
 
 kernel void dequantize_4bit_fp16_nf4(
@@ -199,12 +217,15 @@ kernel void dequantize_4bit_fp16_nf4(
     constant uint& n [[buffer(3)]],
     constant uint& blocksize [[buffer(4)]],
     constant uint& blocks [[buffer(5)]],
-    uint gid [[thread_position_in_grid]]
+    uint tgid [[threadgroup_position_in_grid]],
+    uint tid [[thread_index_in_threadgroup]],
+    uint threadgroup_size [[threads_per_threadgroup]]
 ) {
-    if (gid >= blocks) {
+    if (tgid >= blocks) {
         return;
     }
-    dequantize_block(packed, absmax, output, n, blocksize, gid, NF4_CODE);
+    threadgroup float shared_scale;
+    dequantize_block(packed, absmax, output, n, blocksize, tgid, tid, threadgroup_size, NF4_CODE, shared_scale);
 }
 
 kernel void dequantize_4bit_fp32_fp4(
@@ -214,12 +235,15 @@ kernel void dequantize_4bit_fp32_fp4(
     constant uint& n [[buffer(3)]],
     constant uint& blocksize [[buffer(4)]],
     constant uint& blocks [[buffer(5)]],
-    uint gid [[thread_position_in_grid]]
+    uint tgid [[threadgroup_position_in_grid]],
+    uint tid [[thread_index_in_threadgroup]],
+    uint threadgroup_size [[threads_per_threadgroup]]
 ) {
-    if (gid >= blocks) {
+    if (tgid >= blocks) {
         return;
     }
-    dequantize_block(packed, absmax, output, n, blocksize, gid, FP4_CODE);
+    threadgroup float shared_scale;
+    dequantize_block(packed, absmax, output, n, blocksize, tgid, tid, threadgroup_size, FP4_CODE, shared_scale);
 }
 
 kernel void dequantize_4bit_fp32_nf4(
@@ -229,10 +253,13 @@ kernel void dequantize_4bit_fp32_nf4(
     constant uint& n [[buffer(3)]],
     constant uint& blocksize [[buffer(4)]],
     constant uint& blocks [[buffer(5)]],
-    uint gid [[thread_position_in_grid]]
+    uint tgid [[threadgroup_position_in_grid]],
+    uint tid [[thread_index_in_threadgroup]],
+    uint threadgroup_size [[threads_per_threadgroup]]
 ) {
-    if (gid >= blocks) {
+    if (tgid >= blocks) {
         return;
     }
-    dequantize_block(packed, absmax, output, n, blocksize, gid, NF4_CODE);
+    threadgroup float shared_scale;
+    dequantize_block(packed, absmax, output, n, blocksize, tgid, tid, threadgroup_size, NF4_CODE, shared_scale);
 }
diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm
@@ -5,6 +5,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include <algorithm>
 
 namespace {
 
@@ -167,7 +168,6 @@ static inline void dispatch_dequant_kernel(
     if (n == 0) {
         return;
     }
-
     uint32_t blocks = (n + blocksize - 1) / blocksize;
     TensorView packedView = make_tensor_view(packed, "packed");
     TensorView absmaxView = make_tensor_view(absmax, "absmax");
@@ -184,17 +184,25 @@ static inline void dispatch_dequant_kernel(
     [encoder setBytes:&n length:sizeof(uint32_t) atIndex:3];
     [encoder setBytes:&blocksize length:sizeof(uint32_t) atIndex:4];
     [encoder setBytes:&blocks length:sizeof(uint32_t) atIndex:5];
-    NSUInteger threadsPerThreadgroup = pipeline.threadExecutionWidth;
-    if (threadsPerThreadgroup == 0) {
-        threadsPerThreadgroup = 1;
+
+    NSUInteger maxThreadsPerTG = pipeline.maxTotalThreadsPerThreadgroup;
+    NSUInteger desiredThreads = (blocksize + 1) / 2;
+    if (desiredThreads == 0) {
+        desiredThreads = 1;
     }
+    NSUInteger threadsPerThreadgroup = std::min(maxThreadsPerTG, std::max<NSUInteger>(1, desiredThreads));
+    if (threadsPerThreadgroup < pipeline.threadExecutionWidth) {
+        threadsPerThreadgroup = std::min(pipeline.threadExecutionWidth, maxThreadsPerTG);
+    }
+
+    NSUInteger totalThreads = threadsPerThreadgroup * blocks;
     MTLSize threads = MTLSizeMake(threadsPerThreadgroup, 1, 1);
-    MTLSize grid = MTLSizeMake(blocks, 1, 1);
+    MTLSize grid = MTLSizeMake(totalThreads, 1, 1);
     [encoder dispatchThreads:grid threadsPerThreadgroup:threads];
     [encoder endEncoding];
 
     [commandBuffer commit];
-    [commandBuffer waitUntilCompleted];
+    // [commandBuffer waitUntilCompleted];
 }
 
 }  // namespace
diff --git a/script.sh b/script.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+PYTHON_PATH=/Users/medmekk/miniforge3/envs/gpt/bin/python
+$PYTHON_PATH ./test_bnb_mac.py
diff --git a/test_bnb_mac.py b/test_bnb_mac.py
@@ -9,14 +9,14 @@
 outputs = model.generate(**inputs, max_new_tokens=20)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))  # or whatever entry function you have
 
-import torch
-import bitsandbytes as bnb
-A = torch.randn(2048, device='mps', dtype=torch.float16)
-q, absmax = torch.ops.bitsandbytes.quantize_4bit(A, 64, 'nf4', torch.uint8)
-print('q.shape:', q.shape, q.dtype)
-print('absmax.shape:', absmax.shape, absmax.dtype)
-B = torch.ops.bitsandbytes.dequantize_4bit(q, absmax, 64, 'nf4', A.shape, A.dtype)
-print('ok', float((A-B).abs().max()))
+# import torch
+# import bitsandbytes as bnb
+# A = torch.randn(2048, device='mps', dtype=torch.float16)
+# q, absmax = torch.ops.bitsandbytes.quantize_4bit(A, 64, 'nf4', torch.uint8)
+# print('q.shape:', q.shape, q.dtype)
+# print('absmax.shape:', absmax.shape, absmax.dtype)
+# B = torch.ops.bitsandbytes.dequantize_4bit(q, absmax, 64, 'nf4', A.shape, A.dtype)
+# print('ok', float((A-B).abs().max()))
 
 # import torch, bitsandbytes as bnb
 
@@ -52,4 +52,19 @@
 # print("q_mps[:8]:", q.view(-1)[:8].cpu())
 # print("q_cpu[:8]:", q_cpu.view(-1)[:8])
 # print("absmax_mps[:4]:", absmax[:4].cpu())
-# print("absmax_cpu[:4]:", absmax_cpu[:4])
+# print("absmax_cpu[:4]:", absmax_cpu[:4])
+
+# import torch, bitsandbytes as bnb, time
+
+# torch.manual_seed(0)
+# A = torch.randn(4096 * 4096, device="mps", dtype=torch.float16)
+# blocksize = 64
+
+# q, absmax = torch.ops.bitsandbytes.quantize_4bit(A, blocksize, "nf4", torch.uint8)
+
+# torch.mps.synchronize()
+# t0 = time.perf_counter()
+# torch.ops.bitsandbytes.dequantize_4bit(q, absmax, blocksize, "nf4", A.shape, A.dtype)
+# torch.mps.synchronize()
+# dt = time.perf_counter() - t0
+# print(f"Dequant time: {dt*1000:.2f} ms for {A.numel()/1e6:.1f}M elements")