bitsandbytes-foundation
diff --git a/‎benchmarks/bench_dequant.py‎
Lines changed: 129 additions & 0 deletions b/‎benchmarks/bench_dequant.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎benchmarks/bench_dequant.sh‎
Lines changed: 50 additions & 0 deletions b/‎benchmarks/bench_dequant.sh‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎benchmarks/bench_fp16.py‎
Lines changed: 41 additions & 6 deletions b/‎benchmarks/bench_fp16.py‎
Lines changed: 41 additions & 6 deletions
diff --git a/‎benchmarks/bench_ncu.sh‎
Lines changed: 70 additions & 30 deletions b/‎benchmarks/bench_ncu.sh‎
Lines changed: 70 additions & 30 deletions
@@ -0,0 +1,129 @@
+"""Dequant + cuBLAS overhead analysis.
+
+Measures dequantize_kbit GPU kernel time per shape×k (via ncu or --use-events),
+fp16 matmul time per shape×M, and computes the overhead ratio.
+
+Usage:
+  # Recommended: ncu for dequant (accurate), CUDA events for matmul
+  bash benchmarks/bench_dequant.sh
+
+  # Quick (CUDA events only, includes ~35us dispatch overhead on dequant):
+  python benchmarks/bench_dequant.py --use-events
+
+Env: M_VALS (default "4,8,16,32,64,128,256,512,1024,2048,4096")
+     DEQUANT_CSV: comma-separated dequant times injected by bench_dequant.sh
+                  (order: k=2 × 5 shapes, k=3 × 5, k=4 × 5, k=5 × 5)
+"""
+import os, sys, argparse
+
+for p in [".", ".."]:
+    if os.path.isdir(os.path.join(p, "bitsandbytes")):
+        sys.path.insert(0, os.path.abspath(p))
+        break
+
+import torch
+import bitsandbytes  # noqa: E402
+from bitsandbytes.functional import create_normal_float_codebook  # noqa: E402
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--use-events", action="store_true",
+                    help="Use CUDA events for dequant timing (includes dispatch overhead)")
+args = parser.parse_args()
+
+shapes = [
+    ("gateup", 2048, 5120),
+    ("down",   5120, 2048),
+    ("Q",      2048, 4096),
+    ("O",      4096, 2048),
+    ("KV",     2048,  512),
+]
+k_bits_list = [2, 3, 4, 5]
+m_vals = [int(x) for x in os.environ.get(
+    "M_VALS", "4,8,16,32,64,128,256,512,1024,2048,4096").split(",")]
+
+dev = torch.device("cuda")
+start_ev = torch.cuda.Event(enable_timing=True)
+end_ev = torch.cuda.Event(enable_timing=True)
+WARMUP = 50
+ITERS = 200
+
+# --- Dequant times ---
+dequant_us = {}
+dequant_env = os.environ.get("DEQUANT_CSV", "")
+if dequant_env:
+    # Injected by bench_dequant.sh (ncu-measured)
+    # Order: k=2 × 5 shapes, k=3 × 5, k=4 × 5, k=5 × 5
+    vals = [float(x) for x in dequant_env.split(",")]
+    i = 0
+    for k in k_bits_list:
+        for name, _, _ in shapes:
+            dequant_us[(name, k)] = vals[i]
+            i += 1
+elif args.use_events:
+    # Fallback: CUDA events (includes ~35us dispatch overhead)
+    for k in k_bits_list:
+        codebook = create_normal_float_codebook(k, device=dev)
+        for name, K_dim, N in shapes:
+            n_elements = K_dim * N
+            W = torch.randn(n_elements, device=dev, dtype=torch.float32)
+            packed, absmax = torch.ops.bitsandbytes.quantize_kbit(W, codebook, k)
+            for _ in range(WARMUP):
+                torch.ops.bitsandbytes.dequantize_kbit(
+                    packed, codebook, absmax, k, n_elements, torch.float16)
+            torch.cuda.synchronize()
+            start_ev.record()
+            for _ in range(ITERS):
+                torch.ops.bitsandbytes.dequantize_kbit(
+                    packed, codebook, absmax, k, n_elements, torch.float16)
+            end_ev.record()
+            torch.cuda.synchronize()
+            dequant_us[(name, k)] = start_ev.elapsed_time(end_ev) * 1000 / ITERS
+else:
+    print("ERROR: Run via bench_dequant.sh (ncu) or with --use-events", file=sys.stderr)
+    sys.exit(1)
+
+# --- Print dequant times ---
+print("=== Dequant kernel time (us) ===")
+print(f"{'shape':<8}", end="")
+for k in k_bits_list:
+    print(f" {'k='+str(k):>8}", end="")
+print()
+print("---")
+for name, _, _ in shapes:
+    print(f"{name:<8}", end="")
+    for k in k_bits_list:
+        print(f" {dequant_us[(name, k)]:>8.1f}", end="")
+    print()
+print()
+
+# --- Measure fp16 matmul time per shape×M ---
+matmul_us = {}
+for name, K_dim, N in shapes:
+    W = torch.randn(K_dim, N, dtype=torch.float16, device=dev)
+    for M in m_vals:
+        A = torch.randn(M, K_dim, dtype=torch.float16, device=dev)
+        out = torch.empty(M, N, dtype=torch.float16, device=dev)
+        for _ in range(WARMUP):
+            torch.mm(A, W, out=out)
+        torch.cuda.synchronize()
+        start_ev.record()
+        for _ in range(ITERS):
+            torch.mm(A, W, out=out)
+        end_ev.record()
+        torch.cuda.synchronize()
+        matmul_us[(name, M)] = start_ev.elapsed_time(end_ev) * 1000 / ITERS
+
+# --- Print combined table per k ---
+for k in k_bits_list:
+    print(f"=== k={k}: dequant + fp16 matmul overhead ===")
+    print(f"{'shape':<8} {'M':>6} {'fp16 (us)':>10} {'dequant (us)':>13} {'total (us)':>11} {'speed':>7}")
+    print("-" * 60)
+    for name, K_dim, N in shapes:
+        d = dequant_us[(name, k)]
+        for M in m_vals:
+            mm = matmul_us[(name, M)]
+            total = d + mm
+            speed = mm / total
+            print(f"{name:<8} {M:>6} {mm:>10.1f} {d:>13.1f} {total:>11.1f} {speed:>7.2f}")
+        print()
+    print()
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Dequant + cuBLAS overhead analysis.
+# Uses ncu for accurate dequant kernel timing, CUDA events for matmul.
+#
+# Usage:
+#   bash benchmarks/bench_dequant.sh
+#   M_VALS=16,32,64,128,256 bash benchmarks/bench_dequant.sh
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+# Phase 1: measure dequant kernel times via ncu (all shapes × all k)
+echo "Measuring dequant kernel times via ncu..."
+DEQUANT_CSV=$(ncu --kernel-name "kDequantizeBlockwise_kbit_vec" \
+    --metrics gpu__time_duration.avg \
+    python3 -c "
+import sys, torch; sys.path.insert(0, '.')
+import bitsandbytes
+from bitsandbytes.functional import create_normal_float_codebook
+shapes = [('gateup',2048,5120),('down',5120,2048),('Q',2048,4096),('O',4096,2048),('KV',2048,512)]
+dev = torch.device('cuda')
+for k in [2,3,4,5]:
+    codebook = create_normal_float_codebook(k, device=dev)
+    for name, K, N in shapes:
+        n = K * N
+        W = torch.randn(n, device=dev)
+        packed, absmax = torch.ops.bitsandbytes.quantize_kbit(W, codebook, k)
+        torch.cuda.synchronize()
+        for _ in range(3):
+            torch.ops.bitsandbytes.dequantize_kbit(packed, codebook, absmax, k, n, torch.float16)
+        torch.cuda.synchronize()
+        torch.ops.bitsandbytes.dequantize_kbit(packed, codebook, absmax, k, n, torch.float16)
+        torch.cuda.synchronize()
+" 2>&1 | grep "gpu__time_duration" | awk '{print $NF}' | \
+python3 -c "
+import sys
+vals = [float(l.strip()) for l in sys.stdin]
+# 4 launches per (k, shape): 3 warmup + 1 profiled, take last
+result = []
+for i in range(0, len(vals), 4):
+    result.append(vals[i+3])
+# Output: k=2 × 5 shapes, k=3 × 5, k=4 × 5, k=5 × 5
+print(','.join(f'{v:.2f}' for v in result))
+")
+
+echo "Dequant kernel times (ncu): $DEQUANT_CSV"
+echo ""
+
+# Phase 2: run the Python script with injected dequant times
+DEQUANT_CSV="$DEQUANT_CSV" python3 "$SCRIPT_DIR/bench_dequant.py"
@@ -1,36 +1,71 @@
 """cuBLAS fp16 baseline — CUDA event timing, pre-allocated I/O.
 
-Env: M_VALS (default "1,2,3,4,8")
+Benchmarks dense matmul (torch.mm) and batched MoE matmul (torch.bmm).
+
+Env: M_VALS (default "1,2,3,4,8"), NUM_EXPERTS (default "8")
 """
 import os, torch
 
-shapes = [
+dense_shapes = [
     ("gateup", 2048, 5120),
     ("down",   5120, 2048),
     ("Q",      2048, 4096),
     ("O",      4096, 2048),
     ("KV",     2048,  512),
 ]
+moe_shapes = [
+    ("moe_gu", 2048, 512),
+    ("moe_dn",  512, 2048),
+]
+
 m_vals = [int(x) for x in os.environ.get("M_VALS", "1,2,3,4,8").split(",")]
+NUM_EXPERTS = int(os.environ.get("NUM_EXPERTS", "8"))
 dev = torch.device("cuda")
 start = torch.cuda.Event(enable_timing=True)
 end = torch.cuda.Event(enable_timing=True)
 
+WARMUP = 50
+ITERS = 200
+
+# --- Dense layers (torch.mm) ---
 print(f"{'shape':<8} {'M':>2} {'avg_us':>10}")
 print("---")
 
-for name, K, N in shapes:
+for name, K, N in dense_shapes:
     W = torch.randn(K, N, dtype=torch.float16, device=dev)
     for M in m_vals:
         A = torch.randn(M, K, dtype=torch.float16, device=dev)
         out = torch.empty(M, N, dtype=torch.float16, device=dev)
-        for _ in range(50):
+        for _ in range(WARMUP):
             torch.mm(A, W, out=out)
         torch.cuda.synchronize()
         start.record()
-        for _ in range(200):
+        for _ in range(ITERS):
             torch.mm(A, W, out=out)
         end.record()
         torch.cuda.synchronize()
-        us = start.elapsed_time(end) * 1000 / 200
+        us = start.elapsed_time(end) * 1000 / ITERS
         print(f"{name:<8} {M:>2} {us:>10.2f}")
+
+# --- MoE layers (torch.bmm) ---
+print()
+print(f"{'shape':<8} {'M':>2} {'nexp':>4} {'avg_us':>10}")
+print("---")
+
+for name, K, N in moe_shapes:
+    # Weight: [num_experts, K, N] — each expert has its own weight matrix
+    W_batch = torch.randn(NUM_EXPERTS, K, N, dtype=torch.float16, device=dev)
+    for M in m_vals:
+        # A: [num_experts, M, K] — M tokens per expert
+        A_batch = torch.randn(NUM_EXPERTS, M, K, dtype=torch.float16, device=dev)
+        out = torch.empty(NUM_EXPERTS, M, N, dtype=torch.float16, device=dev)
+        for _ in range(WARMUP):
+            torch.bmm(A_batch, W_batch, out=out)
+        torch.cuda.synchronize()
+        start.record()
+        for _ in range(ITERS):
+            torch.bmm(A_batch, W_batch, out=out)
+        end.record()
+        torch.cuda.synchronize()
+        us = start.elapsed_time(end) * 1000 / ITERS
+        print(f"{name:<8} {M:>2} {NUM_EXPERTS:>4} {us:>10.2f}")
@@ -1,46 +1,51 @@
 #!/bin/bash
-# Full kernel benchmark: MMA + scalar (ncu) + cuBLAS fp16 (CUDA events).
+# Full kernel benchmark: MMA + scalar + grouped (ncu) + cuBLAS fp16 (CUDA events).
+# Then computes end-to-end model summary for Qwen3-Coder-Next 70B.
 #
 # Usage:
-#   bash benchmarks/bench_ncu.sh           # default M=1,2,3,4,8
-#   M_VALS=3,4 bash benchmarks/bench_ncu.sh  # custom M values
+#   bash benchmarks/bench_ncu.sh           # default M=1..8
+#   M_VALS=1,4 bash benchmarks/bench_ncu.sh  # custom M values
 #
-# Output: three tables (MMA, scalar, cuBLAS fp16) with avg kernel time
-# in microseconds for each shape × k × M combination.
+# Output: raw kernel tables, then one summary table per M value showing
+# all kernels side by side for every (shape, k) combination.
 #
-# Runtime: ~30-60 seconds depending on M_VALS count.
+# Runtime: ~2-4 minutes for M=1..8.
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-export M_VALS="${M_VALS:-1,2,3,4,8}"
+RESULTS_DIR="$SCRIPT_DIR/.bench_results"
+mkdir -p "$RESULTS_DIR"
+
+export M_VALS="${M_VALS:-1,2,3,4,5,6,7,8}"
+export NUM_EXPERTS="${NUM_EXPERTS:-8}"
 WARMUP=5
 PROFILED=5
 
+# Compute M subsets: scalar/grouped only support M<=4
+SCALAR_M=$(python3 -c "print(','.join(str(m) for m in [int(x) for x in '$M_VALS'.split(',')] if m <= 4))")
+ALL_M="$M_VALS"
+
 echo "START: $(date)"
-echo "M values: $M_VALS"
-
-for KERNEL in mma scalar; do
-    if [ "$KERNEL" = "mma" ]; then
-        KNAME="kbit_gemm_prod"
-        echo ""
-        echo "=== MMA kernel ==="
-    else
-        KNAME="kbit_scalar_gemv"
-        echo ""
-        echo "=== Scalar GEMV ==="
-    fi
-    printf "%-8s %2s %2s %10s\n" "shape" "k" "M" "avg_us"
-    echo "---"
-
-    KERNEL=$KERNEL M_VALS=$M_VALS ncu --kernel-name "$KNAME" --metrics gpu__time_duration.avg \
+echo "M values: $M_VALS (scalar/grouped: $SCALAR_M)"
+echo "MoE experts: $NUM_EXPERTS"
+
+# Helper: run ncu and parse output for a kernel
+run_ncu_bench() {
+    local KTYPE="$1"   # mma, scalar, grouped
+    local KNAME="$2"   # ncu kernel name filter
+    local SHAPES="$3"  # Python list literal for shape names
+    local MVALS="$4"   # M values to use
+
+    KERNEL=$KTYPE M_VALS=$MVALS NUM_EXPERTS=$NUM_EXPERTS \
+        ncu --kernel-name "$KNAME" --metrics gpu__time_duration.avg \
         python "$SCRIPT_DIR/ncu_driver.py" 2>/dev/null | \
         grep "gpu__time_duration.avg" | awk '{print $NF}' | \
     python3 -c "
-import os, sys
+import sys
 vals = [float(l.strip()) for l in sys.stdin]
-shapes = ['gateup','down','Q','O','KV']
+shapes = $SHAPES
 kbits = [2,3,4,5]
-mvals = [int(x) for x in os.environ['M_VALS'].split(',')]
+mvals = [int(x) for x in '$MVALS'.split(',')]
 W, P = $WARMUP, $PROFILED
 i = 0
 for s in shapes:
@@ -51,12 +56,47 @@ for s in shapes:
             print(f'{s:<8} {k:>2} {m:>2} {avg:>10.2f}')
             i += W + P
 "
-done
+}
+
+# ---- MMA kernel (all M values) ----
+echo ""
+echo "=== MMA kernel ==="
+printf "%-8s %2s %2s %10s\n" "shape" "k" "M" "avg_us"
+echo "---"
+run_ncu_bench mma "kbit_gemm_prod" "['gateup','down','Q','O','KV']" "$ALL_M" | tee "$RESULTS_DIR/mma.txt"
+
+# ---- Scalar GEMV (M<=4 only) ----
+echo ""
+echo "=== Scalar GEMV (M<=4) ==="
+printf "%-8s %2s %2s %10s\n" "shape" "k" "M" "avg_us"
+echo "---"
+if [ -n "$SCALAR_M" ]; then
+    run_ncu_bench scalar "kbit_scalar_gemv" "['gateup','down','Q','O','KV']" "$SCALAR_M" | tee "$RESULTS_DIR/scalar.txt"
+else
+    echo "(no M<=4 values requested)" | tee "$RESULTS_DIR/scalar.txt"
+fi
+
+# ---- Grouped expert kernel (M<=4 only) ----
+echo ""
+echo "=== Grouped scalar GEMV (${NUM_EXPERTS} experts, M<=4) ==="
+printf "%-8s %2s %2s %10s\n" "shape" "k" "M" "avg_us"
+echo "---"
+if [ -n "$SCALAR_M" ]; then
+    run_ncu_bench grouped "kbit_grouped_scalar_gemv" "['moe_gu','moe_dn']" "$SCALAR_M" | tee "$RESULTS_DIR/grouped.txt"
+else
+    echo "(no M<=4 values requested)" | tee "$RESULTS_DIR/grouped.txt"
+fi
+
+# ---- cuBLAS fp16 baselines (CUDA events, all M values) ----
+echo ""
+echo "=== cuBLAS fp16 (dense mm + MoE bmm) ==="
+M_VALS=$ALL_M NUM_EXPERTS=$NUM_EXPERTS python "$SCRIPT_DIR/bench_fp16.py" 2>/dev/null | \
+    tee "$RESULTS_DIR/cublas.txt"
 
-# cuBLAS fp16 (CUDA events — ncu can't reliably filter cuBLAS kernels)
+# ---- Model-level summary ----
 echo ""
-echo "=== cuBLAS fp16 ==="
-M_VALS=$M_VALS python "$SCRIPT_DIR/bench_fp16.py" 2>/dev/null
+echo "=== Qwen3-Coder-Next 70B: weight matmul summary ==="
+python3 "$SCRIPT_DIR/model_summary.py" "$RESULTS_DIR"
 
 echo ""
 echo "END: $(date)"