Add Python wrapper for cscale_to_blocked_batched (batched scale swizzle)

TimDettmers · claude · TimDettmers · commit 381f611088cf · 2026-03-09T12:30:26.000-04:00
Exposes the existing C kernel for batched per-expert scale reordering from
row-major to CUTLASS block-scaled layout. Adds op definition, registered
kernel, and convenience function that computes metadata from expert_offsets.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/_ops.py b/bitsandbytes/_ops.py
@@ -505,6 +505,29 @@ def _(scales: torch.Tensor, H: int, W: int) -> torch.Tensor:
     return torch.empty(out_size, dtype=torch.uint8, device=scales.device)
 
 
+# Batched scale reordering for MoE: row-major → per-expert swizzled
+torch.library.define(
+    "bitsandbytes::scale_to_blocked_batched",
+    "(Tensor scales_rowmajor, Tensor expert_row_offsets, Tensor expert_M, "
+    "Tensor expert_out_offsets, int W, int num_experts, int max_row_blocks, "
+    "int total_out_bytes) -> Tensor",
+)
+
+
+@register_fake("bitsandbytes::scale_to_blocked_batched")
+def _(
+    scales_rowmajor: torch.Tensor,
+    expert_row_offsets: torch.Tensor,
+    expert_M: torch.Tensor,
+    expert_out_offsets: torch.Tensor,
+    W: int,
+    num_experts: int,
+    max_row_blocks: int,
+    total_out_bytes: int,
+) -> torch.Tensor:
+    return torch.empty(total_out_bytes, dtype=torch.uint8, device=scales_rowmajor.device)
+
+
 # Inverse scale reordering: CUTLASS block-scaled layout → row-major
 torch.library.define(
     "bitsandbytes::scale_from_blocked",
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1023,6 +1023,38 @@ def _(blocked_scales: torch.Tensor, H: int, W: int) -> torch.Tensor:
     return out
 
 
+@register_kernel("bitsandbytes::scale_to_blocked_batched", "cuda")
+def _(
+    scales_rowmajor: torch.Tensor,
+    expert_row_offsets: torch.Tensor,
+    expert_M: torch.Tensor,
+    expert_out_offsets: torch.Tensor,
+    W: int,
+    num_experts: int,
+    max_row_blocks: int,
+    total_out_bytes: int,
+) -> torch.Tensor:
+    """Batched scale swizzle: row-major → per-expert CUTLASS block-scaled layout.
+
+    Input: concatenated row-major scales from quantize_nvfp4_raw.
+    Output: contiguous buffer with independently swizzled per-expert blocks.
+    """
+    out = torch.zeros(total_out_bytes, dtype=torch.uint8, device=scales_rowmajor.device)
+    with _cuda_device_of(scales_rowmajor):
+        lib.cscale_to_blocked_batched(
+            get_ptr(scales_rowmajor),
+            get_ptr(out),
+            get_ptr(expert_row_offsets),
+            get_ptr(expert_M),
+            get_ptr(expert_out_offsets),
+            ct.c_int(W),
+            ct.c_int(num_experts),
+            ct.c_int(max_row_blocks),
+            _get_tensor_stream(scales_rowmajor),
+        )
+    return out
+
+
 # Hand-written NVFP4 GEMM (SM_120+)
 #
 # Uses mma.sync.aligned.block_scale instructions for small-M decode.
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1231,6 +1231,55 @@ def quantize_nvfp4_raw(
     return packed, block_scales
 
 
+def scale_to_blocked_batched(
+    scales_rowmajor: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    max_M: int,
+    K: int,
+    num_experts: int,
+) -> torch.Tensor:
+    """Swizzle concatenated row-major scales into per-expert CUTLASS layout.
+
+    Args:
+        scales_rowmajor: Concatenated row-major block scales [total_tokens * K/16] (uint8).
+        expert_offsets: Cumulative token offsets [num_experts + 1] (int32, device).
+        max_M: Max tokens per expert (padded to 128 alignment).
+        K: Hidden dimension.
+        num_experts: Number of experts.
+
+    Returns:
+        Contiguous buffer with per-expert swizzled scales for batched GEMM.
+    """
+    W = K // 16  # scale columns
+    n_col_blocks = (W + 3) // 4
+
+    # Compute per-expert metadata on device
+    tokens_per_expert = expert_offsets[1:] - expert_offsets[:-1]
+    # Scale rows = tokens * (K / 16) / W = tokens (each token has K/16 scale values)
+    # Actually: scales are [total_tokens, W] in row-major, so expert_row_offsets = expert_offsets * W / W = expert_offsets
+    # Wait — the quantize output is flat: total_tokens * (K/16) bytes.
+    # For scale_to_blocked_batched, input is [total_rows, W] where total_rows = total_tokens
+    # expert_row_offsets[i] = expert_offsets[i] (token offset IS the row offset)
+    expert_row_offsets = expert_offsets[:-1].to(torch.int32)
+    expert_M_dev = tokens_per_expert.to(torch.int32)
+
+    # Output offsets: each expert gets n_row_blocks_e * n_col_blocks * 512 bytes
+    # For uniform max_M: all experts get the same size
+    n_row_blocks_per = (max_M + 127) // 128
+    per_expert_bytes = n_row_blocks_per * n_col_blocks * 512
+    expert_out_offsets = torch.arange(
+        num_experts, dtype=torch.int32, device=scales_rowmajor.device,
+    ) * per_expert_bytes
+
+    max_row_blocks = n_row_blocks_per
+    total_out_bytes = num_experts * per_expert_bytes
+
+    return torch.ops.bitsandbytes.scale_to_blocked_batched(
+        scales_rowmajor, expert_row_offsets, expert_M_dev, expert_out_offsets,
+        W, num_experts, max_row_blocks, total_out_bytes,
+    )
+
+
 def dequantize_nvfp4(
     packed_data: torch.Tensor,
     quant_state: NVFP4QuantState,