Remove dead SM_100 grouped GEMM code, superseded by batched MoE GEMM

TimDettmers · claude · TimDettmers · commit cec86d79405a · 2026-03-09T12:27:30.000-04:00
The grouped GEMM SM_100 path was never reachable in practice — _forward_batched
routes SM_100 to the batched kernel, and the fused variant had linking issues.
Removes the C source, CMake entry, and Python wiring (cache, dispatch, SM_100
branch). SM_120 grouped kernel path is untouched.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -300,7 +300,6 @@ if(BUILD_CUDA)
         if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8" AND EXISTS "${CMAKE_SOURCE_DIR}/third_party/cutlass/include")
             set(_NVFP4_SM100_SOURCES
                 csrc/qutlass/gemm_nvfp4_sm100.cu
-                csrc/qutlass/gemm_nvfp4_grouped_sm100.cu
                 csrc/qutlass/gemm_nvfp4_moe_sm100.cu
             )
 
diff --git a/bitsandbytes/backends/cuda/ops.py b/bitsandbytes/backends/cuda/ops.py
@@ -1150,129 +1150,6 @@ def _gemm_nvfp4_grouped_raw(
     )
 
 
-# Cached state for grouped SM_100 GEMM
-_grouped_restype_set = False
-
-# Cached buffers for the fused C dispatch (keyed by (N, K, num_experts),
-# sized for worst-case routing so the cache always hits after first call)
-_grouped_fused_cache: Optional[dict] = None
-
-
-def _get_fused_buffers(
-    total_tokens: int, N: int, K: int, num_experts: int, device: torch.device,
-) -> dict:
-    """Get or grow cached device buffers for the fused C dispatch.
-
-    Buffers are sized for worst-case token routing (all tokens to one expert),
-    keyed on (N, K, num_experts). Grows if total_tokens exceeds the cached size.
-    """
-    global _grouped_fused_cache, _grouped_restype_set
-
-    if not _grouped_restype_set:
-        lib.cgemm_nvfp4_grouped_sm100_meta_size.restype = ct.c_size_t
-        lib.cgemm_nvfp4_grouped_sm100_workspace_size.restype = ct.c_size_t
-        _grouped_restype_set = True
-
-    if (_grouped_fused_cache is not None
-            and _grouped_fused_cache["N"] == N
-            and _grouped_fused_cache["K"] == K
-            and _grouped_fused_cache["num_experts"] == num_experts
-            and _grouped_fused_cache["max_tokens"] >= total_tokens):
-        return _grouped_fused_cache
-
-    scale_W = K // 16
-    n_col_blocks = (scale_W + 3) // 4
-
-    # Worst-case SFA output: each expert adds at most 1 extra 128-row block
-    max_row_blocks = (total_tokens + 127) // 128 + num_experts
-    sfa_out_bytes = max_row_blocks * n_col_blocks * 512
-
-    sfa_swizzle_out = torch.empty(max(sfa_out_bytes, 1), dtype=torch.uint8, device=device)
-    sfa_swizzle_meta = torch.empty(3 * num_experts * 4, dtype=torch.uint8, device=device)
-
-    meta_size = lib.cgemm_nvfp4_grouped_sm100_meta_size(ct.c_int(num_experts))
-    gemm_meta_buf = torch.empty(meta_size, dtype=torch.uint8, device=device)
-
-    # Worst-case workspace: all tokens routed to a single expert
-    M_arr = (ct.c_int * num_experts)(*([0] * num_experts))
-    M_arr[0] = total_tokens
-    ws_size = lib.cgemm_nvfp4_grouped_sm100_workspace_size(
-        M_arr, ct.c_int(N), ct.c_int(K), ct.c_int(num_experts),
-    )
-    workspace_buf = torch.empty(max(ws_size, 1), dtype=torch.uint8, device=device)
-
-    _grouped_fused_cache = {
-        "N": N, "K": K, "num_experts": num_experts, "max_tokens": total_tokens,
-        "sfa_swizzle_out": sfa_swizzle_out,
-        "sfa_swizzle_meta": sfa_swizzle_meta,
-        "gemm_meta_buf": gemm_meta_buf,
-        "workspace_buf": workspace_buf,
-        "ws_size": ws_size,
-    }
-    return _grouped_fused_cache
-
-
-def _gemm_nvfp4_grouped_sm100(
-    A_concat: torch.Tensor,
-    B_all: torch.Tensor,
-    SFA_rowmajor: torch.Tensor,
-    SFB_all: torch.Tensor,
-    offsets_host: tuple[int, ...],
-    A_tensor_scale: float,
-    B_tensor_scale: float,
-    N: int,
-    K: int,
-    num_experts: int,
-) -> torch.Tensor:
-    """SM_100 grouped NVFP4 GEMM using fused C dispatch.
-
-    Single ctypes call handles SFA swizzle + CUTLASS grouped GEMM.
-    All metadata computation and pointer building happens in C.
-    Python only allocates output and passes pre-cached buffers.
-
-    SFB_all is already per-expert swizzled (each expert was independently
-    quantized by quantize_nvfp4, which swizzles each expert's scales
-    separately). No conversion needed.
-
-    offsets_host: host-side tuple of cumulative token offsets (num_experts + 1 ints).
-    """
-    device = A_concat.device
-    total_tokens = offsets_host[-1]
-
-    # Get or grow cached buffers (keyed on N, K, num_experts — always hits
-    # after first call unless total_tokens grows)
-    buf = _get_fused_buffers(total_tokens, N, K, num_experts, device)
-
-    # Output (BF16 — CUTLASS accumulates in FP32, epilogue outputs BF16)
-    D_concat = torch.empty(total_tokens, N, dtype=torch.bfloat16, device=device)
-
-    # Build host offsets ctypes array (per-call, ~1μs for 9 ints)
-    host_offsets_arr = (ct.c_int * (num_experts + 1))(*offsets_host)
-
-    # Single fused C call: SFA swizzle + metadata build + GEMM launch
-    # SFB_all is passed directly — already per-expert swizzled from quantize_nvfp4
-    lib.cgemm_nvfp4_grouped_sm100_fused(
-        get_ptr(A_concat),
-        get_ptr(B_all),
-        get_ptr(SFA_rowmajor),
-        get_ptr(SFB_all),
-        get_ptr(D_concat),
-        host_offsets_arr,
-        ct.c_int(N),
-        ct.c_int(K),
-        ct.c_int(num_experts),
-        ct.c_float(A_tensor_scale * B_tensor_scale),
-        get_ptr(buf["sfa_swizzle_out"]),
-        get_ptr(buf["sfa_swizzle_meta"]),
-        get_ptr(buf["gemm_meta_buf"]),
-        get_ptr(buf["workspace_buf"]),
-        ct.c_size_t(buf["ws_size"]),
-        _get_tensor_stream(A_concat),
-    )
-
-    return D_concat
-
-
 @register_kernel("bitsandbytes::gemm_nvfp4_grouped", "cuda")
 def _(
     A_concat: torch.Tensor,
@@ -1293,17 +1170,6 @@ def _(
     SFB_all: per-expert swizzled weight scales (each expert independently swizzled
              by quantize_nvfp4, then concatenated).
     """
-    # SM_100 (datacenter Blackwell): use CUTLASS grouped GEMM
-    major, _ = torch.cuda.get_device_capability(A_concat.device)
-    if major == 10 and hasattr(lib, "cgemm_nvfp4_grouped_cutlass_sm100"):
-        # Convert device offsets to host tuple (cheap for small arrays,
-        # but callers should migrate to passing host offsets directly)
-        offsets_host = tuple(expert_offsets.tolist())
-        return _gemm_nvfp4_grouped_sm100(
-            A_concat, B_all, SFA_rowmajor, SFB_all, offsets_host,
-            A_tensor_scale, B_tensor_scale, N, K, num_experts,
-        )
-
     # SM_120 (consumer Blackwell): use hand-written grouped kernel
     # SM_120 expects globally-swizzled SFA, so swizzle the row-major input
     total_tokens = A_concat.numel() // (K // 2)
diff --git a/csrc/qutlass/gemm_nvfp4_grouped_sm100.cu b/csrc/qutlass/gemm_nvfp4_grouped_sm100.cu

Original file line number	Diff line number	Diff line change
`@@ -300,7 +300,6 @@ if(BUILD_CUDA)`
`300`	`300`	`if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8" AND EXISTS "${CMAKE_SOURCE_DIR}/third_party/cutlass/include")`
`301`	`301`	`set(_NVFP4_SM100_SOURCES`
`302`	`302`	`csrc/qutlass/gemm_nvfp4_sm100.cu`
`303`		`- csrc/qutlass/gemm_nvfp4_grouped_sm100.cu`
`304`	`303`	`csrc/qutlass/gemm_nvfp4_moe_sm100.cu`
`305`	`304`	`)`
`306`	`305`