perf: Restore N_TILES=4, use launch_bounds(256,4) for occupancy

TimDettmers · claude · TimDettmers · commit 940ac12099b0 · 2026-02-22T17:36:41.000-05:00
Target 4 blocks/SM to force compiler to reduce register count.
N_TILES_PER_WARP=4 gives better compute/load ratio (4 MMA per A load).
Block tile m32×n128 with 8 warps (2×4).

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/csrc/kernels_nvfp4_sm120.cu b/csrc/kernels_nvfp4_sm120.cu
@@ -87,15 +87,14 @@ __device__ __forceinline__ uint32_t pack_8_nibbles_slow(const unsigned char* dat
 // ============================================================================
 
 // N-tiles per warp: each warp computes m16 x (N_TILES_PER_WARP * 8)
-#define N_TILES_PER_WARP 2
+#define N_TILES_PER_WARP 4
 // Block config: M_WARPS x N_WARPS warps per block
-// M_WARPS groups along M (each m16), N_WARPS groups along N (each handles N_TILES_PER_WARP n8-tiles)
-#define M_WARPS 4
+#define M_WARPS 2
 #define N_WARPS 4
-#define WARPS_PER_BLOCK (M_WARPS * N_WARPS) // 16
+#define WARPS_PER_BLOCK (M_WARPS * N_WARPS) // 8
 
-// 512 threads, target 2 blocks/SM for good occupancy
-__global__ __launch_bounds__(WARPS_PER_BLOCK * 32, 2) void kGemmNVFP4_opt(
+// 256 threads, target 4 blocks/SM (limit regs to 48 via maxrregcount)
+__global__ __launch_bounds__(WARPS_PER_BLOCK * 32, 4) void kGemmNVFP4_opt(
     const unsigned char* __restrict__ A,   // M x K/2 packed FP4 (row-major)
     const unsigned char* __restrict__ B,   // N x K/2 packed FP4 (B transposed, row-major)
     const unsigned char* __restrict__ SFA, // M x K/16 UE4M3 scales