ggml-cuda: aggressive Hopper FA config for D=64 (nthreads=256, nbatch_fa=256)

liparetejas · liparetejas · commit 3c667e4f598a · 2026-02-23T10:23:44.000-06:00
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -81,10 +81,11 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
     // We also relax the restriction on nstages for ncols2=1 (handled in get_nstages).
 
     // D=64
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64,  8, 128, 2, 128,  32,  32,  32, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 16, 128, 2, 128,  32,  32,  32, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 32, 128, 2, 128,  32,  32,  32, 2, true);
-    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 64, 128, 2, 128,  32,  32,  32, 2, true);
+    // Aggressive Hopper tuning: nthreads=256 (8 warps), occupancy=1 (full SM), nbatch_fa=256 (max tile).
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64,  8, 256, 1, 256,  32,  32,  32, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 16, 256, 1, 256,  32,  32,  32, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 32, 256, 1, 256,  32,  32,  32, 2, true);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE( 64,  64, 64, 256, 1, 128,  32,  32,  32, 2, true);
 
     // D=80
     GGML_CUDA_FATTN_MMA_CONFIG_CASE( 80,  80,  8, 128, 2, 128,  40,  40,  40, 2, true);