[CUDA] Heuristics for Hopper QMM (ml-explore#3173)

zcbenz · web-flow · commit 0c8107ce8a18 · 2026-02-27T09:22:10.000+09:00
diff --git a/.github/actions/build-linux/action.yml b/.github/actions/build-linux/action.yml
@@ -21,7 +21,7 @@ runs:
       run: |
         if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
           # There is no GPU in arm64 runner, use a common arch.
-          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a"
+          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=80"
           # Can not build tests and stubs when the built executables can not run.
           CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF -DMLX_BUILD_PYTHON_STUBS=OFF"
         fi
diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt
@@ -56,7 +56,6 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/affine_quantize.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/fp_quantize.cu
-          ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qmm_sm90.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qmv.cu
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/quantized.cpp
           ${CMAKE_CURRENT_SOURCE_DIR}/quantized/qqmm.cpp
@@ -65,6 +64,7 @@ target_sources(
           ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/binary)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/quantized/qmm)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/unary)
 
 # fp4 is not available on < 12.8
@@ -145,12 +145,11 @@ if(NOT DEFINED MLX_CUDA_ARCHITECTURES)
     COMMAND __nvcc_device_query
     OUTPUT_VARIABLE MLX_CUDA_ARCHITECTURES
     OUTPUT_STRIP_TRAILING_WHITESPACE)
-  set(UPGRADABLE_ARCHITECTURES "90;100;121")
   if(MLX_CUDA_ARCHITECTURES STREQUAL "")
     message(
       FATAL_ERROR
         "Can not get native CUDA arch, must set MLX_CUDA_ARCHITECTURES")
-  elseif(MLX_CUDA_ARCHITECTURES IN_LIST UPGRADABLE_ARCHITECTURES)
+  elseif(MLX_CUDA_ARCHITECTURES GREATER_EQUAL 90)
     # Use arch-specific compute capability whenever possible.
     set(MLX_CUDA_ARCHITECTURES "${MLX_CUDA_ARCHITECTURES}a")
   endif()
@@ -159,6 +158,11 @@ message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
                                      "${MLX_CUDA_ARCHITECTURES}")
 
+if(("90a" IN_LIST MLX_CUDA_ARCHITECTURES) OR ("90a-real" IN_LIST
+                                              MLX_CUDA_ARCHITECTURES))
+  target_compile_definitions(mlx PRIVATE MLX_CUDA_SM90A_ENABLED)
+endif()
+
 # Search CUDA libs from installed python packages.
 if(WIN32)
   # Resolve paths of unfound DLL at runtime.
diff --git a/mlx/backend/cuda/device.cpp b/mlx/backend/cuda/device.cpp
@@ -281,8 +281,8 @@ void CommandEncoder::add_kernel_node_raw(
     config.blockDim = block_dim;
     config.dynamicSmemBytes = smem_bytes;
     config.stream = stream();
+    cudaLaunchAttribute attr = {};
     if (use_cluster) {
-      cudaLaunchAttribute attr;
       attr.id = cudaLaunchAttributeClusterDimension;
       attr.val.clusterDim.x = cluster_dim.x;
       attr.val.clusterDim.y = cluster_dim.y;
@@ -332,16 +332,16 @@ void CommandEncoder::add_kernel_node_raw(
     config.blockDimZ = block_dim.z;
     config.sharedMemBytes = smem_bytes;
     config.hStream = stream();
+    CUlaunchAttribute attr = {};
     if (use_cluster) {
-      CUlaunchAttribute attr = {};
       attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       attr.value.clusterDim.x = cluster_dim.x;
       attr.value.clusterDim.y = cluster_dim.y;
       attr.value.clusterDim.z = cluster_dim.z;
       config.attrs = &attr;
       config.numAttrs = 1;
-      CHECK_CUDA_ERROR(cuLaunchKernelEx(&config, func, params, nullptr));
     }
+    CHECK_CUDA_ERROR(cuLaunchKernelEx(&config, func, params, nullptr));
     return;
   }
 
diff --git a/mlx/backend/cuda/quantized/qmm/CMakeLists.txt b/mlx/backend/cuda/quantized/qmm/CMakeLists.txt
@@ -0,0 +1,8 @@
+target_sources(
+  mlx
+  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/qmm.cpp
+          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n16_m1.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n32_m1.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n64_m2.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n128_m2.cu
+          ${CMAKE_CURRENT_SOURCE_DIR}/qmm_impl_sm90_m128_n256_m2.cu)
diff --git a/mlx/backend/cuda/quantized/qmm/qmm.cpp b/mlx/backend/cuda/quantized/qmm/qmm.cpp
@@ -0,0 +1,60 @@
+// Copyright © 2026 Apple Inc.
+
+#include "mlx/backend/cuda/quantized/qmm/qmm.h"
+
+#include <cute/tensor.hpp>
+
+namespace mlx::core {
+
+#if defined(MLX_CUDA_SM90A_ENABLED)
+// Defined in qmm_impl_sm90_xxx.cu files.
+template <typename TileShape, typename ClusterShape>
+void qmm_impl_sm90(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& biases,
+    array& out,
+    int bits,
+    int group_size,
+    cu::CommandEncoder& encoder,
+    Stream s);
+#endif // defined(MLX_CUDA_SM90A_ENABLED)
+
+void qmm_sm90(
+    const array& x,
+    const array& w,
+    const array& scales,
+    const array& biases,
+    array& out,
+    int bits,
+    int group_size,
+    cu::CommandEncoder& encoder,
+    Stream s) {
+#if defined(MLX_CUDA_SM90A_ENABLED)
+  auto dispatch = [&]<int tile_m, int tile_n, int cluster_m>() {
+    using cute::Int;
+    using TileShapeMN = cute::Shape<Int<tile_m>, Int<tile_n>>;
+    using ClusterShape = cute::Shape<Int<cluster_m>, Int<1>, Int<1>>;
+    qmm_impl_sm90<TileShapeMN, ClusterShape>(
+        x, w, scales, biases, out, bits, group_size, encoder, s);
+  };
+  int m = out.shape(-2);
+  if (m <= 16) {
+    dispatch.template operator()<128, 16, 1>();
+  } else if (m <= 32) {
+    dispatch.template operator()<128, 32, 1>();
+  } else if (m <= 64) {
+    dispatch.template operator()<128, 64, 2>();
+  } else if (m <= 128) {
+    dispatch.template operator()<128, 128, 2>();
+  } else {
+    dispatch.template operator()<128, 256, 2>();
+  }
+#else
+  throw std::runtime_error(
+      "[quantized_matmul] Hopper-only kernel is not available.");
+#endif // defined(MLX_CUDA_SM90A_ENABLED)
+}
+
+} // namespace mlx::core
diff --git a/mlx/backend/cuda/quantized/qmm/qmm.h b/mlx/backend/cuda/quantized/qmm/qmm.h
@@ -3,7 +3,6 @@
 #pragma once
 
 #include "mlx/backend/cuda/device.h"
-#include "mlx/primitives.h"
 
 #include <optional>
 
@@ -13,11 +12,10 @@ void qmm_sm90(
     const array& x,
     const array& w,
     const array& scales,
-    const std::optional<array>& biases,
+    const array& biases,
     array& out,
     int bits,
     int group_size,
-    QuantizationMode mode,
     cu::CommandEncoder& encoder,
     Stream s);
 
diff --git a/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh b/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh
@@ -1,7 +1,6 @@
 // Copyright © 2026 Apple Inc.
 
 #include "mlx/backend/cuda/cutlass_utils.cuh"
-#include "mlx/backend/cuda/quantized/qmm.h"
 #include "mlx/backend/cuda/quantized/quantized_utils.h"
 #include "mlx/backend/gpu/copy.h"
 #include "mlx/dtype_utils.h"
@@ -13,10 +12,20 @@
 #include <cutlass/gemm/device/gemm_universal_adapter.h>
 #include <cutlass/gemm/kernel/gemm_universal.hpp>
 
+#if defined(MLX_CUDA_SM90A_ENABLED)
+
 // We can't put kernel code in mlx::core due to name conflicts of "Shape".
 namespace cutlass_gemm {
 
-template <typename GroupSize, typename Element, typename Quant, typename F>
+using namespace cute;
+
+template <
+    typename TileShapeMN = Shape<_128, _16>,
+    typename ClusterShape = Shape<_1, _1, _1>,
+    typename Element,
+    typename Quant,
+    typename GroupSize,
+    typename F>
 void qmm_sm90(
     const Element* A,
     const Quant* B,
@@ -29,9 +38,6 @@ void qmm_sm90(
     int64_t l,
     GroupSize group_size,
     F&& launch_kernel) {
-#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
-  using namespace cute;
-
   constexpr int kAlignmentA = 128 / sizeof_bits<Element>::value;
   constexpr int kAlignmentB = 128 / sizeof_bits<Quant>::value;
   constexpr int kTileShapeK =
@@ -40,8 +46,7 @@ void qmm_sm90(
 
   using Arch = cutlass::arch::Sm90;
   using Accumulator = float;
-  using TileShape = Shape<_128, _16, Int<kTileShapeK>>;
-  using ClusterShape = Shape<_1, _1, _1>;
+  using TileShape = decltype(append(TileShapeMN{}, Int<kTileShapeK>{}));
 
   using Epilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       Arch,
@@ -66,7 +71,7 @@ void qmm_sm90(
       Arch,
       cutlass::arch::OpClassTensorOp,
       // ElementA:
-      cute::tuple<Quant, Element, Element>,
+      tuple<Quant, Element, Element>,
       cutlass::layout::RowMajor,
       kAlignmentB,
       // ElementB:
@@ -101,16 +106,14 @@ void qmm_sm90(
 
   auto* kernel = &cutlass::device_kernel<GemmKernel>;
   void* kernel_params[] = {const_cast<Gemm::Params*>(&gemm.params())};
+  auto cluster = ClusterShape{};
   launch_kernel(
       reinterpret_cast<void*>(kernel),
       gemm.get_grid_shape(gemm.params()),
       GemmKernel::get_block_shape(),
+      {get<0>(cluster), get<1>(cluster), get<2>(cluster)},
       GemmKernel::SharedStorageSize,
       kernel_params);
-#else
-  throw std::runtime_error(
-      "[quantized_matmul] Hopper-only kernel is not available.");
-#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
 }
 
 } // namespace cutlass_gemm
@@ -167,29 +170,29 @@ inline void dispatch_groups(int group_size, const char* tag, F&& f) {
   }
 }
 
-void qmm_sm90(
-    const array& x_,
+template <typename TileShapeMN, typename ClusterShape>
+void qmm_impl_sm90(
+    const array& x,
     const array& w,
     const array& scales_,
-    const std::optional<array>& biases_,
+    const array& biases_,
     array& out,
     int bits,
     int group_size,
-    QuantizationMode mode,
     cu::CommandEncoder& encoder,
     Stream s) {
-  if ((mode != QuantizationMode::Affine) || !biases_) {
-    throw std::runtime_error("qmm_sm90 NYI");
-  }
-
   const char* tag = "[quantized_matmul]";
   int m = out.shape(-2);
   int n = out.shape(-1);
-  int k = x_.shape(-1);
+  int k = x.shape(-1);
   int l = out.size() / (m * n);
   if (k % 64 != 0) {
     throw std::runtime_error(fmt::format("{} K must be multiples of 64.", tag));
   }
+  if (!x.flags().row_contiguous) {
+    throw std::runtime_error(
+        fmt::format("{} Activations must be row contiguous.", tag));
+  }
   if (!w.flags().row_contiguous) {
     throw std::runtime_error(
         fmt::format("{} Weights must be row contiguous.", tag));
@@ -198,16 +201,14 @@ void qmm_sm90(
     throw std::runtime_error(
         fmt::format("{} Scales must be row contiguous.", tag));
   }
-  if (!biases_->flags().row_contiguous) {
+  if (!biases_.flags().row_contiguous) {
     throw std::runtime_error(
         fmt::format("{} Biases must be row contiguous.", tag));
   }
 
-  // TODO: Support column-major x.
-  array x = ensure_row_contiguous(x_, encoder, s);
   // FIXME: Copy happens for every call.
   array scales = transpose_last_2_dims(scales_, encoder, s);
-  array biases = transpose_last_2_dims(*biases_, encoder, s);
+  array biases = transpose_last_2_dims(biases_, encoder, s);
 
   dispatch_element_types(out.dtype(), tag, [&]<typename Element>() {
     dispatch_quant_types(bits, tag, [&]<typename Quant>() {
@@ -231,14 +232,40 @@ void qmm_sm90(
             [&](auto* kernel,
                 dim3 num_blocks,
                 dim3 block_dims,
+                dim3 cluster_shape,
                 uint32_t smem_bytes,
                 void** args) {
-              encoder.add_kernel_node(
-                  kernel, num_blocks, block_dims, smem_bytes, args);
+              encoder.add_kernel_node_raw(
+                  kernel,
+                  num_blocks,
+                  block_dims,
+                  cluster_shape,
+                  smem_bytes,
+                  args);
             });
       });
     });
   });
 }
 
 } // namespace mlx::core
+
+#define QMM_SM90_GPU(TileShapeMN, ClusterShape)           \
+  namespace mlx::core {                                   \
+  template void qmm_impl_sm90<TileShapeMN, ClusterShape>( \
+      const array& x,                                     \
+      const array& w,                                     \
+      const array& scales,                                \
+      const array& biases,                                \
+      array& out,                                         \
+      int bits,                                           \
+      int group_size,                                     \
+      cu::CommandEncoder& encoder,                        \
+      Stream s);                                          \
+  }
+
+#else
+
+#define QMM_SM90_GPU(TileShapeMN, ClusterShape)
+
+#endif // defined(MLX_CUDA_SM90A_ENABLED)
diff --git a/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n128_m2.cu b/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n128_m2.cu
@@ -0,0 +1,10 @@
+// Copyright © 2026 Apple Inc.
+
+#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"
+
+using namespace cute;
+
+using TileShapeMN = Shape<_128, _128>;
+using ClusterShape = Shape<_2, _1, _1>;
+
+QMM_SM90_GPU(TileShapeMN, ClusterShape)
diff --git a/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n16_m1.cu b/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n16_m1.cu
@@ -0,0 +1,10 @@
+// Copyright © 2026 Apple Inc.
+
+#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"
+
+using namespace cute;
+
+using TileShapeMN = Shape<_128, _16>;
+using ClusterShape = Shape<_1, _1, _1>;
+
+QMM_SM90_GPU(TileShapeMN, ClusterShape)
diff --git a/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n256_m2.cu b/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n256_m2.cu
@@ -0,0 +1,10 @@
+// Copyright © 2026 Apple Inc.
+
+#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"
+
+using namespace cute;
+
+using TileShapeMN = Shape<_128, _256>;
+using ClusterShape = Shape<_2, _1, _1>;
+
+QMM_SM90_GPU(TileShapeMN, ClusterShape)
diff --git a/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n32_m1.cu b/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n32_m1.cu
@@ -0,0 +1,10 @@
+// Copyright © 2026 Apple Inc.
+
+#include "mlx/backend/cuda/quantized/qmm/qmm_impl_sm90.cuh"
+
+using namespace cute;
+
+using TileShapeMN = Shape<_128, _32>;
+using ClusterShape = Shape<_1, _1, _1>;
+
+QMM_SM90_GPU(TileShapeMN, ClusterShape)
diff --git a/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n64_m2.cu b/mlx/backend/cuda/quantized/qmm/qmm_impl_sm90_m128_n64_m2.cu
diff --git a/mlx/backend/cuda/quantized/quantized.cpp b/mlx/backend/cuda/quantized/quantized.cpp