ml-explore
diff --git a/‎mlx/backend/cpu/quantized.cpp‎
Lines changed: 4 additions & 0 deletions b/‎mlx/backend/cpu/quantized.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mlx/backend/cuda/quantized/qqmm.cpp‎
Lines changed: 72 additions & 14 deletions b/‎mlx/backend/cuda/quantized/qqmm.cpp‎
Lines changed: 72 additions & 14 deletions
diff --git a/‎mlx/backend/metal/quantized.cpp‎
Lines changed: 4 additions & 0 deletions b/‎mlx/backend/metal/quantized.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mlx/backend/no_cpu/primitives.cpp‎
Lines changed: 1 addition & 0 deletions b/‎mlx/backend/no_cpu/primitives.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mlx/backend/no_gpu/primitives.cpp‎
Lines changed: 1 addition & 0 deletions b/‎mlx/backend/no_gpu/primitives.cpp‎
Lines changed: 1 addition & 0 deletions
@@ -1359,4 +1359,8 @@ void QQMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {
   }
 }
 
+void GatherQQMM::eval_cpu(const std::vector<array>& inputs, array& out) {
+  throw std::runtime_error("[GatherQQMM] NYI");
+}
+
 } // namespace mlx::core
@@ -21,7 +21,7 @@ std::tuple<array, array> quantize_input(
     QuantizationMode mode,
     int bits,
     int group_size,
-    std::optional<array> global_scale = std::nullopt) {
+    std::optional<array> global_scale) {
   const array x = ensure_contiguous(input, encoder, s);
 
   // Compute output shapes
@@ -54,7 +54,7 @@ std::tuple<array, array> quantize_input(
 
 array quantize_dequantize_input(
     const array& x_pre,
-    const std::optional<array>& global_scale_x,
+    const std::optional<array>& global_scale,
     int bits,
     int group_size,
     cu::CommandEncoder& encoder,
@@ -69,7 +69,7 @@ array quantize_dequantize_input(
   if (!donate_x) {
     encoder.add_temporary(xhat);
   }
-  fp_quantize_dequantize(x, xhat, group_size, bits, global_scale_x, encoder, s);
+  fp_quantize_dequantize(x, xhat, group_size, bits, global_scale, encoder, s);
   return xhat;
 }
 
@@ -99,24 +99,21 @@ void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
 
   const array& x_pre = inputs[0];
   const array& w_pre = inputs[1];
-  const array& scales_w_pre = inputs[2];
 
   out.set_data(cu::malloc_async(out.nbytes(), encoder));
 
   // - 2 inputs: x, w (non-quantized w)
   // - 3 inputs: x, w, scales_w (quantized w)
   bool w_quantized = (w_pre.dtype() == uint32);
   int base_size = w_quantized ? 3 : 2;
-  assert(
-      inputs.size() == base_size ||
-      (mode_ == QuantizationMode::Nvfp4 && inputs.size() == base_size + 2));
-
   // For nvfp4, global scales are optional but must be both present or both
   // absent If present, they add 2 more inputs (global_scale_x, global_scale_w)
   bool has_global_scales =
-      mode_ == QuantizationMode::Nvfp4 && inputs.size() > base_size;
-  std::optional<array> global_scale_x = std::nullopt;
-  std::optional<array> global_scale_w = std::nullopt;
+      mode_ == QuantizationMode::Nvfp4 && inputs.size() == base_size + 2;
+  assert(inputs.size() == base_size || has_global_scales);
+
+  std::optional<array> global_scale_x;
+  std::optional<array> global_scale_w;
   if (has_global_scales) {
     global_scale_x = inputs[inputs.size() - 2];
     global_scale_w = inputs[inputs.size() - 1];
@@ -128,12 +125,14 @@ void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
             w_pre, encoder, s, mode_, bits_, group_size_, global_scale_w)
       : std::make_tuple(
             ensure_contiguous(w_pre, encoder, s),
-            ensure_contiguous(scales_w_pre, encoder, s));
+            ensure_contiguous(inputs[2], encoder, s));
 
   // Reroute to qmm when: no support in cuBLAS, or doing GEMV.
+  bool can_use_cublas =
+      (mode_ == QuantizationMode::Nvfp4 || mode_ == QuantizationMode::Mxfp8) &&
+      (device.compute_capability_major() >= 10);
   int M = x_pre.shape(-2);
-  bool use_qmm = (device.compute_capability_major() < 10) || (M == 1);
-  use_qmm = true;
+  bool use_qmm = (!can_use_cublas) || (M == 1);
 
   if (use_qmm) {
     array x = quantize_dequantize_input(
@@ -207,4 +206,63 @@ void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
       scalars);
 }
 
+void GatherQQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("QQMatmul::eval_gpu");
+
+  auto& s = stream();
+  auto& encoder = cu::get_command_encoder(s);
+
+  const array& x_pre = inputs[0];
+  const array& w_pre = inputs[1];
+  const array& lhs_indices = ensure_row_contiguous(inputs[2], encoder, s);
+  const array& rhs_indices = ensure_row_contiguous(inputs[3], encoder, s);
+
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+
+  // - 4 inputs: x, w, lhs_indices, rhs_indices (non-quantized w)
+  // - 5 inputs: x, w, lhs_indices, rhs_indices, scales_w (quantized w)
+  bool w_quantized = (w_pre.dtype() == uint32);
+  int base_size = w_quantized ? 5 : 4;
+  // For nvfp4, global scales are optional but must be both present or both
+  // absent If present, they add 2 more inputs (global_scale_x, global_scale_w)
+  bool has_global_scales =
+      mode_ == QuantizationMode::Nvfp4 && inputs.size() == base_size + 2;
+  assert(inputs.size() == base_size || has_global_scales);
+
+  std::optional<array> global_scale_x;
+  std::optional<array> global_scale_w;
+  if (has_global_scales) {
+    global_scale_x = inputs[inputs.size() - 2];
+    global_scale_w = inputs[inputs.size() - 1];
+  }
+
+  // Quantize weights.
+  auto [w_q, scales_w] = !w_quantized
+      ? quantize_input(
+            w_pre, encoder, s, mode_, bits_, group_size_, global_scale_w)
+      : std::make_tuple(
+            ensure_contiguous(w_pre, encoder, s),
+            ensure_contiguous(inputs[4], encoder, s));
+
+  // Quantize activation.
+  array x = quantize_dequantize_input(
+      x_pre, global_scale_x, bits_, group_size_, encoder, s);
+
+  // Reroute to qmm.
+  qmm_naive(
+      x,
+      w_q,
+      scales_w,
+      std::nullopt,
+      global_scale_w,
+      lhs_indices,
+      rhs_indices,
+      out,
+      true, // transpose
+      bits_,
+      group_size_,
+      mode_,
+      encoder);
+}
+
 } // namespace mlx::core
@@ -1667,6 +1667,10 @@ void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
   }
 }
 
+void GatherQQMM::eval_gpu(const std::vector<array>& inputs, array& out) {
+  throw std::runtime_error("[GatherQQMM] NYI");
+}
+
 void fast::Quantize::eval_gpu(
     const std::vector<array>& inputs,
     std::vector<array>& outputs) {
 
@@ -71,6 +71,7 @@ NO_CPU(Gather)
 NO_CPU(GatherAxis)
 NO_CPU(GatherMM)
 NO_CPU(GatherQMM)
+NO_CPU(GatherQQMM)
 NO_CPU(Greater)
 NO_CPU(GreaterEqual)
 NO_CPU(Hadamard)
 
@@ -98,6 +98,7 @@ NO_GPU(Gather)
 NO_GPU(GatherAxis)
 NO_GPU(GatherMM)
 NO_GPU(GatherQMM)
+NO_GPU(GatherQQMM)
 NO_GPU(Greater)
 NO_GPU(GreaterEqual)
 NO_GPU(Hadamard)
Original file line number	Diff line number	Diff line change
`@@ -1359,4 +1359,8 @@ void QQMatmul::eval_cpu(const std::vector<array>& inputs, array& out) {`
`1359`	`1359`	`}`
`1360`	`1360`	`}`
`1361`	`1361`
	`1362`	`+void GatherQQMM::eval_cpu(const std::vector<array>& inputs, array& out) {`
	`1363`	`+ throw std::runtime_error("[GatherQQMM] NYI");`
	`1364`	`+}`
	`1365`	`+`
`1362`	`1366`	`} // namespace mlx::core`
Original file line number	Diff line number	Diff line change
`@@ -1667,6 +1667,10 @@ void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {`
`1667`	`1667`	`}`
`1668`	`1668`	`}`
`1669`	`1669`
	`1670`	`+void GatherQQMM::eval_gpu(const std::vector<array>& inputs, array& out) {`
	`1671`	`+ throw std::runtime_error("[GatherQQMM] NYI");`
	`1672`	`+}`
	`1673`	`+`
`1670`	`1674`	`void fast::Quantize::eval_gpu(`
`1671`	`1675`	`const std::vector<array>& inputs,`
`1672`	`1676`	`std::vector<array>& outputs) {`