Fix SDPA vmap with GQA/MQA shapes (n_heads != n_kv_heads)

Brooooooklyn · Brooooooklyn · commit 4e42445144cf · 2026-04-08T10:13:50.000+08:00
The ScaledDotProductAttention primitive relied on Custom::vmap which
re-vmapped the fallback lambda. That lambda captured n_q_heads and
n_kv_heads at creation time, causing shape mismatches (SIGSEGV/hang)
when vmap changed the array dimensions.

Add a dedicated vmap override that merges the vmap axis into the batch
dimension and re-invokes scaled_dot_product_attention, which recomputes
head counts from actual shapes. Falls back to Custom::vmap for sinks.
diff --git a/mlx/fast.cpp b/mlx/fast.cpp
@@ -50,6 +50,83 @@ std::pair<std::vector<array>, std::vector<int>> Custom::vmap(
   return {outputs, out_axes};
 }
 
+std::pair<std::vector<array>, std::vector<int>> ScaledDotProductAttention::vmap(
+    const std::vector<array>& inputs,
+    const std::vector<int>& axes) {
+  auto s = stream();
+
+  // Sinks require 1-D input; fall back to generic vmap for that case.
+  if (has_sinks_) {
+    return Custom::vmap(inputs, axes);
+  }
+
+  // Determine vmap size from the first mapped input.
+  int vmap_size = -1;
+  for (int i = 0; i < static_cast<int>(axes.size()); ++i) {
+    if (axes[i] != -1) {
+      vmap_size = inputs[i].shape(axes[i]);
+      break;
+    }
+  }
+
+  auto prepare = [&s, vmap_size](const array& x, int ax) -> array {
+    if (ax == -1) {
+      return repeat(expand_dims(x, 0, s), vmap_size, 0, s);
+    }
+    if (ax != 0) {
+      return moveaxis(x, ax, 0, s);
+    }
+    return x;
+  };
+
+  auto q = prepare(inputs[0], axes[0]);
+  auto k = prepare(inputs[1], axes[1]);
+  auto v = prepare(inputs[2], axes[2]);
+
+  // [V, B, H, L, D] -> [V*B, H, L, D]
+  auto merge_batch = [&s, vmap_size](const array& x) -> array {
+    auto shape = x.shape();
+    Shape new_shape = {vmap_size * shape[1]};
+    new_shape.insert(new_shape.end(), shape.begin() + 2, shape.end());
+    return reshape(x, std::move(new_shape), s);
+  };
+
+  q = merge_batch(q);
+  k = merge_batch(k);
+  v = merge_batch(v);
+
+  std::optional<array> mask_arr;
+  bool has_arr_mask = !do_causal_ && inputs.size() > 3;
+  if (has_arr_mask) {
+    mask_arr = merge_batch(prepare(inputs[3], axes[3]));
+  }
+  std::string mask_mode = do_causal_ ? "causal" : has_arr_mask ? "array" : "";
+
+  auto out = scaled_dot_product_attention(
+      q, k, v, scale_, mask_mode, mask_arr, std::nullopt, s);
+
+  // [V*B, H, L, D] -> [V, B, H, L, D]
+  auto split_batch = [&s, vmap_size](const array& x) -> array {
+    auto shape = x.shape();
+    Shape new_shape = {vmap_size, shape[0] / vmap_size};
+    new_shape.insert(new_shape.end(), shape.begin() + 1, shape.end());
+    return reshape(x, std::move(new_shape), s);
+  };
+
+  out = split_batch(out);
+
+  // The re-invoked SDPA may produce a logsumexp sibling when training.
+  if (output_logsumexp_) {
+    assert(
+        !out.siblings().empty() &&
+        "vmap'd SDPA expected logsumexp sibling output");
+    auto lse = split_batch(out.siblings()[0]);
+    return {{out, lse}, {0, 0}};
+  }
+
+  return {{out}, {0}};
+}
+
 array rms_norm(
     const array& x,
     const std::optional<array>& weight,
diff --git a/mlx/fast_primitives.h b/mlx/fast_primitives.h
@@ -244,6 +244,10 @@ class ScaledDotProductAttention : public Custom {
       const std::vector<int>& argnums,
       const std::vector<array>& outputs) override;
 
+  std::pair<std::vector<array>, std::vector<int>> vmap(
+      const std::vector<array>& inputs,
+      const std::vector<int>& axes) override;
+
   bool is_equivalent(const Primitive& other) const override;
 
   DEFINE_NAME(ScaledDotProductAttention);
diff --git a/python/tests/test_fast_sdpa.py b/python/tests/test_fast_sdpa.py
@@ -642,6 +642,116 @@ def test_sdpa_sliced(self):
                         tolerance = {"rtol": 1e-2, "atol": 1e-2}
                     self.assertTrue(mx.allclose(ref, out, **tolerance))
 
+    def test_sdpa_vmap_gqa(self):
+        """Test vmap over SDPA with GQA shapes (n_heads != n_kv_heads).
+
+        Reproduces https://github.com/ml-explore/mlx/issues/3383
+        """
+        D = 64
+        L = 4
+        scale = 1.0 / math.sqrt(D)
+
+        # (n_q_heads, n_kv_heads) pairs: MHA, GQA, MQA
+        head_configs = [
+            (4, 4),  # MHA — works (baseline)
+            (4, 2),  # GQA — hangs/crashes
+            (4, 1),  # MQA — hangs/crashes
+            (8, 2),  # GQA with larger ratio
+        ]
+
+        for n_q, n_kv in head_configs:
+            with self.subTest(n_q_heads=n_q, n_kv_heads=n_kv):
+                B = 2
+                q = mx.random.normal((B, n_q, L, D))
+                k = mx.random.normal((B, n_kv, L, D))
+                v = mx.random.normal((B, n_kv, L, D))
+                mx.eval(q, k, v)
+
+                def f(qi, ki, vi):
+                    return mx.fast.scaled_dot_product_attention(
+                        qi[None], ki[None], vi[None], scale=scale
+                    )[0]
+
+                # Reference: manual loop over batch
+                refs = []
+                for i in range(B):
+                    refs.append(f(q[i], k[i], v[i]))
+                ref = mx.stack(refs)
+
+                # vmap version
+                out = mx.vmap(f)(q, k, v)
+                mx.eval(out)
+
+                self.assertListEqual(list(ref.shape), list(out.shape))
+                self.assertTrue(
+                    mx.allclose(ref, out, atol=1e-5, rtol=1e-3),
+                    f"vmap output mismatch for n_q={n_q}, n_kv={n_kv}",
+                )
+
+    def test_sdpa_vmap_gqa_grad(self):
+        """Test vmap(grad) over SDPA with GQA shapes.
+
+        Reproduces https://github.com/ml-explore/mlx/issues/3383
+        """
+        D = 64
+        L = 4
+        scale = 1.0 / math.sqrt(D)
+
+        for n_q, n_kv in [(4, 4), (4, 2), (4, 1)]:
+            with self.subTest(n_q_heads=n_q, n_kv_heads=n_kv):
+                B = 2
+                q = mx.random.normal((B, n_q, L, D))
+                k = mx.random.normal((B, n_kv, L, D))
+                v = mx.random.normal((B, n_kv, L, D))
+                mx.eval(q, k, v)
+
+                def loss(qi, ki, vi):
+                    return mx.mean(
+                        mx.fast.scaled_dot_product_attention(
+                            qi[None], ki[None], vi[None], scale=scale
+                        )
+                    )
+
+                grad_fn = mx.grad(loss)
+                vmap_grad = mx.vmap(grad_fn)
+                out = vmap_grad(q, k, v)
+                mx.eval(out)
+
+                self.assertEqual(out.shape, q.shape)
+
+    def test_sdpa_vmap_gqa_with_mask(self):
+        """Test vmap over SDPA with GQA shapes and causal masking.
+
+        Reproduces https://github.com/ml-explore/mlx/issues/3383
+        """
+        D = 64
+        L = 8
+        scale = 1.0 / math.sqrt(D)
+
+        for n_q, n_kv in [(4, 2), (4, 1)]:
+            with self.subTest(n_q_heads=n_q, n_kv_heads=n_kv):
+                B = 2
+                q = mx.random.normal((B, n_q, L, D))
+                k = mx.random.normal((B, n_kv, L, D))
+                v = mx.random.normal((B, n_kv, L, D))
+                mx.eval(q, k, v)
+
+                def f(qi, ki, vi):
+                    return mx.fast.scaled_dot_product_attention(
+                        qi[None], ki[None], vi[None], scale=scale, mask="causal"
+                    )[0]
+
+                ref = mx.stack([f(q[i], k[i], v[i]) for i in range(B)])
+
+                out = mx.vmap(f)(q, k, v)
+                mx.eval(out)
+
+                self.assertListEqual(list(ref.shape), list(out.shape))
+                self.assertTrue(
+                    mx.allclose(ref, out, atol=1e-5, rtol=1e-3),
+                    f"vmap+causal mismatch for n_q={n_q}, n_kv={n_kv}",
+                )
+
 
 if __name__ == "__main__":
     mlx_tests.MLXTestRunner(failfast=True)