NripeshN
diff --git a/‎.github/actions/build-cuda-release/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/build-cuda-release/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/actions/build-linux/action.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/actions/build-linux/action.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/actions/build-macos/action.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/actions/build-macos/action.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/actions/test-linux/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/test-linux/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/nightly.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/nightly.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 10 additions & 10 deletions b/‎.github/workflows/release.yml‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎benchmarks/python/block_masked_mm_bench.py‎
Lines changed: 193 additions & 0 deletions b/‎benchmarks/python/block_masked_mm_bench.py‎
Lines changed: 193 additions & 0 deletions
@@ -20,7 +20,7 @@ runs:
       run: |
         pip install auditwheel build patchelf setuptools
         python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
+        MLX_DISABLE_SM90A_KERNELS=1 MLX_BUILD_STAGE=2 python -m build -w
 
         auditwheel repair dist/mlx_cuda*.whl \
           --plat manylinux_2_35_${{ inputs.arch }} \
 
@@ -9,6 +9,7 @@ inputs:
 runs:
   using: "composite"
   steps:
+
     - name: Install Python package
       id: python_build
       shell: sh
@@ -20,7 +21,7 @@ runs:
       run: |
         if ${{ startsWith(inputs.toolkit, 'cuda') && runner.arch == 'arm64' }} ; then
           # There is no GPU in arm64 runner, use a common arch.
-          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=90a"
+          CMAKE_ARGS="$CMAKE_ARGS -DMLX_CUDA_ARCHITECTURES=80"
           # Can not build tests and stubs when the built executables can not run.
           CMAKE_ARGS="$CMAKE_ARGS -DMLX_BUILD_TESTS=OFF -DMLX_BUILD_PYTHON_STUBS=OFF"
         fi
 
@@ -12,12 +12,12 @@ runs:
       run: |
         pip install --upgrade pip
         pip install cmake setuptools typing_extensions
-        pip install -e . -v
+        pip install -e ".[dev]" -v
 
     - name: Install tests dependencies
       shell: bash -l {0}
       run: |
-        pip install numpy torch tensorflow
+        pip install tensorflow
 
     - name: Run Python tests
       shell: bash -l {0}
 
@@ -65,5 +65,5 @@ runs:
         DEVICE: gpu
       run: |
         echo "::group::CPP tests - GPU"
-        ./build/tests/tests -sfe="*fft_tests.cpp,*linalg_tests.cpp"
+        ./build/tests/tests -sfe="*linalg_tests.cpp"
         echo "::endgroup::"
@@ -23,14 +23,14 @@ jobs:
           build-backend: ${{ matrix.python-version == '3.10' }}
           arch: "x86_64"
       - name: Upload mlx artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           name: linux-wheels-${{ matrix.python_version }}
           path: wheelhouse/mlx-*.whl
           retention-days: 7
       - name: Upload mlx-cpu artifacts
         if: matrix.python_version == '3.10'
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           name: mlx-cpu
           path: wheelhouse/mlx_cpu-*.whl
@@ -97,7 +97,7 @@ jobs:
           toolkit: 'cuda-12.9'
           arch: 'x86_64'
       - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           name: mlx-cuda
           path: wheelhouse/mlx_cuda_*.whl
 
@@ -64,15 +64,15 @@ jobs:
           build-backend: ${{ matrix.python_version == '3.10' }}
           arch: ${{ matrix.arch }}
       - name: Upload MLX artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           overwrite: true
           name: linux-wheels-${{ matrix.python_version }}-${{ matrix.arch }}
           path: wheelhouse/mlx-*.whl
           if-no-files-found: error
       - name: Upload CPU artifacts
         if: matrix.python_version == '3.10'
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           overwrite: true
           name: mlx-cpu-${{ matrix.arch }}
@@ -116,15 +116,15 @@ jobs:
           macos-target: 26.0
           build-backend: ${{ matrix.python-version == '3.10' }}
       - name: Upload MLX artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           overwrite: true
           name: mac-wheels-${{ matrix.python-version }}
           path: dist/mlx-*.whl
           if-no-files-found: error
       - name: Upload Metal artifacts
         if: matrix.python-version == '3.10'
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           overwrite: true
           name: mlx-metal
@@ -152,7 +152,7 @@ jobs:
         with:
           arch: ${{ matrix.arch }}
       - name: Upload artifacts
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           overwrite: true
           name: mlx-${{ matrix.toolkit }}-${{ matrix.arch }}
@@ -169,12 +169,12 @@ jobs:
       name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
       url: https://pypi.org/p/mlx
     steps:
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v8
         with:
           pattern: linux-wheels-*
           merge-multiple: true
           path: dist
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v8
         with:
           pattern: mac-wheels-*
           merge-multiple: true
@@ -197,7 +197,7 @@ jobs:
       name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
       url: https://pypi.org/p/mlx-cuda
     steps:
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v8
         with:
           pattern: mlx-cuda-*
           merge-multiple: true
@@ -220,7 +220,7 @@ jobs:
       name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
       url: https://pypi.org/p/mlx-cpu
     steps:
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v8
         with:
           pattern: mlx-cpu-*
           merge-multiple: true
@@ -243,7 +243,7 @@ jobs:
       name: ${{ inputs.dry_run && 'dry-run' || 'pypi' }}
       url: https://pypi.org/p/mlx-metal
     steps:
-      - uses: actions/download-artifact@v7
+      - uses: actions/download-artifact@v8
         with:
           name: mlx-metal
           path: dist
 
@@ -157,6 +157,10 @@ if(MLX_BUILD_CUDA)
   enable_language(CUDA)
   find_package(CUDAToolkit REQUIRED)
   find_package(CUDNN REQUIRED)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.1" AND CUDAToolkit_VERSION
+                                                          VERSION_LESS "13.2")
+    message(FATAL_ERROR "CUDA Toolkit 13.1 is not supported.")
+  endif()
 endif()
 
 if(MLX_BUILD_ROCM)
@@ -369,7 +373,7 @@ else()
   FetchContent_Declare(
     fmt
     GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-    GIT_TAG 10.2.1
+    GIT_TAG 12.1.0
     EXCLUDE_FROM_ALL)
   FetchContent_MakeAvailable(fmt)
 endif()
 
@@ -0,0 +1,193 @@
+# Copyright © 2025 Apple Inc.
+
+import argparse
+import time
+
+import mlx.core as mx
+import numpy as np
+
+MLX_DTYPES = {
+    "float16": mx.float16,
+    "bfloat16": mx.bfloat16,
+    "float32": mx.float32,
+}
+
+
+def parse_cases(cases):
+    parsed = []
+    for spec in cases.split(","):
+        parts = spec.split("x")
+        m, n, k, bs = int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3])
+        sparsity = float(parts[4]) if len(parts) > 4 else 0.5
+        parsed.append((m, n, k, bs, sparsity))
+    return parsed
+
+
+def make_masks(m, n, k, block_size, sparsity, rng):
+    """Create block masks with given sparsity (fraction of blocks zeroed)."""
+    tm = (m + block_size - 1) // block_size
+    tn = (n + block_size - 1) // block_size
+    tk = (k + block_size - 1) // block_size
+
+    lhs_mask = (rng.random((tm, tk)) >= sparsity).astype(np.bool_)
+    rhs_mask = (rng.random((tk, tn)) >= sparsity).astype(np.bool_)
+    out_mask = (rng.random((tm, tn)) >= sparsity).astype(np.bool_)
+    return lhs_mask, rhs_mask, out_mask
+
+
+def mlx_naive_block_masked_mm(a, b, block_size, out_mask, lhs_mask, rhs_mask):
+    """MLX naive: expand masks and use regular matmul."""
+    M, K = a.shape[-2], a.shape[-1]
+    N = b.shape[-1]
+
+    def expand(mask, rows, cols):
+        e = mx.repeat(mx.repeat(mask, block_size, axis=-2), block_size, axis=-1)
+        return e[..., :rows, :cols]
+
+    a_masked = a * expand(lhs_mask, M, K)
+    b_masked = b * expand(rhs_mask, K, N)
+    c = a_masked @ b_masked
+    c = c * expand(out_mask, M, N)
+    return c
+
+
+def bench_mlx(fn, warmup, iters):
+    for _ in range(warmup):
+        y = fn()
+        mx.eval(y)
+    mx.synchronize()
+
+    start = time.perf_counter()
+    for _ in range(iters):
+        y = fn()
+        mx.eval(y)
+    mx.synchronize()
+    return (time.perf_counter() - start) * 1e3 / iters
+
+
+def print_table(headers, rows):
+    widths = [len(h) for h in headers]
+    for row in rows:
+        for i, cell in enumerate(row):
+            widths[i] = max(widths[i], len(cell))
+
+    def fmt_row(row):
+        return (
+            "| "
+            + " | ".join(f"{cell:<{widths[i]}}" for i, cell in enumerate(row))
+            + " |"
+        )
+
+    sep = "|-" + "-|-".join("-" * w for w in widths) + "-|"
+    print(fmt_row(headers))
+    print(sep)
+    for row in rows:
+        print(fmt_row(row))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark block_masked_mm vs naive expand+matmul"
+    )
+    parser.add_argument(
+        "--cases",
+        default=(
+            "256x256x256x32x0.5,"
+            "512x512x512x32x0.5,"
+            "1024x1024x1024x32x0.5,"
+            "1024x1024x1024x64x0.5,"
+            "2048x2048x2048x64x0.5,"
+            "256x256x256x32x0.0,"
+            "1024x1024x1024x32x0.0,"
+            "1024x1024x1024x32x0.9"
+        ),
+        help="Comma-separated MxNxKxBSxSparsity list. Sparsity=fraction of blocks zeroed.",
+    )
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "bfloat16", "float32"],
+    )
+    parser.add_argument("--warmup", type=int, default=10)
+    parser.add_argument("--iters", type=int, default=50)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--no-check", action="store_true")
+    args = parser.parse_args()
+
+    mlx_dtype = MLX_DTYPES[args.dtype]
+
+    print(f"dtype={args.dtype}  warmup={args.warmup}  iters={args.iters}")
+
+    headers = [
+        "Case (MxNxKxBS)",
+        "Sparsity",
+        "MLX ms",
+        "Naive ms",
+        "Speedup",
+    ]
+    if not args.no_check:
+        headers.append("Max err")
+    rows = []
+
+    cases = parse_cases(args.cases)
+    for idx, (m, n, k, bs, sparsity) in enumerate(cases):
+        rng = np.random.default_rng(args.seed + idx)
+        a_np = rng.standard_normal((m, k)).astype(np.float32)
+        b_np = rng.standard_normal((k, n)).astype(np.float32)
+        lhs_mask_np, rhs_mask_np, out_mask_np = make_masks(m, n, k, bs, sparsity, rng)
+
+        a_mx = mx.array(a_np, dtype=mlx_dtype)
+        b_mx = mx.array(b_np, dtype=mlx_dtype)
+        lhs_mask_mx = mx.array(lhs_mask_np)
+        rhs_mask_mx = mx.array(rhs_mask_np)
+        out_mask_mx = mx.array(out_mask_np)
+        mx.eval(a_mx, b_mx, lhs_mask_mx, rhs_mask_mx, out_mask_mx)
+
+        # Correctness check: block_masked_mm vs naive expand+matmul
+        err_str = ""
+        if not args.no_check:
+            y_op = mx.block_masked_mm(
+                a_mx, b_mx, bs, out_mask_mx, lhs_mask_mx, rhs_mask_mx
+            )
+            y_naive = mlx_naive_block_masked_mm(
+                a_mx, b_mx, bs, out_mask_mx, lhs_mask_mx, rhs_mask_mx
+            )
+            mx.eval(y_op, y_naive)
+            err = float(mx.max(mx.abs(y_op - y_naive)).item())
+            err_str = f"{err:.2e}"
+
+        # Benchmark
+        t_mlx = bench_mlx(
+            lambda: mx.block_masked_mm(
+                a_mx, b_mx, bs, out_mask_mx, lhs_mask_mx, rhs_mask_mx
+            ),
+            args.warmup,
+            args.iters,
+        )
+        t_naive = bench_mlx(
+            lambda: mlx_naive_block_masked_mm(
+                a_mx, b_mx, bs, out_mask_mx, lhs_mask_mx, rhs_mask_mx
+            ),
+            args.warmup,
+            args.iters,
+        )
+        speedup = f"{t_naive / t_mlx:.2f}x" if t_mlx > 0 else "-"
+
+        row = [
+            f"{m}x{n}x{k}x{bs}",
+            f"{sparsity:.0%}",
+            f"{t_mlx:.3f}",
+            f"{t_naive:.3f}",
+            speedup,
+        ]
+        if not args.no_check:
+            row.append(err_str)
+        rows.append(row)
+
+    print_table(headers, rows)
+    if not args.no_check:
+        print("err: max|block_masked_mm - naive_expand_matmul|")
+
+
+if __name__ == "__main__":
+    main()