Merge branch 'main' of github.com:ml-explore/mlx into rocm-support-fixes

goniz · goniz · commit 879a200ce812 · 2026-03-03T17:25:22.000+02:00
diff --git a/.github/actions/build-cuda-release/action.yml b/.github/actions/build-cuda-release/action.yml
@@ -20,7 +20,7 @@ runs:
       run: |
         pip install auditwheel build patchelf setuptools
         python setup.py clean --all
-        MLX_BUILD_STAGE=2 python -m build -w
+        MLX_DISABLE_SM90A_KERNELS=1 MLX_BUILD_STAGE=2 python -m build -w
 
         auditwheel repair dist/mlx_cuda*.whl \
           --plat manylinux_2_35_${{ inputs.arch }} \
diff --git a/docs/src/python/nn.rst b/docs/src/python/nn.rst
@@ -175,6 +175,7 @@ In detail:
    value_and_grad
    quantize
    average_gradients
+   fsdp_apply_gradients
 
 .. toctree::
 
diff --git a/mlx/backend/cuda/CMakeLists.txt b/mlx/backend/cuda/CMakeLists.txt
@@ -158,8 +158,10 @@ message(STATUS "CUDA architectures: ${MLX_CUDA_ARCHITECTURES}")
 set_target_properties(mlx PROPERTIES CUDA_ARCHITECTURES
                                      "${MLX_CUDA_ARCHITECTURES}")
 
-if(("90a" IN_LIST MLX_CUDA_ARCHITECTURES) OR ("90a-real" IN_LIST
-                                              MLX_CUDA_ARCHITECTURES))
+# Skip Hopper-only kernels when not building for sm90a.
+if(NOT DEFINED ENV{MLX_DISABLE_SM90A_KERNELS}
+   AND (("90a" IN_LIST MLX_CUDA_ARCHITECTURES) OR ("90a-real" IN_LIST
+                                                   MLX_CUDA_ARCHITECTURES)))
   target_compile_definitions(mlx PRIVATE MLX_CUDA_SM90A_ENABLED)
 endif()
 
diff --git a/mlx/backend/metal/device.cpp b/mlx/backend/metal/device.cpp
@@ -323,9 +323,18 @@ Device::Device() {
   auto pool = new_scoped_memory_pool();
   device_ = load_device();
   default_library_ = load_default_library(device_);
-  arch_ = std::string(device_->architecture()->name()->utf8String());
-  int ag_tens = arch_[arch_.size() - 3] - '0';
-  int ag_ones = arch_[arch_.size() - 2] - '0';
+  arch_ = env::metal_gpu_arch();
+  if (arch_.empty()) {
+    arch_ = std::string(device_->architecture()->name()->utf8String());
+  }
+  int ag_tens = 0;
+  int ag_ones = 0;
+  if (arch_.size() >= 3) {
+    ag_tens = arch_[arch_.size() - 3] - '0';
+    ag_ones = arch_[arch_.size() - 2] - '0';
+    ag_tens = (ag_tens < 10 && ag_tens >= 0) ? ag_tens : 0;
+    ag_ones = (ag_ones < 10 && ag_ones >= 0) ? ag_ones : 0;
+  }
   arch_gen_ = ag_tens * 10 + ag_ones;
   auto arch = arch_.back();
   switch (arch) {
diff --git a/mlx/backend/metal/device_info.cpp b/mlx/backend/metal/device_info.cpp
@@ -21,9 +21,10 @@ device_info(int device_index) {
   auto init_device_info = []()
       -> std::unordered_map<std::string, std::variant<std::string, size_t>> {
     auto pool = metal::new_scoped_memory_pool();
-    auto raw_device = metal::device(mlx::core::Device::gpu).mtl_device();
+    auto& device = metal::device(mlx::core::Device::gpu);
+    auto raw_device = device.mtl_device();
     auto name = std::string(raw_device->name()->utf8String());
-    auto arch = std::string(raw_device->architecture()->name()->utf8String());
+    auto arch = device.get_architecture();
 
     size_t memsize = 0;
     size_t length = sizeof(memsize);
diff --git a/mlx/backend/metal/quantized.cpp b/mlx/backend/metal/quantized.cpp
@@ -82,10 +82,9 @@ inline array ensure_row_contiguous_matrix(
 }
 
 inline int get_qmv_batch_limit(int D, int O, metal::Device& d) {
-  auto arch = d.get_architecture();
-  auto arch_size = arch.back();
-  auto arch_gen = arch.substr(arch.size() - 3, 2);
-  if (arch_gen == "13" || arch_gen == "14") {
+  auto arch_size = d.get_architecture().back();
+  auto arch_gen = d.get_architecture_gen();
+  if (arch_gen == 13 || arch_gen == 14) {
     switch (arch_size) {
       case 'd':
         if (D <= 2048 && O <= 2048) {
diff --git a/mlx/utils.cpp b/mlx/utils.cpp
@@ -258,6 +258,14 @@ int get_var(const char* name, int default_value) {
   }
 }
 
+std::string get_var(const char* name, const char* default_value) {
+  if (const char* buff_str = std::getenv(name)) {
+    return buff_str;
+  } else {
+    return default_value;
+  }
+}
+
 } // namespace env
 
 template <typename T>
diff --git a/mlx/utils.h b/mlx/utils.h
@@ -136,6 +136,7 @@ inline int next_power_of_2(int n) {
 namespace env {
 
 int get_var(const char* name, int default_value);
+std::string get_var(const char* name, const char* default_value);
 
 inline int bfs_max_width() {
   static int bfs_max_width_ = get_var("MLX_BFS_MAX_WIDTH", 20);
@@ -169,6 +170,11 @@ inline int nccl_timeout(int default_value) {
   return nccl_timeout;
 }
 
+inline const std::string& metal_gpu_arch() {
+  static std::string gpu_arch_ = get_var("MLX_METAL_GPU_ARCH", "");
+  return gpu_arch_;
+}
+
 } // namespace env
 
 } // namespace mlx::core
diff --git a/mlx/version.h b/mlx/version.h
@@ -5,8 +5,8 @@
 #include "mlx/api.h"
 
 #define MLX_VERSION_MAJOR 0
-#define MLX_VERSION_MINOR 30
-#define MLX_VERSION_PATCH 7
+#define MLX_VERSION_MINOR 31
+#define MLX_VERSION_PATCH 1
 #define MLX_VERSION_NUMERIC \
   (100000 * MLX_VERSION_MAJOR + 1000 * MLX_VERSION_MINOR + MLX_VERSION_PATCH)
 
diff --git a/python/mlx/nn/__init__.py b/python/mlx/nn/__init__.py
@@ -2,4 +2,8 @@
 
 from mlx.nn import init, losses
 from mlx.nn.layers import *
-from mlx.nn.utils import average_gradients, value_and_grad
+from mlx.nn.utils import (
+    average_gradients,
+    fsdp_apply_gradients,
+    value_and_grad,
+)
diff --git a/python/mlx/nn/utils.py b/python/mlx/nn/utils.py
@@ -5,7 +5,7 @@
 
 import mlx.core as mx
 
-from ..utils import tree_flatten, tree_map, tree_unflatten
+from ..utils import tree_flatten, tree_map, tree_reduce, tree_unflatten
 from .layers.base import Module
 
 
@@ -71,6 +71,31 @@ def wrapped_checkpointed_fn(*args, **kwargs):
     return wrapped_checkpointed_fn
 
 
+def _extract_info(flat):
+    keys = [k for k, _ in flat]
+    shapes = [g.shape for _, g in flat]
+    sizes = [g.size for _, g in flat]
+    dtypes = [g.dtype for _, g in flat]
+    return keys, shapes, sizes, dtypes
+
+
+def _group_by_size(keys, sizes, itemsize, communication_size):
+    grad_groups = []
+    grad_group = []
+    grad_group_size = 0
+    for i in range(len(keys)):
+        grad_group.append(i)
+        grad_group_size += sizes[i] * itemsize
+        if grad_group_size >= communication_size:
+            grad_groups.append(grad_group)
+            grad_group = []
+            grad_group_size = 0
+    if grad_group:
+        grad_groups.append(grad_group)
+        grad_group = []
+    return grad_groups
+
+
 def average_gradients(
     gradients: Any,
     group: Optional[mx.distributed.Group] = None,
@@ -95,7 +120,7 @@ def average_gradients(
         communication_type (Optional[mlx.core.Dtype]): If provided cast to this
             type before performing the communication. Typically cast to a
             smaller float to reduce the communication size. Default: ``None``.
-        communication_stream (Optional[mlx.core.Stream]): The stream to usse
+        communication_stream (Optional[mlx.core.Stream]): The stream to use
             for the communication. If unspecified the default communication
             stream is used which can vary by back-end. Default: ``None``.
     """
@@ -119,10 +144,7 @@ def _average(x):
             return gradients
 
         # Extract some info for the gradient
-        keys = [k for k, _ in flat_grads]
-        shapes = [v.shape for _, v in flat_grads]
-        sizes = [v.size for _, v in flat_grads]
-        dtypes = [v.dtype for _, v in flat_grads]
+        keys, shapes, sizes, dtypes = _extract_info(flat_grads)
 
         # We can't group them if they have mixed types
         if not all(dt == dtypes[0] for dt in dtypes):
@@ -134,19 +156,7 @@ def _average(x):
         )
 
         # Gather the gradients in groups that are just above or equal to all_reduce_size
-        grad_groups = []
-        grad_group = []
-        grad_group_size = 0
-        for i in range(len(keys)):
-            grad_group.append(i)
-            grad_group_size += sizes[i] * itemsize
-            if grad_group_size >= all_reduce_size:
-                grad_groups.append(grad_group)
-                grad_group = []
-                grad_group_size = 0
-        if grad_group:
-            grad_groups.append(grad_group)
-            grad_group = []
+        grad_groups = _group_by_size(keys, sizes, itemsize, all_reduce_size)
 
         # Concatenate-reduce-split
         new_flat_grads = []
@@ -163,3 +173,153 @@ def _average(x):
             )
 
         return tree_unflatten(new_flat_grads)
+
+
+def _clip_grads_fsdp(grads_slice, max_norm):
+    local_norm_sq = tree_reduce(lambda acc, g: acc + g.square().sum(), grads_slice, 0.0)
+    global_norm_sq = mx.distributed.all_sum(local_norm_sq)
+    grad_norm = mx.sqrt(global_norm_sq)
+    normalizer = mx.minimum(max_norm / (grad_norm + 1e-6), 1.0)
+    grads_slice = tree_map(lambda g: g * normalizer, grads_slice)
+
+    return grads_slice, grad_norm
+
+
+def fsdp_apply_gradients(
+    gradients,
+    parameters,
+    optimizer,
+    group=None,
+    communication_size=32 * 1024**2,
+    communication_type=None,
+    communication_stream=None,
+    max_norm=None,
+):
+    """Perform a distributed optimizer step by sharding gradients and optimizer states across ranks.
+
+    This helper function performs the following steps:
+    1. Reduce-scatter the gradients across ranks so each rank gets a shard of the averaged gradients.
+    2. Optionally clip the sharded gradients by global norm.
+    3. Apply the optimizer update on the local parameter slice using the sharded gradients.
+    4. All-gather the updated parameter slices from all ranks to reconstruct the full parameters tree.
+
+    This is similar to PyTorch's FSDP with `reshard_after_forward=False`.
+
+    Args:
+        gradients (Any): The Python tree containing the full gradients (it should
+            have the same structure as ``parameters``). Each gradient's first
+            dimension must be divisible by the world size.
+        parameters (Any): The Python tree containing the full parameters (it should
+            have the same structure across processes). Each parameter's first
+            dimension must be divisible by the world size.
+        optimizer: Optimizer with an ``apply_gradients`` method.
+        group (Optional[mlx.core.distributed.Group]): The group of processes for
+            communication. If ``None``, the global group is used.
+            Default: ``None``.
+        communication_size (int): Group arrays until their size in bytes exceeds
+            this number. Perform one communication step per group of arrays. If
+            less or equal to 0 array grouping is disabled. Default: ``32MiB``.
+        communication_type (Optional[mlx.core.Dtype]): If provided cast to this
+            type before performing the communication. Typically cast to a
+            smaller float to reduce the communication size. Default: ``None``.
+        communication_stream (Optional[mlx.core.Stream]): The stream to use
+            for the communication. If unspecified the default communication
+            stream is used which can vary by back-end. Default: ``None``.
+        max_norm (Optional[float]): If provided, clip gradients to this
+            maximum global norm before applying the optimizer update.
+            Default: ``None``.
+
+    Returns:
+        If ``max_norm`` is ``None``, returns the updated full-parameter tree.
+        Otherwise returns ``(parameters, grad_norm)``, where ``grad_norm`` is
+        the global gradient norm before clipping.
+
+    Example:
+
+        >>> optimizer = optim.SGD(learning_rate=0.01)
+        >>> # Without gradient clipping
+        >>> updated_params = fsdp_apply_gradients(grads, params, optimizer)
+        >>> model.update(updated_params)
+        >>>
+        >>> # With gradient clipping
+        >>> updated_params, grad_norm = fsdp_apply_gradients(
+        ...     grads, params, optimizer, max_norm=1.0
+        ... )
+        >>> model.update(updated_params)
+    """
+    group = group or mx.distributed.init()
+    N = group.size()
+    rank = group.rank()
+
+    if N == 1:
+        if max_norm is not None:
+            gradients, grad_norm = _clip_grads_fsdp(gradients, max_norm)
+            return optimizer.apply_gradients(gradients, parameters), grad_norm
+        return optimizer.apply_gradients(gradients, parameters)
+
+    flat_grads = tree_flatten(gradients)
+    flat_params = tree_flatten(parameters)
+
+    def _sum_scatter(x):
+        dt = x.dtype
+        x = x.astype(communication_type) if communication_type is not None else x
+        return (
+            mx.distributed.sum_scatter(
+                x, group=group, stream=communication_stream
+            ).astype(dt)
+            / N
+        )
+
+    def _all_gather(x):
+        dt = x.dtype
+        x = x.astype(communication_type) if communication_type is not None else x
+        return mx.distributed.all_gather(
+            x, group=group, stream=communication_stream
+        ).astype(dt)
+
+    keys, shapes, sizes, dtypes = _extract_info(flat_grads)
+    itemsize = dtypes[0].size
+
+    groups = _group_by_size(keys, sizes, itemsize, communication_size)
+
+    # reduce-scatter gradients, shard parameters
+    grad_slices = {}
+    param_slices = {}
+    for group_idx, arr_group in enumerate(groups):
+        big_grad = mx.concatenate(
+            [flat_grads[i][1].reshape(N, -1) for i in arr_group], axis=1
+        )
+        grad_slices[group_idx] = _sum_scatter(big_grad)
+        big_param = mx.concatenate(
+            [flat_params[i][1].reshape(N, -1) for i in arr_group], axis=1
+        )
+        param_slices[group_idx] = big_param[rank]
+
+    # clip gradients if needed
+    grad_norm = None
+    if max_norm is not None:
+        grad_slices, grad_norm = _clip_grads_fsdp(grad_slices, max_norm)
+
+    # optimizer step
+    updated_param_slices = optimizer.apply_gradients(grad_slices, param_slices)
+
+    # all-gather and reconstruct
+    new_flat = []
+    for group_idx, arr_group in enumerate(groups):
+        big_gathered = _all_gather(updated_param_slices[group_idx].reshape(1, -1))
+
+        split_sizes = [sizes[i] // N for i in arr_group]
+        split_indices = []
+        acc = 0
+        for s in split_sizes:
+            acc += s
+            split_indices.append(acc)
+
+        parts = mx.split(big_gathered, split_indices[:-1], axis=1)
+        for idx_in_group, i in enumerate(arr_group):
+            new_flat.append((keys[i], parts[idx_in_group].reshape(shapes[i])))
+
+    result = tree_unflatten(new_flat)
+    if max_norm is not None:
+        return result, grad_norm
+    return result
diff --git a/python/src/load.cpp b/python/src/load.cpp
diff --git a/python/tests/nccl_test_distributed.py b/python/tests/nccl_test_distributed.py

Original file line number	Diff line number	Diff line change
`@@ -258,6 +258,14 @@ int get_var(const char* name, int default_value) {`
`258`	`258`	`}`
`259`	`259`	`}`
`260`	`260`
	`261`	`+std::string get_var(const char* name, const char* default_value) {`
	`262`	`+ if (const char* buff_str = std::getenv(name)) {`
	`263`	`+ return buff_str;`
	`264`	`+ } else {`
	`265`	`+ return default_value;`
	`266`	`+ }`
	`267`	`+}`
	`268`	`+`
`261`	`269`	`} // namespace env`
`262`	`270`
`263`	`271`	`template <typename T>`