feat(zero2): add CPU offload support for Muon optimizer

delock · delock · commit d802f0e9c0d7 · 2026-03-31T01:49:34.000-07:00
Add Muon optimizer support in ZeRO Stage 1&amp;2 CPU offload path by:

1. Partition strategy: Muon param groups now partition by parameter
   boundaries (never split a param across ranks), padding to uniform
   max size for all-gather compatibility. Logs padding overhead ratio.

2. CPU Newton-Schulz: Add muon_update_cpu() and
   zeropower_via_newtonschulz5_cpu() using PyTorch CPU bf16 matmul
   as baseline. Architecture allows future replacement with AMX C++ kernel.

3. CPU offload integration: _apply_muon_update_for_cpu_offload() copies
   complete gradients to CPU, runs muon_update on CPU (momentum buffer
   stays on CPU), writes result to FP32 grad buffer. No extra PCIe transfers.

Signed-off-by: Ma, Guokai &lt;guokai.ma@gmail.com&gt;
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -445,6 +445,20 @@ def _enforce_cpu_offload():
             # set model bit16 weight to slices of flattened buffer
             self._update_model_bit16_weights(i)
 
+            # For Muon param groups, pad flat buffer so partition boundaries
+            # never split a parameter. Each partition gets a uniform size
+            # (max_partition_size) suitable for all-gather.
+            if self._is_muon_param_group(i):
+                dp_size = dist.get_world_size(group=self.real_dp_process_group[i])
+                max_ps = self._get_muon_max_partition_size(self.round_robin_bit16_groups[i], dp_size, orig_group_numel)
+                padded_size = max_ps * dp_size
+                if padded_size > self.bit16_groups_flat[i].numel():
+                    pad_tensor = torch.zeros(padded_size - self.bit16_groups_flat[i].numel(),
+                                             dtype=self.bit16_groups_flat[i].dtype,
+                                             device=self.bit16_groups_flat[i].device)
+                    self.bit16_groups_flat[i] = torch.cat([self.bit16_groups_flat[i], pad_tensor])
+                    self._update_model_bit16_weights(i)
+
             # divide the flat weights into near equal partition equal to the data parallel degree
             # each process will compute on a different part of the partition
             data_parallel_partitions = self.get_data_parallel_partitions(self.bit16_groups_flat[i], i)
@@ -1501,6 +1515,64 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
         return torch.tensor(total_norm, device=self.device, dtype=torch.float)
 
     ############################################################################################
+    def _apply_muon_update_for_cpu_offload(self, param):
+        """Apply muon_update on CPU for a parameter in the CPU offload path.
+
+        With Muon-aware partitioning (parameters never split), grad_accum is
+        complete. This method copies it to CPU, runs Newton-Schulz on CPU,
+        writes the result to the FP32 grad buffer, and clears the GPU grad_accum.
+        Returns True if muon_update was applied (caller should skip the normal copy).
+        """
+        if not getattr(param, 'use_muon', False):
+            return False
+        if 'muon' not in self.optimizer.__class__.__name__.lower():
+            return False
+
+        param_id = self.get_param_id(param)
+        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
+
+        grad_accum = self.get_param_gradient_attribute(param)
+        if grad_accum is None:
+            return False
+
+        # Copy full grad to CPU with original shape for Newton-Schulz
+        grad_cpu = grad_accum.detach().clone().to(device=self.device, dtype=torch.float32)
+
+        # Get or create momentum buffer for this param group
+        flatten_copy = self.optimizer.param_groups[i]['params'][0]
+        if 'momentum_buffer' not in self.optimizer.state[flatten_copy]:
+            total_size = sum(p.numel() for p in self.params_in_partition[i] if getattr(p, 'use_muon', False))
+            self.optimizer.state[flatten_copy]['momentum_buffer'] = torch.zeros(total_size,
+                                                                                dtype=torch.float32,
+                                                                                device=self.device)
+
+        momentum_flat = self.optimizer.state[flatten_copy]['momentum_buffer']
+
+        # Find this param's offset within the muon momentum buffer
+        muon_offset = 0
+        for p in self.params_in_partition[i]:
+            if p is param:
+                break
+            if getattr(p, 'use_muon', False):
+                muon_offset += p.numel()
+
+        momentum_cpu = momentum_flat[muon_offset:muon_offset + param.numel()].view(param.size())
+
+        # Run muon update on CPU
+        beta = self.optimizer.param_groups[i].get('momentum', 0.95)
+        update = muon_update(grad_cpu.view(param.size()), momentum_cpu, beta=beta)
+
+        # Write updated momentum back to flat buffer
+        momentum_flat[muon_offset:muon_offset + param.numel()] = momentum_cpu.view(-1)
+
+        # Write updated gradient to the CPU FP32 grad buffer
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements)
+        dest_tensor.copy_(update.view(-1).to(self.master_weights_and_grads_dtype))
+
+        # Clear the GPU grad_accum since we already consumed it
+        self.clear_grad_attribute(param)
+        return True
+
     def copy_grads_in_partition(self, param):
         if self.cpu_offload:
 
@@ -1512,7 +1584,8 @@ def copy_grads_in_partition(self, param):
 
                 self.update_offload_overflow_tracker_for_param_grad(param)
 
-                self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
+                if not self._apply_muon_update_for_cpu_offload(param):
+                    self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
 
             return
         #print(f"ID {self.get_param_id(param)} grad norm {param.grad.norm()}")
@@ -1826,6 +1899,61 @@ def get_data_parallel_partitions(self, tensor, group_id):
             start = start + partition_size
         return partitions
 
+    def _is_muon_param_group(self, group_index):
+        """Check if a parameter group uses the Muon optimizer."""
+        params = self.round_robin_bit16_groups[group_index]
+        return (params and getattr(params[0], 'use_muon', False)
+                and 'muon' in self.optimizer.__class__.__name__.lower())
+
+    def _get_muon_max_partition_size(self, tensor_list, dp, total_num_elements):
+        """Compute the max partition size when partitioning by parameter boundaries.
+
+        Parameters are assigned sequentially to exactly dp partitions. A new
+        partition starts when the current one has reached the target size and
+        there are still remaining partitions to fill. All partitions are padded
+        to the largest one so all-gather works with equal-sized chunks.
+
+        Returns:
+            max_partition_size: the uniform (padded) partition size.
+        """
+        target_size = total_num_elements / dp
+        partition_sizes = []
+        current_size = 0
+        remaining_partitions = dp
+
+        for tensor in tensor_list:
+            numel = tensor.numel()
+            assert numel <= total_num_elements, (f"Muon parameter with {numel} elements exceeds total "
+                                                 f"{total_num_elements} elements.")
+            if current_size >= target_size and remaining_partitions > 1:
+                partition_sizes.append(current_size)
+                remaining_partitions -= 1
+                current_size = 0
+            current_size += numel
+
+        if current_size > 0:
+            partition_sizes.append(current_size)
+            remaining_partitions -= 1
+        while remaining_partitions > 0:
+            partition_sizes.append(0)
+            remaining_partitions -= 1
+
+        assert len(partition_sizes) == dp
+
+        max_partition_size = max(partition_sizes)
+        # Align to nccl_start_alignment_factor to guarantee 4-byte partition boundaries
+        alignment = self.nccl_start_alignment_factor * dp
+        max_partition_size = (max_partition_size + alignment - 1) // alignment * alignment
+        total_padded = max_partition_size * dp
+        padding_ratio = ((total_padded - total_num_elements) / total_num_elements if total_num_elements > 0 else 0)
+        if dist.get_rank() == 0:
+            logger.info(f"Muon partition: max_partition_size={max_partition_size}, "
+                        f"total_elements={total_num_elements}, "
+                        f"total_padded={total_padded}, "
+                        f"padding_ratio={padding_ratio:.4f}")
+
+        return max_partition_size
+
     def get_partition_info(self, tensor_list, partition_size, partition_id):
         params_in_partition = []
         params_not_in_partition = []
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
@@ -39,7 +39,7 @@ toc_label: "Contents"
 | type   | The optimizer name. DeepSpeed natively supports **Adam**, **AdamW**, **OneBitAdam**, **Lamb**, **OneBitLamb**, and **Muon** optimizers (See [here](https://deepspeed.readthedocs.io/en/latest/optimizers.html) for details) and will import other optimizers from [torch](https://pytorch.org/docs/stable/optim.html). | `"Adam"`                     |
 | params | Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for [Adam](https://pytorch.org/docs/stable/optim.html#torch.optim.Adam)).                                                                                                       | `{"lr": 0.001, "eps": 1e-8}` |
 
-Muon optimizer is supported with ZeRO Stage 1, 2, and 3. To use Muon, set the optimizer name to `Muon`. The parameters applied for Muon are automatically determined by the matrix shape and name. For ZeRO Stage 3 with NVMe offloading, set `save_muon_momentum_buffer_in_memory` to `true` under `zero_optimization` to keep the Muon momentum buffer in GPU/CPU memory instead of swapping to NVMe.
+Muon optimizer is supported with ZeRO Stage 1, 2, and 3, including CPU offload (`offload_optimizer`) for all stages. To use Muon, set the optimizer name to `Muon`. The parameters applied for Muon are automatically determined by the matrix shape and name. For ZeRO Stage 3 with NVMe offloading, set `save_muon_momentum_buffer_in_memory` to `true` under `zero_optimization` to keep the Muon momentum buffer in GPU/CPU memory instead of swapping to NVMe.
 
   Example of <i>**optimizer**</i> with Adam
 
diff --git a/tests/unit/ops/muon/test_muon_cpu_offload.py b/tests/unit/ops/muon/test_muon_cpu_offload.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+import pytest
+
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel
+from deepspeed.accelerator import get_accelerator
+
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported", allow_module_level=True)
+
+
+@pytest.mark.parametrize('zero_stage', [2])
+class TestMuonCPUOffload(DistributedTest):
+
+    def test_momentum_buffer_on_cpu(self, zero_stage):
+        """Verify Muon CPU offload creates momentum buffer on CPU.
+
+        This is the key invariant: after a training step with CPU offload,
+        the Muon momentum buffer must reside on CPU (not GPU), confirming
+        that muon_update ran on CPU and no GPU memory is wasted.
+        """
+        hidden_dim = 32
+        batch_size = 8
+        config_dict = {
+            "train_batch_size": batch_size,
+            "optimizer": {
+                "type": "muon",
+                "params": {
+                    "lr": 0.01
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+                "reduce_scatter": False,
+                "offload_optimizer": {
+                    "device": "cpu",
+                    "pin_memory": True,
+                },
+            },
+        }
+
+        model = SimpleModel(hidden_dim=hidden_dim, nlayers=5)
+        engine, optimizer, _, _ = deepspeed.initialize(
+            config=config_dict,
+            model=model,
+            model_parameters=model.parameters(),
+            dist_init_required=False,
+        )
+
+        x = torch.randn(batch_size, hidden_dim, device=engine.device, dtype=torch.half)
+        y = torch.randint(0, hidden_dim, (batch_size, ), device=engine.device)
+        loss = engine(x, y)
+        engine.backward(loss)
+        engine.step()
+
+        # Muon momentum buffer must exist and be on CPU.
+        # If muon_update was silently skipped, momentum_buffer would not be created.
+        flatten_copy = optimizer.optimizer.param_groups[0]['params'][0]
+        state = optimizer.optimizer.state[flatten_copy]
+        assert 'momentum_buffer' in state, ("momentum_buffer not found in optimizer state. "
+                                            "muon_update was not called in the CPU offload path.")
+        assert state['momentum_buffer'].device.type == 'cpu', (
+            f"Momentum buffer is on {state['momentum_buffer'].device}, expected CPU")
+
+
+@pytest.mark.parametrize('zero_stage', [2])
+class TestMuonCPUOffloadCosim(DistributedTest):
+
+    def test_cosim_offload_vs_no_offload(self, zero_stage):
+        """Verify CPU offload produces results consistent with GPU path.
+
+        With the same random seed, offload and non-offload should produce
+        close parameters. If muon_update is skipped or wrong in either path,
+        the results diverge significantly.
+        """
+        hidden_dim = 32
+        batch_size = 8
+
+        def train(offload):
+            torch.manual_seed(42)
+            config_dict = {
+                "train_batch_size": batch_size,
+                "optimizer": {
+                    "type": "muon",
+                    "params": {
+                        "lr": 0.01
+                    }
+                },
+                "fp16": {
+                    "enabled": True
+                },
+                "zero_optimization": {
+                    "stage": zero_stage,
+                    "reduce_scatter": False,
+                },
+            }
+            if offload:
+                config_dict["zero_optimization"]["offload_optimizer"] = {
+                    "device": "cpu",
+                    "pin_memory": True,
+                }
+
+            model = SimpleModel(hidden_dim=hidden_dim, nlayers=5)
+            engine, _, _, _ = deepspeed.initialize(
+                config=config_dict,
+                model=model,
+                model_parameters=model.parameters(),
+                dist_init_required=False,
+            )
+
+            for _ in range(3):
+                x = torch.randn(batch_size, hidden_dim, device=engine.device, dtype=torch.half)
+                y = torch.randint(0, hidden_dim, (batch_size, ), device=engine.device)
+                loss = engine(x, y)
+                engine.backward(loss)
+                engine.step()
+
+            return {n: p.clone().detach().float().cpu() for n, p in model.named_parameters()}
+
+        params_offload = train(offload=True)
+        params_no_offload = train(offload=False)
+
+        for name in params_offload:
+            p_off = params_offload[name]
+            p_no = params_no_offload[name]
+            # Both paths should produce the same NaN pattern
+            nan_mask = p_off.isnan() | p_no.isnan()
+            assert nan_mask.equal(p_off.isnan()), (f"{name}: NaN pattern differs between offload and non-offload. "
+                                                   "muon_update produced different results.")
+            # On non-NaN elements, cosine similarity should be very high
+            valid = ~nan_mask
+            if valid.sum() > 0:
+                cos_sim = torch.nn.functional.cosine_similarity(p_off[valid].unsqueeze(0),
+                                                                p_no[valid].unsqueeze(0)).item()
+                assert cos_sim > 0.99, (f"{name}: cosine similarity {cos_sim:.4f} between offload and "
+                                        f"non-offload is too low, indicating muon_update results diverge.")