Reapply "fix: update 1 file reformatted."

Copilot · nathon-lee · commit be87a3cd10a3 · 2026-03-07T02:46:58.000Z
This reverts commit b90aee5. Signed-off-by: nathon-lee <leejianwoo@gmail.com>
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -284,18 +284,11 @@ def _enforce_cpu_offload():
 
         self.low_precision_master_weights_and_grads = self.master_weights_and_grads_dtype != torch.float32
 
-        # Check for Muon optimizer usage
-        self.uses_muon = any(getattr(param, 'use_muon', False) for group in self.optimizer.param_groups for param in group['params'])
-        
         if self.reduce_scatter and self.partition_gradients:
             valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
             assert self.communication_data_type in valid_reduce_scatter_dtypes, f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
             assert self.gradient_predivide_factor == 1.0, f"gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
             assert self.postscale_gradients, f"pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
-            
-        # Check for Muon optimizer compatibility with reduce_scatter (applies to both ZeRO-1 and ZeRO-2)
-        if self.reduce_scatter and self.uses_muon:
-            assert False, f"{self.zero_stage_string} with reduce_scatter=True is incompatible with Muon optimizer. Please disable reduce_scatter or use a different optimizer."
 
         # param flattened by groups
         self.bit16_groups = []
@@ -1224,9 +1217,7 @@ def average_tensor(self, tensor: torch.Tensor, communication_data_type: torch.dt
             stream = get_accelerator().current_stream()
 
         with get_accelerator().stream(stream):
-            # Check if current configuration requires full all-reduce
-            if not self.reduce_scatter or any(self.group_uses_muon):
-                # Force full all-reduce for Muon parameters or when reduce_scatter is disabled
+            if not self.reduce_scatter:
                 self.gradient_reduction_w_predivide(tensor, communication_data_type)
                 return