docs: add doc

HT-Yuan · HT-Yuan · commit 5c545e6c42fb · 2026-05-08T14:00:48.000+08:00
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -7,7 +7,7 @@
 from dataclasses import asdict, dataclass, field, fields
 from enum import Enum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, ClassVar, TypeVar
 
 import uvloop
 import yaml
@@ -348,8 +348,10 @@ class OptimizerConfig:
     lr: float = field(
         default=1e-3,
         metadata={
-            "help": "Learning rate. When type='muon', this is the Muon lr for >=2D params "
-            "(typical value: ~0.02). The AdamW backend lr is controlled by muon_backend_lr."
+            "help": "Learning rate. When type='muon', this is shared by both the Muon sub-optimizer "
+            "(>=2D params) and the AdamW backend (<2D params). Pair "
+            "muon_scale_mode='spectral' with muon_extra_scale_factor=0.2 (Moonlight-style) to "
+            "make Muon's update RMS match AdamW so a single lr works for both."
         },
     )
     weight_decay: float = field(
@@ -434,26 +436,31 @@ class OptimizerConfig:
             "Mirrors Megatron-Core OptimizerConfig.muon_num_ns_steps."
         },
     )
-    muon_scale_mode: Literal["rms", "spectral"] = field(
-        default="rms",
+    muon_scale_mode: str = field(
+        default="spectral",
         metadata={
-            "help": "Update-scaling mode for Muon. 'rms' (Moonlight-style) scales the update so its "
-            "RMS matches Adam, allowing a single lr for all parameters (see https://arxiv.org/abs/2502.16982). "
-            "'spectral' uses the Keller Jordan max(1, m/n)^0.5 spectral scaling. "
-            "Only effective when type='muon'. Mirrors Megatron-Core OptimizerConfig.muon_scale_mode.",
-            "choices": ["rms", "spectral"],
+            "help": "Muon update scaling mode (final scale = mode_factor * muon_extra_scale_factor):"
+            "Only used when type='muon'. Mirrors Megatron-Core OptimizerConfig.muon_scale_mode.",
+            "choices": ["spectral", "unit_rms_norm", "shape_scaling"],
         },
     )
-    muon_backend_lr: float | None = field(
-        default=None,
+    muon_extra_scale_factor: float = field(
+        default=1.0,
         metadata={
-            "help": "Learning rate for the AdamW backend optimizer in Muon (handles <2D params: "
-            "biases, norms, embeddings). Typical value: ~3e-4. If None, falls back to the main lr "
-            "with a warning (since Muon lr is typically ~100x larger). "
-            "Only effective when type='muon'."
+            "help": "Extra multiplier on top of muon_scale_mode. Use 0.2 with "
+            "scale_mode='spectral' for Moonlight-style RMS-matched scaling. "
+            "Only used when type='muon'. Mirrors Megatron-Core OptimizerConfig.muon_extra_scale_factor."
         },
     )
 
+    def __post_init__(self):
+        """Validate optimizer configuration."""
+        valid_muon_scale_modes = {"spectral", "unit_rms_norm", "shape_scaling"}
+        if self.muon_scale_mode not in valid_muon_scale_modes:
+            raise ValueError(
+                f"muon_scale_mode must be one of {valid_muon_scale_modes}, got {self.muon_scale_mode!r}. "
+            )
+
 
 @dataclass
 class FSDPWrapPolicy:
diff --git a/areal/engine/fsdp_engine.py b/areal/engine/fsdp_engine.py
@@ -1136,31 +1136,22 @@ def _create_optimizer(self, ft_spec: FinetuneSpec) -> None:
                     muon_params.append(p)
                 else:
                     backend_params.append(p)
-            if self.optimizer_config.muon_backend_lr is not None:
-                backend_lr = self.optimizer_config.muon_backend_lr
-            else:
-                backend_lr = lr
-                self.logger.warning(
-                    "muon_backend_lr is not set; falling back to main lr (%.2e) for AdamW backend. "
-                    "Typical Muon setups use a much smaller backend lr (e.g. 3e-4). "
-                    "Set muon_backend_lr explicitly to suppress this warning.",
-                    lr,
-                )
             self.optimizer = MuonOptimizer(
                 [
                     dict(
                         params=muon_params,
                         lr=lr,
                         momentum=self.optimizer_config.muon_momentum,
                         weight_decay=weight_decay,
-                        rms_scale=self.optimizer_config.muon_scale_mode == "rms",
+                        scale_mode=self.optimizer_config.muon_scale_mode,
+                        extra_scale_factor=self.optimizer_config.muon_extra_scale_factor,
                         nesterov=self.optimizer_config.muon_use_nesterov,
                         ns_steps=self.optimizer_config.muon_num_ns_steps,
                         use_muon=True,
                     ),
                     dict(
                         params=backend_params,
-                        lr=backend_lr,
+                        lr=lr,
                         betas=(beta1, beta2),
                         eps=eps,
                         weight_decay=weight_decay,
diff --git a/areal/engine/fsdp_utils/muon.py b/areal/engine/fsdp_utils/muon.py
@@ -80,14 +80,44 @@ def apply_momentum(
     return update
 
 
-def apply_scaling(grad: Tensor, rms_scale: bool = False) -> Tensor:
-    """Post-NS scaling: either Moonlight RMS or Keller Jordan max(1, m/n)^0.5."""
-    if rms_scale:
-        # https://github.com/MoonshotAI/Moonlight/blob/5afcb6911077e7f182d05865fe90d9f39abcbcbd/examples/toy_train.py#L146
-        grad *= 0.2 * math.sqrt(max(grad.shape[1], grad.shape[0]))
+def apply_scaling(
+    grad: Tensor,
+    mode: str = "spectral",
+    extra_scale_factor: float = 1.0,
+) -> Tensor:
+    """Post-Newton-Schulz update scaling.
+
+    Naming aligned with Megatron-Core / emerging_optimizers (NVIDIA-NeMo).
+
+    Final scale = scale_factor(mode) * extra_scale_factor, where:
+        - 'spectral'      : sqrt(max(m, n))
+          Kimi/Moonlight (arXiv:2502.16982); emerging_optimizers default.
+        - 'unit_rms_norm' : sqrt(m / n)
+          Scion (arXiv:2502.07529) / Bernstein
+          (https://jeremybernste.in/writing/deriving-muon).
+        - 'shape_scaling' : max(1, m / n)**0.5
+          Keller Jordan original (https://kellerjordan.github.io/posts/muon).
+
+    Set extra_scale_factor=0.2 with mode='spectral' to reproduce the legacy
+    Moonlight `https://github.com/MoonshotAI/Moonlight/blob/5afcb6911077e7f182d05865fe90d9f39abcbcbd/examples/toy_train.py#L146`
+    setting (= 0.2 * sqrt(max(m, n))), which
+    approximately matches AdamW's update RMS norm so a single lr works for
+    both Muon and the AdamW backend.
+    """
+    m = grad.size(-2)
+    n = grad.size(-1)
+    if mode == "spectral":
+        scale = math.sqrt(max(m, n))
+    elif mode == "unit_rms_norm":
+        scale = math.sqrt(m / n)
+    elif mode == "shape_scaling":
+        scale = max(1, m / n) ** 0.5
     else:
-        # https://github.com/KellerJordan/Muon/blob/f90a42b28e00b8d9d2d05865fe90d9f39abcbcbd/muon.py#L40
-        grad *= max(1, grad.size(-2) / grad.size(-1)) ** 0.5
+        raise ValueError(
+            f"Invalid muon_scale_mode {mode!r}. Valid: "
+            "{'spectral', 'unit_rms_norm', 'shape_scaling'}."
+        )
+    grad *= scale * extra_scale_factor
     return grad
 
 
@@ -194,7 +224,11 @@ def finish(self):
         else:
             scatter(grad.to_local(), None, src=dest_rank, group=pg, async_op=False)
 
-        update = apply_scaling(grad, self.group["rms_scale"])
+        update = apply_scaling(
+            grad,
+            self.group["scale_mode"],
+            self.group["extra_scale_factor"],
+        )
 
         self.param.mul_(1 - self.group["lr"] * self.group["weight_decay"])
         self.param.add_(update.reshape(self.param.shape), alpha=-self.group["lr"])
@@ -272,7 +306,11 @@ def finish(self):
         new_local = distribute_tensor(g_full, mesh, placements).to_local()
         grad.to_local().copy_(new_local)
 
-        update = apply_scaling(grad, self.group["rms_scale"])
+        update = apply_scaling(
+            grad,
+            self.group["scale_mode"],
+            self.group["extra_scale_factor"],
+        )
 
         self.param.mul_(1 - self.group["lr"] * self.group["weight_decay"])
         self.param.add_(update.reshape(self.param.shape), alpha=-self.group["lr"])
@@ -309,7 +347,11 @@ def start(self):
         )
         update = zeropower_via_newtonschulz5(update, self.group["ns_steps"])
         update = update.to(self.param.grad.dtype)
-        update = apply_scaling(update, self.group["rms_scale"])
+        update = apply_scaling(
+            update,
+            self.group["scale_mode"],
+            self.group["extra_scale_factor"],
+        )
         self.param.mul_(1 - self.group["lr"] * self.group["weight_decay"])
         self.param.add_(update.reshape(self.param.shape), alpha=-self.group["lr"])
 
@@ -330,7 +372,8 @@ class Muon(torch.optim.Optimizer):
 
     Notable changes:
         - DTensor/FSDP2 native: uses gather/scatter for distributed NS instead of DDP.
-        - ``rms_scale`` argument following the Moonlight paper (https://arxiv.org/abs/2502.16982).
+        - ``scale_mode`` / ``extra_scale_factor`` arguments aligned with Megatron-Core /
+          emerging_optimizers (NVIDIA-NeMo). See :func:`apply_scaling` for details.
 
     Example::
 
@@ -340,7 +383,7 @@ class Muon(torch.optim.Optimizer):
         ])
 
     Param group args (``use_muon=True``):
-        lr, momentum, weight_decay, rms_scale, nesterov, ns_steps
+        lr, momentum, weight_decay, scale_mode, extra_scale_factor, nesterov, ns_steps
 
     Param group args (``use_muon=False``):
         lr, betas, eps, weight_decay
@@ -353,7 +396,8 @@ def __init__(self, param_groups):
                 group.setdefault("lr", 0.02)
                 group.setdefault("momentum", 0.95)
                 group.setdefault("weight_decay", 0)
-                group.setdefault("rms_scale", True)
+                group.setdefault("scale_mode", "spectral")
+                group.setdefault("extra_scale_factor", 1.0)
                 group.setdefault("nesterov", True)
                 group.setdefault("ns_steps", 5)
                 assert set(group.keys()) == {
@@ -362,7 +406,8 @@ def __init__(self, param_groups):
                     "momentum",
                     "weight_decay",
                     "use_muon",
-                    "rms_scale",
+                    "scale_mode",
+                    "extra_scale_factor",
                     "nesterov",
                     "ns_steps",
                 }
diff --git a/areal/engine/megatron_engine.py b/areal/engine/megatron_engine.py
@@ -1327,14 +1327,15 @@ def _create_optimizer(self, ft_spec: FinetuneSpec) -> None:
         # Forward Muon-specific hyperparameters onto Megatron-Core's OptimizerConfig.
         # AReaL's muon_* fields are 1:1 aligned with Megatron-Core >= 0.17, so no
         # translation is required. Fields not exposed by AReaL (muon_coefficient_type,
-        # muon_split_qkv, muon_tp_mode, muon_extra_scale_factor, muon_fp32_matmul_prec)
-        # keep their Megatron defaults.
+        # muon_split_qkv, muon_tp_mode, muon_fp32_matmul_prec) keep their Megatron
+        # defaults.
         if self.optimizer_config.type == "muon":
             muon_passthrough_fields = (
                 "muon_momentum",
                 "muon_use_nesterov",
                 "muon_num_ns_steps",
                 "muon_scale_mode",
+                "muon_extra_scale_factor",
             )
             for attr in muon_passthrough_fields:
                 if hasattr(mcore_opt_config, attr):
diff --git a/areal/experimental/engine/archon_utils.py b/areal/experimental/engine/archon_utils.py
@@ -60,10 +60,6 @@ def create_optimizer(
             eps=eps,
             fused=True,
         )
-    elif optimizer_config.type == "muon":
-        raise NotImplementedError(
-            "Muon optimizer is not yet supported under ArchonEngine. "
-        )
     elif optimizer_config.type == "sgd":
         return torch.optim.SGD(
             params,
diff --git a/docs/en/_toc.yml b/docs/en/_toc.yml
@@ -40,6 +40,7 @@ parts:
     - file: algorithms/prox_approx
   - caption: Reference
     chapters:
+    - file: reference/optimizer
     - file: reference/checkpointing
     - file: reference/metrics_tracking
     - file: reference/alloc_mode
diff --git a/docs/en/reference/optimizer.md b/docs/en/reference/optimizer.md
@@ -0,0 +1,96 @@
+(section-optimizer-guide)=
+
+# Optimizer Configuration Guide
+
+AReaL supports multiple optimizer types, configurable via the `optimizer.type` field.
+This document covers the support matrix across training backends and the implementation
+differences of the Muon optimizer.
+
+## Supported Optimizer Types
+
+| Type        | Description                                                                                        |
+| ----------- | -------------------------------------------------------------------------------------------------- |
+| `adam`      | AdamW optimizer (default)                                                                          |
+| `adam_bf16` | BF16-precision AdamW, reduces optimizer state memory                                               |
+| `sgd`       | Standard SGD                                                                                       |
+| `muon`      | Muon optimizer: Newton-Schulz orthogonalized updates for ≥2D params, AdamW backend for \<2D params |
+
+## Engine Support Matrix
+
+| Optimizer   |      FSDP Engine       |        Megatron Engine         |    Archon Engine     |
+| ----------- | :--------------------: | :----------------------------: | :------------------: |
+| `adam`      |           ✅           |               ✅               |          ✅          |
+| `adam_bf16` | ✅ (AnyPrecisionAdamW) | ✅ (precision-aware optimizer) |          ❌          |
+| `sgd`       |           ✅           |               ✅               |          ✅          |
+| `muon`      |           ✅           |   ✅ (Megatron-Core ≥ 0.17)    | ❌ (not implemented) |
+
+### Notes
+
+- **FSDP Engine**: `adam_bf16` uses `AnyPrecisionAdamW`, storing momentum and variance
+  in BF16.
+- **Megatron Engine**: `adam_bf16` requires model dtype to be bfloat16; it is
+  auto-converted to adam with precision-aware optimizer enabled.
+- **Archon Engine**: Currently only supports `adam` and `sgd`. Muon support is under
+  development.
+
+## Muon Optimizer
+
+### Overview
+
+Muon (MomentUm Orthogonalized by Newton-schulz) is an optimizer that applies approximate
+orthogonalization to gradient momentum via Newton-Schulz iteration. The core idea is to
+impose an orthogonal constraint on weight matrix gradients, making update directions
+more "uniform" in parameter space and accelerating convergence.
+
+### Reference Implementations and Papers
+
+| Resource                                 | Link                                               |
+| ---------------------------------------- | -------------------------------------------------- |
+| Original implementation (Keller Jordan)  | https://github.com/KellerJordan/Muon               |
+| Moonlight paper (RMS scaling)            | https://arxiv.org/abs/2502.16982                   |
+| AReaL FSDP implementation                | `areal/engine/fsdp_utils/muon.py`                  |
+| Emerging-Optimizers (Megatron-Core Muon) | https://github.com/NVIDIA-NeMo/Emerging-Optimizers |
+
+### FSDP vs Megatron Implementation Differences
+
+The FSDP Engine and Megatron Engine differ significantly in how they partition
+parameters for Muon:
+
+| Dimension                         | FSDP Engine                                                       | Megatron Engine                                                                       |
+| --------------------------------- | ----------------------------------------------------------------- | ------------------------------------------------------------------------------------- |
+| **Muon parameter scope**          | **All** ≥2D parameters (including embedding weight matrices)      | **Linear layer weights**                                                              |
+| **AdamW backend parameters**      | All \<2D parameters (bias, LayerNorm weight/bias)                 | Embeddings, biases, norms, and non-Linear 2D parameters                               |
+| **Distributed NS implementation** | DTensor gather/scatter (FSDP2 native)                             | TP-aware `TensorParallelMuon` (distributed Newton-Schulz over TP communication group) |
+| **TP + EP support**               | TP + FSDP 2D mesh ✅; TP + EP + FSDP 3D mesh ❌ (not implemented) | Full TP / EP / PP support                                                             |
+
+### Configuration Example
+
+```yaml
+optimizer:
+  type: muon
+  lr: 2e-3                    # Shared lr (Muon and AdamW backend)
+  muon_momentum: 0.95
+  muon_use_nesterov: true
+  muon_num_ns_steps: 5
+  muon_scale_mode: spectral      # spectral / unit_rms_norm / shape_scaling
+  muon_extra_scale_factor: 0.2   # 0.2 + spectral = Moonlight-style RMS-matched scaling
+  weight_decay: 0.05
+  beta1: 0.9                  # AdamW backend params
+  beta2: 0.95
+  eps: 1e-5
+  lr_scheduler_type: cosine
+  warmup_steps_proportion: 0.03
+```
+
+### Configuration Parameters
+
+| Parameter                 | Type  | Default            | Description                                                                                                                                                                                                        |
+| ------------------------- | ----- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `lr`                      | float | 0.001              | Shared learning rate for both Muon (≥2D params) and AdamW backend (\<2D params). A single lr works well when pairing `muon_scale_mode=spectral` with `muon_extra_scale_factor=0.2` (Moonlight-style)               |
+| `muon_momentum`           | float | 0.95               | Muon momentum coefficient                                                                                                                                                                                          |
+| `muon_use_nesterov`       | bool  | true               | Whether to use Nesterov momentum                                                                                                                                                                                   |
+| `muon_num_ns_steps`       | int   | 5                  | Number of Newton-Schulz iteration steps                                                                                                                                                                            |
+| `muon_scale_mode`         | str   | "spectral"         | Update scaling mode. `spectral`: `sqrt(max(m, n))` (Kimi/Moonlight, emerging_optimizers default). `unit_rms_norm`: `sqrt(m / n)` (Scion / Bernstein). `shape_scaling`: `max(1, m/n)**0.5` (Keller Jordan original) |
+| `muon_extra_scale_factor` | float | 1.0                | Extra multiplicative scale; final scale = `scale_factor(mode) * muon_extra_scale_factor`. Use `0.2` with `spectral` to reproduce Moonlight-style RMS-matched scaling                                               |
+| `weight_decay`            | float | 0.01               | Weight decay, applied to both Muon and AdamW backend                                                                                                                                                               |
+| `beta1` / `beta2` / `eps` | float | 0.9 / 0.999 / 1e-8 | AdamW backend hyperparameters                                                                                                                                                                                      |
diff --git a/docs/zh/_toc.yml b/docs/zh/_toc.yml
@@ -40,6 +40,7 @@ parts:
     - file: algorithms/prox_approx
   - caption: 参考
     chapters:
+    - file: reference/optimizer
     - file: reference/checkpointing
     - file: reference/metrics_tracking
     - file: reference/alloc_mode
diff --git a/docs/zh/reference/optimizer.md b/docs/zh/reference/optimizer.md