update consistent rms

shuningjin · shuningjin · commit 73adf2a9f24c · 2026-04-22T07:56:33.000Z
diff --git a/src/maxtext/configs/pyconfig.py b/src/maxtext/configs/pyconfig.py
@@ -256,16 +256,6 @@ def _prepare_for_pydantic(raw_keys: dict[str, Any]) -> dict[str, Any]:
           Please pass tokenizer_path in your command if this is not intended."
         )
 
-    # Preprocess muon_consistent_rms to be None or float
-    if key == "muon_consistent_rms":
-      if value in ["None", "none"]:
-        new_value = None
-      else:
-        try:
-          new_value = float(value)
-        except ValueError as e:
-          raise ValueError("muon_consistent_rms should be None or float") from e
-
     pydantic_kwargs[key] = new_value
 
   return pydantic_kwargs
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1343,7 +1343,7 @@ class Muon(BaseModel):
       0,
       description="Strength of the weight decay regularization. This is multiplied with the learning rate.",
   )
-  muon_consistent_rms: None | float = Field(
+  muon_consistent_rms: float | None = Field(
       None,
       description="If None, apply width scaling to updates. If float, apply consistent rms scaling (recommend 0.2).",
   )
diff --git a/src/maxtext/optimizers/optimizers.py b/src/maxtext/optimizers/optimizers.py
@@ -197,6 +197,8 @@ def get_optimizer(config, learning_rate_schedule, model=None):
       muon_weight_dimension_numbers = get_muon_weight_dimension_numbers(model, config)
     else:
       raise ValueError("Please specify model to extract muon dimension number.")
+    # TODO(shuningjin): remove
+    print(f"DEBUG: {config.muon_consistent_rms}, {type(config.muon_consistent_rms)}")
     muon_kwargs = {
         # Shared parameters: "nesterov" uses default
         "learning_rate": learning_rate_schedule,

Original file line number	Diff line number	Diff line change
`@@ -1343,7 +1343,7 @@ class Muon(BaseModel):`
`1343`	`1343`	`0,`
`1344`	`1344`	`description="Strength of the weight decay regularization. This is multiplied with the learning rate.",`
`1345`	`1345`	`)`
`1346`		`- muon_consistent_rms: None \| float = Field(`
	`1346`	`+ muon_consistent_rms: float \| None = Field(`
`1347`	`1347`	`None,`
`1348`	`1348`	`description="If None, apply width scaling to updates. If float, apply consistent rms scaling (recommend 0.2).",`
`1349`	`1349`	`)`