chore: update ChaosGrad default lr to 1e-4 across implementation, docs, and tests

theomgdev · theomgdev · commit b04a695c52f2 · 2026-04-14T09:54:49.000+03:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -203,7 +203,7 @@ trainer = OdyssNetTrainer(model, lr=3e-4)
 
 # ChaosGrad: optional zero-hyperparameter optimizer (pass as custom optimizer)
 from odyssnet import ChaosGrad
-opt     = ChaosGrad(ChaosGrad.classify_params(model), lr=1e-3)
+opt     = ChaosGrad(ChaosGrad.classify_params(model), lr=1e-4)
 trainer = OdyssNetTrainer(model, optimizer=opt)
 ```
 
diff --git a/docs/LIBRARY.md b/docs/LIBRARY.md
@@ -223,7 +223,7 @@ trainer = OdyssNetTrainer(model, optimizer=torch.optim.AdamW(model.parameters(),
 
 # ChaosGrad — zero-hyperparameter optimizer (optional, see ChaosGrad section below)
 from odyssnet import ChaosGrad
-opt     = ChaosGrad(ChaosGrad.classify_params(model), lr=1e-3)
+opt     = ChaosGrad(ChaosGrad.classify_params(model), lr=1e-4)
 trainer = OdyssNetTrainer(model, optimizer=opt)
 ```
 
@@ -512,7 +512,7 @@ from odyssnet import OdyssNet, OdyssNetTrainer, ChaosGrad
 model   = OdyssNet(num_neurons=32, input_ids=[0], output_ids=[31], device='cuda')
 
 # Classify parameters for group-specific meta-adaptation
-opt     = ChaosGrad(ChaosGrad.classify_params(model), lr=1e-3)
+opt     = ChaosGrad(ChaosGrad.classify_params(model), lr=1e-4)
 trainer = OdyssNetTrainer(model, optimizer=opt, device='cuda')
 
 for epoch in range(100):
diff --git a/odyssnet/training/chaos_optimizer.py b/odyssnet/training/chaos_optimizer.py
@@ -9,7 +9,7 @@
     from odyssnet.training.chaos_optimizer import ChaosGrad
 
     model   = OdyssNet(num_neurons=32, input_ids=[0], output_ids=[31])
-    opt     = ChaosGrad(ChaosGrad.classify_params(model), lr=1e-3)
+    opt     = ChaosGrad(ChaosGrad.classify_params(model), lr=1e-4)
     trainer = OdyssNetTrainer(model, optimizer=opt)
 
 Algorithm (v3 improvements over the removed v2.2):
@@ -38,14 +38,14 @@ class ChaosGrad(torch.optim.Optimizer):
       - per_param_alpha : gradient-centralization gate
 
     The single user-facing parameter ``lr`` (genesis learning rate,
-    default 1e-3) is a mathematical starting point, not a dial to tune.
+    default 1e-4) is a mathematical starting point, not a dial to tune.
 
     Args:
         params: Iterable of parameters **or** a list of classified param-group
                 dicts returned by :meth:`classify_params`. Providing classified
                 groups enables group-specific decay seeding, per-group beta
                 equilibria, and the Hebbian bypass rule.
-        lr (float): Genesis learning rate. Default: ``1e-3``.
+        lr (float): Genesis learning rate. Default: ``1e-4``.
     """
 
     # ------------------------------------------------------------------ #
@@ -133,7 +133,7 @@ class ChaosGrad(torch.optim.Optimizer):
     # Construction                                                         #
     # ------------------------------------------------------------------ #
 
-    def __init__(self, params, lr: float = 1e-3) -> None:
+    def __init__(self, params, lr: float = 1e-4) -> None:
         if lr <= 0:
             raise ValueError(f"Genesis learning rate must be > 0, got {lr}")
         defaults = dict(
diff --git a/tests/training/test_chaos_optimizer.py b/tests/training/test_chaos_optimizer.py
@@ -42,7 +42,7 @@ def _model(n=8, in_ids=None, out_ids=None, **kwargs):
                     device="cpu", **kwargs)
 
 
-def _opt(model, lr=1e-3):
+def _opt(model, lr=1e-4):
     return ChaosGrad(ChaosGrad.classify_params(model), lr=lr)
 
 
@@ -148,7 +148,7 @@ def test_gates_beta_equil_is_0_85(self):
 
     def test_plain_params_accepted(self):
         m   = _model()
-        opt = ChaosGrad(m.parameters(), lr=1e-3)
+        opt = ChaosGrad(m.parameters(), lr=1e-4)
         assert opt is not None
 
 
@@ -183,7 +183,7 @@ def test_W_diagonal_stays_zero(self):
 
     def test_sparse_gradient_raises(self):
         embed = torch.nn.Embedding(10, 4, sparse=True)
-        opt   = ChaosGrad(embed.parameters(), lr=1e-3)
+        opt   = ChaosGrad(embed.parameters(), lr=1e-4)
         idx   = torch.tensor([0, 2])
         out   = embed(idx).sum()
         out.backward()
diff --git a/tests/training/test_chaos_optimizer_extra.py b/tests/training/test_chaos_optimizer_extra.py
@@ -29,7 +29,7 @@ def _model(n=8, **kwargs):
                     device='cpu', **kwargs)
 
 
-def _opt(model, lr=1e-3):
+def _opt(model, lr=1e-4):
     return ChaosGrad(ChaosGrad.classify_params(model), lr=lr)
 
 
@@ -126,11 +126,11 @@ def test_lr_group_override_propagates(self):
         from group, so the new value must take effect on the next step.
         """
         m   = _model()
-        opt = _opt(m, lr=1e-3)
+        opt = _opt(m, lr=1e-4)
         _one_step_raw(opt, m)
 
         for pg in opt.param_groups:
-            pg['lr'] = 5e-4
+            pg['lr'] = 5e-5
 
         W_before = m.W.data.clone()
         _one_step_raw(opt, m)
@@ -279,15 +279,15 @@ def test_gradient_checkpointing_compatible(self):
     def test_plain_params_no_crash(self):
         """ChaosGrad with plain model.parameters() (no classify_params) must train."""
         m   = _model()
-        opt = ChaosGrad(m.parameters(), lr=1e-3)
+        opt = ChaosGrad(m.parameters(), lr=1e-4)
         t   = OdyssNetTrainer(m, optimizer=opt)
         loss = _step(t, n=3)
         assert math.isfinite(loss)
 
     def test_plain_params_hebbian_no_bypass(self):
         """Without classify_params, hebb params get lightweight treatment (no crash)."""
         m   = _model(hebb_type='global')
-        opt = ChaosGrad(m.parameters(), lr=1e-3)
+        opt = ChaosGrad(m.parameters(), lr=1e-4)
         t   = OdyssNetTrainer(m, optimizer=opt)
         loss = _step(t, n=5)
         assert math.isfinite(loss)