theomgdev
diff --git a/‎odyssnet/training/chaos_optimizer.py‎
Lines changed: 226 additions & 25 deletions b/‎odyssnet/training/chaos_optimizer.py‎
Lines changed: 226 additions & 25 deletions
@@ -79,6 +79,8 @@ class ChaosGrad(torch.optim.Optimizer):
     _FRUST_THRESH     = 0.75
     _FRUST_NOISE      = 0.01
     _FRUST_META_RESET = 0.30
+    _FRUST_IMPROVE_TOL = 1e-4
+    _FRUST_SCALE_FLOOR = 1e-10
 
     # Cold-start helpers
     _GENESIS_SCALAR = 1e-6
@@ -311,6 +313,150 @@ def _row_mode(self, p: torch.Tensor) -> bool:
         """True when this parameter uses per-row meta-params."""
         return self._meta_resolution == 'row' and p.dim() >= 2
 
+    @staticmethod
+    def _finite_scalar(x, fallback: float) -> float:
+        """Convert value to a finite float; otherwise return fallback."""
+        try:
+            if torch.is_tensor(x):
+                x = float(x.float().mean().item())
+            else:
+                x = float(x)
+        except Exception:
+            return fallback
+        if not math.isfinite(x):
+            return fallback
+        return x
+
+    def _state_requires_reinit(self, state: dict, g_f: torch.Tensor) -> bool:
+        """Return True when state is missing/corrupt and must cold-start."""
+        required = (
+            'step', 'init_lr', 'grad_ema', 'momentum',
+            'per_param_lr', 'per_param_beta',
+            'per_param_decay', 'per_param_alpha', 'v2',
+        )
+        if any(k not in state for k in required):
+            return True
+
+        init_lr = self._finite_scalar(state.get('init_lr', 0.0), 0.0)
+        if init_lr <= 0.0:
+            return True
+
+        for key in ('grad_ema', 'momentum', 'v2'):
+            t = state.get(key)
+            if not torch.is_tensor(t) or t.shape != g_f.shape:
+                return True
+            if not bool(torch.isfinite(t).all().item()):
+                return True
+
+        return False
+
+    def _sanitize_meta_params(self, state: dict, p: torch.Tensor, group: dict) -> None:
+        """Keep adaptive meta-params finite, in-range, and shape-consistent."""
+        row_mode = self._row_mode(p)
+        rows = p.shape[0] if p.dim() >= 2 else 1
+
+        is_hebbian = group.get('is_hebbian', False)
+        beta_equil = self._finite_scalar(group.get('beta_equil', 0.90), 0.90)
+        init_decay = 0.0 if is_hebbian else self._finite_scalar(group.get('init_decay', 0.0), 0.0)
+
+        init_lr = self._finite_scalar(state.get('init_lr', self._LR_MIN), self._LR_MIN)
+        init_lr = max(self._LR_MIN, min(self._LR_MAX, init_lr))
+        state['init_lr'] = init_lr
+
+        def _sanitize_scalar(v, lo, hi, fallback):
+            x = self._finite_scalar(v, fallback)
+            if lo is not None:
+                x = max(lo, x)
+            if hi is not None:
+                x = min(hi, x)
+            return x
+
+        def _sanitize_row(v, lo, hi, fallback):
+            if torch.is_tensor(v):
+                t = v.to(device=p.device, dtype=torch.float32)
+                if t.shape != (rows,):
+                    if t.numel() == 1:
+                        t = t.reshape(1).expand(rows).clone()
+                    else:
+                        flat = t.reshape(-1)
+                        if flat.numel() >= rows:
+                            t = flat[:rows].clone()
+                        else:
+                            t = torch.full((rows,), fallback, dtype=torch.float32, device=p.device)
+                            t[:flat.numel()] = flat
+            else:
+                t = torch.full(
+                    (rows,),
+                    self._finite_scalar(v, fallback),
+                    dtype=torch.float32,
+                    device=p.device,
+                )
+
+            pos_fill = hi if hi is not None else fallback
+            neg_fill = lo if lo is not None else fallback
+            t = torch.nan_to_num(t, nan=fallback, posinf=pos_fill, neginf=neg_fill)
+            if lo is not None or hi is not None:
+                t.clamp_(
+                    lo if lo is not None else -float('inf'),
+                    hi if hi is not None else float('inf'),
+                )
+            return t
+
+        if row_mode:
+            state['per_param_lr'] = _sanitize_row(
+                state.get('per_param_lr', init_lr),
+                self._LR_MIN,
+                self._LR_MAX,
+                init_lr,
+            )
+            state['per_param_beta'] = _sanitize_row(
+                state.get('per_param_beta', beta_equil),
+                self._BETA_MIN,
+                self._BETA_MAX,
+                beta_equil,
+            )
+            if is_hebbian:
+                state['per_param_decay'] = torch.zeros((rows,), dtype=torch.float32, device=p.device)
+            else:
+                state['per_param_decay'] = _sanitize_row(
+                    state.get('per_param_decay', init_decay),
+                    0.0,
+                    self._DECAY_MAX,
+                    init_decay,
+                )
+            state['per_param_alpha'] = _sanitize_row(
+                state.get('per_param_alpha', 0.5),
+                0.0,
+                1.0,
+                0.5,
+            )
+            return
+
+        state['per_param_lr'] = _sanitize_scalar(
+            state.get('per_param_lr', init_lr),
+            self._LR_MIN,
+            self._LR_MAX,
+            init_lr,
+        )
+        state['per_param_beta'] = _sanitize_scalar(
+            state.get('per_param_beta', beta_equil),
+            self._BETA_MIN,
+            self._BETA_MAX,
+            beta_equil,
+        )
+        state['per_param_decay'] = 0.0 if is_hebbian else _sanitize_scalar(
+            state.get('per_param_decay', init_decay),
+            0.0,
+            self._DECAY_MAX,
+            init_decay,
+        )
+        state['per_param_alpha'] = _sanitize_scalar(
+            state.get('per_param_alpha', 0.5),
+            0.0,
+            1.0,
+            0.5,
+        )
+
     def _init_param_state(
         self,
         p: torch.Tensor,
@@ -554,6 +700,11 @@ def step(self, closure=None):
             with torch.enable_grad():
                 loss = closure()
 
+        if not math.isfinite(self._frustration):
+            self._frustration = 0.0
+        if not math.isfinite(self._best_loss):
+            self._best_loss = float('inf')
+
         burst_now  = (self._frustration > self._FRUST_THRESH) or self._force_plateau_escape
         self._force_plateau_escape = False
 
@@ -577,33 +728,42 @@ def step(self, closure=None):
                     )
 
                 g_f = p.grad.float().detach()
+                if not bool(torch.isfinite(g_f).all().item()):
+                    # Do not let NaN/Inf gradients poison state/weights.
+                    self.reset_param_state(p)
+                    continue
+
+                # Respect hard architectural constraints in all optimizer
+                # pathways (signals, moments, preconditioner).
+                g_signal = g_f
+                if is_core and g_signal.dim() == 2 and g_signal.shape[0] == g_signal.shape[1]:
+                    g_signal = g_signal.clone()
+                    g_signal.fill_diagonal_(0.0)
 
                 # ---- Cold start ----
                 if not self.state[p]:
-                    self.state[p] = self._init_param_state(p, g_f, group)
+                    self.state[p] = self._init_param_state(p, g_signal, group)
 
                 state = self.state[p]
 
-                # Neurogenesis/legacy checkpoints can occasionally produce
-                # partial state payloads. Re-seed missing essentials so step
-                # logic remains total and shape-safe.
-                required = (
-                    'step', 'init_lr', 'grad_ema', 'momentum',
-                    'per_param_lr', 'per_param_beta',
-                    'per_param_decay', 'per_param_alpha', 'v2',
-                )
-                if any(k not in state for k in required):
-                    self.state[p] = self._init_param_state(p, g_f, group)
+                # Re-seed invalid state payloads so the update remains stable.
+                if self._state_requires_reinit(state, g_signal):
+                    self.state[p] = self._init_param_state(p, g_signal, group)
                     state = self.state[p]
 
+                # Sanitize before meta-update math so corrupt payloads cannot
+                # explode or shape-break the update equations.
+                self._sanitize_meta_params(state, p, group)
+
                 state['step'] += 1
                 step = state['step']
 
                 # ---- Hypergradient signals ----
-                sigs = self._compute_signals(g_f, state, p)
+                sigs = self._compute_signals(g_signal, state, p)
 
                 # ---- Meta-parameter update ----
                 self._update_meta_params(state, sigs, group)
+                self._sanitize_meta_params(state, p, group)
 
                 per_lr    = state['per_param_lr']
                 per_beta  = state['per_param_beta']
@@ -614,7 +774,7 @@ def step(self, closure=None):
 
                 # In row mode per_lr / per_beta are (rows,) tensors. Reshape
                 # them for broadcasting across the full parameter shape.
-                row_mode = torch.is_tensor(per_lr)
+                row_mode = self._row_mode(p)
                 if row_mode:
                     view_shape = (-1,) + (1,) * (p.dim() - 1)
                     per_lr_b   = per_lr.view(view_shape)
@@ -637,19 +797,19 @@ def step(self, closure=None):
 
                 # ---- Update grad EMA (for next step's signals) ----
                 state['grad_ema'].mul_(self._SIGNAL_ALPHA).add_(
-                    g_f, alpha=1.0 - self._SIGNAL_ALPHA
+                    g_signal, alpha=1.0 - self._SIGNAL_ALPHA
                 )
 
                 # ---- Gradient centralization ----
-                g_proc = g_f
-                if g_f.dim() >= 2:
-                    dims   = tuple(range(1, g_f.dim()))
-                    g_mean = g_f.mean(dim=dims, keepdim=True)
+                g_proc = g_signal
+                if g_signal.dim() >= 2:
+                    dims   = tuple(range(1, g_signal.dim()))
+                    g_mean = g_signal.mean(dim=dims, keepdim=True)
                     if row_mode and torch.is_tensor(per_alpha):
                         if torch.any(per_alpha > 1e-3):
-                            g_proc = g_f - per_alpha_b * g_mean
+                            g_proc = g_signal - per_alpha_b * g_mean
                     elif per_alpha > 1e-3:
-                        g_proc = g_f - per_alpha * g_mean
+                        g_proc = g_signal - per_alpha * g_mean
 
                 # ---- Zero diagonal on chaos core gradient ----
                 if is_core and g_proc.dim() == 2 and g_proc.shape[0] == g_proc.shape[1]:
@@ -664,6 +824,10 @@ def step(self, closure=None):
                 else:
                     v.mul_(per_beta).add_(g_proc, alpha=1.0 - per_beta)
 
+                # Keep optimizer state in the same constrained subspace as W.
+                if is_core and v.dim() == 2 and v.shape[0] == v.shape[1]:
+                    v.fill_diagonal_(0.0)
+
                 # ---- Frustration burst ----
                 # Noise is scaled to the current momentum RMS so that the
                 # perturbation is always a comparable fraction of "how fast we
@@ -686,6 +850,9 @@ def step(self, closure=None):
                             noise = noise - per_alpha * noise_mean
                     v.add_(noise)
 
+                    if is_core and v.dim() == 2 and v.shape[0] == v.shape[1]:
+                        v.fill_diagonal_(0.0)
+
                     if burst_type == 'full':
                         beta_equil = group.get('beta_equil', 0.90)
                         if row_mode:
@@ -715,7 +882,11 @@ def step(self, closure=None):
 
                 # ---- Elementwise second-moment (Adam-style) ----
                 v2 = state['v2']
-                v2.mul_(self._BETA2).addcmul_(g_f, g_f, value=1.0 - self._BETA2)
+                # Keep denominator dynamics consistent with the transformed
+                # gradient that drives momentum (centralization included).
+                v2.mul_(self._BETA2).addcmul_(g_proc, g_proc, value=1.0 - self._BETA2)
+                if is_core and v2.dim() == 2 and v2.shape[0] == v2.shape[1]:
+                    v2.fill_diagonal_(0.0)
                 bias_corr_v2 = max(1.0 - self._BETA2 ** step, self._EPS)
                 denom        = (v2 / bias_corr_v2).sqrt().add_(self._EPS)
 
@@ -761,18 +932,48 @@ def report_loss(self, loss_value: float) -> None:
 
         Call this once per optimizer step (the trainer does this automatically
         when ChaosGrad is detected as the active optimizer).
+
+        The frustration signal is sign-agnostic and scale-robust:
+          - meaningful relative improvement  -> 0.0 signal
+          - exact plateau / regression       -> 1.0 signal
+          - sub-threshold improvement        -> linearly reduced signal
         """
         loss = float(loss_value)
-        if loss < self._best_loss * 0.9999:
-            self._best_loss      = loss
-            frustration_signal   = 0.0
+
+        if not math.isfinite(self._frustration):
+            self._frustration = 0.0
+
+        if not math.isfinite(loss):
+            # Treat NaN/Inf losses as maximal stagnation signal without
+            # corrupting best_loss / frustration with NaN values.
+            self._frustration = (
+                self._frustration * self._FRUST_DECAY
+                + 1.0 * (1.0 - self._FRUST_DECAY)
+            )
+            self._frustration = max(0.0, min(1.0, self._frustration))
+            return
+
+        if not math.isfinite(self._best_loss):
+            self._best_loss = loss
+            frustration_signal = 0.0
         else:
-            frustration_signal = min(1.0, loss / max(self._best_loss, 1e-10))
+            scale = max(abs(self._best_loss), abs(loss), self._FRUST_SCALE_FLOOR)
+            rel_improvement = (self._best_loss - loss) / scale
+
+            if rel_improvement > self._FRUST_IMPROVE_TOL:
+                self._best_loss = loss
+                frustration_signal = 0.0
+            else:
+                # 1.0 for plateau/regression, smoothly reduced by small gains
+                # that do not pass the "new best" threshold.
+                sub_thr_gain = max(0.0, min(1.0, rel_improvement / self._FRUST_IMPROVE_TOL))
+                frustration_signal = 1.0 - sub_thr_gain
 
         self._frustration = (
             self._frustration * self._FRUST_DECAY
             + frustration_signal * (1.0 - self._FRUST_DECAY)
         )
+        self._frustration = max(0.0, min(1.0, self._frustration))
 
     def trigger_plateau_escape(self) -> None:
         """Manually trigger a frustration burst on the next :meth:`step` call."""