cleaned up utils.optim and wrote compliant NAG optim

Alexander Ororbia · Alexander Ororbia · commit 51c2650569c4 · 2025-11-19T19:53:46.000-05:00
diff --git a/docs/source/ngclearn.utils.optim.rst b/docs/source/ngclearn.utils.optim.rst
@@ -12,6 +12,14 @@ ngclearn.utils.optim.adam module
    :undoc-members:
    :show-inheritance:
 
+ngclearn.utils.optim.nag module
+-------------------------------
+
+.. automodule:: ngclearn.utils.optim.nag
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 ngclearn.utils.optim.optim\_utils module
 ----------------------------------------
 
diff --git a/ngclearn/utils/optim/adam.py b/ngclearn/utils/optim/adam.py
@@ -1,16 +1,11 @@
 # %%
 
-# from ngcsimlib.component import Component
-# from ngcsimlib.compartment import Compartment
-# from ngcsimlib.resolver import resolver
-
 import numpy as np
 from jax import jit, numpy as jnp, random, nn, lax
 from functools import partial
-import time
 
 
-def step_update(param, update, g1, g2, lr, beta1, beta2, time, eps):
+def step_update(param, update, g1, g2, lr, beta1, beta2, time_step, eps):
     """
     Runs one step of Adam over a set of parameters given updates.
     The dynamics for any set of parameters is as follows:
@@ -39,17 +34,17 @@ def step_update(param, update, g1, g2, lr, beta1, beta2, time, eps):
 
         beta2: 2nd moment control factor
 
-        time: current time t or iteration step/call to this Adam update
+        time_step: current time t or iteration step/call to this Adam update
 
         eps: numberical stability coefficient (for calculating final update)
 
     Returns:
-        adjusted parameter tensor (same shape as "param")
+        adjusted parameter tensor (same shape as "param"), adjusted g1, adjusted g2
     """
     _g1 = beta1 * g1 + (1. - beta1) * update
     _g2 = beta2 * g2 + (1. - beta2) * jnp.square(update)
-    g1_unb = _g1 / (1. - jnp.power(beta1, time))
-    g2_unb = _g2 / (1. - jnp.power(beta2, time))
+    g1_unb = _g1 / (1. - jnp.power(beta1, time_step))
+    g2_unb = _g2 / (1. - jnp.power(beta2, time_step))
     _param = param - lr * g1_unb/(jnp.sqrt(g2_unb) + eps)
     return _param, _g1, _g2
 
@@ -83,9 +78,7 @@ def adam_step(opt_params, theta, updates, eta=0.001, beta1=0.9, beta2=0.999, eps
     new_g1 = []
     new_g2 = []
     for i in range(len(theta)):
-        px_i, g1_i, g2_i = step_update(theta[i], updates[i], g1[i],
-                                        g2[i], eta, beta1,
-                                        beta2, time_step, eps)
+        px_i, g1_i, g2_i = step_update(theta[i], updates[i], g1[i], g2[i], eta, beta1, beta2, time_step, eps)
         new_theta.append(px_i)
         new_g1.append(g1_i)
         new_g2.append(g2_i)
diff --git a/ngclearn/utils/optim/nag.py b/ngclearn/utils/optim/nag.py
@@ -0,0 +1,84 @@
+# %%
+
+import numpy as np
+from jax import jit, numpy as jnp, random, nn, lax
+from functools import partial
+import time
+
+
+def step_update(param, update, phi_old, lr, mu, time_step):
+    """
+    Runs one step of Nesterov's accelerated gradient (NAG) over a set of parameters given updates.
+    The dynamics for any set of parameters is as follows:
+
+    | phi = param - update * lr
+    | param = phi + (phi - phi_previous) * mu, where mu = 0 iff t <= 1 (first iteration)
+
+    Args:
+        param: parameter tensor to change/adjust
+
+        update: update tensor to be applied to parameter tensor (must be same
+            shape as "param")
+
+        phi_old: previous friction/momentum parameter
+
+        lr: global step size value to be applied to updates to parameters
+
+        mu: friction/momentum control factor
+
+        time_step: current time t or iteration step/call to this NAG update
+
+    Returns:
+        adjusted parameter tensor (same shape as "param"), adjusted momentum/friction variable
+    """
+    phi = param - update * lr ## do a phantom gradient adjustment step
+    _param = phi + (phi - phi_old) * (mu * (time_step > 1.)) ## NAG-step
+    _phi_old = phi
+    return _param, _phi_old
+
+@jit
+def nag_step(opt_params, theta, updates, eta=0.01, mu=0.9):  ## apply adjustment to theta
+    """
+    Implements Nesterov's accelerated gradient (NAG) algorithm as a decoupled update rule given adjustments produced
+    by a credit assignment algorithm/process.
+
+    Args:
+        opt_params: (ArrayLike) parameters of the optimization algorithm
+
+        theta: (ArrayLike) the weights of neural network
+
+        updates: (ArrayLike) the updates of neural network
+
+        eta: (float, optional) step size coefficient for NAG update (Default: 0.001)
+
+        mu: (float, optional) friction/momentum control factor. (Default: 0.9)
+
+    Returns:
+        ArrayLike: opt_params. New opt params, ArrayLike: theta. The updated weights
+    """
+    phi, time_step = opt_params
+    time_step = time_step + 1
+    new_theta = []
+    new_phi = []
+    for i in range(len(theta)):
+        px_i, phi_i = step_update(theta[i], updates[i], phi[i], eta, mu, time_step)
+        new_theta.append(px_i)
+        new_phi.append(phi_i)
+    return (new_phi, time_step), new_theta
+
+@jit
+def nag_init(theta):
+    time_step = jnp.asarray(0.0)
+    phi = [jnp.zeros(theta[i].shape) for i in range(len(theta))]
+    return phi, time_step
+
+if __name__ == '__main__':
+    weights = [jnp.asarray([3.0, 3.0]), jnp.asarray([3.0, 3.0])]
+    updates = [jnp.asarray([3.0, 3.0]), jnp.asarray([3.0, 3.0])]
+    opt_params = nag_init(weights)
+    opt_params, theta = nag_step(opt_params, weights, updates)
+    print(f"opt_params: {opt_params}, theta: {theta}")
+    weights = theta
+    print("##################")
+    opt_params, theta = nag_step(opt_params, weights, updates)
+    print(f"opt_params: {opt_params}, theta: {theta}")
diff --git a/ngclearn/utils/optim/optim_utils.py b/ngclearn/utils/optim/optim_utils.py
@@ -1,17 +1,20 @@
 import functools
 from .sgd import sgd_step, sgd_init
+from .nag import nag_step, nag_init
 from .adam import adam_step, adam_init
 
 def get_opt_init_fn(opt='adam'):
     return {
         'adam': adam_init,
+        'nag': nag_init,
         'sgd': sgd_init
     }[opt]
 
 
 def get_opt_step_fn(opt='adam', **kwargs):
-    # **kwargs here is the hyper parameters you want to pass in the optimization function
+    ## **kwargs here is the hyper-parameters you want to pass in the optimization function
     return {
         'adam': functools.partial(adam_step, **kwargs),
+        'nag': functools.partial(nag_step, **kwargs),
         'sgd': functools.partial(sgd_step, **kwargs),
     }[opt]
diff --git a/ngclearn/utils/optim/sgd.py b/ngclearn/utils/optim/sgd.py
@@ -15,7 +15,8 @@ def step_update(param, update, lr):
 
 @jit
 def sgd_step(opt_params, theta, updates, eta=0.001): ## apply adjustment to theta
-    """Return a params update
+    """
+    Returns updated parameters in accordance to a stochastic gradient descent (SGD) recipe
 
     Args:
         opt_params: (ArrayLike) parameters of the optimization algorithm
@@ -42,7 +43,6 @@ def sgd_step(opt_params, theta, updates, eta=0.001): ## apply adjustment to thet
 def sgd_init(theta):
     return jnp.asarray(0.0)
 
-
 if __name__ == '__main__':
     opt_params, theta = sgd_step((2.0), [1.0, 1.0], [3.0, 4.0], 3e-2)
     print(f"opt_params: {opt_params}, theta: {theta}")