switched to simplified ademamix from optax

davidtweedle · davidtweedle · commit 532e5f0ac26c · 2026-04-10T10:41:26.000-04:00
diff --git a/submissions/self_tuning/ademamix/submission.py b/submissions/self_tuning/ademamix/submission.py
@@ -1,39 +1,24 @@
-"""
-Forked from apple's ademamix jax implementation:
-https://github.com/apple/ml-ademamix
-for the purposes of submitting to the algoperf benchmark.
-| Adapted from optax's implementation of AdamW:
-| https://github.com/google-deepmind/optax/blob/b75644809f2f68fc11f42d4395a5753e11e92e80/optax/_src/alias.py#L548#L675
-"""
+"""AlgoPerf AdEMAMix submission built on Optax."""
 import functools
 from typing import (
         Any,
-        Callable,
         Dict,
         Iterator,
         List,
-        NamedTuple,
         Optional,
         Tuple,
-        Union,
         )
 
-import chex
 import jax
-from jax import tree_util as jtu
 import jax.numpy as jnp
 import optax
 from flax import jax_utils
-from flax import traverse_util as tu
-from jax import lax
-
-from optax._src import transform, combine, base, numerics, utils
-from optax import tree_utils as otu
 
 from algoperf import spec, jax_sharding_utils
 
 
 HPARAMS = {
+        'ademamix_variant': 'simplified',
         'alpha': 8.0,
         'alpha_start': 0,
         'warmup': 10,
@@ -83,106 +68,44 @@ def warmup_fn(step):
     schedule_fn = optax.join_schedules(schedules=[warmup_fn, constant_fn], boundaries=[warmup])
     return schedule_fn
 
-
-class ScaleByAdemamixState(NamedTuple):
-  """State for the AdEMAMix algorithm."""
-  count: chex.Array  
-  count_m2: chex.Array  
-  m1: optax.Updates
-  m2: optax.Updates
-  nu: optax.Updates
-
-
-def ademamix(lr, b1=0.9, b2=0.999, b3=0.9999, alpha=5.0, b3_scheduler=None, alpha_scheduler=None,
-             eps=1e-8, eps_root=0.0, weight_decay=0.0, mask=None):
-  r"""AdEMAMix.
-
-    Args:
-        lr: A global scaling factor, either fixed or evolving along
-            iterations with a scheduler, see :func:`optax.scale_by_learning_rate`.
-        b1: Exponential decay rate to track the fast EMA.
-        b2: Exponential decay rate to track the second moment of past gradients.
-        b3: Exponential decay rate to track the slow EMA.
-        alpha: Mixing coeficient use for the linear combination of the fast and slow EMAs.
-        b3_scheduler: an optional scheduler function, given a timestep, returns the 
-            value of b3. Use `beta3_scheduler(b3,b1,T_b3)` to follow the AdEMAMix paper. 
-        alpha_scheduler: an optional scheduler function, given a timestep, returns the 
-            value of alpha. Use `alpha_scheduler(alpha,0,T_alpha)` to follow the 
-            AdEMAMix paper. 
-        eps: A small constant applied to denominator outside of the square root
-            (as in the Adam paper) to avoid dividing by zero when rescaling.
-        eps_root: A small constant applied to denominator inside the square root (as
-            in RMSProp), to avoid dividing by zero when rescaling. This is needed for
-            instance when computing (meta-)gradients through Adam.
-        mu_dtype: Optional `dtype` to be used for the first order accumulator; if
-            `None` then the `dtype` is inferred from `params` and `updates`.
-        weight_decay: Strength of the weight decay regularization. Note that this
-            weight decay is multiplied with the learning rate. This is consistent
-            with other frameworks such as PyTorch, but different from
-            (Loshchilov et al, 2019) where the weight decay is only multiplied with
-            the "schedule multiplier", but not the base learning rate.
-        mask: A tree with same structure as (or a prefix of) the params PyTree,
-            or a Callable that returns such a pytree given the params/updates.
-            The leaves should be booleans, `True` for leaves/subtrees you want to
-            apply the weight decay to, and `False` for those you want to skip. Note
-            that the Adam gradient transformations are applied to all parameters.
-
-    Returns:
-        The corresponding `GradientTransformation`.
-  """
-  return combine.chain(
-    scale_by_ademamix(b1, b2, b3, alpha, b3_scheduler, alpha_scheduler, eps, eps_root),
-    transform.add_decayed_weights(weight_decay, mask),
-    transform.scale_by_learning_rate(lr),
-  )
-
-
-def scale_by_ademamix(b1, b2, b3, alpha, b3_scheduler, alpha_scheduler, eps=1e-8, eps_root=0.0):
-
-  def init_fn(params):
-    m1 = jax.tree.map(jnp.zeros_like, params)   # fast EMA
-    m2 = jax.tree.map(jnp.zeros_like, params)   # slow EMA
-    nu = jax.tree.map(jnp.zeros_like, params)   # second moment estimate
-    return ScaleByAdemamixState(count=jnp.zeros([], jnp.int32), count_m2=jnp.zeros([], jnp.int32), m1=m1, m2=m2, nu=nu)
-
-  def update_fn(updates, state, params=None):
-    del params
-    c_b3 = b3_scheduler(state.count_m2) if b3_scheduler is not None else b3
-    c_alpha = alpha_scheduler(state.count_m2) if alpha_scheduler is not None else alpha
-    m1 = _update_moment(updates, state.m1, b1, 1) # m1 = b1 * m1 + (1-b1) * updates
-    m2 = _update_moment(updates, state.m2, c_b3, 1)
-    nu = _update_moment(updates, state.nu, b2, 2)
-    count = state.count + jnp.array(1, dtype=jnp.int32)
-    # count_inc = numerics.safe_int32_increment(state.count)
-    count_m2 = state.count_m2 + jnp.array(1, dtype=jnp.int32)
-    # count_m2_inc = numerics.safe_int32_increment(state.count_m2)
-    m1_hat = _bias_correction(m1, b1, count)
-    nu_hat = _bias_correction(nu, b2, count)
-    updates = jax.tree.map(lambda m1_, m2_, v_: (m1_+c_alpha*m2_)/(jnp.sqrt(v_+eps_root)+eps), m1_hat, m2, nu_hat)
-    return updates, ScaleByAdemamixState(count=count, count_m2=count_m2, m1=m1, m2=m2, nu=nu)
-
-  return base.GradientTransformation(init_fn, update_fn)
-
-
-def _update_moment(updates, moments, decay, order):
-  """Compute the exponential moving average of the `order`-th moment."""
-  return jax.tree.map(
-      lambda g, t: (1 - decay) * (g ** order) + decay * t, updates, moments)
-
-
-
-def _bias_correction(moment, decay, count):
-  """Performs bias correction. It becomes a no-op as count goes to infinity."""
-  # The conversion to the data type of the moment ensures that bfloat16 remains
-  # bfloat16 in the optimizer state. This conversion has to be done after
-  # `bias_correction_` is calculated as calculating `decay**count` in low
-  # precision can result in it being rounded to 1 and subsequently a
-  # "division by zero" error.
-  bias_correction_ = 1 - decay**count
-
-  # Perform division in the original precision.
-  return jax.tree.map(
-      lambda t: t / bias_correction_.astype(t.dtype), moment)
+def build_ademamix_optimizer(
+    lr,
+    variant='simplified',
+    b1=0.9,
+    b2=0.999,
+    b3=0.9999,
+    alpha=5.0,
+    b3_scheduler=None,
+    alpha_scheduler=None,
+    eps=1e-8,
+    eps_root=0.0,
+    weight_decay=0.0,
+    mask=None,
+):
+  if variant == 'simplified':
+    return optax.contrib.simplified_ademamix(
+        learning_rate=lr,
+        b1=b1,
+        b2=b2,
+        alpha=alpha_scheduler if alpha_scheduler is not None else alpha,
+        eps=eps,
+        eps_root=eps_root,
+        weight_decay=weight_decay,
+        mask=mask,
+    )
+  if variant == 'full':
+    return optax.contrib.ademamix(
+        learning_rate=lr,
+        b1=b1,
+        b2=b2,
+        b3=b3_scheduler if b3_scheduler is not None else b3,
+        alpha=alpha_scheduler if alpha_scheduler is not None else alpha,
+        eps=eps,
+        eps_root=eps_root,
+        weight_decay=weight_decay,
+        mask=mask,
+    )
+  raise ValueError(f'Unsupported ademamix variant: {variant}')
 
 
 def train_step(workload,
@@ -410,21 +333,26 @@ def init_optimizer_state(
     b2 = HPARAMS['b2']
     b3 = HPARAMS['b3']
     alpha = HPARAMS['alpha']
+    variant = HPARAMS['ademamix_variant']
     warmup = HPARAMS['warmup']
     T = workload.step_hint
     f_b3 = beta3_scheduler(b3, beta_start=b1, warmup=T)
     f_a = alpha_scheduler(alpha, alpha_start=0, warmup=T)
     f_lr = lr_scheduler(lr, warmup, T)
     weight_decay = HPARAMS['weight_decay']
-    opt_init_fn, opt_update_fn = ademamix(lr=f_lr,
-                                          b1=b1,
-                                          b2=b2,
-                                          b3=b3,
-                                          alpha=alpha,
-                                          b3_scheduler=f_b3,
-                                          alpha_scheduler=f_a,
-                                          weight_decay=weight_decay
-                                          )
+    optimizer = build_ademamix_optimizer(
+        lr=f_lr,
+        variant=variant,
+        b1=b1,
+        b2=b2,
+        b3=b3,
+        alpha=alpha,
+        b3_scheduler=f_b3,
+        alpha_scheduler=f_a,
+        weight_decay=weight_decay,
+    )
+    opt_init_fn = optimizer.init
+    opt_update_fn = optimizer.update
     optimizer_state = opt_init_fn(params_zeros_like)
     return optimizer_state, opt_update_fn
 
@@ -438,14 +366,17 @@ def f(x): return jnp.sum(x ** 2)  # simple quadratic function
     f_a = alpha_scheduler(alpha, alpha_start=0, warmup=10)
     f_b3 = beta3_scheduler(b3, beta_start=b1, warmup=10)
 
-    solver = ademamix(lr=0.01, 
-                      b1=b1, 
-                      b2=b2, 
-                      b3=b3, 
-                      alpha=alpha, 
-                      b3_scheduler=f_b3, 
-                      alpha_scheduler=f_a,
-                      weight_decay=0.01)
+    solver = build_ademamix_optimizer(
+        lr=0.01,
+        variant='full',
+        b1=b1,
+        b2=b2,
+        b3=b3,
+        alpha=alpha,
+        b3_scheduler=f_b3,
+        alpha_scheduler=f_a,
+        weight_decay=0.01,
+    )
     
     params = jnp.array([1., 2., 3.])
     print('Objective function: {:.2f}'.format(f(params)))