NNX post-train fixes: unpack MultimodalInput for NNX decoder; support scalar LR in adam_pax

ecnal-cienet · ecnal-cienet · commit 86e20ea29409 · 2026-04-13T15:02:21.000Z
- models.py: NNX Transformer was passing `multimodal_input=MultimodalInput(...)` to
  NNXDecoder, which expects individual keyword args (image_embeddings, image_masks,
  audio_embeddings, audio_masks, bidirectional_mask). Unpack the object at the call site.

- optimizers.py: adam_pax called `learning_rate_fn(count)` unconditionally, failing when
  `optax.inject_hyperparams` passes a pre-evaluated scalar instead of a callable schedule.
  Add `callable()` guard to handle both cases.
diff --git a/src/maxtext/models/models.py b/src/maxtext/models/models.py
@@ -517,7 +517,11 @@ def __call__(
           previous_chunk=previous_chunk,
           slot=slot,
           page_state=page_state,
-          multimodal_input=multimodal_input,
+          image_embeddings=multimodal_input.image_embeddings if multimodal_input is not None else None,
+          image_masks=multimodal_input.image_masks if multimodal_input is not None else None,
+          audio_embeddings=multimodal_input.audio_embeddings if multimodal_input is not None else None,
+          audio_masks=multimodal_input.audio_masks if multimodal_input is not None else None,
+          bidirectional_mask=multimodal_input.bidirectional_mask if multimodal_input is not None else None,
           kv_caches=kv_caches,
           attention_metadata=attention_metadata,
           deepstack_visual_embeds=deepstack_visual_embeds,
diff --git a/src/maxtext/optimizers/optimizers.py b/src/maxtext/optimizers/optimizers.py
@@ -336,7 +336,9 @@ def _update_momentum(update, mu, nu):
       else:
         updates = jax.tree_util.tree_map(lambda x, v: x + weight_decay * v, updates, params)
 
-    step_size = -1.0 * learning_rate_fn(count)
+    # learning_rate_fn may be a callable schedule or a scalar (e.g. when wrapped
+    # by optax.inject_hyperparams, it is passed as a pre-evaluated scalar).
+    step_size = -1.0 * (learning_rate_fn(count) if callable(learning_rate_fn) else learning_rate_fn)
     # Finally, fold in step size.
     updates = jax.tree_util.tree_map(lambda x: step_size * x, updates)