add custom vjp over repeat scan

NuojCheng · NuojCheng · commit 2176bcbc131b · 2026-03-27T17:37:19.000Z
diff --git a/src/maxtext/layers/pipeline.py b/src/maxtext/layers/pipeline.py
@@ -1389,8 +1389,6 @@ def run_iteration_scannable(model, loop_state, bsw):
     # base scannable function used twice for real and bubble runs
     base_scannable = functools.partial(
         pipeline_utils.create_rematerialized_pipeline_stage,
-        model=self,
-        run_iteration_scannable=run_iteration_scannable,
         deterministic=deterministic,
         model_mode=model_mode,
         logical_partition_spec=logical_partition_spec,
diff --git a/src/maxtext/utils/pipeline_utils.py b/src/maxtext/utils/pipeline_utils.py
@@ -19,6 +19,7 @@
 from jax.sharding import PartitionSpec as P
 from flax import linen as nn
 from flax.linen.spmd import LogicallyPartitioned
+import jax.numpy as jnp
 
 
 def get_mesh_axis_dim_indices(physical_partition_spec, axis_name="fsdp"):
@@ -248,8 +249,6 @@ def run_pipeline_microbatches_custom_bwd(residuals, g_final_state):
 
 
 def create_rematerialized_pipeline_stage(
-    model,
-    run_iteration_scannable,
     length,
     deterministic,
     model_mode,
@@ -285,39 +284,61 @@ def create_rematerialized_pipeline_stage(
     the updated `loop_state`.
   """
 
-  def execute_pipeline_stage(model, loop_state_and_bsw):
-    loop_state, w_curr = loop_state_and_bsw
-    # # Retrieve the specific weights needed for this pipeline chunk
-    # bsw = model.both_weight_prefetching(pipeline_weights, physical_partition_spec, loop_state["loop_iteration"])
-    w_next = jax.remat(
-        model.one_weight_prefetching,
-        static_argnums=(1,),
-        policy=jax.checkpoint_policies.nothing_saveable,
-    )(
-        pipeline_weights,
-        physical_partition_spec,
-        loop_state["loop_iteration"],
-    )
-    bsw = (w_curr, w_next)
+  def execute_pipeline_stage_outer(model, loop_state_and_bsw):
+
     scan_microbatches_fn = create_gradient_accumulation_scan(
         model=model,
         length=length,
         deterministic=deterministic,
         model_mode=model_mode,
         logical_partition_spec=logical_partition_spec,
     )
-    loop_state, bsw = scan_microbatches_fn(loop_state, bsw, positions, segment_ids)
-    w_curr, w_next = bsw
-    del w_curr
-    return (loop_state, w_next), None
-
-  return execute_pipeline_stage
-
-  # return nn.remat(
-  #     execute_pipeline_stage,
-  #     prevent_cse=not model.config.scan_pipeline_iterations,
-  #     policy=model.get_pipeline_remat_policy(),
-  # )
+
+    remat_weight_prefetching = model.one_weight_prefetching
+
+    @jax.custom_vjp
+    def execute_pipeline_stage(loop_state_and_bsw, pipeline_weights):
+      return execute_pipeline_stage_custom_fwd(loop_state_and_bsw, pipeline_weights)[0]
+
+    def execute_pipeline_stage_custom_fwd(loop_state_and_bsw, pipeline_weights):
+      loop_state, w_curr = loop_state_and_bsw
+      # # Retrieve the specific weights needed for this pipeline chunk
+      w_next = remat_weight_prefetching(
+          pipeline_weights,
+          physical_partition_spec,
+          loop_state["loop_iteration"],
+      )
+      bsw = (w_curr, w_next)
+
+      (loop_state, bsw), scan_fn_vjp = jax.vjp(scan_microbatches_fn, loop_state, bsw, positions, segment_ids)
+      p_remat_weight_prefetching = functools.partial(
+          remat_weight_prefetching,
+          physical_partition_spec=physical_partition_spec,
+          loop_iteration=loop_state["loop_iteration"],
+      )
+      remat_weight_prefetching_t = jax.linear_transpose(
+          p_remat_weight_prefetching,
+          pipeline_weights,
+      )
+      w_curr, w_next = bsw
+      del w_curr
+      return (loop_state, w_next), (scan_fn_vjp, remat_weight_prefetching_t)
+
+    def execute_pipeline_stage_custom_bwd(residuals, g_outputs):
+      g_loop_state, g_w_next = g_outputs
+      scan_fn_vjp, remat_weight_prefetching_t = residuals
+      g_w_curr = jax.tree.map(jnp.zeros_like, g_w_next)
+      g_bsw = (g_w_curr, g_w_next)
+      g_loop_state, g_bsw, _, _ = scan_fn_vjp((g_loop_state, g_bsw))
+      g_w_curr, g_w_next = g_bsw
+      (g_pipeline_weights,) = remat_weight_prefetching_t(g_w_next)
+      return (g_loop_state, g_w_curr), g_pipeline_weights
+
+    execute_pipeline_stage.defvjp(execute_pipeline_stage_custom_fwd, execute_pipeline_stage_custom_bwd)
+
+    return execute_pipeline_stage(loop_state_and_bsw, pipeline_weights), None
+
+  return execute_pipeline_stage_outer
 
 
 def create_flax_pipeline_scan(pipeline_stage_fn, length):