add another layer of custom vjp

NuojCheng · NuojCheng · commit 0efb56ec9a57 · 2026-03-02T21:24:52.000Z
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
@@ -1118,6 +1118,7 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
             pre_bias_logits,
             self.config.use_custom_sort_vjp,
             roll_to_expert_id=num_experts_per_shard * expert_shard_id,
+            rngs=rngs,
         )
 
         # Filter down to the group sizes that apply to only the experts in the
diff --git a/src/maxtext/layers/pipeline.py b/src/maxtext/layers/pipeline.py
@@ -15,8 +15,8 @@
 """Pipeline layer wrapping a decoder layer(s). Supports circular pipelining"""
 
 from typing import Any
+import functools
 
-import numpy as np
 from maxtext.utils import pipeline_utils
 
 from jax import numpy as jnp
@@ -469,11 +469,8 @@ def permute_output_micro_per_stage_dim(self, output):
     # The first real output (microbatch 0) takes a certain amount of loop iterations to finish and be pushed to
     # state_io - it will land on a different index of state_io depending on the number of iterations.
     microbatch_0_idx = self.iterations_to_complete_first_microbatch() % self.microbatches_per_stage
-    permutation = (
-        np.arange(self.microbatches_per_stage) + microbatch_0_idx
-    ) % self.microbatches_per_stage  # permute so the value in land_idx is moved into idx 0, and (land_idx + 1) appear
-    # in idx 1, etc
-    output = output[:, permutation]
+    output = jnp.roll(output, shift=-microbatch_0_idx, axis=1)
+    output = self._maybe_shard_with_logical(output, self.state_io_logical)
     return output
 
   def get_current_stage_weights(
@@ -583,6 +580,77 @@ def bsw_all_gather_over_fsdp(self, weights, physical_partition_spec, loop_iterat
     bsw_1 = self.from_all_variables_to_bsw(weights, loop_iteration + 1, physical_partition_spec)
     return jax.ad_checkpoint.checkpoint_name((bsw_0, bsw_1), "bsw")
 
+  def _run_initialization(
+      self,
+      example_inputs,
+      example_segmentation,
+      example_position,
+      segment_idx,
+      position_idx,
+      deterministic,
+      model_mode,
+  ):
+    """Runs the initialization sequence mapping layers appropriately based on pipeline settings."""
+    vmap_func = self.get_vmap_func_for_init()
+
+    if self.config.num_pipeline_repeats > 1:
+      # To shard the weights on initialization for the circular pipeline we create weights of
+      # shape [num_repeat, num_stages, ...] (e.g. [num_repeat, num_stages, embed, mlp]) and shard the num_stages axis.
+      # We wrap the main stage vmap with a num_repeat vmap to generate this axis only for parameter initialization.
+      vmap_func = nn.vmap(
+          vmap_func,
+          in_axes=(0, segment_idx, position_idx, None, None),
+          variable_axes={
+              "params": 0,
+              "_overwrite_with_gradient": 0,
+              "non_trainable": 0,
+              "hyper_params": 0,
+          },
+          split_rngs={"params": True, "dropout": self.config.enable_dropout},
+          metadata_params={
+              nn.PARTITION_NAME: "circular_repeats",
+              "sub_weight_split_dims_mapping": (None,),
+              "is_initializing": True,
+              "x_times": self.config.num_pipeline_repeats,
+              "optimizer_dims_mapping": None,
+          },
+      )
+
+      example_inputs = jax.lax.broadcast(example_inputs, [self.config.num_pipeline_repeats])
+      example_segmentation = (
+          jax.lax.broadcast(example_segmentation, [self.config.num_pipeline_repeats])
+          if example_segmentation is not None
+          else None
+      )
+      example_position = (
+          jax.lax.broadcast(example_position, [self.config.num_pipeline_repeats])
+          if example_position is not None
+          else None
+      )
+
+    # We only need to run one set of stages to initialize the variables, instead of looping over all microbatches for
+    # the full total_iterations.
+    example_inputs = self._maybe_shard_with_logical(example_inputs, (None, None, None, None))
+    stage_outputs = vmap_func(
+        self.layers, example_inputs, example_segmentation, example_position, deterministic, model_mode
+    )
+    if self.config.scan_layers:
+      stage_outputs = stage_outputs[0]
+
+    # We return something of the correct shape (global_batch, sequence, embed) by reshaping a single stages output
+    # which has shape [pipeline_microbatch_size, sequence, embed]
+    if self.config.num_pipeline_repeats > 1:
+      stage_outputs = stage_outputs[0]  # Remove extra dimension created for the circular vmap
+    broadcasted_stage_outpus = jax.lax.broadcast(
+        stage_outputs[0], [self.config.micro_batch_size_to_train_on // self.pipeline_microbatch_size]
+    )
+
+    return jnp.reshape(
+        broadcasted_stage_outpus,
+        [self.config.micro_batch_size_to_train_on, self.config.max_target_length, self.config.emb_dim],
+        out_sharding=self.output_sharding,
+    )
+
   def get_vmap_func_for_init(self):
     """This vmap func is used to initialize the weights only on init."""
 
@@ -815,63 +883,8 @@ def __call__(
     bubble_iterations = self.forwarding_delay * (self.num_stages - 1)
 
     if self.is_initializing():
-      vmap_func = self.get_vmap_func_for_init()
-
-      if self.config.num_pipeline_repeats > 1:
-        # To shard the weights on initialization for the circular pipeline we create weights of
-        # shape [num_repeat, num_stages, ...] (e.g. [num_repeat, num_stages, embed, mlp]) and shard the num_stages axis.
-        # We wrap the main stage vmap with a num_repeat vmap to generate this axis only for parameter initialization.
-        vmap_func = nn.vmap(
-            vmap_func,
-            in_axes=(0, segment_idx, position_idx, None, None),
-            variable_axes={
-                "params": 0,
-                "_overwrite_with_gradient": 0,
-                "non_trainable": 0,
-                "hyper_params": 0,
-            },
-            split_rngs={"params": True, "dropout": self.config.enable_dropout},
-            metadata_params={
-                nn.PARTITION_NAME: "circular_repeats",
-                "sub_weight_split_dims_mapping": (None,),
-                "is_initializing": True,
-                "x_times": self.config.num_pipeline_repeats,
-                "optimizer_dims_mapping": None,
-            },
-        )
-
-        example_inputs = jax.lax.broadcast(example_inputs, [self.config.num_pipeline_repeats])
-        example_segmentation = (
-            jax.lax.broadcast(example_segmentation, [self.config.num_pipeline_repeats])
-            if example_segmentation is not None
-            else None
-        )
-        example_position = (
-            jax.lax.broadcast(example_position, [self.config.num_pipeline_repeats])
-            if example_position is not None
-            else None
-        )
-      # We only need to run one set of stages to initialize the variables, instead of looping over all microbatches for
-      # the full total_iterations.
-      example_inputs = self._maybe_shard_with_logical(example_inputs, (None, None, None, None))
-      stage_outputs = vmap_func(
-          self.layers, example_inputs, example_segmentation, example_position, deterministic, model_mode
-      )
-      if self.config.scan_layers:
-        stage_outputs = stage_outputs[0]
-
-      # We return something of the correct shape (global_batch, sequence, embed) by reshaping a single stages output
-      # which has shape [pipeline_microbatch_size, sequence, embed]
-      if self.config.num_pipeline_repeats > 1:
-        stage_outputs = stage_outputs[0]  # Remove extra dimension created for the circular vmap
-      broadcasted_stage_outpus = jax.lax.broadcast(
-          stage_outputs[0], [self.config.micro_batch_size_to_train_on // self.pipeline_microbatch_size]
-      )
-
-      return jnp.reshape(
-          broadcasted_stage_outpus,
-          [self.config.micro_batch_size_to_train_on, self.config.max_target_length, self.config.emb_dim],
-          out_sharding=self.output_sharding,
+      return self._run_initialization(
+          example_inputs, example_segmentation, example_position, segment_idx, position_idx, deterministic, model_mode
       )
 
     logical_partition_spec = pipeline_utils.get_logical_spec_repeats_removed(logical_partition_spec)
@@ -898,95 +911,37 @@ def run_iteration_scannable(model, loop_state):
           policy=self.get_pipeline_remat_policy(),
       )
 
-    def run_one_repeat_scannable(model, loop_state):
-      loop_state["bsw"] = model.bsw_all_gather_over_fsdp(
-          loop_state["weights"], physical_partition_spec, loop_state["loop_iteration"]
-      )
-
-      if model.config.scan_pipeline_iterations:
-        run_one_repeat_scanned_custom = pipeline_utils.create_scanned_function(
-            model=model,
-            run_iteration_scannable=run_iteration_scannable,
-            length=model.config.num_pipeline_microbatches,
-            variable_axes={
-                "summaries": 0,
-                "aux_loss": 0,
-                "intermediates": 0,
-                "hyper_params": 0,
-            },
-            split_rngs={"random": True},
-            deterministic=deterministic,
-            model_mode=model_mode,
-            logical_partition_spec=logical_partition_spec,
-        )
-        loop_state = run_one_repeat_scanned_custom(loop_state, positions, segment_ids)
-      else:
-        for _ in range(model.config.num_pipeline_microbatches):
-          loop_state, _ = run_iteration_scannable(model, loop_state)
-      return loop_state, None
-
-    run_one_repeat_scannable = nn.remat(
-        run_one_repeat_scannable,
-        prevent_cse=not self.config.scan_pipeline_iterations,
-        policy=self.get_pipeline_remat_policy(),
+    base_scannable = functools.partial(
+        pipeline_utils.create_run_scannable,
+        model=self,
+        run_iteration_scannable=run_iteration_scannable,
+        deterministic=deterministic,
+        model_mode=model_mode,
+        logical_partition_spec=logical_partition_spec,
+        physical_partition_spec=physical_partition_spec,
+        positions=positions,
+        segment_ids=segment_ids,
     )
 
-    def run_bubbles_scannable(model, loop_state):
-      loop_state["bsw"] = model.bsw_all_gather_over_fsdp(
-          loop_state["weights"], physical_partition_spec, loop_state["loop_iteration"]
-      )
-
-      if model.config.scan_pipeline_iterations:
-        run_bubbles_scanned_custom = pipeline_utils.create_scanned_function(
-            model=model,
-            run_iteration_scannable=run_iteration_scannable,
-            length=bubble_iterations,
-            variable_axes={
-                "summaries": 0,
-                "aux_loss": 0,
-                "intermediates": 0,
-                "hyper_params": 0,
-            },
-            split_rngs={"random": True},
-            deterministic=deterministic,
-            model_mode=model_mode,
-            logical_partition_spec=logical_partition_spec,
-        )
-        loop_state = run_bubbles_scanned_custom(loop_state, positions, segment_ids)
-      else:
-        for _ in range(model.config.num_pipeline_microbatches):
-          loop_state, _ = run_iteration_scannable(model, loop_state)
-      return loop_state, None
+    run_one_repeat_scannable = base_scannable(
+        length=self.config.num_pipeline_microbatches,
+    )
 
-    run_bubbles_scannable = nn.remat(
-        run_bubbles_scannable,
-        prevent_cse=not self.config.scan_pipeline_iterations,
-        policy=self.get_pipeline_remat_policy(),
+    run_bubbles_scannable = base_scannable(
+        length=bubble_iterations,
     )
 
     def run_all_iterations(model, loop_state):
       if self.config.scan_pipeline_repeats:
-        run_repeats_scanned = nn.scan(
-            run_one_repeat_scannable,
-            variable_axes={
-                "summaries": 0,
-                "aux_loss": 0,
-                "intermediates": 0,
-                "hyper_params": 0,
-            },
-            split_rngs={"random": True},
+        run_repeats_scanned = pipeline_utils.create_run_repeats_scanned(
+            run_scannable=run_one_repeat_scannable,
+            model=model,
             length=model.config.num_pipeline_repeats,
         )
 
-        run_bubbles_scanned = nn.scan(
-            run_bubbles_scannable,
-            variable_axes={
-                "summaries": 0,
-                "aux_loss": 0,
-                "intermediates": 0,
-                "hyper_params": 0,
-            },
-            split_rngs={"random": True},
+        run_bubbles_scanned = pipeline_utils.create_run_repeats_scanned(
+            run_scannable=run_bubbles_scannable,
+            model=model,
             length=1,
         )
         loop_state, _ = run_repeats_scanned(model, loop_state)
diff --git a/src/maxtext/utils/pipeline_utils.py b/src/maxtext/utils/pipeline_utils.py

Original file line number	Diff line number	Diff line change
`@@ -1118,6 +1118,7 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r`
`1118`	`1118`	`pre_bias_logits,`
`1119`	`1119`	`self.config.use_custom_sort_vjp,`
`1120`	`1120`	`roll_to_expert_id=num_experts_per_shard * expert_shard_id,`
	`1121`	`+ rngs=rngs,`
`1121`	`1122`	`)`
`1122`	`1123`
`1123`	`1124`	`# Filter down to the group sizes that apply to only the experts in the`