add cse remat

NuojCheng · NuojCheng · commit a1f6db316867 · 2026-03-30T23:17:43.000Z
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -957,7 +957,7 @@ xprof_e2e_enable_fw_power_level_event: False
 xprof_e2e_enable_fw_thermal_event: False
 profile_power_events: False # Set to True to enable TPU-specific power/thermal profiling events. Defaults to False to avoid breaking GPU xplane tracing.
 
-log_config: False # Prints the config (after defaults have been set by pyconfig logic)
+log_config: True # Prints the config (after defaults have been set by pyconfig logic)
 debug_sharding: False # Prints model weights sharding info
 
 # Checkpoint Structured logging
diff --git a/src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe-cp.yml b/src/maxtext/configs/custom_mesh_and_rule/pipeline-large-moe-cp.yml
@@ -0,0 +1,61 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This logical rule is designed to optimize pipeline parallelism for large-scale jobs. 
+# Key changes include removing expert weight sharding on the `q_lora` dimension, which 
+# is relatively small (e.g., 512 for DeepSeek), and limiting sharding strategies when 
+# EP x FSDP > 512. 
+#
+# The `data` axis is preserved for two reasons: first, the pipeline stage acts as a 
+# data parallel (DP) domain externally, making the `data` axis a necessary reference; 
+# second, it may be required for DCN communication. 
+#
+# Finally, the `context` axis is used to add fractional batch size support
+mesh_axes: ['data', 'stage', 'fsdp', 'context', 'expert']
+data_sharding: [['data', 'stage', 'fsdp', 'context', 'expert']]
+logical_axis_rules: [
+                      ['activation_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_batch_moe', ['data', 'fsdp', 'expert']],
+                      ['activation_batch_no_exp', ['data', 'fsdp']],
+                      ['activation_batch_no_exp_moe', ['data', 'fsdp']],
+                      ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert']],
+                      ['activation_embed_and_logits_batch_sequence', ['data', 'stage', 'fsdp', 'context', 'expert']],
+                      ['activation_length', ['context', 'expert']],
+                      ['activation_attn_length', ['context', 'expert']],
+                      ['activation_attn_length_no_exp', ['context']],
+                      ['activation_length_no_exp', ['context']],
+                      ['activation_length_no_exp_moe', ['context']],
+                      ['activation_norm_length', ['context']],
+                      ['activation_norm_length_moe', ['context']],
+                      ['activation_q_length', ['context', 'expert']],
+                      ['activation_q_length_no_exp', ['context']],
+                      ['prefill_activation_length', ['context']],
+                      ['prefill_activation_norm_length', ['context']],
+                      ['activation_prefill_kv_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_kv_batch', ['data', 'fsdp', 'expert']],
+                      ['activation_kv_batch_no_exp', ['data', 'fsdp']],
+                      ['activation_vocab', ['context']],
+                      ['activation_stage', 'stage'],
+                      ['activation_exp', ['expert']],
+                      ['decode_batch', ['data', 'fsdp', 'expert']],
+                      ['embed', ['fsdp', 'context', 'expert']],
+                      ['embed_no_exp', ['fsdp', 'context']],
+                      ['embed_moe', ['fsdp', 'context', 'expert']],
+                      ['embed_no_exp_moe', ['fsdp', 'context']],
+                      ['q_lora', ['fsdp']],
+                      ['kv_lora', ['fsdp']],
+                      ['layers', 'stage'],
+                      ['exp', 'expert'],
+                      ['exp_with_fsdp', 'fsdp'],
+                    ]
diff --git a/src/maxtext/layers/pipeline.py b/src/maxtext/layers/pipeline.py
@@ -1173,7 +1173,7 @@ def from_repeat_weights_to_bsw(
       self,
       repeat_weights,
       physical_partition_spec,
-      axes_to_gather=("fsdp", "fsdp_transpose", "expert"),  # three major FSDP-like axes
+      axes_to_gather=("fsdp", "fsdp_transpose", "context", "expert"),  # three major FSDP-like axes
       use_shardmap=False,  # using shardmap produces additional reduce-scatter in backward pass
   ):
     """Executes the FSDP-like all-gathers to fully materialize a block of weights for the BSW."""
@@ -1351,7 +1351,6 @@ def __call__(
       segment_idx = None
 
     loop_state, bsw = self.init_states(inputs)
-    weights = self.layers.variables
     physical_partition_spec = logical_to_mesh(
         logical_partition_spec, mesh=self.mesh, rules=self.config.logical_axis_rules
     )
@@ -1388,41 +1387,34 @@ def run_iteration_scannable(model, loop_state, bsw):
 
     # base scannable function used twice for real and bubble runs
     base_scannable = functools.partial(
-        pipeline_utils.create_rematerialized_pipeline_stage,
+        pipeline_utils.create_pipeline_stage,
         deterministic=deterministic,
         model_mode=model_mode,
         logical_partition_spec=logical_partition_spec,
         physical_partition_spec=physical_partition_spec,
         positions=positions,
         segment_ids=segment_ids,
-        pipeline_weights=weights,
     )
 
     run_one_repeat_scannable = base_scannable(length=self.config.num_pipeline_microbatches)
-    # run_one_repeat_scannable = nn.remat(
-    #   run_one_repeat_scannable,
-    #   prevent_cse=True,
-    #   policy=self.get_pipeline_remat_policy()
-    # )
     run_bubbles_scannable = base_scannable(length=bubble_iterations)
-    # run_bubbles_scannable = nn.remat(
-    #   run_bubbles_scannable,
-    #   prevent_cse=True,
-    #   policy=self.get_pipeline_remat_policy()
-    # )
 
     run_repeats_scanned = pipeline_utils.create_flax_pipeline_scan(
         pipeline_stage_fn=run_one_repeat_scannable,
         length=self.config.num_pipeline_repeats,
+        remat_policy=self.get_pipeline_remat_policy(),
         use_scan=self.config.scan_pipeline_repeats,
     )
     run_bubbles_scanned = pipeline_utils.create_flax_pipeline_scan(
         pipeline_stage_fn=run_bubbles_scannable,
         length=1,
+        remat_policy=self.get_pipeline_remat_policy(),
         use_scan=self.config.scan_pipeline_repeats,
     )
-    (loop_state, w_curr), _ = run_repeats_scanned(self, (loop_state, bsw[0]))
-    (loop_state, _), _ = run_bubbles_scanned(self, (loop_state, w_curr))
+    initial_carry_repeats = (loop_state, bsw[0], self.layers.variables)
+    (loop_state, w_curr, pipeline_weights), _ = run_repeats_scanned(self, initial_carry_repeats)
+    initial_carry_bubbles = (loop_state, w_curr, pipeline_weights)
+    (loop_state, _, pipeline_weights), _ = run_bubbles_scanned(self, initial_carry_bubbles)
 
     final_output = self.realign_output_microbatches(loop_state["state_io"])
     final_output = jnp.reshape(
diff --git a/src/maxtext/utils/pipeline_utils.py b/src/maxtext/utils/pipeline_utils.py
@@ -248,24 +248,21 @@ def run_pipeline_microbatches_custom_bwd(residuals, g_final_state):
   return run_pipeline_microbatches_custom
 
 
-def create_rematerialized_pipeline_stage(
+def create_pipeline_stage(
     length,
     deterministic,
     model_mode,
     logical_partition_spec,
     physical_partition_spec,
     positions,
     segment_ids,
-    pipeline_weights,
 ):
-  """Builds a memory-checkpointed execution block for a single pipeline stage.
+  """Builds an execution block for a single pipeline stage.
 
   This function prepares the state for a specific chunk of pipeline execution by:
   1. Prefetching the required weights for the current stage/loop iteration.
   2. Executing `length` microbatches using either a memory-efficient `jax.lax.scan`
      (if `scan_pipeline_iterations` is True) or an unrolled Python `for` loop.
-  3. Wrapping the entire stage block in `flax.linen.remat` to discard and recompute
-     activations during the backward pass based on the model's policy.
 
   Args:
     length: The number of microbatches to process in this stage.
@@ -275,14 +272,15 @@ def create_rematerialized_pipeline_stage(
     physical_partition_spec: Rules for physical device mesh mappings (used in prefetching).
     positions: Position IDs for the sequence.
     segment_ids: Segment/Attention routing IDs for the sequence.
-    pipeline_weights: The fully gathered pipeline weights explicitly passed via closure.
 
   Returns:
-    A function decorated with `nn.remat` that takes `(model, loop_state)` and returns
-    the updated `loop_state`.
+    A function that takes `(model, loop_state, weight, pipeline_weights)` and returns
+    the updated loop_state and new weight.
   """
 
-  def execute_pipeline_stage_outer(model, loop_state_and_bsw):
+  def execute_pipeline_stage_outer(model, carry):
+
+    loop_state, w_curr, pipeline_weights = carry
 
     scan_microbatches_fn = create_gradient_accumulation_scan(
         model=model,
@@ -295,12 +293,11 @@ def execute_pipeline_stage_outer(model, loop_state_and_bsw):
     remat_weight_prefetching = model.one_weight_prefetching
 
     @jax.custom_vjp
-    def execute_pipeline_stage(loop_state_and_bsw, pipeline_weights):
-      return execute_pipeline_stage_custom_fwd(loop_state_and_bsw, pipeline_weights)[0]
+    def execute_pipeline_stage(loop_state, w_curr, pipeline_weights):
+      return execute_pipeline_stage_custom_fwd(loop_state, w_curr, pipeline_weights)[0]
 
-    def execute_pipeline_stage_custom_fwd(loop_state_and_bsw, pipeline_weights):
-      loop_state, w_curr = loop_state_and_bsw
-      # # Retrieve the specific weights needed for this pipeline chunk
+    def execute_pipeline_stage_custom_fwd(loop_state, w_curr, pipeline_weights):
+      # Retrieve the specific weights needed for this pipeline chunk
       w_next = remat_weight_prefetching(
           pipeline_weights,
           physical_partition_spec,
@@ -328,17 +325,17 @@ def execute_pipeline_stage_custom_bwd(residuals, g_outputs):
       g_loop_state, g_bsw, _, _ = scan_fn_vjp((g_loop_state, g_bsw))
       g_w_curr, g_w_next = g_bsw
       (g_pipeline_weights,) = remat_weight_prefetching_t(g_w_next)
-      return (g_loop_state, g_w_curr), g_pipeline_weights
+      return g_loop_state, g_w_curr, g_pipeline_weights
 
     execute_pipeline_stage.defvjp(execute_pipeline_stage_custom_fwd, execute_pipeline_stage_custom_bwd)
 
-    return execute_pipeline_stage(loop_state_and_bsw, pipeline_weights), None
+    return (*execute_pipeline_stage(loop_state, w_curr, pipeline_weights), pipeline_weights), None
 
   return execute_pipeline_stage_outer
 
 
-def create_flax_pipeline_scan(pipeline_stage_fn, length, use_scan=True):
-  """Wraps the pipeline stage execution in a `flax.linen.scan`.
+def create_flax_pipeline_scan(pipeline_stage_fn, length, remat_policy, use_scan=True):
+  """Wraps the pipeline stage execution in a `flax.linen.scan` and `flax.linen.remat`.
 
   This lifts the pipeline stage function so it can be repeated sequentially over
   the specified length. It safely handles Flax-specific state collections, ensuring
@@ -348,6 +345,7 @@ def create_flax_pipeline_scan(pipeline_stage_fn, length, use_scan=True):
   Args:
     pipeline_stage_fn: The function representing a single pipeline stage
                        (usually created by `create_rematerialized_pipeline_stage`).
+    remat_policy: remat policy used for pipeline stage
     length: The total number of pipeline stages/repeats to scan over.
     use_scan: Either scan over repeats or unroll the scan.
 
@@ -356,7 +354,10 @@ def create_flax_pipeline_scan(pipeline_stage_fn, length, use_scan=True):
   """
   unroll_length = 1 if use_scan else length
   return nn.scan(
-      pipeline_stage_fn,
+      nn.remat(
+          pipeline_stage_fn,
+          policy=remat_policy,
+      ),
       variable_axes={
           "summaries": 0,
           "aux_loss": 0,