AI-Hypercomputer
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 4 additions & 3 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/maxtext/configs/models/deepseek3-671b-2dfsdp.yml‎
Lines changed: 5 additions & 3 deletions b/‎src/maxtext/configs/models/deepseek3-671b-2dfsdp.yml‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 54 additions & 72 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 54 additions & 72 deletions
diff --git a/‎src/maxtext/layers/decoders.py‎
Lines changed: 6 additions & 7 deletions b/‎src/maxtext/layers/decoders.py‎
Lines changed: 6 additions & 7 deletions
@@ -275,7 +275,6 @@ pipeline_parallel_layers: -1 # Pipeline only this number of layers - for the rem
 # PP degree divides the number of layers.
 # By default (when set to -1) we pipeline all of the decoder layers.
 
-
 # num_pipeline_microbatches must be a multiple of the number of pipeline stages. By default it is set to the number of stages.
 # Note the microbatch_size is given by global_batch_size / num_pipeline_microbatches, where global_batch_size = per_device_batch_size * num_devices
 num_pipeline_microbatches: -1
@@ -288,8 +287,9 @@ pipeline_fsdp_ag_once: False # If set to true then all gather all of the weights
 # to only one stage's worth, however we only execute one all-gather and reduce across per repeat, as opposed
 # to every microbatch. This is similar to zero-1 sharding, since we also don't need to all gather the FSDP weights in the backward pass.
 # An alternative to setting this to true may be to replace any FSDP with DP and use optimizer offloading if necessary.
-# A more optimal behavior is to all-gather at the start of each repeat, which would ideally get the best of both worlds -
-# a small amount of memory and time, however this has proven hard to implement in SPMD, see b/364386697 for more.
+pipeline_fsdp_ag_per_repeat: False
+# Pipeline weight prefetching per repeat is an advanced SPMD pipeline parallelism improvement technique
+# When enabled, it prefetches necessary weight gathering ahead of microbatched computation, therefore reducing collectives
 
 # There are two loops for PP:
 #  1)  Outer loop over microbatches (pipeline iterations)
@@ -299,6 +299,7 @@ pipeline_fsdp_ag_once: False # If set to true then all gather all of the weights
 # It may be useful to do the reverse when the layers_per_stage is very large.
 # The below settings only have effect when using pipeline parallelism.
 scan_pipeline_iterations: True
+scan_pipeline_repeats: True
 scan_layers_per_stage: False
 set_remat_policy_on_pipeline_iterations: True
 set_remat_policy_on_layers_per_stage: False
 
@@ -56,19 +56,21 @@ rope_truncate: True
 rope_attention_scaling: False
 
 override_logical_axis_rules: True
-mesh_axes: ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']
-data_sharding: [['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']]
+mesh_axes: ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']
+data_sharding: [['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']]
 logical_axis_rules: [
     ['activation_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
-    ['activation_embed_and_logits_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
+    ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
     ['activation_kv_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert', 'context']],
     ['activation_embed_and_logits_batch', ['data', 'fsdp', 'fsdp_transpose', 'expert']],
     ['activation_norm_length', ['context']],
     ['activation_heads', []],
+    ['activation_stage', 'stage'],
     ['embed', ['fsdp']],
     ['embed_no_exp', ['fsdp']],
     ['q_lora', ['fsdp']],
     ['kv_lora', ['fsdp']],
+    ['layers', 'stage'],
     ['q_lora_up_proj', ['fsdp_transpose', 'expert']],
     ['kv_lora_up_proj', ['fsdp_transpose', 'expert']],
     ['q_heads', ['fsdp_transpose', 'expert']],
 
@@ -842,6 +842,9 @@ class IciParallelism(BaseModel):
 class PipelineParallelism(BaseModel):
   """Configuration for pipeline parallelism."""
 
+  pipeline_fsdp_ag_per_repeat: bool = Field(
+      False, description="Enable weight prefetching for circular pipeline parallelism."
+  )
   num_layers_per_pipeline_stage: int = Field(1, description="Number of layers to place on each pipeline stage.")
   num_pipeline_repeats: int = Field(
       -1,
@@ -857,6 +860,7 @@ class PipelineParallelism(BaseModel):
   )
   pipeline_fsdp_ag_once: bool = Field(False, description="If True, all-gather FSDP weights once per pipeline repeat.")
   scan_pipeline_iterations: bool = Field(True, description="Use jax.lax.scan over pipeline iterations.")
+  scan_pipeline_repeats: bool = Field(True, description="Use jax.lax.scan over pipeline repeats.")
   scan_layers_per_stage: bool = Field(False, description="Use jax.lax.scan over layers within a stage.")
   set_remat_policy_on_pipeline_iterations: bool = Field(True, description="Set remat policy on the pipeline scan.")
   set_remat_policy_on_layers_per_stage: bool = Field(False, description="Set remat policy on the inner layer scan.")
@@ -2250,6 +2254,17 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         )
         self.num_pipeline_repeats = num_pipeline_repeats
 
+      if self.pipeline_fsdp_ag_per_repeat:
+        assert self.num_pipeline_repeats > 1, "Pipeline weight prefetching only supports circular pipeline."
+        assert (
+            self.num_layers_per_pipeline_stage == 1
+        ), "Pipeline weight prefetching currently only supports one layer per pipeline stage."
+        assert (
+            not self.pipeline_delay_activation_forwarding
+        ), "Pipeline weight prefetching does not support pipeline delay."
+        assert not self.quantization, "Quantization is currently not supported for pipeline prefetching."
+        assert not self.scan_layers_per_stage, "Pipeline weight prefetching currently does not support scan."
+
       assert (num_stages * self.num_pipeline_repeats * self.num_layers_per_pipeline_stage) == (
           self.pipeline_parallel_layers
       ), (
@@ -2539,78 +2554,45 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
       raise ValueError("`share_kv_projections` is not compatible with `attention_type='mla'`.")
 
     # I. FINAL TYPE CONVERSIONS AND DERIVED LISTS
-    # Create the ici_parallelism and dcn_parallelism lists for legacy compatibility.
-    if self.using_pipeline_parallelism and self.mesh_axes and self.mesh_axes[0] == "stage":
-      self.ici_parallelism = [
-          self.ici_diloco_parallelism,
-          self.ici_pipeline_parallelism,
-          self.ici_data_parallelism,
-          self.ici_fsdp_parallelism,
-          self.ici_fsdp_transpose_parallelism,
-          self.ici_sequence_parallelism,
-          self.ici_context_parallelism,
-          self.ici_context_autoregressive_parallelism,
-          self.ici_tensor_parallelism,
-          self.ici_tensor_transpose_parallelism,
-          self.ici_tensor_sequence_parallelism,
-          self.ici_expert_parallelism,
-          self.ici_autoregressive_parallelism,
-      ]
-      self.dcn_parallelism = [
-          self.dcn_diloco_parallelism,
-          self.dcn_pipeline_parallelism,
-          self.dcn_data_parallelism,
-          self.dcn_fsdp_parallelism,
-          self.dcn_fsdp_transpose_parallelism,
-          self.dcn_sequence_parallelism,
-          self.dcn_context_parallelism,
-          self.dcn_context_autoregressive_parallelism,
-          self.dcn_tensor_parallelism,
-          self.dcn_tensor_transpose_parallelism,
-          self.dcn_tensor_sequence_parallelism,
-          self.dcn_expert_parallelism,
-          self.dcn_autoregressive_parallelism,
-      ]
-    else:
-      ici_map = {
-          "diloco": self.ici_diloco_parallelism,
-          "data": self.ici_data_parallelism,
-          "stage": self.ici_pipeline_parallelism,
-          "fsdp": self.ici_fsdp_parallelism,
-          "fsdp_transpose": self.ici_fsdp_transpose_parallelism,
-          "sequence": self.ici_sequence_parallelism,
-          "context": self.ici_context_parallelism,
-          "context_autoregressive": self.ici_context_autoregressive_parallelism,
-          "tensor": self.ici_tensor_parallelism,
-          "tensor_transpose": self.ici_tensor_transpose_parallelism,
-          "tensor_sequence": self.ici_tensor_sequence_parallelism,
-          "model": self.ici_tensor_parallelism,
-          "expert": self.ici_expert_parallelism,
-          "autoregressive": self.ici_autoregressive_parallelism,
-          "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
-          "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
-      }
-      self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]
-
-      dcn_map = {
-          "diloco": self.dcn_diloco_parallelism,
-          "data": self.dcn_data_parallelism,
-          "stage": self.dcn_pipeline_parallelism,
-          "fsdp": self.dcn_fsdp_parallelism,
-          "fsdp_transpose": self.dcn_fsdp_transpose_parallelism,
-          "sequence": self.dcn_sequence_parallelism,
-          "context": self.dcn_context_parallelism,
-          "context_autoregressive": self.dcn_context_autoregressive_parallelism,
-          "tensor": self.dcn_tensor_parallelism,
-          "tensor_transpose": self.dcn_tensor_transpose_parallelism,
-          "tensor_sequence": self.dcn_tensor_sequence_parallelism,
-          "model": self.dcn_tensor_parallelism,
-          "expert": self.dcn_expert_parallelism,
-          "autoregressive": self.dcn_autoregressive_parallelism,
-          "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
-          "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
-      }
-      self.dcn_parallelism = [dcn_map[axis] for axis in self.mesh_axes]
+    ici_map = {
+        "diloco": self.ici_diloco_parallelism,
+        "data": self.ici_data_parallelism,
+        "stage": self.ici_pipeline_parallelism,
+        "fsdp": self.ici_fsdp_parallelism,
+        "fsdp_transpose": self.ici_fsdp_transpose_parallelism,
+        "sequence": self.ici_sequence_parallelism,
+        "context": self.ici_context_parallelism,
+        "context_autoregressive": self.ici_context_autoregressive_parallelism,
+        "tensor": self.ici_tensor_parallelism,
+        "tensor_transpose": self.ici_tensor_transpose_parallelism,
+        "tensor_sequence": self.ici_tensor_sequence_parallelism,
+        "model": self.ici_tensor_parallelism,
+        "expert": self.ici_expert_parallelism,
+        "autoregressive": self.ici_autoregressive_parallelism,
+        "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
+        "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
+    }
+    self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]
+
+    dcn_map = {
+        "diloco": self.dcn_diloco_parallelism,
+        "data": self.dcn_data_parallelism,
+        "stage": self.dcn_pipeline_parallelism,
+        "fsdp": self.dcn_fsdp_parallelism,
+        "fsdp_transpose": self.dcn_fsdp_transpose_parallelism,
+        "sequence": self.dcn_sequence_parallelism,
+        "context": self.dcn_context_parallelism,
+        "context_autoregressive": self.dcn_context_autoregressive_parallelism,
+        "tensor": self.dcn_tensor_parallelism,
+        "tensor_transpose": self.dcn_tensor_transpose_parallelism,
+        "tensor_sequence": self.dcn_tensor_sequence_parallelism,
+        "model": self.dcn_tensor_parallelism,
+        "expert": self.dcn_expert_parallelism,
+        "autoregressive": self.dcn_autoregressive_parallelism,
+        "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
+        "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
+    }
+    self.dcn_parallelism = [dcn_map[axis] for axis in self.mesh_axes]
 
     # Diloco params
     self.num_diloco_replicas = int(self.ici_diloco_parallelism * self.dcn_diloco_parallelism)
 
@@ -307,7 +307,7 @@ def setup(self):
     if self.config.using_pipeline_parallelism:
       pipeline_stage_module = self.get_pipeline_stage_module(self.decoder_layer)
       remat_policy = self.get_remat_policy()
-      self.pipeline_module = pipeline.Pipeline(
+      self.pipeline_module = pipeline.create_pipeline(
           config=self.config, mesh=self.mesh, layers=pipeline_stage_module, remat_policy=remat_policy
       )
 
@@ -794,12 +794,11 @@ def __call__(
         model_mode,
     )
     if cfg.using_pipeline_parallelism:
-      if cfg.pipeline_fsdp_ag_once:
-        logical_partition_spec = self.pipeline_module.get_weight_sharding(
-            y, decoder_segment_ids, decoder_positions, deterministic, model_mode
-        )
-      else:
-        logical_partition_spec = None  # This partition spec is only used for the fsdp_ag_once feature.
+      logical_partition_spec = (
+          self.pipeline_module.get_weight_sharding(y, decoder_segment_ids, decoder_positions, deterministic, model_mode)
+          if cfg.pipeline_fsdp_ag_once or cfg.pipeline_fsdp_ag_per_repeat
+          else None
+      )
       if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
         assert len(RemattedBlockLayers) == 2, "Scanned layers must have a length of 2 using deepseek."
         dense_layer = RemattedBlockLayers[0]