AI-Hypercomputer
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 3 additions & 6 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 4 additions & 4 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/maxtext/layers/decoders.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/layers/decoders.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/maxtext/layers/pipeline.py‎
Lines changed: 195 additions & 139 deletions b/‎src/maxtext/layers/pipeline.py‎
Lines changed: 195 additions & 139 deletions
diff --git a/‎src/maxtext/models/deepseek_batchsplit.py‎
Lines changed: 1 addition & 1 deletion b/‎src/maxtext/models/deepseek_batchsplit.py‎
Lines changed: 1 addition & 1 deletion
@@ -275,10 +275,6 @@ pipeline_parallel_layers: -1 # Pipeline only this number of layers - for the rem
 # PP degree divides the number of layers.
 # By default (when set to -1) we pipeline all of the decoder layers.
 
-# Pipeline weight prefetching is an advanced SPMD pipeline parallelism improvement technique
-# When enabled, it prefetches necessary weight gathering ahead of microbatched computation, therefore reducing collectives
-use_pipeline_weight_prefetching: False
-
 # num_pipeline_microbatches must be a multiple of the number of pipeline stages. By default it is set to the number of stages.
 # Note the microbatch_size is given by global_batch_size / num_pipeline_microbatches, where global_batch_size = per_device_batch_size * num_devices
 num_pipeline_microbatches: -1
@@ -291,8 +287,9 @@ pipeline_fsdp_ag_once: False # If set to true then all gather all of the weights
 # to only one stage's worth, however we only execute one all-gather and reduce across per repeat, as opposed
 # to every microbatch. This is similar to zero-1 sharding, since we also don't need to all gather the FSDP weights in the backward pass.
 # An alternative to setting this to true may be to replace any FSDP with DP and use optimizer offloading if necessary.
-# A more optimal behavior is to all-gather at the start of each repeat, which would ideally get the best of both worlds -
-# a small amount of memory and time, however this has proven hard to implement in SPMD, see b/364386697 for more.
+pipeline_fsdp_ag_per_repeat: False
+# Pipeline weight prefetching per repeat is an advanced SPMD pipeline parallelism improvement technique
+# When enabled, it prefetches necessary weight gathering ahead of microbatched computation, therefore reducing collectives
 
 # There are two loops for PP:
 #  1)  Outer loop over microbatches (pipeline iterations)
 
@@ -840,7 +840,7 @@ class IciParallelism(BaseModel):
 class PipelineParallelism(BaseModel):
   """Configuration for pipeline parallelism."""
 
-  use_pipeline_weight_prefetching: bool = Field(
+  pipeline_fsdp_ag_per_repeat: bool = Field(
       False, description="Enable weight prefetching for circular pipeline parallelism."
   )
   num_layers_per_pipeline_stage: int = Field(1, description="Number of layers to place on each pipeline stage.")
@@ -2240,7 +2240,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         )
         self.num_pipeline_repeats = num_pipeline_repeats
 
-      if self.use_pipeline_weight_prefetching:
+      if self.pipeline_fsdp_ag_per_repeat:
         assert self.num_pipeline_repeats > 1, "Pipeline weight prefetching only supports circular pipeline."
         assert (
             self.num_layers_per_pipeline_stage == 1
@@ -2556,7 +2556,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         "expert": self.ici_expert_parallelism,
         "autoregressive": self.ici_autoregressive_parallelism,
         "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
-          "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
+        "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
     }
     self.ici_parallelism = [ici_map[axis] for axis in self.mesh_axes]
 
@@ -2576,7 +2576,7 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
         "expert": self.dcn_expert_parallelism,
         "autoregressive": self.dcn_autoregressive_parallelism,
         "attn_dp": 1,  # initialized to 1, vLLM will auto calculate this value based on TP and num_kv_heads
-          "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
+        "attn_dp_expert": 1,  # initialized to 1, vLLM will auto calculate this value based on EP
     }
     self.dcn_parallelism = [dcn_map[axis] for axis in self.mesh_axes]
 
 
@@ -796,7 +796,7 @@ def __call__(
     if cfg.using_pipeline_parallelism:
       logical_partition_spec = (
           self.pipeline_module.get_weight_sharding(y, decoder_segment_ids, decoder_positions, deterministic, model_mode)
-          if cfg.pipeline_fsdp_ag_once or cfg.use_pipeline_weight_prefetching
+          if cfg.pipeline_fsdp_ag_once or cfg.pipeline_fsdp_ag_per_repeat
           else None
       )
       if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
 
@@ -809,7 +809,7 @@ def gmm(
         group_sizes,
         representative_value=max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),
     )
-    if config.use_qwix_quantization or (config.using_pipeline_parallelism and config.use_pipeline_weight_prefetching):
+    if config.use_qwix_quantization:
       output = megablox.gmm(
           lhs=inputs,
           rhs=kernel,
Original file line number	Diff line number	Diff line change
`@@ -796,7 +796,7 @@ def __call__(`
`796`	`796`	`if cfg.using_pipeline_parallelism:`
`797`	`797`	`logical_partition_spec = (`
`798`	`798`	`self.pipeline_module.get_weight_sharding(y, decoder_segment_ids, decoder_positions, deterministic, model_mode)`
`799`		`- if cfg.pipeline_fsdp_ag_once or cfg.use_pipeline_weight_prefetching`
	`799`	`+ if cfg.pipeline_fsdp_ag_once or cfg.pipeline_fsdp_ag_per_repeat`
`800`	`800`	`else None`
`801`	`801`	`)`
`802`	`802`	`if cfg.decoder_block == DecoderBlockType.DEEPSEEK:`
Original file line number	Diff line number	Diff line change
`@@ -809,7 +809,7 @@ def gmm(`
`809`	`809`	`group_sizes,`
`810`	`810`	`representative_value=max_utils.generate_representative_group_sizes(inputs.shape[0], kernel.shape[0]),`
`811`	`811`	`)`
`812`		`- if config.use_qwix_quantization or (config.using_pipeline_parallelism and config.use_pipeline_weight_prefetching):`
	`812`	`+ if config.use_qwix_quantization:`
`813`	`813`	`output = megablox.gmm(`
`814`	`814`	`lhs=inputs,`
`815`	`815`	`rhs=kernel,`