Offload all checkpoints to host and remove unused mesh axes.

Kevin Wang · Google-ML-Automation · commit 24a3c79bee3d · 2026-04-21T14:00:42.000-07:00
PiperOrigin-RevId: 903412342
diff --git a/src/maxtext/configs/models/deepseek3-671b-batchsplit.yml b/src/maxtext/configs/models/deepseek3-671b-batchsplit.yml
@@ -59,30 +59,27 @@ use_batch_split_schedule: True
 shard_mode: "explicit"
 remove_size_one_mesh_axis_from_type: False
 override_logical_axis_rules: True
-mesh_axes: ['diloco', 'data', 'stage', 'fsdp', 'expert', 'context']
-data_sharding: [['data', 'stage', 'fsdp', 'expert', 'context']]
+mesh_axes: ['data', 'fsdp', 'expert', 'context']
+data_sharding: [['data', 'fsdp', 'expert', 'context']]
 logical_axis_rules: [
     ['activation_batch', ['data', 'fsdp', 'expert', 'context']],
     ['activation_batch_moe', ['data', 'fsdp', 'expert', 'context']],
-    ['activation_embed_and_logits_batch', ['data', 'stage', 'fsdp', 'expert', 'context']],
+    ['activation_embed_and_logits_batch', ['data', 'fsdp', 'expert', 'context']],
     ['activation_kv_batch', ['data', 'fsdp', 'expert', 'context']],
     ['activation_norm_length', []],
     ['activation_norm_length_moe', []],
     ['activation_heads', []],
-    ['activation_stage', 'stage'],
     ['embed', ['fsdp']],
     ['embed_moe', ['fsdp']],
     ['embed_no_exp', ['fsdp']],
     ['embed_no_exp_moe', ['fsdp']],
     ['q_lora', ['fsdp']],
     ['kv_lora', ['fsdp']],
-    ['layers', 'stage'],
     ['q_lora_up_proj', []],
     ['kv_lora_up_proj', []],
     ['q_heads', []],
     ['kv_heads', []],
     ['heads', []],
     ['mlp', []],
     ['expert_only', ['expert']],
-    ['diloco', 'diloco'],
 ]
diff --git a/src/maxtext/models/deepseek_batchsplit.py b/src/maxtext/models/deepseek_batchsplit.py
@@ -714,7 +714,7 @@ def process_layer_scannable(carry, layer_idx, group_id):
           pairwise_swap_and_negate_mask=yarn_mask,
       )
       # Offload to host memory.
-      for residual_name in ("mlpwi_0", "mlpwi_1"):
+      for residual_name in ("mlpwi_0", "mlpwi_1", "attn_out", "layer_inputs"):
         r = res.pop(residual_name)
         r = jax.tree.map(lambda x: jax.device_put(x, jax.typeof(x).sharding.with_memory_kind("pinned_host")), r)
         res[residual_name] = r
@@ -736,7 +736,7 @@ def process_layer_scannable(carry, layer_idx, group_id):
         pairwise_swap_and_negate_mask=yarn_mask,
     )
     # Offload first layer residuals to host memory.
-    for residual_name in ("mlpwi_0", "mlpwi_1"):
+    for residual_name in ("mlpwi_0", "mlpwi_1", "attn_out", "layer_inputs"):
       r = first_res.pop(residual_name)
       r = jax.tree.map(lambda x: jax.device_put(x, jax.typeof(x).sharding.with_memory_kind("pinned_host")), r)
       first_res[residual_name] = r
@@ -829,7 +829,7 @@ def process_layer_bwd_scannable(carry, res_and_layer_idx, group_id):
       next_next_ws_grad = all_reduce_ws_grad_dcn(next_next_ws_grad, mesh)
       all_layer_ws_grad = insert_layer_ws_grad(all_layer_ws_grad, next_next_ws_grad, layer_idx + 2, cfg.param_scan_axis)
       # Get residuals from host.
-      for residual_name in ("mlpwi_0", "mlpwi_1"):
+      for residual_name in ("mlpwi_0", "mlpwi_1", "attn_out", "layer_inputs"):
         r = res.pop(residual_name)
         r = jax.tree.map(lambda x: jax.device_put(x, jax.typeof(x).sharding.with_memory_kind("device")), r)
         res[residual_name] = r
@@ -890,7 +890,7 @@ def process_layer_bwd_scannable(carry, res_and_layer_idx, group_id):
       prev_prev_ws = gather_weights(extract_layer_weights(all_weights, num_layers - 3, cfg.param_scan_axis), mesh)
       ws_grad = reduce_scatter_ws_grad(ws_grad, mesh)
     # Get residuals from host.
-    for residual_name in ("mlpwi_0", "mlpwi_1"):
+    for residual_name in ("mlpwi_0", "mlpwi_1", "attn_out", "layer_inputs"):
       r = last_last_res.pop(residual_name)
       r = jax.tree.map(lambda x: jax.device_put(x, jax.typeof(x).sharding.with_memory_kind("device")), r)
       last_last_res[residual_name] = r
@@ -931,7 +931,7 @@ def process_layer_bwd_scannable(carry, res_and_layer_idx, group_id):
     third_ws_grad = all_reduce_ws_grad_dcn(third_ws_grad, mesh)
     all_layer_ws_grad = insert_layer_ws_grad(all_layer_ws_grad, third_ws_grad, 2, cfg.param_scan_axis)
     # Get residuals from host.
-    for residual_name in ("mlpwi_0", "mlpwi_1"):
+    for residual_name in ("mlpwi_0", "mlpwi_1", "attn_out", "layer_inputs"):
       r = first_res.pop(residual_name)
       r = jax.tree.map(lambda x: jax.device_put(x, jax.typeof(x).sharding.with_memory_kind("device")), r)
       first_res[residual_name] = r