Merge pull request #4034 from AI-Hypercomputer:with_mesh_upgrade

Google-ML-Automation · Google-ML-Automation · commit 5d45f2ea049e · 2026-06-05T11:46:57.000-07:00
PiperOrigin-RevId: 927407231
diff --git a/src/maxtext/examples/sft_llama3_demo_gpu.ipynb b/src/maxtext/examples/sft_llama3_demo_gpu.ipynb
@@ -547,7 +547,7 @@
         "    positions = jnp.arange(seq_len)[None, :]\n",
         "    attention_mask = jnp.tril(jnp.ones((seq_len, seq_len), dtype=jnp.bool_))[None, :]\n",
         "\n",
-        "    with mesh:\n",
+        "    with jax.set_mesh(mesh):\n",
         "        output = model(tokens, positions, None, attention_mask)\n",
         "        logits = output[0] if isinstance(output, tuple) else output\n",
         "\n",
diff --git a/src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py b/src/maxtext/trainers/post_train/distillation/save_top_k_teacher_logits.py
@@ -185,7 +185,7 @@ def generate_and_save_data(config, local_args):
 
   multihost_utils.sync_global_devices("start_generation_loop")
 
-  with mesh:
+  with jax.set_mesh(mesh):
     if jax.process_index() == 0:
       max_logging.log(f"Starting Distributed Top-K generation loop for {config.steps - start_step} steps...")
 
diff --git a/src/maxtext/trainers/post_train/distillation/train_distill.py b/src/maxtext/trainers/post_train/distillation/train_distill.py
@@ -686,7 +686,7 @@ def train_distill(
 
   # Hardware Execution (Safe Context)
   max_logging.log("Applying logical axis rules for model initialization and training...")
-  with mesh, nn_partitioning.axis_rules(student_config.logical_axis_rules):
+  with jax.set_mesh(mesh), nn_partitioning.axis_rules(student_config.logical_axis_rules):
     # 2. Load Models
     if is_offline:
       max_logging.log("Offline Distillation: Skipping Teacher Model loading.")
diff --git a/src/maxtext/trainers/post_train/dpo/train_dpo.py b/src/maxtext/trainers/post_train/dpo/train_dpo.py
@@ -159,7 +159,7 @@ def setup_trainer_state(mt_config, goodput_recorder=None):
 
 def train_model(mt_config: MaxTextConfig, trainer, mesh):
   """Runs the DPO training loop in Tunix."""
-  with mesh, nn_partitioning.axis_rules(mt_config.logical_axis_rules):
+  with jax.set_mesh(mesh), nn_partitioning.axis_rules(mt_config.logical_axis_rules):
     trainer.train(trainer.data_hooks.train_data_iterator, trainer.data_hooks.eval_data_iterator)
   return trainer
 
diff --git a/src/maxtext/trainers/post_train/sft/train_sft.py b/src/maxtext/trainers/post_train/sft/train_sft.py
@@ -263,7 +263,7 @@ def setup_trainer_state(mt_config, goodput_recorder=None):
 
 def train_model(mt_config, trainer, mesh):
   """Runs the SFT training loop in Tunix."""
-  with mesh, nn_partitioning.axis_rules(mt_config.logical_axis_rules):
+  with jax.set_mesh(mesh), nn_partitioning.axis_rules(mt_config.logical_axis_rules):
     trainer.train(
         trainer.data_hooks.train_data_iterator,
         trainer.data_hooks.eval_data_iterator,
diff --git a/src/maxtext/utils/lora_utils.py b/src/maxtext/utils/lora_utils.py
@@ -539,7 +539,7 @@ def apply_lora_to_model(
   )
 
   if mesh is not None:
-    with mesh, nn_partitioning.axis_rules(mt_config.logical_axis_rules):
+    with jax.set_mesh(mesh), nn_partitioning.axis_rules(mt_config.logical_axis_rules):
       graph_def, state = nnx.split(lora_model)
 
       # We handle explicit replication for LoRA to ensure safety and efficiency.
diff --git a/tests/integration/deepseek_scan_engram_test.py b/tests/integration/deepseek_scan_engram_test.py
@@ -145,7 +145,7 @@ def batch_decode(self, token_ids, *args, **kwargs):
 
     shared_embedding = DummyEmbedding(emb_dim=config.emb_dim)
 
-    with mesh, jax.disable_jit():
+    with jax.set_mesh(mesh), jax.disable_jit():
       variables = decoder.init(
           {"params": jax.random.PRNGKey(0), "dropout": jax.random.PRNGKey(1), "aqt": jax.random.PRNGKey(2)},
           shared_embedding=shared_embedding,
diff --git a/tests/integration/diloco_test.py b/tests/integration/diloco_test.py
@@ -79,7 +79,7 @@ def test_diloco_training_simulation_with_mesh(self):
         ]
     )
 
-    with mesh:
+    with jax.set_mesh(mesh):
       tx = optax.sgd(learning_rate=0.1)
       rngs = nnx.Rngs(params=jax.random.key(seed=42))
       model = SimpleNNXModel(rngs=rngs)
diff --git a/tests/utils/attention_test_util.py b/tests/utils/attention_test_util.py
@@ -196,15 +196,18 @@ def forward_with_context_expert_parallelism(
         "inputs_segmentation": decoder_segment_ids,
         "inputs_position": decoder_positions,
     }
-    with mesh_cp:
+    # jax.set_mesh requires all sharding constraints inside the block to reference devices in the context mesh.
+    with jax.set_mesh(mesh_cp):
+      replicated = NamedSharding(mesh_cp, P())
+      replicated_batch = {k: jax.device_put(v, replicated) for k, v in batch.items()}
       reordered_batch = maxtext_utils.get_reorder_callable(
           context_parallel_size, ShardMode.AUTO, hardware=cfg_cp.hardware
-      )(batch)
+      )(replicated_batch)
     lnx = reordered_batch["inputs"]
     decoder_segment_ids = reordered_batch["inputs_segmentation"]
     decoder_positions = reordered_batch["inputs_position"]
   # apply attention with sharding
-  with mesh_cp, nn_partitioning.axis_rules(cfg_cp.logical_axis_rules):
+  with jax.set_mesh(mesh_cp), nn_partitioning.axis_rules(cfg_cp.logical_axis_rules):
     batch_axis = "activation_batch"
     length_axis = "activation_length"
     lnx_spec = nn_partitioning.logical_to_mesh_axes(

Original file line number	Diff line number	Diff line change
`@@ -539,7 +539,7 @@ def apply_lora_to_model(`
`539`	`539`	`)`
`540`	`540`
`541`	`541`	`if mesh is not None:`
`542`		`- with mesh, nn_partitioning.axis_rules(mt_config.logical_axis_rules):`
	`542`	`+ with jax.set_mesh(mesh), nn_partitioning.axis_rules(mt_config.logical_axis_rules):`
`543`	`543`	`graph_def, state = nnx.split(lora_model)`
`544`	`544`
`545`	`545`	`# We handle explicit replication for LoRA to ensure safety and efficiency.`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ def test_diloco_training_simulation_with_mesh(self):`
`79`	`79`	`]`
`80`	`80`	`)`
`81`	`81`
`82`		`- with mesh:`
	`82`	`+ with jax.set_mesh(mesh):`
`83`	`83`	`tx = optax.sgd(learning_rate=0.1)`
`84`	`84`	`rngs = nnx.Rngs(params=jax.random.key(seed=42))`
`85`	`85`	`model = SimpleNNXModel(rngs=rngs)`