Removed the global mesh

ajkv-google · ajkv-google · commit 72bb9d554883 · 2026-01-15T23:38:33.000Z
diff --git a/recml/core/training/mesh_context.py b/recml/core/training/mesh_context.py
diff --git a/recml/core/training/partitioning.py b/recml/core/training/partitioning.py
@@ -22,8 +22,6 @@
 import flax.linen as nn
 import jax
 import numpy as np
-from recml.core.training import mesh_context
-
 
 PyTree = Any
 State = Any
@@ -112,7 +110,6 @@ def partition_init(
   ) -> CreateStateFn:
     with self.mesh:
       if abstract_batch is not None:
-        mesh_context.set_global_mesh(self.mesh)
         abstract_state = jax.eval_shape(init_fn, abstract_batch)
         specs = nn.get_partition_spec(abstract_state)
         self.state_sharding = jax.tree.map(
@@ -135,7 +132,6 @@ def partition_step(self, fn: StepFn, *, training: bool = False) -> StepFn:
       jit_kws["donate_argnums"] = (1,)
 
     with self.mesh:
-      mesh_context.set_global_mesh(self.mesh)
       step_fn = jax.jit(
           fn,
           in_shardings=(self.data_sharding, self.state_sharding),
@@ -195,7 +191,6 @@ def __init__(
     if axis_sizes[0] == -1:
       axis_sizes[0] = len(devices) // math.prod(axis_sizes[1:])
 
-    # self.mesh = jax.make_mesh(axis_sizes, axis_names, devices=devices)
     self.mesh = jax.sharding.Mesh(devices, axis_names)
     self.rules = rules
     self.aot_compile = aot_compile
@@ -235,7 +230,6 @@ def partition_init(
       )
 
     with self.mesh:
-      mesh_context.set_global_mesh(self.mesh)
       abstract_state = jax.eval_shape(init_fn, abstract_batch)
       specs = nn.get_partition_spec(abstract_state)
 
@@ -268,7 +262,6 @@ def partition_step(self, fn: StepFn, *, training: bool = False) -> StepFn:
 
 
     with self.mesh:
-      mesh_context.set_global_mesh(self.mesh)
       step_fn = jax.jit(
           fn,
           in_shardings=(self.data_sharding, self.state_sharding),
diff --git a/recml/examples/train_hstu_jax.py b/recml/examples/train_hstu_jax.py
@@ -184,14 +184,6 @@ def _compute_metrics(self, loss, logits, targets):
         targets = jnp.squeeze(targets)
         metrics = {"loss": clu_metrics.Average.from_model_output(loss)}
 
-        # def get_acc(k):
-        #     _, top_k_indices = jax.nn.top_k(logits, k)
-        #     correct = jnp.sum(top_k_indices == targets[:, None], axis=-1)
-        #     return jnp.mean(correct)
-
-        # metrics["HR_10"] = clu_metrics.Average.from_model_output(get_acc(10))
-        # metrics["HR_50"] = clu_metrics.Average.from_model_output(get_acc(50))
-        # metrics["HR_200"] = clu_metrics.Average.from_model_output(get_acc(200))
         return metrics
 
 def experiment() -> fdl.Config[recml.Experiment]:
diff --git a/recml/layers/linen/sparsecore.py b/recml/layers/linen/sparsecore.py
@@ -28,9 +28,6 @@
 from recml.core.ops import embedding_ops
 import tensorflow as tf
 
-from recml.core.training import mesh_context
-
-
 with epy.lazy_imports():
   # pylint: disable=g-import-not-at-top
   from jax_tpu_embedding.sparsecore.lib.flax.linen import embed
@@ -177,7 +174,6 @@ def __call__(self, inputs: Mapping[str, jax.Array]) -> jax.Array:
       dataclasses.field(default=lambda n, bs: bs)
   )
 
-  # Optional device information.
   local_device_count: int = dataclasses.field(
       default_factory=jax.local_device_count
   )
@@ -369,19 +365,9 @@ class SparsecoreEmbed(nn.Module):
   """
 
   sparsecore_config: SparsecoreConfig
-  mesh: jax.sharding.Mesh | jax.sharding.AbstractMesh | None = None
-
-  def get_mesh(self) -> jax.sharding.Mesh:
-    # Try to get the mesh from our custom global context
-    mesh = mesh_context.get_global_mesh()
-
-    if mesh is None:
-      raise ValueError(
-          "No global mesh found. Make sure to call "
-          "`partitioning.partition_init` (which sets the mesh) "
-          "before initializing SparseCore."
-      )
-    return mesh
+  mesh: jax.sharding.Mesh = dataclasses.field(
+      default_factory=lambda: jax.sharding.Mesh(jax.devices(), ('batch',))
+  )
 
   def get_sharding_axis(
       self, mesh: jax.sharding.Mesh | jax.sharding.AbstractMesh
@@ -391,25 +377,25 @@ def get_sharding_axis(
     return self.sparsecore_config.sharding_axis
 
   def setup(self):
-    mesh = self.get_mesh()
-    sharding_axis_name = self.get_sharding_axis(mesh)
 
+    sharding_axis_name = self.get_sharding_axis(self.mesh)
+    
     initializer = functools.partial(
         embedding.init_embedding_variables,
         table_specs=embedding.get_table_specs(
             self.sparsecore_config.feature_specs
         ),
         global_sharding=jax.sharding.NamedSharding(
-            mesh, jax.sharding.PartitionSpec(sharding_axis_name, None)
+            self.mesh, jax.sharding.PartitionSpec(sharding_axis_name, None)
         ),
         num_sparsecore_per_device=self.sparsecore_config.num_sc_per_device,
         # We need to by-pass the mesh check to allow using an abstract mesh.
-        bypass_mesh_check=isinstance(mesh, jax.sharding.AbstractMesh),
+        bypass_mesh_check=isinstance(self.mesh, jax.sharding.AbstractMesh),
     )
     self.embedding_table = self.param(
         name=EMBEDDING_PARAM_NAME,
         init_fn=embed.with_sparsecore_layout(
-            initializer, (sharding_axis_name,), mesh  # type: ignore
+            initializer, (sharding_axis_name,), self.mesh  # type: ignore
         ),
     )
 
@@ -426,12 +412,13 @@ def __call__(
     Returns:
       The activations structure with the same structure as specs.
     """
-    mesh = self.get_mesh()
-    sharding_axis_name = self.get_sharding_axis(mesh)
+    # mesh = self.get_mesh()
+    sharding_axis_name = self.get_sharding_axis(self.mesh)
+    
     activations = embedding_ops.sparsecore_lookup(
         embedding_ops.SparsecoreParams(
             feature_specs=self.sparsecore_config.feature_specs,
-            mesh=mesh,
+            mesh=self.mesh,
             data_axes=(sharding_axis_name,),
             embedding_axes=(sharding_axis_name, None),
             sharding_strategy=self.sparsecore_config.sharding_strategy,