print out logic axes

Sharon Yu · Sharon Yu · commit a8f11478360a · 2026-01-21T16:17:20.000Z
diff --git a/src/MaxText/max_utils.py b/src/MaxText/max_utils.py
@@ -1032,3 +1032,17 @@ def transformer_engine_context():
       yield
   except (ImportError, AttributeError):
     yield
+
+
+def print_mesh_axes_info(mesh: jax.sharding.Mesh):
+  """Prints all mesh axes and their sizes in a single comma-separated line."""
+  if not mesh.shape:
+    max_logging.info("Mesh Axes: (Empty Mesh)")
+    return
+
+  axis_info = [
+      f"{axis_name}: {axis_size}"
+      for axis_name, axis_size in mesh.shape.items()
+  ]
+  info_str = "Mesh Axes: (" + ", ".join(axis_info) + ")"
+  max_logging.info(info_str)
diff --git a/src/MaxText/maxtext_utils.py b/src/MaxText/maxtext_utils.py
@@ -18,6 +18,8 @@
 import functools
 import pickle
 
+from collections import defaultdict
+
 from flax import linen as nn
 from flax.linen import partitioning as nn_partitioning
 from flax.training import train_state
@@ -26,6 +28,7 @@
 
 from jax.experimental import mesh_utils
 from jax.experimental.serialize_executable import deserialize_and_load
+from jax.sharding import PartitionSpec as P
 
 import jax
 import jax.numpy as jnp
@@ -1204,12 +1207,109 @@ def schedule(step):
   return optax.join_schedules(pieces, boundaries)
 
 
-def print_state_mesh_shardings_params(state, state_sharding, mesh):
+def print_state_mesh_shardings_params(
+    state, state_sharding, state_logical_annotations, mesh, logical_axis_rules
+):
   """Print state shardings."""
+  if (not hasattr(state, 'params') or
+      not hasattr(state_sharding, 'params') or
+      not hasattr(state_logical_annotations, 'params')):
+    max_logging.warning(
+      "Warning: 'params' attribute missing in one of the inputs to "
+      "print_state_mesh_shardings_params."
+    )
+    return
+
   leaves_params, _ = jax.tree_util.tree_flatten_with_path(state.params)
   leaves_sharding, _ = jax.tree_util.tree_flatten_with_path(state_sharding.params)
-  for (path, leaf_val), (_, leaf_sharding) in zip(leaves_params, leaves_sharding):
+  leaves_rule_values, _ = jax.tree_util.tree_flatten_with_path(
+      state_logical_annotations.params
+  )
+
+  if not len(leaves_params) == len(leaves_sharding) == len(leaves_rule_values):
+    max_logging.warning(
+        "Warning: Parameter tree structure mismatch between state, shardings,"
+        " and logical annotations."
+    )
+    return
+
+  # Build a reverse map (Potential Physical Axes Tuple -> List of Semantic Names)
+  rule_value_to_semantic = defaultdict(list)
+  if logical_axis_rules:
+    rules_iter = (
+        logical_axis_rules.items()
+        if isinstance(logical_axis_rules, dict)
+        else logical_axis_rules
+    )
+    for name, potentials in rules_iter:
+      # name: LHS for example 'embed/activation_batch
+      # potentials: RHS for example 'data', 'model', None, ['data', 'model']
+      if isinstance(potentials, str):
+        key = (potentials,)
+      elif potentials is None:
+        key = (None,)
+      elif isinstance(potentials, list):
+        key = tuple(potentials)
+      elif isinstance(potentials, tuple):
+        key = potentials
+      else:
+        key = (potentials,)
+
+      key = tuple(p for p in key)
+      rule_value_to_semantic[key].append(name)
+
+  # Header for the entire block
+  max_logging.info("Parameter Path")
+  max_logging.info("Shape")
+  max_logging.info("Logical Axes")
+  max_logging.info("Physical PartitionSpec")
+  max_logging.info("-" * 120)
+
+  for (path, leaf_val), (_, leaf_sharding), (_, leaf_rule_value) in zip(
+      leaves_params, leaves_sharding, leaves_rule_values
+  ):
     path_str = "/".join(str(p.key) for p in path)
-    shape = jax.typeof(leaf_val)
+    shape = str(jax.typeof(leaf_val))
+
+    # Physical PartitionSpec from NamedSharding
     pspec = sharding.remove_size_one_mesh_axis(leaf_sharding.spec, mesh)
-    max_logging.log(f"{path_str:.<80} {shape} {tuple(pspec)}")
+    pspec_str = str(tuple(pspec))
+
+    def get_semantic_names(rule_val_item, rmap):
+      if rule_val_item is None:
+        key = (None,)
+      elif isinstance(rule_val_item, str):
+        key = (rule_val_item,)
+      elif isinstance(rule_val_item, tuple):
+        key = rule_val_item
+      else:
+        return f"'{str(rule_val_item)}'"
+
+      key = tuple(p for p in key)
+      names = rmap.get(key)
+
+      if names:
+        return "{" + " | ".join(sorted(list(set(names)))) + "}" 
+      else:
+        # Show rule value if unmapped.
+        return f"'{str(key)}'"
+
+    # Logical Axes string representation
+    if isinstance(leaf_rule_value, P):
+      semantic_parts = []
+      for s in leaf_rule_value:
+        name_str = get_semantic_names(s, rule_value_to_semantic)
+        semantic_parts.append(str(name_str))
+      semantic_str = "Partitionspec(" + ", ".join(semantic_parts) + ")"
+    elif leaf_rule_value is None:
+      semantic_str = "Partitionspec(None)" 
+    else: # Should not be common
+      semantic_str = str(leaf_rule_value)
+
+    # Multi-line logging for each parameter
+    max_logging.info(f"{path_str}")
+    max_logging.info(f"{shape}")
+    max_logging.info(f"{semantic_str}")
+    max_logging.info(f"{pspec_str}")
+    max_logging.info("-" * 120)
+    print(flush=True)
diff --git a/src/MaxText/train_compile.py b/src/MaxText/train_compile.py
@@ -100,7 +100,7 @@ def get_shaped_inputs(topology_mesh, config):
   shaped_rng = jax.ShapeDtypeStruct(example_rng.shape, example_rng.dtype)
 
   # Shaped state
-  abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state(
+  abstract_state, state_logical_annotations, state_mesh_shardings = maxtext_utils.get_abstract_state(
       model, tx, config, example_rng, topology_mesh
   )
 
@@ -109,7 +109,7 @@ def get_shaped_inputs(topology_mesh, config):
 
   shaped_train_args = (abstract_state, shaped_batch, shaped_rng)
   shaped_train_kwargs = {}
-  return shaped_train_args, shaped_train_kwargs, state_mesh_shardings, model
+  return shaped_train_args, shaped_train_kwargs, state_mesh_shardings, state_logical_annotations, model
 
 
 def jit_and_compile(
@@ -158,7 +158,13 @@ def is_oom(argv: Sequence[str]) -> bool:
   max_utils.print_system_information()
 
   # Get shaped inputs
-  shaped_train_args, shaped_train_kwargs, state_mesh_shardings, model = get_shaped_inputs(topology_mesh, config)
+  (
+    shaped_train_args,
+    shaped_train_kwargs,
+    state_mesh_shardings,
+    _,
+    model,
+  ) = get_shaped_inputs(topology_mesh, config)
 
   # Get data sharding
   data_sharding = sharding.get_input_data_sharding(config, topology_mesh)
@@ -213,7 +219,13 @@ def main(argv: Sequence[str]) -> None:
   max_utils.print_system_information()
 
   # Get shaped inputs
-  shaped_train_args, shaped_train_kwargs, state_mesh_shardings, model = get_shaped_inputs(topology_mesh, config)
+  (
+    shaped_train_args,
+    shaped_train_kwargs,
+    state_mesh_shardings,
+    state_logical_annotations,
+    model,
+  ) = get_shaped_inputs(topology_mesh, config)
 
   # Get data sharding
   data_sharding = sharding.get_input_data_sharding(config, topology_mesh)
@@ -227,8 +239,15 @@ def main(argv: Sequence[str]) -> None:
 
   # print weights sharding info under debug sharding mode
   if config.debug_sharding:
-    max_utils.print_non_trivial_mesh_axis(topology_mesh)
-    maxtext_utils.print_state_mesh_shardings_params(shaped_train_args[0], state_mesh_shardings, topology_mesh)
+    # max_utils.print_non_trivial_mesh_axis(topology_mesh)
+    max_utils.print_mesh_axes_info(topology_mesh)
+    maxtext_utils.print_state_mesh_shardings_params(
+        shaped_train_args[0],
+        state_mesh_shardings,
+        state_logical_annotations,
+        topology_mesh,
+        config.logical_axis_rules,
+    )
 
   # Compile
   print("Jitting and compiling train step...", flush=True)
diff --git a/src/MaxText/train_utils.py b/src/MaxText/train_utils.py
@@ -206,7 +206,7 @@ def setup_train_loop(config, recorder, devices=None):
               eval_data_iterator,
           )
 
-    state, _, state_mesh_shardings, data_iterator = maxtext_utils.setup_training_state(
+    state, state_mesh_annotations, state_mesh_shardings, data_iterator = maxtext_utils.setup_training_state(
         model, data_iterator, tx, config, init_rng, mesh, checkpoint_manager
     )
 
@@ -218,7 +218,13 @@ def setup_train_loop(config, recorder, devices=None):
     # print weights sharding info under debug sharding mode
     if config.debug_sharding:
       max_utils.print_non_trivial_mesh_axis(model.mesh)
-      maxtext_utils.print_state_mesh_shardings_params(state, state_mesh_shardings, model.mesh)
+      maxtext_utils.print_state_mesh_shardings_params(
+          state,
+          state_mesh_shardings,
+          state_mesh_annotations,
+          model.mesh,
+          config.logical_axis_rules
+      )
 
     if config.use_dpo:
       abstract_state, _, _ = maxtext_utils.get_abstract_state(model, tx, config, init_rng, mesh, is_training=True)
diff --git a/tests/sharding_compare_test.py b/tests/sharding_compare_test.py
@@ -97,7 +97,7 @@ def test_sharding_dump_for_model(model_name: str, topology: str, num_slice: str)
   validate_config(config)
 
   topology_mesh = get_topology_mesh(config)
-  _, _, state_mesh_shardings, _ = get_shaped_inputs(topology_mesh, config)
+  _, _, state_mesh_shardings, _, _ = get_shaped_inputs(topology_mesh, config)
   actual_json = named_shardings_to_json(state_mesh_shardings)
   expected_json = load_named_sharding_json(json_path)
 
diff --git a/tests/sharding_dump.py b/tests/sharding_dump.py
@@ -275,7 +275,7 @@ def main(argv: Sequence[str]) -> None:
 
   try:
     topology_mesh = get_topology_mesh(config)
-    _, _, state_mesh_shardings, _ = get_shaped_inputs(topology_mesh, config)
+    _, _, state_mesh_shardings, _, _ = get_shaped_inputs(topology_mesh, config)
   except:  # pylint: disable=bare-except
     state_mesh_shardings = {}