AI-Hypercomputer
diff --git a/‎docs/guides/optimization/benchmark_and_performance.md‎
Lines changed: 122 additions & 0 deletions b/‎docs/guides/optimization/benchmark_and_performance.md‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 3 additions & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/trainers/pre_train/train_compile.py‎
Lines changed: 28 additions & 84 deletions b/‎src/maxtext/trainers/pre_train/train_compile.py‎
Lines changed: 28 additions & 84 deletions
@@ -61,6 +61,128 @@ To use a custom policy, set `remat_policy` to `custom` and specify the layers in
 - `device`: The activation remains on the TPU device.
 - `Remat`: Rematerialization is performed during the backward pass.
 
+**Automatic remat policy search with the Estimator**
+
+Finding the optimal remat policy and batch size manually can be time-consuming. MaxText provides an **Estimator** tool (`estimator.py`) that automates this search using [Ahead-of-Time (AOT) compilation](../monitoring_and_debugging/features_and_diagnostics.md#ahead-of-time-compilation-aot). It leverages `train_compile` to test whether a given configuration causes an Out-Of-Memory (OOM) error *without* requiring the target hardware.
+
+The estimator supports two modes:
+
+1. **Search both batch size and remat policy** (when `per_device_batch_size` is *not* provided): It finds the Pareto frontier of batch size vs. remat policy by iterating through policies from full remat to full device, using binary search for the largest non-OOM batch size at each step.
+2. **Search remat policy only** (when `per_device_batch_size` *is* provided): It finds the least aggressive (fastest) remat policy that fits in memory for the given fixed batch size.
+
+*Mode 1 example: Search both batch size and remat policy (Llama 3.1 405B on tpu7x-1024)*
+
+```bash
+python -m maxtext.utils.estimator \
+  maxtext/configs/base.yml \
+  compile_topology=tpu7x-1024 \
+  compile_topology_num_slices=1 \
+  model_name=llama3.1-405b \
+  max_target_length=32768 \
+  ici_context_parallelism=8 \
+  ici_fsdp_parallelism=-1 \
+  log_config=False \
+  write_estimator_result=False
+```
+
+*Mode 2 example: Search best remat policy for a fixed batch size (DeepSeek3 671B on v5p-1024)*
+
+```bash
+python3 -m maxtext.utils.estimator maxtext/configs/base.yml \
+  model_name=deepseek3-671b \
+  compile_topology=v5p-1024 \
+  compile_topology_num_slices=1 \
+  ici_fsdp_parallelism=512 \
+  per_device_batch_size=2.0 \
+  dtype=bfloat16 \
+  weight_dtype=float32 \
+  max_target_length=8192 \
+  log_config=False \
+  write_estimator_result=False \
+  decoder_layer_input=offload
+```
+
+Key options:
+
+- `write_estimator_result=True`: Writes runnable training commands to `remat_commands_from_estimator.txt`.
+- `write_estimator_result=False` (default): Prints results to stdout only.
+- You can pin specific tensor remat actions (e.g., `context=offload`) to constrain the search space.
+
+*Advanced example: Search remat policy with XLA tuning flags (DeepSeek3 671B on tpu7x-512)*
+
+For production workloads you often want to combine the estimator with XLA compiler tuning flags for SparseCore offloading, latency-hiding scheduling, and other optimizations. Set these via `LIBTPU_INIT_ARGS` before invoking the estimator:
+
+```bash
+export LIBTPU_INIT_ARGS=" \
+  --xla_tpu_dvfs_p_state=7 \
+  --xla_tpu_scoped_vmem_limit_kib=65536 \
+  --xla_tpu_bf16_emission_mode=NATIVE_EMISSION \
+  --xla_tpu_enable_sparse_core_reduce_scatter_v2=true \
+  --xla_tpu_enable_sparse_core_collective_offload_all_gather=true \
+  --xla_tpu_enable_sparse_core_collective_offload_2d_all_gather=true \
+  --xla_tpu_enable_all_gather_offload_tracing=true \
+  --xla_tpu_use_tc_device_shape_on_sc=True \
+  --xla_sc_disable_megacore_partitioning=True \
+  --xla_tpu_enable_async_collective_fusion_fuse_all_gather=false \
+  --xla_enable_async_all_gather=true \
+  --xla_tpu_prefer_async_allgather_to_allreduce=true \
+  --xla_tpu_enable_sparse_core_collective_offload_all_reduce=true \
+  --xla_tpu_enable_sparse_core_collective_offload_reduce_scatter=true \
+  --xla_tpu_enable_sparse_core_collective_offload_3d_all_gather=true \
+  --xla_tpu_use_single_sparse_core_for_all_gather_offload=true \
+  --xla_tpu_enable_concurrent_sparse_core_offloading=true \
+  --xla_tpu_aggressive_opt_barrier_removal=true \
+  --xla_tpu_enable_offloading_gather_to_sparsecore=true \
+  --xla_tpu_sparse_core_all_gather_latency_multiplier=1 \
+  --xla_tpu_sparse_core_reduce_scatter_latency_multiplier=3 \
+  --xla_tpu_enable_sparse_core_collective_aggregator=true \
+  --xla_tpu_enable_latency_hiding_layer_scheduler=true \
+  --xla_tpu_scheduler_percent_shared_memory_limit=150 \
+  --xla_tpu_enable_layer_scheduler_for_dependent_collectives=true \
+  --xla_tpu_enable_sparse_core_collective_offload_nd_reduce_scatter=true \
+  --xla_tpu_pcie_bandwidth_multiplier=0.03 \
+  --xla_tpu_enable_sparse_core_offload_queuing_in_lhs=true \
+  --xla_tpu_enable_multi_compute_overlap_in_layer_scheduler=false \
+  --xla_tpu_enable_3d_reduce_scatter_decomposer=false "
+
+python3 -m maxtext.utils.estimator maxtext/configs/base.yml \
+  compile_topology=tpu7x-512 \
+  compile_topology_num_slices=1 \
+  run_name=${WORKLOAD_NAME} \
+  skip_jax_distributed_system=true \
+  dtype=bfloat16 \
+  per_device_batch_size=4.0 \
+  model_name=deepseek3-671b \
+  remat_policy=custom \
+  decoder_layer_input=device \
+  mu_dtype=bfloat16 \
+  grad_dtype=bfloat16 \
+  ici_fsdp_parallelism=128 \
+  ici_expert_parallelism=4 \
+  dataset_type=synthetic \
+  dataset_path=gs://max-datasets-rogue \
+  opt_type=adamw \
+  steps=20 \
+  sa_use_fused_bwd_kernel=true \
+  use_max_logit_estimate=-1 \
+  cost_estimate_flops_fwd=5000000000000 \
+  cost_estimate_flops_bwd=5000000000000 \
+  float32_weight_sum=False \
+  megablox=true \
+  sparse_matmul=true \
+  use_tokamax_gmm=false \
+  use_tokamax_splash=true \
+  max_target_length=4096 \
+  use_random_routing=true \
+  use_ring_of_experts=true \
+  use_ragged_sort=true \
+  tokenizer_path=assets/tokenizer.mistral-v3 \
+  base_output_directory=${BASE_OUTPUT_DIR} \
+  merge_gating_gmm=false
+```
+
+This example fixes `per_device_batch_size=4.0` so the estimator runs in **Mode 2** (policy-only search), finding the least aggressive remat policy that fits the DeepSeek3 671B model on a tpu7x-512 pod. The XLA flags enable SparseCore collective offloading and latency-hiding scheduling, which affect compilation memory layout and thus the OOM boundary.
+
 ### Low precision training
 
 MaxText supports quantization via QWIX. To enable this, set `use_qwix_quantization=true`.
 
@@ -948,6 +948,9 @@ compiled_trainstep_file: "" # Name of saved serialized compiled train_step, e.g.
 compile_topology: '' # Target hardware version, e.g. 'v5e-256'
 compile_topology_num_slices: -1 # Number of target slices, set to a positive integer.
 
+# MaxText Estimator configs
+write_estimator_result: False
+
 decode_sampling_strategy: "greedy" # decode_sampling_strategy should be one of greedy, weighted, nucleus, topk, or composite(top_k -> top_p -> weighted temperature)
 decode_sampling_nucleus_p: -1 # set if you're doing nucleus / top-p
 decode_sampling_top_k: 0 # set if you're doing top-k
 
@@ -1674,6 +1674,7 @@ class AOT(BaseModel):
   compiled_trainstep_file: PathStr = Field("", description="Name of saved serialized compiled train_step.")
   compile_topology: str = Field("", description="Target hardware version, e.g. 'v5e-256'.")
   compile_topology_num_slices: int = Field(-1, description="Number of target slices.")
+  write_estimator_result: bool = Field(False, description="Write estimator.py results in a separate file.")
 
 
 class DevelopmentAndDebugging(BaseModel):
 
@@ -61,12 +61,9 @@
 def validate_config(config):
   """Validates the config is is setup correctly to compile, returning a useful error message if not."""
   assert config.compile_topology != "", (
-      "You must pass your desired target hardware in compile_topology, e.g."
-      " compile_topology=v5e-256"
+      "You must pass your desired target hardware in compile_topology, e.g." " compile_topology=v5e-256"
   )
-  assert (
-      config.compile_topology_num_slices > 0
-  ), "You must set compile_topology_num_slices to a positive integer"
+  assert config.compile_topology_num_slices > 0, "You must set compile_topology_num_slices to a positive integer"
 
 
 def get_topology_mesh(config):
@@ -78,18 +75,12 @@ def get_topology_mesh(config):
         num_slices=config.compile_topology_num_slices,
     ).devices
   else:
-    target_hardware = accelerator_to_spec_map.get_system_characteristics(
-        config.compile_topology
-    )
+    target_hardware = accelerator_to_spec_map.get_system_characteristics(config.compile_topology)
     if target_hardware.platform == "gpu":
       # Disable sharded autotuning. This is an optimization to distribute
       # autotuning across the fleet, but can cause hangs with AoT compilation.
-      os.environ["XLA_FLAGS"] = (
-          os.environ.get("XLA_FLAGS", "") + " --xla_gpu_shard_autotuning=false"
-      )
-      jax.config.update(
-          "mock_num_gpu_processes", config.compile_topology_num_slices
-      )
+      os.environ["XLA_FLAGS"] = os.environ.get("XLA_FLAGS", "") + " --xla_gpu_shard_autotuning=false"
+      jax.config.update("mock_num_gpu_processes", config.compile_topology_num_slices)
       topology_devices = jax.devices()
     else:
       topology_devices = get_topology_desc(
@@ -104,14 +95,8 @@ def get_topology_mesh(config):
       "jax_remove_size_one_mesh_axis_from_type",
       config.remove_size_one_mesh_axis_from_type,
   )
-  topology_device_mesh = maxtext_utils.create_device_mesh(
-      config, topology_devices
-  )
-  mesh_axis_type = (
-      AxisType.Explicit
-      if config.shard_mode == ShardMode.EXPLICIT
-      else AxisType.Auto
-  )
+  topology_device_mesh = maxtext_utils.create_device_mesh(config, topology_devices)
+  mesh_axis_type = AxisType.Explicit if config.shard_mode == ShardMode.EXPLICIT else AxisType.Auto
   topology_mesh = Mesh(
       topology_device_mesh,
       config.mesh_axes,
@@ -129,9 +114,7 @@ def _collect_nnx_activation_shardings(create_model_fn, config, mesh):
   input_shape = (config.micro_batch_size_to_train_on, config.max_target_length)
   abstract_input = jax.ShapeDtypeStruct(input_shape, jnp.int32)
 
-  def _nnx_forward(
-      decoder_input_tokens, decoder_positions, decoder_segment_ids
-  ):
+  def _nnx_forward(decoder_input_tokens, decoder_positions, decoder_segment_ids):
     model_instance = create_model_fn()
     return model_instance(
         decoder_input_tokens=decoder_input_tokens,
@@ -140,9 +123,7 @@ def _nnx_forward(
         enable_dropout=False,
     )
 
-  with jax.set_mesh(mesh), nn_partitioning.axis_rules(
-      config.logical_axis_rules
-  ):
+  with jax.set_mesh(mesh), nn_partitioning.axis_rules(config.logical_axis_rules):
     jax.eval_shape(_nnx_forward, abstract_input, abstract_input, abstract_input)
 
 
@@ -151,13 +132,9 @@ def get_shaped_inputs(topology_mesh, config):
   # Construct the model and optimizer to get shaped versions of the state
   quant = quantizations.configure_quantization(config)
   if config.pure_nnx:
-    _create_model_partial, model = (
-        model_creation_utils.create_nnx_abstract_model(config, topology_mesh)
-    )
+    _create_model_partial, model = model_creation_utils.create_nnx_abstract_model(config, topology_mesh)
   else:
-    model = Transformer(
-        config, topology_mesh, quant=quant, model_mode=MODEL_MODE_TRAIN
-    )
+    model = Transformer(config, topology_mesh, quant=quant, model_mode=MODEL_MODE_TRAIN)
   # The learning_rate_schedule is baked into the compiled object.
   learning_rate_schedule = maxtext_utils.create_learning_rate_schedule(config)
   # pass in model for muon
@@ -176,20 +153,14 @@ def create_train_state_fn():
 
     init_state_fn = create_train_state_fn
   else:
-    init_state_fn = functools.partial(
-        maxtext_utils.init_initial_state, model, tx, config, True, example_rng
-    )
+    init_state_fn = functools.partial(maxtext_utils.init_initial_state, model, tx, config, True, example_rng)
 
   # Shaped state
-  abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state(
-      config, topology_mesh, init_state_fn, True
-  )
+  abstract_state, _, state_mesh_shardings = maxtext_utils.get_abstract_state(config, topology_mesh, init_state_fn, True)
 
   if config.pure_nnx:
     # NNX doesn't use Linen logical annotations; derive PartitionSpecs from the physical shardings.
-    logical_annotations = maxtext_utils_nnx.get_partition_spec_nnx(
-        state_mesh_shardings
-    )
+    logical_annotations = maxtext_utils_nnx.get_partition_spec_nnx(state_mesh_shardings)
     # For NNX, get_functional_train_with_signature expects the graphdef (static structure),
     # not the raw model — mirroring how the training loop does nnx.split(train_state).
     with nn_partitioning.axis_rules(config.logical_axis_rules):
@@ -198,9 +169,7 @@ def create_train_state_fn():
     model = graphdef
   else:
     # unsharded logical annotations
-    logical_annotations = maxtext_utils.get_logical_annotations(
-        config, topology_mesh, init_state_fn
-    )
+    logical_annotations = maxtext_utils.get_logical_annotations(config, topology_mesh, init_state_fn)
 
   # Shaped batch
   shaped_batch = maxtext_utils.get_shaped_batch(config)
@@ -217,9 +186,7 @@ def create_train_state_fn():
   # Collect NNX activation shardings via an abstract forward pass (must run
   # after get_abstract_state, which only traces __init__).
   if config.debug_sharding and config.pure_nnx:
-    _collect_nnx_activation_shardings(
-        _create_model_partial, config, topology_mesh
-    )
+    _collect_nnx_activation_shardings(_create_model_partial, config, topology_mesh)
 
   return (
       shaped_train_args,
@@ -256,9 +223,7 @@ def jit_and_compile(
     maxtext_utils.maybe_dump_jaxpr(config, jitted, func_input_args)
     lowered = jitted.lower(*func_input_args, **func_input_kwargs)
   # Import libtpu flags as compiler options. Defaults to empty dict if string is empty.
-  compiler_options = max_utils.parse_libtpu_flags_to_dict(
-      config.compile_xla_flags
-  )
+  compiler_options = max_utils.parse_libtpu_flags_to_dict(config.compile_xla_flags)
   compiled = lowered.compile(compiler_options=compiler_options)
   return compiled
 
@@ -293,18 +258,12 @@ def is_oom(argv: Sequence[str]) -> bool:
   ) = get_shaped_inputs(topology_mesh, config)
 
   # Update params_shardings when shard_optimizer_over_data is enabled (Zero-1)
-  params_shardings, state_mesh_shardings = (
-      sharding.maybe_update_params_sharding_with_opt(
-          config, state_mesh_shardings
-      )
-  )
+  params_shardings, state_mesh_shardings = sharding.maybe_update_params_sharding_with_opt(config, state_mesh_shardings)
 
   # When ZeRO-1 is enabled, we need to use the original params_shardings for input shardings
   # but keep the updated state_mesh_shardings for the optimizer state
   if config.shard_optimizer_over_data:
-    input_state_mesh_shardings = state_mesh_shardings.replace(
-        params=params_shardings
-    )
+    input_state_mesh_shardings = state_mesh_shardings.replace(params=params_shardings)
   else:
     input_state_mesh_shardings = state_mesh_shardings
 
@@ -344,6 +303,7 @@ def is_oom(argv: Sequence[str]) -> bool:
   except Exception as e:
     # return true if OOM error happens
     # OOM error looks like
+    # Check failed: entries[i] <= std::numeric_limits<uint32_t>::max()
     # jax.errors.JaxRuntimeError: RESOURCE_EXHAUSTED: Allocation ...
     # jax.errors.JaxRuntimeError: INTERNAL: RET_CHECK failure ...
     message = str(e).lower()
@@ -355,8 +315,7 @@ def is_oom(argv: Sequence[str]) -> bool:
 def main(argv: Sequence[str]) -> None:
   jax.config.update("jax_default_prng_impl", "unsafe_rbg")
   os.environ["LIBTPU_INIT_ARGS"] = (
-      os.environ.get("LIBTPU_INIT_ARGS", "")
-      + " --xla_tpu_spmd_rng_bit_generator_unsafe=true"
+      os.environ.get("LIBTPU_INIT_ARGS", "") + " --xla_tpu_spmd_rng_bit_generator_unsafe=true"
   )
   print("Starting train_compile.py...", flush=True)
 
@@ -381,18 +340,12 @@ def main(argv: Sequence[str]) -> None:
   ) = get_shaped_inputs(topology_mesh, config)
 
   # Update params_shardings when shard_optimizer_over_data is enabled (Zero-1)
-  params_shardings, state_mesh_shardings = (
-      sharding.maybe_update_params_sharding_with_opt(
-          config, state_mesh_shardings
-      )
-  )
+  params_shardings, state_mesh_shardings = sharding.maybe_update_params_sharding_with_opt(config, state_mesh_shardings)
 
   # When ZeRO-1 is enabled, we need to use the original params_shardings for input shardings
   # but keep the updated state_mesh_shardings for the optimizer state
   if config.shard_optimizer_over_data:
-    input_state_mesh_shardings = state_mesh_shardings.replace(
-        params=params_shardings
-    )
+    input_state_mesh_shardings = state_mesh_shardings.replace(params=params_shardings)
   else:
     input_state_mesh_shardings = state_mesh_shardings
 
@@ -401,21 +354,15 @@ def main(argv: Sequence[str]) -> None:
   if config.enable_diloco:
     # Build abstract DiLoCo state and shardings for AOT compilation
     abstract_state = shaped_train_args[0]
-    diloco_state, state_mesh_shardings, inner_state_shardings = (
-        diloco.build_abstract_diloco_state(
-            config, abstract_state, state_mesh_shardings, topology_mesh
-        )
+    diloco_state, state_mesh_shardings, inner_state_shardings = diloco.build_abstract_diloco_state(
+        config, abstract_state, state_mesh_shardings, topology_mesh
     )
     # For NNX, shaped_train_args has 2 elements (state, batch) — no rng; pass None for prng.
-    shaped_rng_arg = (
-        shaped_train_args[2] if len(shaped_train_args) > 2 else None
-    )
+    shaped_rng_arg = shaped_train_args[2] if len(shaped_train_args) > 2 else None
     shaped_train_args = (diloco_state, shaped_train_args[1], shaped_rng_arg)
 
     # Wrap train_step with diloco
-    train_step_partial = functools.partial(
-        train.train_step, model, config, inner_state_shardings, params_shardings
-    )
+    train_step_partial = functools.partial(train.train_step, model, config, inner_state_shardings, params_shardings)
     train_step_fn = diloco.build_diloco_train_step(config, train_step_partial)
 
     # For DiLoCo, the train_step_fn is already fully wrapped and takes (state, batch, prng)
@@ -480,10 +427,7 @@ def main(argv: Sequence[str]) -> None:
   if config.compiled_trainstep_file != "":
     print("Saving compiled object...")
     save_compiled(compiled, config.compiled_trainstep_file)
-    print(
-        "Successfully saved compiled object as"
-        f" {config.compiled_trainstep_file}"
-    )
+    print("Successfully saved compiled object as" f" {config.compiled_trainstep_file}")
   print("Finished train_compile.py successfully!", flush=True)
   print(f"Cost analysis: {compiled.cost_analysis()}")
   print(f"Memory analysis: {compiled.memory_analysis()}")