AI-Hypercomputer
diff --git a/‎src/maxtext/common/checkpointing.py‎
Lines changed: 63 additions & 28 deletions b/‎src/maxtext/common/checkpointing.py‎
Lines changed: 63 additions & 28 deletions
diff --git a/‎src/maxtext/configs/base.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/configs/base.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/configs/post_train/dpo.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/maxtext/configs/post_train/dpo.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 32 additions & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎src/maxtext/input_pipeline/data_processing_utils.py‎
Lines changed: 21 additions & 12 deletions b/‎src/maxtext/input_pipeline/data_processing_utils.py‎
Lines changed: 21 additions & 12 deletions
@@ -24,7 +24,7 @@
 import jax
 from maxtext.utils.globals import DEFAULT_OCDBT_TARGET_DATA_FILE_SIZE
 from maxtext.input_pipeline.multihost_dataloading import MultiHostDataLoadIterator
-from maxtext.input_pipeline.multihost_dataloading import RemoteIterator
+from maxtext.input_pipeline.multihost_dataloading import RemoteIteratorWrapper
 from maxtext.input_pipeline.synthetic_data_processing import PlaceHolderDataIterator
 from maxtext.utils import exceptions
 from maxtext.utils import max_logging
@@ -44,6 +44,7 @@
 
 import grain
 from grain.python import PyGrainCheckpointHandler
+from grain.experimental import ElasticIterator
 
 CheckpointManager = ocp.CheckpointManager
 CheckpointManagerOptions = ocp.CheckpointManagerOptions
@@ -69,6 +70,22 @@ def save(
     """Saves the given iterator to the checkpoint in `directory`."""
     item = item or args.item  # pytype:disable=attribute-error
 
+    # RemoteIteratorWrapper handles checkpointing via colocated python
+    if isinstance(item, RemoteIteratorWrapper):
+      step = int(directory.parent.name)
+      item.save_state(step)
+      return
+
+    # ElasticIterator state is a single global scalar shared by all shards,
+    # so we write one fixed `process_0.json` from process 0 only. This file
+    # layout survives changes in `jax.process_count()`.
+    if isinstance(item, ElasticIterator):
+      if jax.process_index() == 0:
+        directory.mkdir(parents=True, exist_ok=True)
+        filename = directory / "process_0.json"
+        filename.write_text(json.dumps(item.get_state(), indent=4))
+      return
+
     def save_single_process(item, process_index, process_count):
       filename = directory / f"process_{process_index}-of-{process_count}.json"
       if isinstance(item, grain.DatasetIterator):
@@ -95,6 +112,21 @@ def restore(
     process_index = getattr(args, "process_index", None)
     process_count = getattr(args, "process_count", None)
 
+    # In Pathways + colocated_python environment, RemoteIteratorWrapper handles checkpointing
+    if isinstance(item, RemoteIteratorWrapper):
+      step = int(directory.parent.name)
+      item.restore_state(step)
+      return item
+
+    # McJax and Pathways through controller cases
+    # ElasticIterator: every process reads the same shared `process_0.json`.
+    if isinstance(item, ElasticIterator):
+      filename = directory / "process_0.json"
+      if not filename.exists():
+        raise ValueError(f"File {filename} does not exist.")
+      item.set_state(json.loads(filename.read_text()))
+      return item
+
     def restore_single_process(item, process_index, process_count):
       filename = directory / f"process_{process_index}-of-{process_count}.json"
       if not filename.exists():
@@ -132,15 +164,6 @@ class GrainCheckpointRestore(ocp.args.CheckpointArgs):
   process_count: Optional[int] = None
 
 
-def _is_remote_iterator(data_iterator):
-  """Check if data_iterator is a RemoteIterator or contains RemoteIterator instances."""
-  if isinstance(data_iterator, RemoteIterator):
-    return True
-  if isinstance(data_iterator, list):
-    return any(isinstance(item, RemoteIterator) for item in data_iterator)
-  return False
-
-
 def _load_full_state_from_path(
     path,
     abstract_unboxed_pre_state,
@@ -482,6 +505,17 @@ def _restore_grain_iterator(
   This function dispatches to the correct restore strategy based on
   the number of stored checkpoint files vs. current JAX processes.
   """
+  if isinstance(data_iterator, RemoteIteratorWrapper):
+    grain_restore_args = GrainCheckpointRestore(item=data_iterator)
+    restored_state = checkpoint_manager.restore(step, args=Composite(items=checkpoint_args, iter=grain_restore_args))
+    return (restored_state, None)
+
+  # ElasticIterator: one shared `process_0.json` regardless of shard count.
+  if not isinstance(data_iterator, list) and isinstance(data_iterator.local_iterator, ElasticIterator):
+    grain_restore_args = GrainCheckpointRestore(item=data_iterator.local_iterator)
+    restored_state = checkpoint_manager.restore(step, args=Composite(items=checkpoint_args, iter=grain_restore_args))
+    return (restored_state, None)
+
   directory = checkpoint_manager.directory / str(step) / "iter"
   process_count_jax = jax.process_count()
 
@@ -625,7 +659,7 @@ def map_to_pspec(data):
               None,
           )
         # Case 2: Matches if dataset type is "grain" and the data iterator is not a
-        # PlaceHolderDataIterator or RemoteIterator and a specific checkpoint file exists for the iterator
+        # PlaceHolderDataIterator and a specific checkpoint file exists for the iterator
         case (
             checkpoint_manager,
             dataset_type,
@@ -634,7 +668,6 @@ def map_to_pspec(data):
             dataset_type == "grain"
             and data_iterator
             and not isinstance(data_iterator, PlaceHolderDataIterator)
-            and not _is_remote_iterator(data_iterator)
             and (checkpoint_manager.directory / str(step) / "iter").exists()
         ):
           return _restore_grain_iterator(
@@ -810,22 +843,24 @@ def save_checkpoint(checkpoint_manager, step, state, config=None, data_iterator=
   )
   save_args_composite = {"items": checkpoint_args}
 
-  if (
-      config
-      and config.dataset_type == "grain"
-      and not isinstance(data_iterator, PlaceHolderDataIterator)
-      and not _is_remote_iterator(data_iterator)
-  ):
-    if not isinstance(data_iterator, list):
-      data_iterator = [data_iterator]
-    grain_iters_to_save = []
-    process_count_total = jax.process_count() * len(data_iterator)
-    if config.expansion_factor_real_data > 1:
-      process_count_total = process_count_total // config.expansion_factor_real_data
-    for i, data_iter in enumerate(data_iterator):
-      process_index = jax.process_index() + i * jax.process_count()
-      grain_iters_to_save.append((data_iter.local_iterator, process_index, process_count_total))
-    save_args_composite["iter"] = GrainCheckpointSave(item=grain_iters_to_save)
+  if config and config.dataset_type == "grain" and not isinstance(data_iterator, PlaceHolderDataIterator):
+    if isinstance(data_iterator, RemoteIteratorWrapper):
+      # Pass the wrapper directly; GrainCheckpointHandler will call save_state with the step
+      save_args_composite["iter"] = GrainCheckpointSave(item=data_iterator)
+    elif not isinstance(data_iterator, list) and isinstance(data_iterator.local_iterator, ElasticIterator):
+      # ElasticIterator checkpoints a single global scalar shared by all shards.
+      save_args_composite["iter"] = GrainCheckpointSave(item=data_iterator.local_iterator)
+    else:
+      if not isinstance(data_iterator, list):
+        data_iterator = [data_iterator]
+      grain_iters_to_save = []
+      process_count_total = jax.process_count() * len(data_iterator)
+      if config.expansion_factor_real_data > 1:
+        process_count_total = process_count_total // config.expansion_factor_real_data
+      for i, data_iter in enumerate(data_iterator):
+        process_index = jax.process_index() + i * jax.process_count()
+        grain_iters_to_save.append((data_iter.local_iterator, process_index, process_count_total))
+      save_args_composite["iter"] = GrainCheckpointSave(item=grain_iters_to_save)
 
   match (checkpoint_manager, config, data_iterator):
     case (checkpoint_manager, _, _) if isinstance(
 
@@ -729,6 +729,7 @@ grain_num_threads_eval: 16
 grain_prefetch_buffer_size_eval: 500
 grain_data_source_max_workers: 16  # Max workers for ThreadPoolExecutor when mixing multiple Grain data sources.
 grain_shuffle_buffer_size: 100 # shuffle buffer when using sequential access formats such as Parquet, TFRecord.
+grain_use_elastic_iterator: False # For elastic training, set to this true and packing=False
 # for using pathways
 colocated_python_data_input: False  # experimental feature, under testing
 
 
@@ -1,6 +1,7 @@
 base_config: "base.yml"
 
 use_dpo: true
+packing: false
 train_data_columns: ['chosen', 'rejected']
 eval_data_columns: ['chosen', 'rejected']
 base_output_directory: 'gs://maxtext-external/logs'
 
@@ -1124,6 +1124,13 @@ class GrainDataset(BaseModel):
   grain_file_type: str = Field(
       "arrayrecord", description="File type for Grain data. Supported: arrayrecord, tfrecord, parquet."
   )
+  grain_use_elastic_iterator: bool = Field(
+      False,
+      description=(
+          "Whether to use grain's `ElasticIterator` for data loading. When True, the iterator"
+          "checkpoint can be restored after a change in the number of data-loading shards."
+      ),
+  )
   grain_worker_count: int = Field(1, description="Number of workers for Grain data loading.")
   grain_per_worker_buffer_size: int = Field(1, description="Per-worker buffer size for Grain train data loading.")
   grain_worker_count_eval: int = Field(1, description="Number of workers for Grain eval data loading.")
@@ -2630,6 +2637,31 @@ def calculate_global_batch_sizes(per_device_batch_size, expansion_factor, num_de
       raise ValueError("At most one of `load_parameters_path` or `load_full_state_path` should be set.")
     if self.elastic_enabled and not self.enable_single_controller:
       raise ValueError("Elastic training is only supported with Pathways (`enable_single_controller=True`).")
+    if self.grain_use_elastic_iterator and self.grain_file_type != "arrayrecord":
+      raise ValueError(
+          "`grain_use_elastic_iterator=True` only supports `grain_file_type=arrayrecord`. "
+          "tfrecord and parquet pipelines use `InterleaveIterDataset` (a many-to-one "
+          "IterDataset transform), which `ElasticIterator` forbids. "
+          f"Got grain_file_type={self.grain_file_type}."
+      )
+    if self.grain_use_elastic_iterator and self.packing:
+      raise ValueError("`grain_use_elastic_iterator=True` requires `packing=False`.")
+    if self.use_dpo and self.packing:
+      raise ValueError("DPO does not support packing. Set `packing=False`.")
+    if self.grain_use_elastic_iterator and not self.use_truncation:
+      raise ValueError(
+          "`grain_use_elastic_iterator=True` requires `use_truncation=True`. "
+          "`TokenizeAndChunk` uses `apply`, which produces a many-to-one "
+          "IterDataset transform that `ElasticIterator` forbids."
+      )
+    if self.grain_use_elastic_iterator and (
+        self.grain_train_mixture_config_path or ";" in (self.grain_train_files or "")
+    ):
+      raise ValueError(
+          "`grain_use_elastic_iterator=True` does not support dataset mixtures. "
+          "Set `grain_train_mixture_config_path` to empty and use a single "
+          "`grain_train_files` pattern (no ';' separator)."
+      )
     if (self.load_parameters_path or self.load_full_state_path) and not self.enable_checkpointing:
       raise ValueError("You must set enable_checkpointing=True to load a checkpoint.")
     if self.enable_multi_tier_checkpointing:
 
@@ -78,8 +78,16 @@ def get_local_batch_size(config):
   return batch_size
 
 
-def format_and_batch(dataset, config, batch_size, pad_id, data_columns, tokenizer_model):
-  """Packs or pads the dataset according to config and batches it."""
+def format_and_batch(dataset, config, batch_size, pad_id, data_columns, tokenizer_model, shift=True):
+  """Packs or pads the dataset, batches it, and optionally shifts tokens for next-token prediction.
+
+  When `config.grain_use_elastic_iterator` is True, batching is skipped
+  (ElasticIterator performs it internally) and, if `shift=True`, the shift is
+  applied pre-batch on axis 0, which is equivalent to a post-batch axis=1 shift.
+
+  `shift` should be False for pipelines that don't do next-token prediction
+  (e.g. DPO, which scores full sequences).
+  """
   if config.packing:
     length_struct = {col: config.max_target_length for col in data_columns}
     max_segments = config.max_segments_per_seq
@@ -117,23 +125,24 @@ def format_and_batch(dataset, config, batch_size, pad_id, data_columns, tokenize
   else:
     dataset = dataset.map(input_pipeline_utils.PadOrTrimToMaxLength(config.max_target_length, pad_id))
 
+  if config.grain_use_elastic_iterator:
+    # ElasticIterator batches internally, so return the pre-batch dataset.
+    if shift:
+      dataset = dataset.map(input_pipeline_utils.ShiftData(ignored_ids=[pad_id], axis=0))
+    return dataset
+
   batch_fn = functools.partial(grain.experimental.batch_and_pad, batch_size=batch_size, pad_value=pad_id)
   dataset = dataset.batch(batch_size, batch_fn=batch_fn)
+  if shift:
+    dataset = dataset.map(input_pipeline_utils.ShiftData(ignored_ids=[pad_id], axis=1))
   return dataset
 
 
-def shift_dataset(dataset, pad_id):
-  """Shift tokens to create inputs and targets for standard next-token prediction."""
-  return dataset.map(
-      input_pipeline_utils.ShiftData(
-          ignored_ids=[pad_id],
-          axis=1,
-      )
-  )
-
-
 def apply_multiprocessing_and_prefetch(dataset, config, grain_worker_count, grain_per_worker_buffer_size):
   """Applies multiprocessing and prefetching configurations to the dataset."""
+  if config.grain_use_elastic_iterator:
+    # ElasticIterator applies multiprocessing itself.
+    return dataset
   multiprocessing_options = (
       pick_performance_config(
           ds=dataset,