fix: raise RuntimeError when checkpoint step >= config.steps

Dr-Left · Dr-Left · commit 66ddc482ad08 · 2026-06-01T22:56:31.000Z
When a user sets steps=x and there is already a checkpoint saved at step x,
the job should fail with a clear error message instead of performing no
computation or failing with a confusing profiling error.

We add an early check in setup_train_loop (train_utils.py) and a fallback check in
train_loop (train.py) to fail fast before loading the checkpoint/initializing TPU
or before the expensive TPU compilation step. Both checks are standardized to
use a shared validation helper. Unit tests are added to verify the validation logic.

TAG=agy
CONV=88c01cb5-28b2-4b67-8895-4a290d332d3f
diff --git a/src/maxtext/trainers/pre_train/train.py b/src/maxtext/trainers/pre_train/train.py
@@ -639,6 +639,9 @@ def train_loop(config, recorder, state=None):
       state,
   ) = train_utils.setup_train_loop(config, recorder)
 
+  start_step = get_first_step(model, state)  # this is the start_step for training
+  train_utils.validate_completed_steps(start_step, config.steps)
+
   if isinstance(model, nn.Module):
     if config.use_dpo:
       if "reference_params" not in state.params:
@@ -682,8 +685,6 @@ def train_loop(config, recorder, state=None):
       compiled = p_train_step.lower(*lower_args).compile(compiler_options=compiler_options)
       compiled_stats = compiled.memory_analysis()
       max_utils.print_compiled_memory_stats(compiled_stats)
-
-  start_step = get_first_step(model, state)  # this is the start_step for training
   prof = profiler.Profiler(config, offset_step=start_step)
   metric_logger_instance = metric_logger.MetricLogger(config=config, learning_rate_schedule=learning_rate_schedule)
 
diff --git a/src/maxtext/utils/train_utils.py b/src/maxtext/utils/train_utils.py
@@ -240,6 +240,11 @@ def create_train_state_fn():
     else:
       init_state_fn = partial(maxtext_utils.init_initial_state, model, tx, config, is_training, init_rng)
     checkpoint_manager = create_checkpoint_manager(config, mesh, init_state_fn)
+    if checkpoint_manager is not None:
+      checkpoint_step = checkpoint_manager.latest_step()
+      if checkpoint_step is not None:
+        validate_completed_steps(checkpoint_step + 1, config.steps)
+
 
   with maybe_record_goodput(recorder, GoodputEvent.TRAINING_PREPARATION):
     data_iterator, eval_data_iterator = create_data_iterator(config, mesh)
@@ -405,3 +410,15 @@ def validate_train_config(config):
         "WARNING: Sequence packing is essentially ignored for synthetic data. "
         "Please use a real dataset to use sequence packing."
     )
+
+
+def validate_completed_steps(completed_steps: int, config_steps: int):
+  """Raises RuntimeError if training has already completed up to config_steps."""
+  if completed_steps >= config_steps:
+    raise RuntimeError(
+        f"Requested training up to step {config_steps}, but a checkpoint already exists at step {completed_steps - 1} "
+        f"(which means {completed_steps} steps have been completed). "
+        f"Did you mean to continue training past step {completed_steps} (you should set steps > {completed_steps}) "
+        f"or to not load the checkpoint (use enable_checkpointing=False?)"
+    )
+
diff --git a/tests/unit/train_utils_test.py b/tests/unit/train_utils_test.py
@@ -18,7 +18,11 @@
 from dataclasses import dataclass
 from unittest.mock import MagicMock
 
-from maxtext.utils.train_utils import validate_train_config, create_training_optimizer
+from maxtext.utils.train_utils import (
+    validate_train_config,
+    create_training_optimizer,
+    validate_completed_steps,
+)
 
 
 @dataclass
@@ -185,12 +189,32 @@ def test_sgd_optimizer_returns_tx(self):
     config.learning_rate_schedule_steps = 100
     config.lr_schedule_type = "cosine"
     config.use_iota_embed = False
-
     _, tx = create_training_optimizer(config, model=None)
-
     self.assertIsNotNone(tx)
     self.assertTrue(hasattr(tx, "init"))
 
 
+class TestValidateCompletedSteps(unittest.TestCase):
+  """Tests for validate_completed_steps."""
+
+  def test_under_steps_passes(self):
+    """Verifies no exception raised when completed_steps < config_steps."""
+    # Should not raise
+    validate_completed_steps(completed_steps=50, config_steps=100)
+
+  def test_equal_steps_raises(self):
+    """Verifies RuntimeError raised when completed_steps == config_steps."""
+    with self.assertRaises(RuntimeError) as context:
+      validate_completed_steps(completed_steps=100, config_steps=100)
+    self.assertIn("Requested training up to step 100, but a checkpoint already exists at step 99", str(context.exception))
+
+  def test_over_steps_raises(self):
+    """Verifies RuntimeError raised when completed_steps > config_steps."""
+    with self.assertRaises(RuntimeError) as context:
+      validate_completed_steps(completed_steps=105, config_steps=100)
+    self.assertIn("Requested training up to step 100, but a checkpoint already exists at step 104", str(context.exception))
+
+
 if __name__ == "__main__":
   unittest.main()
+