Save multiple checkpoints per epoch (#509)

shanjiaz · web-flow · commit 3c811bce1dec · 2026-05-13T20:38:01.000Z
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED. ## Purpose Save multiple checkpoints per epoch  ## Description updated `checkpoint_freq` to float, setting it to 0.5 would mean save twice per epoch to the same directory by overwriting. We skip the last saving right before validation since it's handled separately. (Or should it?) When checkpoint_freq > 1 it should be an int and we save per multiple epochs.  ## Related Issue  #493 ## Tests Tested locally with an example.  I have filled in: - [x] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". - [x] The test plan/results, such as providing test command and pasting the results. - [ ] (Optional) The necessary documentation update. - [x] I (a human) have written or reviewed the code in this pr to the best of my ability. --------- Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>
diff --git a/scripts/train.py b/scripts/train.py
@@ -383,11 +383,16 @@ def main(args: argparse.Namespace):
     maybe_destroy_distributed()
 
 
-def _checkpoint_freq(value: str) -> int:
-    ivalue = int(value)
-    if ivalue < 1:
-        raise argparse.ArgumentTypeError("--checkpoint-freq must be >= 1")
-    return ivalue
+def _checkpoint_freq(value: str) -> float:
+    fvalue = float(value)
+    if fvalue <= 0:
+        raise argparse.ArgumentTypeError("--checkpoint-freq must be > 0")
+    if fvalue > 1 and not fvalue.is_integer():
+        raise argparse.ArgumentTypeError(
+            f"--checkpoint-freq={fvalue} is not an integer. Values > 1 are treated "
+            "as epoch counts and must be whole numbers."
+        )
+    return fvalue
 
 
 def parse_args():
@@ -632,8 +637,9 @@ def parse_args():
     parser.add_argument(
         "--checkpoint-freq",
         type=_checkpoint_freq,
-        default=1,
-        help="Save a checkpoint every N epochs.",
+        default=1.0,
+        help="Save a checkpoint every N epochs. Values < 1 enable sub-epoch "
+        "checkpointing (e.g. 0.5 = every half epoch).",
     )
     parser.add_argument(
         "--save-best",
diff --git a/src/speculators/train/trainer.py b/src/speculators/train/trainer.py
@@ -29,6 +29,7 @@
 metric_logger = logging.getLogger("speculators.metrics")
 
 warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
+MIN_STEP_PCT = 0.25
 
 
 class TrainerConfig(NamedTuple):
@@ -44,7 +45,7 @@ class TrainerConfig(NamedTuple):
     scheduler_warmup_steps: int | None = None
     scheduler_total_steps: int | None = None
     scheduler_num_cosine_cycles: float = 0.5
-    checkpoint_freq: int = 1
+    checkpoint_freq: float = 1
     save_best: bool = False
     hidden_states_dtype: torch.dtype = torch.bfloat16
     log_freq: int = 1
@@ -191,7 +192,13 @@ def train_epoch(self, epoch: int):
         if self.local_rank == 0:
             train_loader = tqdm(train_loader, desc=f"Epoch {epoch}")  # type: ignore[assignment]
 
-        for batch in train_loader:
+        num_steps = len(self.train_loader)
+        step_interval = (
+            max(1, round(num_steps * self.config.checkpoint_freq))
+            if self.config.checkpoint_freq < 1
+            else None
+        )
+        for local_step, batch in enumerate(train_loader, 1):
             gpu_batch = {
                 k: v.to(self.local_rank, non_blocking=True)
                 if isinstance(v, torch.Tensor)
@@ -229,6 +236,15 @@ def train_epoch(self, epoch: int):
                 )
             self.global_step += 1
 
+            if (
+                step_interval is not None
+                and not self.config.save_best
+                and local_step % step_interval == 0
+                and num_steps - local_step >= step_interval * MIN_STEP_PCT
+                # Avoid saving back to back ay the end of each epoch
+            ):
+                self.maybe_save_checkpoint(epoch)
+
     @torch.no_grad()
     def val_epoch(self, epoch: int) -> dict[str, float] | None:
         if self.val_loader is None:
@@ -271,7 +287,8 @@ def maybe_save_checkpoint(self, epoch: int | str):
         if epoch != "interrupted" and (
             self.config.save_best
             or (
-                isinstance(epoch, int)
+                self.config.checkpoint_freq >= 1
+                and isinstance(epoch, int)
                 and epoch != 0
                 and (epoch + 1) % self.config.checkpoint_freq != 0
             )
@@ -294,7 +311,9 @@ def maybe_update_best(self, epoch: int, val_metrics: dict | None):
             self.checkpointer.save_checkpoint(self.model, self.opt, epoch)
             if self.scheduler is not None:
                 self.checkpointer.save_scheduler_state_dict(self.scheduler, epoch)
-        elif not (epoch == 0 or (epoch + 1) % self.config.checkpoint_freq == 0):
+        elif self.config.checkpoint_freq >= 1 and not (
+            epoch == 0 or (epoch + 1) % int(self.config.checkpoint_freq) == 0
+        ):
             return
 
         self.best_val_loss = val_metrics["loss_epoch"]