trainer: persist accumulator + grads across stop/restart

BitcrushedHeart · BitcrushedHeart · commit bdb1f35ea57d · 2026-05-17T23:31:18.000+01:00
When training is stopped mid-accumulation-window and resumed from a
backup (end()/backup_before_save path), accumulated_loss was a
function-local in GenericTrainer.train() so it reset to 0.0 on resume
while train_progress.global_step was restored verbatim. The first
post-resume update step then logged only the trailing micro-batches
of the affected window (a sharp downward spike of factor
(acc - k_lost)/acc) and the optimizer.step() at that boundary
applied under-accumulated grads.

This change persists the in-flight gradient-accumulation state:
accumulated_loss, per-trainable-parameter .grad tensors keyed by
NamedParameterGroup unique_name, GradScaler state, RNG snapshots
(torch_cpu/cuda + python + numpy), and a cheap dataset fingerprint.
Stored as accumulator/accumulator.pt alongside optimizer.pt and
meta.json. On resume the trainer reattaches grads, restores the
accumulator and RNG, and the next optimizer step proceeds as if the
stop never happened.

Mismatched fingerprints (changed concept set or different
gradient_accumulation_steps) warn-only and restore anyway; losing
accumulated gradient state is worse than an off-spec effective batch.
Legacy backups without the new file load silently with the prior
behavior.

Stop semantics are deliberately untouched.

Files modified:
- modules/model/BaseModel.py: new accumulator_state attribute
- modules/modelSaver/mixin/InternalModelSaverMixin.py: write
  accumulator/accumulator.pt when staged
- modules/modelLoader/mixin/InternalModelLoaderMixin.py: read it
  under contextlib.suppress(FileNotFoundError)
- modules/trainer/GenericTrainer.py: stage on backup/save, restore
  on train() entry, mirror loop-locals each iteration
- modules/util/NamedParameterGroup.py: iter_named_parameters() yields
  stable (group_unique_name.idx, param) pairs
- modules/util/dataset_fingerprint.py: SHA-256 over concept
  identifiers with concept_file_name fallback
diff --git a/modules/model/BaseModel.py b/modules/model/BaseModel.py
@@ -76,6 +76,7 @@ class BaseModel(metaclass=ABCMeta):
     embedding_state_dicts: dict[str, dict[str, Tensor]] | None
     autocast_context: torch.autocast | nullcontext
     train_dtype: DataType
+    accumulator_state: dict | None
 
     def __init__(
             self,
@@ -93,6 +94,7 @@ def __init__(
         self.embedding_state_dicts = {}
         self.autocast_context = nullcontext()
         self.train_dtype = DataType.FLOAT_32
+        self.accumulator_state = None
 
     @abstractmethod
     def to(self, device: torch.device):
diff --git a/modules/modelLoader/mixin/InternalModelLoaderMixin.py b/modules/modelLoader/mixin/InternalModelLoaderMixin.py
@@ -38,5 +38,13 @@ def _load_internal_data(
             with contextlib.suppress(FileNotFoundError):
                 model.ema_state_dict = torch.load(os.path.join(model_name, "ema", "ema.pt"), weights_only=True)
 
+            # Optional grad-accum snapshot; legacy backups without it fall through to defaults.
+            # weights_only=False: payload mixes tensors with python dicts/tuples (RNG state).
+            with contextlib.suppress(FileNotFoundError):
+                model.accumulator_state = torch.load(
+                    os.path.join(model_name, "accumulator", "accumulator.pt"),
+                    weights_only=False,
+                )
+
             # meta
             model.train_progress = train_progress
diff --git a/modules/modelSaver/mixin/InternalModelSaverMixin.py b/modules/modelSaver/mixin/InternalModelSaverMixin.py
@@ -40,3 +40,12 @@ def _save_internal_data(
                     'global_step': model.train_progress.global_step,
                 },
             }, meta_file)
+
+        # In-flight grad-accum snapshot; staged by the trainer, skipped on non-training paths.
+        accumulator_state = getattr(model, "accumulator_state", None)
+        if accumulator_state is not None:
+            os.makedirs(os.path.join(destination, "accumulator"), exist_ok=True)
+            torch.save(
+                accumulator_state,
+                os.path.join(destination, "accumulator", "accumulator.pt"),
+            )
diff --git a/modules/trainer/GenericTrainer.py b/modules/trainer/GenericTrainer.py
@@ -3,6 +3,7 @@
 import json
 import math
 import os
+import random
 import shutil
 import traceback
 from collections.abc import Callable
@@ -22,6 +23,7 @@
 from modules.util.commands.TrainCommands import TrainCommands
 from modules.util.config.SampleConfig import SampleConfig
 from modules.util.config.TrainConfig import TrainConfig
+from modules.util.dataset_fingerprint import compute_concept_fingerprint
 from modules.util.dtype_util import create_grad_scaler, enable_grad_scaling
 from modules.util.enum.ConceptType import ConceptType
 from modules.util.enum.EMAMode import EMAMode
@@ -42,6 +44,7 @@
 from torchvision.transforms.functional import pil_to_tensor
 
 import huggingface_hub
+import numpy as np
 from requests.exceptions import ConnectionError
 from tqdm import tqdm
 
@@ -78,6 +81,11 @@ def __init__(self, config: TrainConfig, callbacks: TrainCallbacks, commands: Tra
         self.one_step_trained = False
         self.grad_hook_handles = []
 
+        # Loop locals mirrored so __backup/__save can read them without threading.
+        self._loop_accumulated_loss: float = 0.0
+        self._loop_accumulated_loss_tensor: torch.Tensor | None = None
+        self._loop_scaler = None
+
     def start(self):
         if multi.is_master():
             self.__save_config_to_workspace()
@@ -445,6 +453,7 @@ def __backup(self, train_progress: TrainProgress, print_msg: bool = True, print_
             if print_msg:
                 print_cb("Creating Backup " + backup_path)
 
+            self._stage_accumulator_state_for_save()
             self.model_saver.save(
                 self.model,
                 self.config.model_type,
@@ -464,6 +473,7 @@ def __backup(self, train_progress: TrainProgress, print_msg: bool = True, print_
                 traceback.print_exc()
                 print("Could not delete partial backup")
         finally:
+            self._clear_staged_accumulator_state()
             if self.config.rolling_backup:
                 self.__prune_backups(self.config.rolling_backup_count)
 
@@ -496,17 +506,20 @@ def __save(self, train_progress: TrainProgress, print_msg: bool = True, print_cb
             if self.config.optimizer.optimizer.is_schedule_free:
                 torch.clear_autocast_cache()
                 self.model.optimizer.eval()
+            self._stage_accumulator_state_for_save()
             self.model_saver.save(
                 model=self.model,
                 model_type=self.config.model_type,
                 output_model_format=self.config.output_model_format,
                 output_model_destination=save_path,
                 dtype=self.config.output_dtype.torch_dtype()
             )
+            self._clear_staged_accumulator_state()
             if self.config.optimizer.optimizer.is_schedule_free:
                 torch.clear_autocast_cache()
                 self.model.optimizer.train()
         except Exception:
+            self._clear_staged_accumulator_state()
             traceback.print_exc()
             print("Could not save model. Check your disk space!")
             try:
@@ -553,6 +566,142 @@ def __is_update_step(self, train_progress: TrainProgress) -> bool:
             "update_step", self.config.gradient_accumulation_steps, TimeUnit.STEP, train_progress, start_at_zero=False
         )
 
+    def _stage_accumulator_state_for_save(self):
+        # Build the in-flight grad-accum snapshot for InternalModelSaverMixin.
+        if not multi.is_master():
+            self.model.accumulator_state = None
+            return
+
+        if self._loop_accumulated_loss_tensor is not None and \
+                isinstance(self._loop_accumulated_loss_tensor, torch.Tensor):
+            try:
+                acc_loss_f = float(self._loop_accumulated_loss_tensor.item())
+            except Exception:
+                acc_loss_f = float(self._loop_accumulated_loss)
+        else:
+            acc_loss_f = float(self._loop_accumulated_loss)
+
+        param_grads: dict[str, torch.Tensor] = {}
+        if self.model is not None and self.model.parameters is not None:
+            for key, p in self.model.parameters.iter_named_parameters():
+                if not p.requires_grad or p.grad is None:
+                    continue
+                param_grads[key] = p.grad.detach().to(device="cpu", copy=True)
+
+        scaler_state = None
+        if self._loop_scaler is not None:
+            try:
+                scaler_state = self._loop_scaler.state_dict()
+            except Exception:
+                scaler_state = None
+
+        rng: dict = {
+            "torch_cpu": torch.get_rng_state(),
+            "torch_cuda": torch.cuda.get_rng_state_all() if torch.cuda.is_available() else None,
+            "python": random.getstate(),
+            # Snapshots the GLOBAL numpy RNG; Generator-based snapshots don't round-trip with set_state.
+            "numpy": np.random.get_state(legacy=True),  # noqa: NPY002
+        }
+
+        fp_hash, fp_count = compute_concept_fingerprint(
+            getattr(self.config, "concepts", None),
+            getattr(self.config, "concept_file_name", None),
+        )
+        self.model.accumulator_state = {
+            "accumulated_loss": acc_loss_f,
+            "param_grads": param_grads,
+            "scaler": scaler_state,
+            "rng": rng,
+            "fingerprint": {
+                "gradient_accumulation_steps": int(self.config.gradient_accumulation_steps),
+                "dataset_hash": fp_hash,
+                "concept_count": fp_count,
+            },
+        }
+
+    def _clear_staged_accumulator_state(self):
+        if self.model is not None:
+            self.model.accumulator_state = None
+
+    def _restore_accumulator_state(
+            self,
+            accumulated_loss: torch.Tensor,
+            train_device: torch.device,
+            scaler,
+    ) -> tuple[torch.Tensor, bool]:
+        # Returns (accumulated_loss, has_gradient). Warn-only on mismatch; never discards state.
+        if not multi.is_master():
+            return accumulated_loss, False
+        state = getattr(self.model, "accumulator_state", None)
+        if state is None:
+            return accumulated_loss, False
+
+        fp = state.get("fingerprint", {})
+        saved_acc = fp.get("gradient_accumulation_steps")
+        if saved_acc is not None and saved_acc != self.config.gradient_accumulation_steps:
+            print(
+                f"Warning: gradient_accumulation_steps mismatch on resume: "
+                f"saved={saved_acc} current={self.config.gradient_accumulation_steps}; "
+                f"restoring partial accumulator state anyway."
+            )
+        current_hash, current_count = compute_concept_fingerprint(
+            getattr(self.config, "concepts", None),
+            getattr(self.config, "concept_file_name", None),
+        )
+        if fp.get("dataset_hash") and fp.get("dataset_hash") != current_hash:
+            delta = current_count - int(fp.get("concept_count", current_count))
+            print(
+                f"Warning: dataset fingerprint mismatch on resume: "
+                f"saved_concepts={fp.get('concept_count')} current_concepts={current_count} "
+                f"(delta={delta}); restoring partial accumulator state anyway."
+            )
+
+        acc_loss_f = float(state.get("accumulated_loss", 0.0) or 0.0)
+        accumulated_loss = torch.tensor(acc_loss_f, device=train_device)
+
+        saved_grads: dict = state.get("param_grads", {}) or {}
+        if self.model is not None and self.model.parameters is not None:
+            current_keys = {k for k, _ in self.model.parameters.iter_named_parameters()}
+            missing = [k for k in saved_grads if k not in current_keys]
+            if saved_grads and len(missing) / len(saved_grads) > 0.10:
+                print(
+                    f"Warning: {len(missing)} of {len(saved_grads)} saved grad keys are "
+                    f"absent in the current model; skipping those grads."
+                )
+            applied = 0
+            for key, p in self.model.parameters.iter_named_parameters():
+                if not p.requires_grad:
+                    continue
+                if key in saved_grads:
+                    p.grad = saved_grads[key].to(device=p.device, dtype=p.dtype, non_blocking=True)
+                    applied += 1
+                else:
+                    p.grad = None
+            has_gradient = applied > 0
+        else:
+            has_gradient = False
+
+        if scaler is not None and state.get("scaler") is not None:
+            try:
+                scaler.load_state_dict(state["scaler"])
+            except Exception:
+                print("Warning: could not restore GradScaler state; continuing with a fresh scaler.")
+
+        rng = state.get("rng", {}) or {}
+        if "torch_cpu" in rng and rng["torch_cpu"] is not None:
+            torch.set_rng_state(rng["torch_cpu"])
+        if rng.get("torch_cuda") is not None and torch.cuda.is_available():
+            with contextlib.suppress(Exception):
+                torch.cuda.set_rng_state_all(rng["torch_cuda"])
+        if "python" in rng and rng["python"] is not None:
+            random.setstate(rng["python"])
+        if rng.get("numpy") is not None:
+            with contextlib.suppress(Exception):
+                np.random.set_state(rng["numpy"])  # noqa: NPY002
+
+        self.model.accumulator_state = None
+        return accumulated_loss, has_gradient
+
     def __apply_fused_back_pass(self, scaler):
         fused_optimizer_step = self.config.optimizer.optimizer.supports_fused_back_pass() and self.config.optimizer.fused_back_pass
         fused_reduce = self.config.multi_gpu and self.config.fused_gradient_reduce
@@ -621,6 +770,7 @@ def train(self):
             return
 
         scaler = create_grad_scaler() if enable_grad_scaling(self.config.train_dtype, self.parameters) else None
+        self._loop_scaler = scaler  # mirror so save-side staging can capture state_dict
 
         self.__apply_fused_back_pass(scaler)
 
@@ -634,6 +784,15 @@ def train(self):
         ema_loss_steps = 0
         epochs = range(train_progress.epoch, self.config.epochs, 1)
 
+        # If resuming from a mid-window save, restore in-flight accumulator + grads + RNG.
+        accumulated_loss, restored_has_grad = self._restore_accumulator_state(
+            accumulated_loss, train_device, scaler,
+        )
+        if restored_has_grad:
+            has_gradient = True
+        self._loop_accumulated_loss_tensor = accumulated_loss
+        self._loop_accumulated_loss = float(accumulated_loss.item()) if accumulated_loss is not None else 0.0
+
         for _epoch in tqdm(epochs, desc="epoch") if multi.is_master() else epochs:
             multi.sync_commands(self.commands)
             if self.commands.get_stop_command():
@@ -761,6 +920,7 @@ def sample_commands_fun():
                     detached_loss = loss.detach()
                     multi.reduce_tensor_mean(detached_loss)
                     accumulated_loss += detached_loss
+                    self._loop_accumulated_loss_tensor = accumulated_loss  # save-side stage mirror
 
                     if self.__is_update_step(train_progress):
                         if self.config.fused_gradient_reduce:
@@ -807,6 +967,8 @@ def sample_commands_fun():
                             self.tensorboard.add_scalar("smooth_loss/train_step", ema_loss, train_progress.global_step)
 
                         accumulated_loss = 0.0
+                        self._loop_accumulated_loss = 0.0  # clear save-side mirror at boundary
+                        self._loop_accumulated_loss_tensor = None
                         self.model_setup.after_optimizer_step(self.model, self.config, train_progress)
 
                         if self.model.ema:
diff --git a/modules/util/NamedParameterGroup.py b/modules/util/NamedParameterGroup.py
@@ -32,6 +32,12 @@ def add_group(self, group: NamedParameterGroup):
     def parameters(self) -> list[Parameter]:
         return [p for group in self.__groups for p in group.parameters]
 
+    def iter_named_parameters(self) -> Iterable[tuple[str, Parameter]]:
+        # Stable per-parameter keys for accumulator-state save/load.
+        for group in self.__groups:
+            for i, p in enumerate(group.parameters):
+                yield f"{group.unique_name}.{i}", p
+
     def parameters_for_optimizer(self, config: TrainConfig) -> list[dict]:
         parameters = []
 
diff --git a/modules/util/dataset_fingerprint.py b/modules/util/dataset_fingerprint.py
@@ -0,0 +1,50 @@
+"""SHA-256 fingerprint of the configured dataset, warn-only on resume mismatch."""
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+from collections.abc import Iterable
+
+from modules.util.config.ConceptConfig import ConceptConfig
+
+
+def _identifier_tuple(c) -> tuple:
+    def g(name, default):
+        if hasattr(c, name):
+            return getattr(c, name)
+        if isinstance(c, dict):
+            return c.get(name, default)
+        return default
+
+    raw_type = g('type', '')
+    type_str = getattr(raw_type, 'value', raw_type)
+    return (
+        str(g('name', '') or ''),
+        str(g('path', '') or ''),
+        int(g('seed', 0) or 0),
+        str(type_str or ''),
+        bool(g('include_subdirectories', False)),
+        bool(g('enabled', True)),
+    )
+
+
+def compute_concept_fingerprint(
+        concepts: Iterable[ConceptConfig] | Iterable[dict] | None,
+        concept_file_name: str | None = None,
+) -> tuple[str, int]:
+    items: list = []
+    if concepts:
+        items = list(concepts)
+    elif concept_file_name and os.path.exists(concept_file_name):
+        # Mirrors TrainConfig.to_pack_dict: under the GUI, concepts live in a file.
+        try:
+            with open(concept_file_name, 'r') as f:
+                items = json.load(f) or []
+        except (OSError, ValueError):
+            items = []
+
+    payload = [_identifier_tuple(c) for c in items]
+    payload.sort(key=lambda t: t[1])
+    blob = json.dumps(payload, separators=(',', ':'), sort_keys=False).encode()
+    return hashlib.sha256(blob).hexdigest(), len(payload)