fix: correct gradient accumulation off-by-one and lr_scheduler over-stepping (#82)

Luodian · anxiangsir · web-flow · commit b02db5bfd615 · 2026-02-10T13:21:01.000+08:00
* fix: correct gradient accumulation off-by-one and lr_scheduler over-stepping

* fix: align scheduler total_iters with optimizer steps under gradient accumulation

lr_scheduler total_iters was set to micro-step count (total_steps), but
after moving lr_scheduler.step() to only fire on optimizer steps, the
scheduler would only traverse 1/backward_passes_per_step of its budget.

Divide total_iters by backward_passes_per_step so the full LR curve
(warmup + polynomial decay) completes over the actual optimizer steps.
No-op when backward_passes_per_step=1 (Stage-1).

---------

Co-authored-by: Xiang An &lt;anxiangsir@outlook.com&gt;
diff --git a/training/train.py b/training/train.py
@@ -350,7 +350,8 @@ def _expand(name, v):
         optimizer_cls = torch.optim.AdamW
 
         opt = optimizer_cls(parameters, lr=args.lr, weight_decay=args.weight_decay)
-        lr_scheduler = PolynomialLRWarmup(opt, int(args.total_steps * args.warmup_ratio), args.total_steps, 2)
+        optimizer_total_steps = args.total_steps // args.backward_passes_per_step
+        lr_scheduler = PolynomialLRWarmup(opt, int(optimizer_total_steps * args.warmup_ratio), optimizer_total_steps, 2)
     else:
         raise ValueError(f"{args.opt} not support!")
 
@@ -652,7 +653,7 @@ def wrap_ddp(model):
             list_loss.append(head_loss)
             list_loss_float.append(head_loss.item())
 
-        is_accumulation_step = global_step % args.backward_passes_per_step != 0
+        is_accumulation_step = (global_step + 1) % args.backward_passes_per_step != 0
         scaled_loss = sum(list_loss) / args.backward_passes_per_step
 
         if is_accumulation_step:
@@ -665,8 +666,7 @@ def wrap_ddp(model):
                 clip_grad_norm_(pfc.parameters(), max_norm=5, norm_type=2)
             opt.step()
             opt.zero_grad(set_to_none=True)
-
-        lr_scheduler.step()
+            lr_scheduler.step()
 
         batch_end_callback(
             global_step=global_step,