use built-in optimizer

terarachang · terarachang · commit c356980b2603 · 2026-04-28T05:23:50.000Z
diff --git a/examples/cosmos/optimizer_utils.py b/examples/cosmos/optimizer_utils.py
diff --git a/examples/cosmos/train_cosmos_predict25_lora.py b/examples/cosmos/train_cosmos_predict25_lora.py
@@ -25,7 +25,7 @@
 
 import diffusers
 from diffusers import Cosmos2_5_PredictBasePipeline
-from diffusers.optimization import get_scheduler
+from diffusers.optimization import get_linear_schedule_with_warmup, get_scheduler
 from diffusers.training_utils import cast_training_params
 from diffusers.utils.torch_utils import is_compiled_module
 from diffusers.utils import (
@@ -239,37 +239,26 @@ def parse_args():
     parser.add_argument(
         "--scheduler_warm_up_steps",
         type=int,
-        nargs="+",
-        default=[1000],
-        help="Warm-up steps per cycle for the LambdaLinearScheduler.",
+        default=1000,
+        help="Number of warmup steps for the linear LR scheduler.",
     )
     parser.add_argument(
-        "--scheduler_cycle_lengths",
+        "--num_training_steps",
         type=int,
-        nargs="+",
-        default=[100000],
-        help="Cycle lengths for the LambdaLinearScheduler.",
-    )
-    parser.add_argument(
-        "--scheduler_f_start",
-        type=float,
-        nargs="+",
-        default=[1e-6],
-        help="LR multiplier at the start of each warm-up cycle.",
+        default=100000,
+        help="Total number of training steps for the LR scheduler.",
     )
     parser.add_argument(
         "--scheduler_f_max",
         type=float,
-        nargs="+",
-        default=[0.5],
-        help="Maximum LR multiplier reached after warm-up.",
+        default=0.5,
+        help="Maximum LR multiplier (peak after warmup) for the linear scheduler.",
     )
     parser.add_argument(
         "--scheduler_f_min",
         type=float,
-        nargs="+",
-        default=[0.2],
-        help="Minimum LR multiplier at the end of each cycle.",
+        default=0.2,
+        help="Minimum LR multiplier (floor of linear decay) for the linear scheduler.",
     )
     parser.add_argument(
         "--do_final_eval",
@@ -585,16 +574,13 @@ def main():
     if args.allow_tf32:
         torch.backends.cuda.matmul.allow_tf32 = True
 
-    from optimizer_utils import build_optimizer_and_scheduler
-    optimizer, lr_scheduler = build_optimizer_and_scheduler(
-        lora_params,
-        lr=args.learning_rate,
-        weight_decay=args.weight_decay,
-        warm_up_steps=args.scheduler_warm_up_steps,
-        cycle_lengths=args.scheduler_cycle_lengths,
-        f_start=args.scheduler_f_start,
-        f_max=args.scheduler_f_max,
+    optimizer = torch.optim.AdamW(lora_params, lr=args.learning_rate, weight_decay=args.weight_decay)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=args.scheduler_warm_up_steps,
+        num_training_steps=args.num_training_steps,
         f_min=args.scheduler_f_min,
+        f_max=args.scheduler_f_max,
     )
 
     train_dataloader = build_dataloader(args)
diff --git a/src/diffusers/optimization.py b/src/diffusers/optimization.py
@@ -120,7 +120,12 @@ def rule_func(steps: int) -> float:
 
 
 def get_linear_schedule_with_warmup(
-    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, last_epoch: int = -1
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    last_epoch: int = -1,
+    f_min: float = 0.0,
+    f_max: float = 1.0,
 ) -> LambdaLR:
     """
     Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
@@ -135,17 +140,20 @@ def get_linear_schedule_with_warmup(
             The total number of training steps.
         last_epoch (`int`, *optional*, defaults to -1):
             The index of the last epoch when resuming training.
+        f_min (`float`, *optional*, defaults to 0.0):
+            Minimum lr multiplier (floor of the linear decay). The lr will not fall below `f_min * initial_lr`.
+        f_max (`float`, *optional*, defaults to 1.0):
+            Maximum lr multiplier (peak reached after warmup). The lr peaks at `f_max * initial_lr`.
 
     Return:
         `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
     def lr_lambda(current_step: int):
         if current_step < num_warmup_steps:
-            return float(current_step) / float(max(1, num_warmup_steps))
-        return max(
-            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
-        )
+            return f_max * float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        return f_min + (f_max - f_min) * max(0.0, progress)
 
     return LambdaLR(optimizer, lr_lambda, last_epoch)