Add Warmup-Stable-Decay (WSD) learning rate scheduler with configurable stable and decay phases

bzantium · bzantium · commit e886dd2120fc · 2026-01-13T17:17:46.000+09:00
Signed-off-by: bzantium &lt;ryumin93@gmail.com&gt;
diff --git a/src/MaxText/configs/base.yml b/src/MaxText/configs/base.yml
@@ -607,7 +607,7 @@ grain_file_type: 'arrayrecord' # arrayrecord or parquet
 grain_packing_type: 'first_fit' # 'first_fit' or 'concat_then_split'. See details of the corresponding module in https://google-grain.readthedocs.io/en/latest/grain.experimental.html
 grain_worker_count: 1 # Set to -1 to enable auto-tuning: automatically determines optimal worker count. See https://google-grain.readthedocs.io/en/latest/_autosummary/grain.experimental.pick_performance_config.html
 grain_per_worker_buffer_size: 1
-# num_threads and prefetch_buffer_size are per-worker per-dataset. 
+# num_threads and prefetch_buffer_size are per-worker per-dataset.
 # When using array_records, they are used in ReadOptions (https://google-grain.readthedocs.io/en/latest/tutorials/data_loader_tutorial.html#per-worker-readoptions)
 # The default value matches that in the Grain package. If mixing multiple data sources, consider lowering these values to reduce memory usage.
 # When using parquet, grain_num_threads is the number of files to read and interleave in parallel
@@ -635,15 +635,28 @@ skip_jax_distributed_system: False # If True we will not initialize the jax dist
 # However when run on google internal TPUs the coordination service is started automatically
 # and we should set this to True so we won't try to initialize a second time manually.
 
-# We take inspiration from Llama2's learning rate (LR) schedule, see https://arxiv.org/pdf/2307.09288.pdf section 2.2
-# Learning rate schedule has either two or three parts:
+# Learning rate schedule structure depends on lr_schedule_type:
+#
+# Cosine schedule (lr_schedule_type='cosine'):
+# Inspired by Llama2's learning rate schedule, see https://arxiv.org/pdf/2307.09288.pdf section 2.2
+# 1) Linear warmup from 0 to [learning_rate] over steps 0 to [learning_rate_schedule_steps * warmup_steps_fraction]
+# 2) Cosine decay from [learning_rate] to [learning_rate * learning_rate_final_fraction] until learning_rate_schedule_steps
+# 3) Constant learning rate of 0 from learning_rate_schedule_steps to steps (if steps > learning_rate_schedule_steps)
+#
+# WSD schedule (lr_schedule_type='wsd', Warmup-Stable-Decay):
 # 1) Linear warmup from 0 to [learning_rate] over steps 0 to [learning_rate_schedule_steps * warmup_steps_fraction]
-# 2) Cosine decay from [learning_rate] to [learning_rate * cosine_learning_rate_final_fraction] from warmup to learning_rate_schedule_steps
-# 3) Constant learning rate of 0 from learning_rate_schedule_steps to steps.
+# 2) Stable phase at [learning_rate] for the majority of training
+# 3) Decay from [learning_rate] to [learning_rate * learning_rate_final_fraction] over [learning_rate_schedule_steps * wsd_decay_steps_fraction] steps
+#    The decay can be either linear or cosine based on wsd_decay_style
+# 4) Constant learning rate of 0 from learning_rate_schedule_steps to steps (if steps > learning_rate_schedule_steps)
+#
 # The zero learning rate section can be used to more accurately measure the fully trained model's performance.
 learning_rate: 3.e-5
-cosine_learning_rate_final_fraction: 0.1
-warmup_steps_fraction: 0.1
+lr_schedule_type: 'cosine'  # Options: 'cosine' or 'wsd'
+learning_rate_final_fraction: 0.1  # Final LR as fraction of peak LR (applies to both cosine and WSD schedules)
+wsd_decay_steps_fraction: 0.1  # Fraction of learning_rate_schedule_steps used for decay phase in WSD (e.g., 0.1 = 10%)
+wsd_decay_style: 'linear'  # Decay style for WSD schedule: 'linear' or 'cosine'
+warmup_steps_fraction: 0.1  # Fraction of learning_rate_schedule_steps used for warmup phase (applies to both schedules)
 learning_rate_schedule_steps: -1 # By default the length of the schedule is set to the number of steps.
 # However you may choose a longer schedule (learning_rate_schedule_steps > steps), in which case the training will end before
 # dropping fully down. Or you may choose a shorter schedule, where the unspecified steps will have a learning rate of 0.
diff --git a/src/MaxText/configs/types.py b/src/MaxText/configs/types.py
@@ -124,6 +124,20 @@ class OptimizerType(str, Enum):
   MUON = "muon"
 
 
+class LearningRateScheduleType(str, Enum):
+  """Supported learning rate schedule types."""
+
+  COSINE = "cosine"
+  WSD = "wsd"
+
+
+class WsdDecayStyle(str, Enum):
+  """Supported decay styles for WSD schedule."""
+
+  LINEAR = "linear"
+  COSINE = "cosine"
+
+
 class RopeType(str, Enum):
   """Supported Rotary Positional Embedding (RoPE) implementations."""
 
@@ -1005,8 +1019,17 @@ class Optimizer(BaseModel):
       1.0, description="The threshold for gradient clipping. 0 disables clipping."
   )
   learning_rate: NonNegativeFloat = Field(3.0e-5, description="The peak learning rate.")
-  cosine_learning_rate_final_fraction: float = Field(
-      0.1, description="Final LR as a fraction of peak LR in cosine decay."
+  lr_schedule_type: LearningRateScheduleType = Field(
+      LearningRateScheduleType.COSINE, description="The type of learning rate schedule to use."
+  )
+  learning_rate_final_fraction: float = Field(
+      0.1, description="Final LR as a fraction of peak LR (applies to both cosine and WSD schedules)."
+  )
+  wsd_decay_steps_fraction: float = Field(
+      0.1, ge=0.0, le=1.0, description="Fraction of total steps for decay phase in WSD schedule."
+  )
+  wsd_decay_style: WsdDecayStyle = Field(
+      WsdDecayStyle.LINEAR, description="The decay style for WSD schedule ('linear' or 'cosine')."
   )
   warmup_steps_fraction: float = Field(0.1, ge=0.0, le=1.0, description="Fraction of total steps for LR warmup.")
   learning_rate_schedule_steps: int = Field(
@@ -1748,6 +1771,17 @@ def set_derived_and_validate_values(self) -> "MaxTextConfig":
     # If steps is -1, it defaults to the length of the learning rate schedule.
     if self.steps == -1:
       self.steps = self.learning_rate_schedule_steps
+
+    # Validate WSD learning rate schedule fractions
+    if self.lr_schedule_type == LearningRateScheduleType.WSD:
+      total_fraction = self.warmup_steps_fraction + self.wsd_decay_steps_fraction
+      if total_fraction > 1.0:
+        raise ValueError(
+            f"Invalid WSD schedule: warmup_steps_fraction ({self.warmup_steps_fraction}) + "
+            f"wsd_decay_steps_fraction ({self.wsd_decay_steps_fraction}) must not exceed 1.0. "
+            f"Current sum: {total_fraction}"
+        )
+
     # If eval_per_device_batch_size is not set, it defaults to the training per_device_batch_size.
     if getattr(self, "eval_per_device_batch_size", 0.0) == 0.0:
       self.eval_per_device_batch_size = self.per_device_batch_size
diff --git a/src/MaxText/maxtext_utils.py b/src/MaxText/maxtext_utils.py
@@ -40,6 +40,7 @@
 from MaxText import max_utils
 from MaxText import multimodal_utils
 from MaxText import sharding
+from MaxText.configs import types
 from MaxText.common_types import DecoderBlockType, MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
 from MaxText.inference.page_manager import PageState
 
@@ -1103,44 +1104,72 @@ def create_device_mesh(config, devices=None):
 
 
 def create_learning_rate_schedule(config):
-  """Creates a warmup and cosine decay learning rate schedule:
-  We take inspiration from Llama2's learning rate (LR) schedule, see https://arxiv.org/pdf/2307.09288.pdf section 2.2
-  Learning rate schedule has either two or three parts:
+  """Creates a learning rate schedule with warmup and decay.
+
+  Supports two schedule types:
+  - Cosine: Inspired by Llama2's learning rate schedule, see https://arxiv.org/pdf/2307.09288.pdf section 2.2
+  - WSD (Warmup-Stable-Decay): Maintains constant learning rate for most of training before final decay
+
+  Schedule structure:
   1) Linear warmup from 0 to [learning_rate] over steps 0 to [learning_rate_schedule_steps * warmup_steps_fraction]
-  2) Cosine from [learning_rate] to [learning_rate * cosine_learning_rate_final_fraction] until learning_rate_schedule_steps
+  2) Decay from [learning_rate] to a final value until learning_rate_schedule_steps
+     - Cosine: decays to [learning_rate * learning_rate_final_fraction]
+     - WSD: maintains [learning_rate] for a stable phase, then decays to [learning_rate * learning_rate_final_fraction]
+       using either linear or cosine decay based on wsd_decay_style
   3) Constant learning rate of 0 from learning_rate_schedule_steps to steps.
   The zero learning rate section can be used to more accurately measure the fully trained model's performance.
   """
 
   def make_cos_schedule(init_lr, final_lr, len_steps):
     def schedule(step):
-      pct = (step) / len_steps
+      pct = step / (len_steps - 1) if len_steps > 1 else 1.0
       a = 0.5 * (jnp.cos(jnp.pi * pct) + 1)
       lr = init_lr * a + final_lr * (1 - a)
       return lr
 
     return schedule
 
   lr = config.learning_rate
-  cos_final_lr = lr * config.cosine_learning_rate_final_fraction
-
+  final_lr = lr * config.learning_rate_final_fraction
   warmup_steps = int(config.learning_rate_schedule_steps * config.warmup_steps_fraction)
-  cos_steps = config.learning_rate_schedule_steps - warmup_steps
   constant_zero_steps = config.steps - config.learning_rate_schedule_steps
 
-  warmup_schedule = optax.linear_schedule(init_value=0.0, end_value=lr, transition_steps=warmup_steps)
-  cos_schedule = make_cos_schedule(lr, cos_final_lr, cos_steps)
-  constant_schedule = optax.constant_schedule(0.0)
-
-  pieces = [warmup_schedule, cos_schedule]
-  boundaries = [
-      warmup_steps,
-      warmup_steps + cos_steps,
-  ]
+  pieces = []
+  boundaries = []
+
+  if warmup_steps > 0:
+    warmup_schedule = optax.linear_schedule(init_value=0.0, end_value=lr, transition_steps=warmup_steps - 1)
+    pieces.append(warmup_schedule)
+    boundaries.append(warmup_steps)
+
+  if config.lr_schedule_type == types.LearningRateScheduleType.COSINE:
+    cos_steps = config.learning_rate_schedule_steps - warmup_steps
+    if cos_steps > 0:
+      cos_schedule = make_cos_schedule(lr, final_lr, cos_steps)
+      pieces.append(cos_schedule)
+      boundaries.append(warmup_steps + cos_steps)
+
+  else:  # WSD
+    decay_steps = int(config.learning_rate_schedule_steps * config.wsd_decay_steps_fraction)
+    stable_steps = config.learning_rate_schedule_steps - warmup_steps - decay_steps
+
+    if stable_steps > 0:
+      stable_schedule = optax.constant_schedule(lr)
+      pieces.append(stable_schedule)
+      boundaries.append(warmup_steps + stable_steps)
+    if decay_steps > 0:
+      # Create decay schedule based on wsd_decay_style
+      if config.wsd_decay_style == types.WsdDecayStyle.LINEAR:
+        decay_schedule = optax.linear_schedule(init_value=lr, end_value=final_lr, transition_steps=decay_steps - 1)
+      else:  # COSINE
+        decay_schedule = make_cos_schedule(lr, final_lr, decay_steps)
+      pieces.append(decay_schedule)
+      boundaries.append(warmup_steps + stable_steps + decay_steps)
 
   if constant_zero_steps > 0:
+    constant_schedule = optax.constant_schedule(0.0)
     pieces.append(constant_schedule)
-    boundaries.append(warmup_steps + cos_steps + constant_zero_steps)
+    boundaries.append(config.learning_rate_schedule_steps)
 
   return optax.join_schedules(pieces, boundaries)
 
diff --git a/tests/maxtext_utils_test.py b/tests/maxtext_utils_test.py
@@ -682,5 +682,113 @@ def test_bytes_from_pytree_empty_dict(self):
     self.assertEqual(max_utils.calculate_bytes_from_pytree({}), 0)
 
 
+class TestLearningRateSchedules(unittest.TestCase):
+  """Test suite for learning rate schedule functions."""
+
+  def test_cosine_schedule(self):
+    """Tests cosine learning rate schedule."""
+    learning_rate = 1e-3
+    learning_rate_schedule_steps = 1000
+    steps = 1200
+    warmup_steps_fraction = 0.1
+    learning_rate_final_fraction = 0.1
+
+    warmup_steps = int(learning_rate_schedule_steps * warmup_steps_fraction)
+
+    config = pyconfig.initialize(
+        [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+        enable_checkpointing=False,
+        learning_rate=learning_rate,
+        learning_rate_schedule_steps=learning_rate_schedule_steps,
+        steps=steps,
+        warmup_steps_fraction=warmup_steps_fraction,
+        lr_schedule_type="cosine",
+        learning_rate_final_fraction=learning_rate_final_fraction,
+    )
+
+    schedule_fn = maxtext_utils.create_learning_rate_schedule(config)
+
+    # Warmup phase: 0 -> peak
+    self.assertAlmostEqual(float(schedule_fn(0)), 0.0, places=6)
+    self.assertAlmostEqual(float(schedule_fn(warmup_steps)), learning_rate, places=6)
+
+    # Cosine decay phase
+    lr_end = schedule_fn(learning_rate_schedule_steps - 1)
+    expected_final = learning_rate * learning_rate_final_fraction
+    self.assertLess(float(lr_end), learning_rate)
+    self.assertAlmostEqual(float(lr_end), expected_final, places=6)
+
+    # Zero phase
+    self.assertAlmostEqual(float(schedule_fn(steps - 1)), 0.0, places=6)
+
+  def test_wsd_schedule(self):
+    """Tests WSD learning rate schedule with both linear and cosine decay styles."""
+    learning_rate = 1e-3
+    learning_rate_schedule_steps = 1000
+    steps = 1200
+    warmup_steps_fraction = 0.1
+    learning_rate_final_fraction = 0.1
+    wsd_decay_steps_fraction = 0.1
+
+    warmup_steps = int(learning_rate_schedule_steps * warmup_steps_fraction)
+    decay_steps = int(learning_rate_schedule_steps * wsd_decay_steps_fraction)
+    stable_steps = learning_rate_schedule_steps - warmup_steps - decay_steps
+    decay_start = warmup_steps + stable_steps
+
+    # Test both decay styles: linear and cosine
+    for decay_style in ["linear", "cosine"]:
+      config = pyconfig.initialize(
+          [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+          enable_checkpointing=False,
+          learning_rate=learning_rate,
+          learning_rate_schedule_steps=learning_rate_schedule_steps,
+          steps=steps,
+          warmup_steps_fraction=warmup_steps_fraction,
+          lr_schedule_type="wsd",
+          learning_rate_final_fraction=learning_rate_final_fraction,
+          wsd_decay_steps_fraction=wsd_decay_steps_fraction,
+          wsd_decay_style=decay_style,
+      )
+      schedule_fn = maxtext_utils.create_learning_rate_schedule(config)
+
+      # Warmup phase: 0 -> peak
+      self.assertAlmostEqual(float(schedule_fn(0)), 0.0, places=6)
+      self.assertAlmostEqual(float(schedule_fn(warmup_steps)), learning_rate, places=6)
+
+      # Stable phase: constant at peak
+      self.assertAlmostEqual(float(schedule_fn(warmup_steps + 10)), learning_rate, places=6)
+      self.assertAlmostEqual(float(schedule_fn(warmup_steps + stable_steps // 2)), learning_rate, places=6)
+      self.assertAlmostEqual(float(schedule_fn(decay_start - 1)), learning_rate, places=6)
+
+      # Decay phase: peak -> final
+      lr_mid_decay = schedule_fn(decay_start + decay_steps // 2)
+      expected_final = learning_rate * learning_rate_final_fraction
+      self.assertLess(float(lr_mid_decay), learning_rate)
+      self.assertGreater(float(lr_mid_decay), expected_final)
+
+      # End of decay phase: should reach expected_final
+      lr_end = schedule_fn(learning_rate_schedule_steps - 1)
+      self.assertAlmostEqual(float(lr_end), expected_final, places=6)
+
+      # Zero phase
+      self.assertAlmostEqual(float(schedule_fn(steps - 1)), 0.0, places=6)
+
+    # Test invalid fractions - should raise during config initialization
+    with self.assertRaises(ValueError) as cm:
+      pyconfig.initialize(
+          [None, os.path.join(MAXTEXT_PKG_DIR, "configs", "base.yml")],
+          enable_checkpointing=False,
+          learning_rate=learning_rate,
+          learning_rate_schedule_steps=learning_rate_schedule_steps,
+          steps=steps,
+          warmup_steps_fraction=0.6,
+          lr_schedule_type="wsd",
+          learning_rate_final_fraction=learning_rate_final_fraction,
+          wsd_decay_steps_fraction=0.5,  # Sum > 1.0
+      )
+    self.assertIn("warmup_steps_fraction", str(cm.exception))
+    self.assertIn("wsd_decay_steps_fraction", str(cm.exception))
+
+
 if __name__ == "__main__":
   unittest.main()