Add LoRA LR multiplier and detach base logits in EAGLE loss

yeyu-nvidia · claude · yeyu-nvidia · commit ddddb81ebe44 · 2026-03-19T17:11:15.000-07:00
Detach base_outputs.logits when used as soft labels in the EAGLE loss so
gradients do not flow back to LoRA through the label path (which causes
circular collapse). LoRA still receives EAGLE gradients via the hidden-
state path (out_hiddens -&gt; eagle_input_hiddens).

Add eagle_base_lora_lr_multiplier (default 10x) to compensate for the
weaker hidden-state gradient signal: LoRA parameters are split into a
separate optimizer param group with lr = base_lr * multiplier.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: Ye Yu &lt;yeyu@nvidia.com&gt;
diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py
@@ -170,6 +170,38 @@ def make_eagle_supervised_data_module(
 class EagleTrainerWithAccLog(Trainer):
     """Wrapper around Trainer that logs training accuracy."""
 
+    def __init__(self, *args, lora_lr_multiplier: float = 1.0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lora_lr_multiplier = lora_lr_multiplier
+
+    def create_optimizer(self):
+        """Override to give LoRA parameters a higher learning rate."""
+        super().create_optimizer()
+        if self.lora_lr_multiplier != 1.0:
+            lora_ids = {
+                id(p)
+                for n, p in self.model.named_parameters()
+                if "lora_" in n and p.requires_grad
+            }
+            if lora_ids:
+                new_groups = []
+                for group in self.optimizer.param_groups:
+                    lora = [p for p in group["params"] if id(p) in lora_ids]
+                    others = [p for p in group["params"] if id(p) not in lora_ids]
+                    if lora and others:
+                        new_groups.append({**group, "params": others})
+                        new_groups.append(
+                            {**group, "params": lora, "lr": group["lr"] * self.lora_lr_multiplier}
+                        )
+                    elif lora:
+                        new_groups.append(
+                            {**group, "lr": group["lr"] * self.lora_lr_multiplier}
+                        )
+                    else:
+                        new_groups.append(group)
+                self.optimizer.param_groups = new_groups
+        return self.optimizer
+
     def compute_loss(self, *args, **kwargs):
         """Override compute_loss to save train accs in trainer state."""
         if not hasattr(self.state, "training_accs"):
diff --git a/examples/speculative_decoding/launch_train.sh b/examples/speculative_decoding/launch_train.sh
@@ -134,6 +134,10 @@ while [ $# -gt 0 ]; do
       if [[ "$1" != *=* ]]; then shift; fi
       EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT="${1#*=}"
       ;;
+    --eagle_base_lora_lr_multiplier*)
+      if [[ "$1" != *=* ]]; then shift; fi
+      EAGLE_BASE_LORA_LR_MULTIPLIER="${1#*=}"
+      ;;
     --eagle_base_lora*)
       if [[ "$1" != *=* ]]; then shift; fi
       EAGLE_BASE_LORA="${1#*=}"
@@ -184,6 +188,7 @@ EAGLE_BASE_LORA_RANK=${EAGLE_BASE_LORA_RANK:-64}
 EAGLE_BASE_LORA_ALPHA=${EAGLE_BASE_LORA_ALPHA:-16.0}
 EAGLE_BASE_LORA_TARGET_MODULES=${EAGLE_BASE_LORA_TARGET_MODULES:-""}
 EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT=${EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT:-1.0}
+EAGLE_BASE_LORA_LR_MULTIPLIER=${EAGLE_BASE_LORA_LR_MULTIPLIER:-10.0}
 
 
 if [[ "$MODE" == "eagle3" ]]; then
@@ -219,7 +224,8 @@ if [[ "$EAGLE_BASE_LORA" == "True" ]]; then
   LORA_ARGS="--eagle_base_lora True \
              --eagle_base_lora_rank $EAGLE_BASE_LORA_RANK \
              --eagle_base_lora_alpha $EAGLE_BASE_LORA_ALPHA \
-             --eagle_base_lora_preservation_loss_weight $EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT"
+             --eagle_base_lora_preservation_loss_weight $EAGLE_BASE_LORA_PRESERVATION_LOSS_WEIGHT \
+             --eagle_base_lora_lr_multiplier $EAGLE_BASE_LORA_LR_MULTIPLIER"
   if [[ "$EAGLE_BASE_LORA_TARGET_MODULES" != "" ]]; then
     LORA_ARGS="$LORA_ARGS --eagle_base_lora_target_modules $EAGLE_BASE_LORA_TARGET_MODULES"
   fi
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
@@ -169,6 +169,14 @@ class EagleArguments:
             )
         },
     )
+    eagle_base_lora_lr_multiplier: float = field(
+        default=10.0,
+        metadata={
+            "help": (
+                "Learning rate multiplier for LoRA parameters relative to the base learning rate."
+            )
+        },
+    )
 
 
 def train():
@@ -285,6 +293,7 @@ def train():
         processing_class=tokenizer,
         args=training_args,
         callbacks=[EagleTrainingPlot(training_args.ar_validate_steps, training_args.estimate_ar)],
+        lora_lr_multiplier=eagle_args.eagle_base_lora_lr_multiplier,
         **data_module,
     )
 
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py
@@ -1012,7 +1012,10 @@ def forward(
                     # base model predict +1 tok, while eagle predict +2
                     # so we shift base model outputs compared to eagle outputs
                     # additionally, we mask the first n tok of eagle outputs at nth TTT step
-                    base_outputs.logits[:, 1 + i + ttt_step :],
+                    # Detach so the EAGLE loss treats base logits as fixed soft labels and does
+                    # not backprop into the base model through this path.  LoRA still receives
+                    # EAGLE gradients via the hidden-state path (out_hiddens -> eagle_input_hiddens).
+                    base_outputs.logits.detach()[:, 1 + i + ttt_step :],
                     eagle_logit[:, ttt_step : -(1 + i)],
                     loss_mask[:, 1 + ttt_step :] if i == 0 else loss_mask[:, 1 + ttt_step : -i],
                 )

Original file line number	Diff line number	Diff line change
`@@ -169,6 +169,14 @@ class EagleArguments:`
`169`	`169`	`)`
`170`	`170`	`},`
`171`	`171`	`)`
	`172`	`+ eagle_base_lora_lr_multiplier: float = field(`
	`173`	`+ default=10.0,`
	`174`	`+ metadata={`
	`175`	`+ "help": (`
	`176`	`+ "Learning rate multiplier for LoRA parameters relative to the base learning rate."`
	`177`	`+ )`
	`178`	`+ },`
	`179`	`+ )`
`172`	`180`
`173`	`181`
`174`	`182`	`def train():`
`@@ -285,6 +293,7 @@ def train():`
`285`	`293`	`processing_class=tokenizer,`
`286`	`294`	`args=training_args,`
`287`	`295`	`callbacks=[EagleTrainingPlot(training_args.ar_validate_steps, training_args.estimate_ar)],`
	`296`	`+ lora_lr_multiplier=eagle_args.eagle_base_lora_lr_multiplier,`
`288`	`297`	`**data_module,`
`289`	`298`	`)`
`290`	`299`