NVIDIA
diff --git a/‎examples/specdec_bench/run.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/specdec_bench/run.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/specdec_bench/specdec_bench/models/vllm.py‎
Lines changed: 1 addition & 0 deletions b/‎examples/specdec_bench/specdec_bench/models/vllm.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 102 additions & 3 deletions b/‎examples/speculative_decoding/eagle_utils.py‎
Lines changed: 102 additions & 3 deletions
diff --git a/‎examples/speculative_decoding/main.py‎
Lines changed: 23 additions & 2 deletions b/‎examples/speculative_decoding/main.py‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎examples/speculative_decoding/requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎examples/speculative_decoding/requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/speculative_decoding/scripts/merge_lora.py‎
Lines changed: 138 additions & 0 deletions b/‎examples/speculative_decoding/scripts/merge_lora.py‎
Lines changed: 138 additions & 0 deletions
@@ -157,6 +157,7 @@ def run_simple(args):
         tensor_parallel_size=args.tp_size,
         moe_expert_parallel_size=args.ep_size,
         trust_remote_code=args.trust_remote_code,
+        tokenizer_path=args.tokenizer,
         **engine_args,
     )
 
 
@@ -72,6 +72,7 @@ def __init__(self, model_dir, max_concurrent_requests, sampling_kwargs, **kwargs
             num_speculative_tokens = specdec.get("num_speculative_tokens", 3)
         engine_args = AsyncEngineArgs(
             model=model_dir,
+            tokenizer=kwargs.get("tokenizer_path"),
             trust_remote_code=kwargs.get("trust_remote_code", False),
             tensor_parallel_size=kwargs.get("tensor_parallel_size", 1),
             enable_expert_parallel=kwargs.get("moe_expert_parallel_size", 1) > 1,
 
@@ -120,17 +120,107 @@ def make_speculative_data_module(
 class EagleTrainerWithAccLog(Trainer):
     """Wrapper around Trainer that logs training accuracy."""
 
+    def __init__(
+        self,
+        *args,
+        lora_lr_multiplier: float = 1.0,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.lora_lr_multiplier = lora_lr_multiplier
+
+    def create_optimizer(self):
+        """Override to give LoRA parameters a higher learning rate."""
+        super().create_optimizer()
+        if self.lora_lr_multiplier != 1.0:
+            lora_ids = {
+                id(p) for n, p in self.model.named_parameters() if "lora_" in n and p.requires_grad
+            }
+            if lora_ids:
+                new_groups = []
+                for group in self.optimizer.param_groups:
+                    lora = [p for p in group["params"] if id(p) in lora_ids]
+                    others = [p for p in group["params"] if id(p) not in lora_ids]
+                    if lora and others:
+                        new_groups.append({**group, "params": others})
+                        new_groups.append(
+                            {**group, "params": lora, "lr": group["lr"] * self.lora_lr_multiplier}
+                        )
+                    elif lora:
+                        new_groups.append({**group, "lr": group["lr"] * self.lora_lr_multiplier})
+                    else:
+                        new_groups.append(group)
+                self.optimizer.param_groups = new_groups
+        return self.optimizer
+
     def compute_loss(self, *args, **kwargs):
-        """Override compute_loss to save train accs in trainer state."""
+        """Override compute_loss to save train accs and per-component losses in trainer state."""
         if not hasattr(self.state, "training_accs"):
             self.state.training_accs = []
+        if not hasattr(self.state, "component_losses"):
+            self.state.component_losses = {"eagle": [], "preservation": []}
         kwargs.pop("num_items_in_batch", None)
         loss, outputs = super().compute_loss(return_outputs=True, *args, **kwargs)
-        if hasattr(outputs, "train_acc"):
+        if hasattr(outputs, "train_acc") and any(outputs.train_acc):
             self.state.training_accs.append(outputs.train_acc)
+        # Track per-component losses
+        for key, attr in [
+            ("eagle", "eagle_loss"),
+            ("preservation", "preservation_loss"),
+        ]:
+            val = getattr(outputs, attr, None)
+            if val is not None:
+                self.state.component_losses[key].append(val.item())
         return loss
 
 
+class LoRAWarmupCallback(TrainerCallback):
+    """Manages LoRA warmup: freezes LoRA during warmup, unfreezes after."""
+
+    def __init__(self, warmup_steps: int):
+        self.warmup_steps = warmup_steps
+        self._activated = False
+
+    def on_step_begin(self, args, state, control, **kwargs):
+        """Check if warmup is over and activate LoRA co-training."""
+        if self._activated:
+            return control
+        if state.global_step >= self.warmup_steps:
+            model = kwargs["model"]
+            # Unwrap DDP/FSDP if needed
+            raw_model = model.module if hasattr(model, "module") else model
+            if hasattr(raw_model, "_lora_cotraining_active"):
+                raw_model._lora_cotraining_active = True
+                # Unfreeze LoRA parameters
+                lora_params = []
+                for name, param in raw_model._base_model.named_parameters():
+                    if "lora_" in name:
+                        param.requires_grad = True
+                        lora_params.append(param)
+
+                # Add LoRA params to optimizer — they were excluded at creation time
+                # because requires_grad was False during warmup.
+                optimizer = kwargs.get("optimizer")
+                if optimizer is not None and lora_params:
+                    existing_ids = {id(p) for g in optimizer.param_groups for p in g["params"]}
+                    new_params = [p for p in lora_params if id(p) not in existing_ids]
+                    if new_params:
+                        optimizer.add_param_group(
+                            {
+                                "params": new_params,
+                                "lr": optimizer.param_groups[0]["lr"],
+                                "weight_decay": 0.0,
+                            }
+                        )
+                        print_rank_0(f"  Added {len(new_params)} LoRA params to optimizer")
+
+                print_rank_0(
+                    f"Step {state.global_step}: LoRA warmup complete, enabling co-training."
+                )
+            self._activated = True
+        return control
+
+
 class EagleTrainingPlot(TrainerCallback):
     """Callback that plot training acc and AR during training."""
 
@@ -176,8 +266,16 @@ def on_log(self, args, state, control, **kwargs):
             if logs:
                 wandb.log({k: v for k, v in logs.items() if v is not None}, step=state.global_step)
 
-        # reset training_accs
+            # Log per-component losses
+            if hasattr(state, "component_losses"):
+                for key, vals in state.component_losses.items():
+                    if vals:
+                        wandb.log({f"{key}_loss": np.mean(vals)}, step=state.global_step)
+
+        # reset training_accs and component_losses
         state.training_accs = []
+        if hasattr(state, "component_losses"):
+            state.component_losses = {"eagle": [], "preservation": []}
         return control
 
     def on_step_end(self, args, state, control, **kwargs):
@@ -186,6 +284,7 @@ def on_step_end(self, args, state, control, **kwargs):
             return control
         if state.global_step % self.ar_validate_steps == 0 and state.global_step > 0:
             print_rank_0("Running AR validation...")
+            torch.cuda.empty_cache()
             try:
                 ars = validate_ar(
                     model=kwargs["model"],
 
@@ -40,6 +40,7 @@
 from eagle_utils import (
     EagleTrainerWithAccLog,
     EagleTrainingPlot,
+    LoRAWarmupCallback,
     make_speculative_data_module,
     patch_ring_attention_for_ttt,
 )
@@ -183,7 +184,9 @@ def _load_config(config_path: str, overrides: list[str] = ()) -> tuple[dict, dic
 
     if hf_cfg.get("dp_shard_size") is None:
         cp_size = hf_cfg.get("cp_size", 1)
-        hf_cfg["dp_shard_size"] = torch.cuda.device_count() // cp_size
+        # Use WORLD_SIZE (total GPUs across all nodes) when available, else local GPU count.
+        world_size = int(os.environ.get("WORLD_SIZE", torch.cuda.device_count()))
+        hf_cfg["dp_shard_size"] = world_size // cp_size
 
     return hf_cfg, eagle_cfg, dflash_cfg
 
@@ -316,6 +319,20 @@ def train():
         else:
             raise Exception(f"{training_args.mode} is not supported!")
 
+    # Move any remaining CPU buffers to CUDA so DDP (NCCL-only) can broadcast
+    # them.  We iterate named_buffers and reassign via the owning module to
+    # keep the module tree consistent.  Parameters are left on CPU — the HF
+    # Trainer will move them during init.
+    if torch.cuda.is_available():
+        _target_dev = torch.device("cuda", 0)
+        for name, buf in list(model.named_buffers()):
+            if buf.device.type == "cpu":
+                parts = name.split(".")
+                mod = model
+                for p in parts[:-1]:
+                    mod = getattr(mod, p)
+                setattr(mod, parts[-1], buf.to(_target_dev))
+
     print_rank_0("Loading dataset...")
     is_dflash = training_args.mode == "dflash"
     if training_args.mode in ("eagle3", "dflash"):
@@ -327,11 +344,15 @@ def train():
             shift_labels=not is_dflash,
         )
 
+    callbacks = [EagleTrainingPlot(training_args.ar_validate_steps, training_args.estimate_ar)]
+    if eagle_cfg.get("eagle_base_lora") and eagle_cfg.get("eagle_base_lora_warmup_steps", 0) > 0:
+        callbacks.append(LoRAWarmupCallback(eagle_cfg["eagle_base_lora_warmup_steps"]))
+
     trainer = EagleTrainerWithAccLog(
         model=model,
         processing_class=tokenizer,
         args=training_args,
-        callbacks=[EagleTrainingPlot(training_args.ar_validate_steps, training_args.estimate_ar)],
+        callbacks=callbacks,
         **data_module,
     )
 
 
@@ -1 +1,3 @@
+accelerate>=1.12.0
+peft==0.18.1
 transformers>=5.0,<5.4
@@ -0,0 +1,138 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Merge LoRA weights from an exported EAGLE checkpoint into the base model and save.
+
+Usage:
+    python merge_lora.py \
+        --base_model_path /path/to/original/base/model \
+        --exported_lora_dir /path/to/exported/eagle/checkpoint \
+        --output_path /path/to/merged/output
+
+The exported checkpoint (from export_hf_checkpoint.py) contains
+adapter_model.safetensors and adapter_config.json in standard peft format.
+This script loads the original base model, applies the trained LoRA adapters,
+merges them into the base weights, and saves the fused model + tokenizer.
+"""
+
+import argparse
+from pathlib import Path
+
+from safetensors.torch import load_file
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Merge LoRA weights from an exported EAGLE checkpoint into the base model."
+    )
+    parser.add_argument(
+        "--base_model_path",
+        type=str,
+        required=True,
+        help="Path to the original base model (HF model name or local path).",
+    )
+    parser.add_argument(
+        "--exported_lora_dir",
+        type=str,
+        required=True,
+        help="Path to the exported EAGLE checkpoint containing adapter_model.safetensors.",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        required=True,
+        help="Directory to save the merged (fused) base model.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    lora_dir = Path(args.exported_lora_dir)
+
+    # Verify exported files exist (standard peft naming)
+    config_path = lora_dir / "adapter_config.json"
+    weights_path = lora_dir / "adapter_model.safetensors"
+    if not config_path.exists() or not weights_path.exists():
+        raise FileNotFoundError(
+            f"Expected adapter_config.json and adapter_model.safetensors "
+            f"in {lora_dir}. Run export_hf_checkpoint.py first."
+        )
+
+    lora_sd = load_file(weights_path)
+    print(f"Loaded {len(lora_sd)} LoRA tensors from {lora_dir}")
+    print(f"  Sample keys: {list(lora_sd.keys())[:4]}")
+
+    # Load the original base model
+    print(f"Loading base model from {args.base_model_path}...")
+    model = AutoModelForCausalLM.from_pretrained(
+        args.base_model_path, torch_dtype="auto", device_map="cpu", trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, trust_remote_code=True)
+
+    # Load LoRA adapter into the base model (export dir uses standard peft naming)
+    print("Loading LoRA adapter via PeftModel.from_pretrained...")
+    from peft import PeftModel
+
+    model = PeftModel.from_pretrained(model, str(lora_dir))
+    print("  PeftModel loaded successfully")
+
+    # Debug: check adapter file keys vs model keys and values
+    adapter_keys = set(lora_sd.keys())
+    model_lora_keys = {k for k in model.state_dict() if ".lora_A." in k or ".lora_B." in k}
+    print(f"  Adapter file keys (first 4): {sorted(adapter_keys)[:4]}")
+    print(f"  Model LoRA keys (first 4): {sorted(model_lora_keys)[:4]}")
+    # Check if exported lora_B values are actually non-zero
+    for k, v in lora_sd.items():
+        if ".lora_B." in k:
+            print(f"  Exported {k}: shape={v.shape}, norm={v.norm().item():.6f}")
+            break
+
+    # Verify lora_B weights are non-zero (B is init'd to zero, so non-zero means loaded)
+    lora_b_norms = [v.norm().item() for k, v in model.state_dict().items() if ".lora_B." in k]
+    if not lora_b_norms or all(n == 0 for n in lora_b_norms):
+        raise RuntimeError("LoRA-B weights are all zero — adapter loading failed.")
+    print(
+        f"  Verified: {len(lora_b_norms)} LoRA-B matrices "
+        f"(mean norm={sum(lora_b_norms) / len(lora_b_norms):.4f})"
+    )
+
+    # Merge LoRA into base weights and remove adapter wrappers
+    model = model.merge_and_unload()
+    print("LoRA merged successfully.")
+
+    # Save
+    print(f"Saving merged model to {args.output_path}...")
+    model.save_pretrained(args.output_path)
+    tokenizer.save_pretrained(args.output_path)
+
+    # Restore the original base model's config.json.  save_pretrained() with newer
+    # transformers (>=5.x) rewrites config fields (e.g. rope_theta → rope_parameters,
+    # torch_dtype → dtype) which can confuse downstream engines like TRT-LLM or vLLM.
+    # Since LoRA only changes weights — not architecture — the original config is correct.
+    import shutil
+
+    base_config = Path(args.base_model_path) / "config.json"
+    output_config = Path(args.output_path) / "config.json"
+    if base_config.exists():
+        shutil.copy2(str(base_config), str(output_config))
+        print(f"  Restored original config.json from {base_config}")
+
+    print(f"Done! Merged model saved to {args.output_path}")
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
`@@ -157,6 +157,7 @@ def run_simple(args):`
`157`	`157`	`tensor_parallel_size=args.tp_size,`
`158`	`158`	`moe_expert_parallel_size=args.ep_size,`
`159`	`159`	`trust_remote_code=args.trust_remote_code,`
	`160`	`+ tokenizer_path=args.tokenizer,`
`160`	`161`	`**engine_args,`
`161`	`162`	`)`
`162`	`163`
Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
	`1`	`+accelerate>=1.12.0`
	`2`	`+peft==0.18.1`
`1`	`3`	`transformers>=5.0,<5.4`