vllm-project
diff --git a/‎examples/train/peagle_qwen3_8b_sharegpt_online_5k.sh‎
Lines changed: 112 additions & 0 deletions b/‎examples/train/peagle_qwen3_8b_sharegpt_online_5k.sh‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/train.py‎
Lines changed: 24 additions & 4 deletions b/‎scripts/train.py‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎src/speculators/models/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎src/speculators/models/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/speculators/models/dflash/core.py‎
Lines changed: 2 additions & 12 deletions b/‎src/speculators/models/dflash/core.py‎
Lines changed: 2 additions & 12 deletions
diff --git a/‎src/speculators/models/dflash/metrics.py‎
Lines changed: 9 additions & 6 deletions b/‎src/speculators/models/dflash/metrics.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎src/speculators/models/eagle3/core.py‎
Lines changed: 3 additions & 2 deletions b/‎src/speculators/models/eagle3/core.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/speculators/models/eagle3/metrics.py‎
Lines changed: 7 additions & 4 deletions b/‎src/speculators/models/eagle3/metrics.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎src/speculators/models/metrics.py‎
Lines changed: 15 additions & 21 deletions b/‎src/speculators/models/metrics.py‎
Lines changed: 15 additions & 21 deletions
diff --git a/‎src/speculators/models/peagle/__init__.py‎
Lines changed: 7 additions & 0 deletions b/‎src/speculators/models/peagle/__init__.py‎
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,112 @@
+#!/bin/bash
+# Online P-EAGLE Training Script
+#
+# Runs the full online P-EAGLE training pipeline: data preparation, vLLM server launch,
+# and training (with hidden states generated on-the-fly from the live server).
+#
+# Usage: Copy this script, modify the configuration variables below, then run:
+#   bash examples/train/peagle_qwen3_8b_sharegpt_online_5k.sh
+
+### Example E2E run for P-EAGLE Qwen3-8B on 5k samples from ShareGPT ###
+
+# P-EAGLE (Parallel EAGLE) extends EAGLE-3 with parallel multi-token prediction using
+# Conditional-On-Distribution (COD) sampling for memory-efficient training.
+
+# Note: With just 5k samples, the model performance will not be very good, however there
+# are enough samples to verify that the pipeline is working correctly and that the model
+# is learning something. This is a good sanity check when creating a drafter for a new
+# target model.
+
+# Timing (on 4x NVIDIA H100 80GB GPUs, DP=2)
+# Data Preprocessing: 26 seconds
+# vLLM Server Startup: 82 seconds (1 min 22 secs)
+# Training (5 epochs): 2793 seconds (46 mins 33 secs)
+# Total: 2901 seconds (48 mins 21 secs)
+
+# Results on SpecBench (80 prompts, 256 output tokens):
+# acceptance rate: 13.35%
+# acceptance length: 1.53
+# per-position acceptance:
+#   position 0: 40.84%
+#   position 1: 10.84%
+#   position 2: 1.58%
+#   position 3: 0.15%
+
+set -euo pipefail
+
+# ============ Configuration ============
+MODEL="Qwen/Qwen3-8B"
+DATASET="sharegpt"                # sharegpt, ultrachat, or path to custom data
+OUTPUT_DIR="./output/peagle_qwen3_8b_sharegpt"
+VLLM_PORT=8108
+MAX_SAMPLES=5000
+SEQ_LENGTH=4096
+EPOCHS=5
+LR=6e-4
+
+# P-EAGLE-specific parameters
+SPECULATOR_TYPE="peagle"
+NUM_LAYERS=4
+NUM_DEPTHS=4
+DOWN_SAMPLE_RATIO=0.7
+DOWN_SAMPLE_RATIO_MIN=0.2
+# GPU assignments (online training needs separate GPUs for vLLM and training)
+VLLM_GPUS="2,3"
+TRAIN_GPUS="4,5"
+NUM_TRAIN_GPUS=2
+# =======================================
+
+# Step 1: Prepare data
+echo "=== Step 1: Preparing data ==="
+python scripts/prepare_data.py \
+    --model "$MODEL" \
+    --data "$DATASET" \
+    --output "$OUTPUT_DIR" \
+    --max-samples "$MAX_SAMPLES" \
+    --seq-length "$SEQ_LENGTH"
+
+# Step 2: Launch vLLM server in the background
+echo "=== Step 2: Launching vLLM server ==="
+CUDA_VISIBLE_DEVICES="$VLLM_GPUS" python scripts/launch_vllm.py "$MODEL" \
+    --hidden-states-path "$OUTPUT_DIR/hidden_states" \
+    -- --data-parallel-size 2 --port "$VLLM_PORT" &
+VLLM_PID=$!
+
+# Ensure vLLM is cleaned up on exit
+cleanup() {
+    echo "Stopping vLLM server..."
+    kill "$VLLM_PID" 2>/dev/null || true
+    wait "$VLLM_PID" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+echo "Waiting for vLLM server to be ready..."
+until curl -sf "http://localhost:${VLLM_PORT}/health" > /dev/null 2>&1; do
+    sleep 2
+done
+echo "vLLM server ready."
+
+# Step 3: Train against the live vLLM server
+echo "=== Step 3: Training ==="
+CUDA_VISIBLE_DEVICES="$TRAIN_GPUS" torchrun \
+    --standalone --nproc_per_node "$NUM_TRAIN_GPUS" \
+    scripts/train.py \
+    --verifier-name-or-path "$MODEL" \
+    --data-path "$OUTPUT_DIR" \
+    --vllm-endpoint "http://localhost:${VLLM_PORT}/v1" \
+    --hidden-states-path "$OUTPUT_DIR/hidden_states" \
+    --save-path "$OUTPUT_DIR/checkpoints" \
+    --epochs "$EPOCHS" \
+    --lr "$LR" \
+    --total-seq-len "$SEQ_LENGTH" \
+    --speculator-type "$SPECULATOR_TYPE" \
+    --num-layers "$NUM_LAYERS" \
+    --num-depths "$NUM_DEPTHS" \
+    --down-sample-ratio "$DOWN_SAMPLE_RATIO" \
+    --down-sample-ratio-min "$DOWN_SAMPLE_RATIO_MIN" \
+    --no-norm-before-residual \
+    --scheduler-type cosine \
+    --on-missing generate \
+    --on-generate delete
+
+echo "Done. Checkpoints saved to $OUTPUT_DIR/checkpoints/"
@@ -143,7 +143,7 @@ ignore_missing_imports=true
 [tool.ruff]
 line-length = 88
 indent-width = 4
-exclude = ["build", "dist", "env", ".venv"]
+exclude = ["build", "dist", "env", ".venv", "output"]
 
 [tool.ruff.format]
 quote-style = "double"
@@ -249,13 +249,13 @@ select = [
     "INP001", # allow implicit namespace packages in scripts
     "PTH", # os.path is acceptable in scripts
     "T201", # print statements are acceptable in scripts
+    "SLF001", # allow private member access for model configuration
 ]
 
 "examples/**/*.py" = [
     "INP001", # allow implicit namespace packages in examples
 ]
 
-
 [tool.ruff.lint.isort]
 known-first-party = ["speculators", "tests"]
 
 
@@ -269,22 +269,22 @@ def main(args: argparse.Namespace):
         )
 
     model_class = registry[args.speculator_type]
+
     if args.from_pretrained:
         draft_model = model_class.from_pretrained(
             args.from_pretrained, t2d=t2d, d2t=d2t
         )
     else:
-        args_dict = vars(args)
-        args_dict["draft_vocab_size"] = draft_vocab_size
+        args.draft_vocab_size = draft_vocab_size
         draft_model = model_class.from_training_args(
             verifier_config=transformer_layer_config,
             t2d=t2d,
             d2t=d2t,
-            **args_dict,
+            **vars(args),
         )
 
     # Setup dataloaders
-    preprocess = shift_batch if args.speculator_type == "eagle3" else None
+    preprocess = shift_batch if args.speculator_type in ("eagle3", "peagle") else None
 
     noise_transform = AddUniformNoise(std=args.noise_std)
     if args.legacy_data:
@@ -608,6 +608,7 @@ def parse_args():
         help="Use RMSNorm before fc in Eagle3 draft path "
         "(e.g. for gpt-oss). Omit for other models.",
     )
+    # D-Flash specific parameters
     parser.add_argument(
         "--block-size",
         type=int,
@@ -620,6 +621,25 @@ def parse_args():
         default=256,
         help="Maximum anchor positions for DFlash training (default: 256)",
     )
+    # P-EAGLE specific parameters
+    parser.add_argument(
+        "--num-depths",
+        type=int,
+        default=8,
+        help="Number of parallel prediction depths for P-EAGLE (default: 8)",
+    )
+    parser.add_argument(
+        "--down-sample-ratio",
+        type=float,
+        default=0.7,
+        help="Geometric decay ratio for COD sampling in P-EAGLE (default: 0.7)",
+    )
+    parser.add_argument(
+        "--down-sample-ratio-min",
+        type=float,
+        default=0.2,
+        help="Minimum retention ratio for COD sampling in P-EAGLE (default: 0.2)",
+    )
     # Dataloader parameters
     parser.add_argument(
         "--num-workers", type=int, default=12, help="Number of dataloader workers"
 
@@ -2,10 +2,13 @@
 
 from .dflash import DFlashDraftModel, DFlashSpeculatorConfig
 from .eagle3 import Eagle3DraftModel, Eagle3SpeculatorConfig
+from .peagle import PEagleDraftModel, PEagleSpeculatorConfig
 
 __all__ = [
     "DFlashDraftModel",
     "DFlashSpeculatorConfig",
     "Eagle3DraftModel",
     "Eagle3SpeculatorConfig",
+    "PEagleDraftModel",
+    "PEagleSpeculatorConfig",
 ]
@@ -106,10 +106,8 @@ def from_training_args(
             verifier_config: Verifier model configuration. This should be a config
                 with num_hidden_layers set to the number of DRAFT layers (created
                 by create_transformer_layer_config in train.py).
-            t2d: Target-to-draft vocabulary mapping tensor (optional, creates
-                identity mapping if None)
-            d2t: Draft-to-target vocabulary mapping tensor (optional, creates
-                identity mapping if None)
+            t2d: Target-to-draft vocabulary mapping tensor (optional)
+            d2t: Draft-to-target vocabulary mapping tensor (optional)
             **kwargs: Training arguments with DFlash-specific params
                 - draft_vocab_size: Size of draft vocabulary
                 - block_size: Block size for draft predictions (default: 8)
@@ -158,14 +156,6 @@ def from_training_args(
             ),
         )
 
-        # Create identity mappings if t2d/d2t not provided (no vocab reduction)
-        if t2d is None or d2t is None:
-            vocab_size = kwargs["draft_vocab_size"]
-            # t2d: all tokens in target vocab are in draft vocab
-            t2d = torch.ones(vocab_size, dtype=torch.bool)
-            # d2t: identity mapping (zero offset for all tokens)
-            d2t = torch.zeros(vocab_size, dtype=torch.long)
-
         model = cls(config=config)
         model.load_vocab_mappings(t2d, d2t)
         model.load_verifier_weights()
 
@@ -51,15 +51,18 @@ def compute_metrics(
     pred_ids = torch.argmax(logits, dim=-1)
     target_ids = torch.argmax(targets, dim=-1)
 
-    full_acc, per_position_acc = compute_accuracy_multi_step(
+    correct_per_pos, total_per_pos = compute_accuracy_multi_step(
         pred_ids, target_ids, loss_mask, pos_idx, block_size
     )
 
     metrics: dict[str, Any] = {}
-    metrics["loss"] = loss.detach().clone()
-    metrics["full_acc"] = full_acc
+    metrics["loss_sum"] = loss.detach().clone()
+    metrics["loss_total"] = torch.tensor(1.0, device=logits.device)
+    # Position 0 is the anchor — intentionally excluded from accuracy
+    metrics["full_acc_sum"] = correct_per_pos[1:].sum()
+    metrics["full_acc_total"] = total_per_pos[1:].sum()
 
-    # Intentionally drop position 0
-    for pos in range(1, len(per_position_acc)):
-        metrics[f"position {pos} acc"] = per_position_acc[pos]
+    for pos in range(1, block_size):
+        metrics[f"position_{pos}_acc_sum"] = correct_per_pos[pos]
+        metrics[f"position_{pos}_acc_total"] = total_per_pos[pos]
     return loss, metrics
@@ -28,7 +28,7 @@ def conditional_torch_compile(func):
 @SpeculatorModel.register("eagle3")
 class Eagle3DraftModel(DraftVocabMixin, SpeculatorModel):
     config_class: ClassVar[type[Eagle3SpeculatorConfig]] = Eagle3SpeculatorConfig  # type: ignore[misc]
-    _keys_to_ignore_on_load_missing: ClassVar[list[str]] = [  # type: ignore[misc,assignment]
+    _keys_to_ignore_on_load_missing: ClassVar[list[str]] = [  # type: ignore[misc]
         "embed_tokens.weight",
         "verifier_norm.weight",
         "verifier_lm_head.weight",
@@ -255,7 +255,8 @@ def forward(  # noqa: C901
             # shape: [1, total_seq_len]
 
         if return_loss:
-            metrics["loss"] = loss.detach().clone()
+            metrics["loss_sum"] = loss.detach().clone()
+            metrics["loss_total"] = torch.tensor(1.0, device=device)
             return draft_tokens, loss, metrics
         else:
             return draft_tokens
 
@@ -95,13 +95,16 @@ def compute_metrics(
     pred_ids = torch.argmax(s_logits, dim=-1)
     target_ids = torch.argmax(s_targets, dim=-1)
 
-    s_full_acc, s_cond_acc = compute_accuracy_single_step(
+    full_correct, full_total, cond_correct, cond_total = compute_accuracy_single_step(
         pred_ids, target_ids, s_loss_mask, s_prev_correct
     )
 
     s_metrics = {}
-    s_metrics[f"loss_{ttt_step}"] = s_loss.detach().clone()
-    s_metrics[f"full_acc_{ttt_step}"] = s_full_acc
-    s_metrics[f"cond_acc_{ttt_step}"] = s_cond_acc
+    s_metrics[f"loss_{ttt_step}_sum"] = s_loss.detach().clone()
+    s_metrics[f"loss_{ttt_step}_total"] = torch.tensor(1.0, device=s_loss.device)
+    s_metrics[f"full_acc_{ttt_step}_sum"] = full_correct
+    s_metrics[f"full_acc_{ttt_step}_total"] = full_total
+    s_metrics[f"cond_acc_{ttt_step}_sum"] = cond_correct
+    s_metrics[f"cond_acc_{ttt_step}_total"] = cond_total
 
     return s_loss, s_metrics
@@ -11,7 +11,7 @@ def compute_accuracy_single_step(
     loss_mask: torch.Tensor | None,  # shape: [1, seq_len]
     prev_correct: torch.Tensor | None,  # shape: [1, seq_len]
 ):
-    """Compute full and conditional accuracy for a single speculative step.
+    """Compute full and conditional accuracy counts for a single speculative step.
 
     Args:
         pred_ids: Predicted token IDs.
@@ -21,22 +21,21 @@ def compute_accuracy_single_step(
             via logical AND with the current step's correctness.
 
     Returns:
-        Tuple of (full_accuracy, conditional_accuracy) where conditional accuracy
-        is accuracy given all previous steps were also correct.
+        Tuple of (full_correct, full_total, cond_correct, cond_total) as raw
+        counts suitable for distributed reduction before computing ratios.
     """
     correct = pred_ids == target_ids
-    cond_denom: torch.Tensor | int = correct.numel()
+    cond_total = torch.tensor(correct.numel(), dtype=torch.float, device=correct.device)
     if prev_correct is not None:
-        cond_denom = prev_correct.sum()
-        # Update prev_correct in place
+        cond_total = prev_correct.sum().float()
         correct = torch.logical_and(prev_correct, correct, out=prev_correct)
     if loss_mask is not None:
         correct = torch.masked_select(correct, loss_mask.to(torch.bool))
 
     correct_sum = correct.float().sum()
-    full_denom = correct.numel()
+    full_total = torch.tensor(correct.numel(), dtype=torch.float, device=correct.device)
 
-    return correct_sum / (full_denom + _EPS), correct_sum / (cond_denom + _EPS)
+    return correct_sum, full_total, correct_sum, cond_total
 
 
 @torch.no_grad()
@@ -47,7 +46,7 @@ def compute_accuracy_multi_step(
     pos_idx: torch.Tensor,  # shape: [1, seq_len]
     num_pos: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    """Compute overall and per-position accuracy across multiple speculative steps.
+    """Compute per-position correct/total counts across multiple speculative steps.
 
     Args:
         pred_ids: Predicted token IDs.
@@ -57,24 +56,19 @@ def compute_accuracy_multi_step(
         num_pos: Number of distinct positions (i.e. block size).
 
     Returns:
-        Tuple of (overall_accuracy, per_position_accuracy) where per_position_accuracy
-        has shape [num_pos].
+        Tuple of (correct_per_pos, total_per_pos) both with shape [num_pos].
+        Overall counts can be derived by summing these.
     """
     correct = pred_ids == target_ids
     correct = torch.masked_select(correct, loss_mask.to(torch.bool))
     pos_idx = torch.masked_select(pos_idx, loss_mask.to(torch.bool))
 
-    correct_sum = correct.float().sum()
-    full_denom = correct.numel()
-    overall_acc = correct_sum / (full_denom + _EPS)
-
-    sums = torch.zeros(num_pos, dtype=torch.long, device=correct.device)
-    counts = torch.zeros(num_pos, dtype=torch.long, device=correct.device)
-    sums.scatter_add_(0, pos_idx, correct.long())
-    counts.scatter_add_(0, pos_idx, torch.ones_like(correct, dtype=torch.long))
-    per_pos_idx_acc = sums.float() / (counts.float() + _EPS)
+    correct_per_pos = torch.zeros(num_pos, dtype=torch.float, device=correct.device)
+    total_per_pos = torch.zeros(num_pos, dtype=torch.float, device=correct.device)
+    correct_per_pos.scatter_add_(0, pos_idx, correct.float())
+    total_per_pos.scatter_add_(0, pos_idx, torch.ones_like(correct, dtype=torch.float))
 
-    return overall_acc, per_pos_idx_acc  # shape: [], [block_size]
+    return correct_per_pos, total_per_pos  # shape: [num_pos], [num_pos]
 
 
 def kl_div_loss(
 
@@ -0,0 +1,7 @@
+from speculators.models.peagle.config import PEagleSpeculatorConfig
+from speculators.models.peagle.core import PEagleDraftModel
+
+__all__ = [
+    "PEagleDraftModel",
+    "PEagleSpeculatorConfig",
+]