sgl-project
diff --git a/‎configs/gemma4-26b-a4b-eagle3.json‎
Lines changed: 32 additions & 0 deletions b/‎configs/gemma4-26b-a4b-eagle3.json‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎examples/regen_gemma4_26b_data.sh‎
Lines changed: 174 additions & 0 deletions b/‎examples/regen_gemma4_26b_data.sh‎
Lines changed: 174 additions & 0 deletions
diff --git a/‎examples/run_gemma3_27b_eagle3_online.sh‎
Lines changed: 4 additions & 4 deletions b/‎examples/run_gemma3_27b_eagle3_online.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/run_gemma4_26b_eagle3_online.sh‎
Lines changed: 31 additions & 0 deletions b/‎examples/run_gemma4_26b_eagle3_online.sh‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎scripts/regenerate_train_data.py‎
Lines changed: 21 additions & 1 deletion b/‎scripts/regenerate_train_data.py‎
Lines changed: 21 additions & 1 deletion
@@ -0,0 +1,32 @@
+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2816,
+  "initializer_range": 0.02,
+  "intermediate_size": 2112,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 1,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": 512,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.50.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 262144,
+  "draft_vocab_size": 262144,
+  "target_model_type": "gemma4_text"
+}
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+# Regenerate training data for Gemma4-26B Eagle3.
+#
+# This script:
+#   1. Launches SGLang server(s) for Gemma4-26B on available GPUs.
+#   2. Waits for the server(s) to become healthy.
+#   3. Runs regenerate_train_data.py with thinking-ratio support.
+#   4. Shuts down the server(s) on exit.
+#
+# Usage:
+#   bash examples/regen_gemma4_26b_data.sh
+#
+# Environment variables (override defaults):
+#   MODEL            - HuggingFace model ID       (default: google/gemma-4-26b-a4b-it)
+#   TP_SIZE          - Tensor-parallel size        (default: 2)
+#   NUM_SERVERS      - Number of server instances  (default: 1)
+#   BASE_PORT        - First server port           (default: 30000)
+#   CONCURRENCY      - Requests per server         (default: 128)
+#   MAX_TOKENS       - Max generation tokens       (default: 8192)
+#   TEMPERATURE      - Sampling temperature        (default: 0.8)
+#   THINKING_RATIO   - Fraction with thinking      (default: 0.7)
+#   INPUT_FILE       - Input JSONL path            (required)
+#   OUTPUT_FILE      - Output JSONL path           (required)
+#   NUM_SAMPLES      - Max samples to process      (default: all)
+
+set -euo pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname "$SCRIPT_DIR")
+
+# ── Configurable defaults ────────────────────────────────────────────────────
+MODEL="${MODEL:-google/gemma-4-26b-a4b-it}"
+TP_SIZE="${TP_SIZE:-1}"
+NUM_SERVERS="${NUM_SERVERS:-8}"
+BASE_PORT="${BASE_PORT:-30000}"
+CONCURRENCY="${CONCURRENCY:-128}"
+MAX_TOKENS="${MAX_TOKENS:-2048}"
+TEMPERATURE="${TEMPERATURE:-1}"
+THINKING_RATIO="${THINKING_RATIO:-0.7}"
+INPUT_FILE="${INPUT_FILE:-$ROOT_DIR/cache/dataset/ultrachat_train.jsonl}"
+OUTPUT_FILE="${OUTPUT_FILE:-$ROOT_DIR/outputs/dataset/ultrachat_regen_gemma4.jsonl}"
+NUM_SAMPLES="${NUM_SAMPLES:-}"
+
+# ── Derived ──────────────────────────────────────────────────────────────────
+TOTAL_GPUS=$(( TP_SIZE * NUM_SERVERS ))
+AVAIL_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l || echo 0)
+
+if [ "$AVAIL_GPUS" -lt "$TOTAL_GPUS" ]; then
+    echo "Error: Need ${TOTAL_GPUS} GPUs (${NUM_SERVERS} servers x TP ${TP_SIZE}) but only ${AVAIL_GPUS} available."
+    exit 1
+fi
+
+echo "============================================================"
+echo "  Gemma4-26B Data Regeneration"
+echo "============================================================"
+echo "  Model:           ${MODEL}"
+echo "  TP size:         ${TP_SIZE}"
+echo "  Servers:         ${NUM_SERVERS}"
+echo "  Ports:           ${BASE_PORT}..$(( BASE_PORT + (NUM_SERVERS - 1) * 10 ))"
+echo "  Concurrency:     ${CONCURRENCY} per server"
+echo "  Max tokens:      ${MAX_TOKENS}"
+echo "  Temperature:     ${TEMPERATURE}"
+echo "  Thinking ratio:  ${THINKING_RATIO}"
+echo "  Input:           ${INPUT_FILE}"
+echo "  Output:          ${OUTPUT_FILE}"
+echo "============================================================"
+
+# ── Cleanup on exit ──────────────────────────────────────────────────────────
+SERVER_PIDS=()
+
+cleanup() {
+    echo ""
+    echo "Shutting down SGLang server(s)..."
+    for pid in "${SERVER_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill "$pid" 2>/dev/null || true
+        fi
+    done
+    # Wait briefly then force-kill stragglers
+    sleep 2
+    for pid in "${SERVER_PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill -9 "$pid" 2>/dev/null || true
+        fi
+    done
+    echo "All servers stopped."
+}
+trap cleanup EXIT
+
+# ── Launch servers ───────────────────────────────────────────────────────────
+SERVER_ADDRESSES=()
+
+for i in $(seq 0 $(( NUM_SERVERS - 1 ))); do
+    PORT=$(( BASE_PORT + i * 10 ))
+    GPU_START=$(( i * TP_SIZE ))
+    GPU_END=$(( GPU_START + TP_SIZE - 1 ))
+    CUDA_DEVICES=$(seq -s, "$GPU_START" "$GPU_END")
+
+    echo "Starting server $((i+1))/${NUM_SERVERS} on GPUs ${CUDA_DEVICES}, port ${PORT}..."
+
+    CUDA_VISIBLE_DEVICES="${CUDA_DEVICES}" /home/pyc_google_com/dev/gemma/.venv/bin/python -m sglang.launch_server \
+        --model "${MODEL}" \
+        --tp "${TP_SIZE}" \
+        --port "${PORT}" \
+        --host 0.0.0.0 \
+        --cuda-graph-max-bs 128 \
+        --trust-remote-code --enable-torch-compile \
+        > "${ROOT_DIR}/cache/sglang_server_${PORT}.log" 2>&1 &
+
+    SERVER_PIDS+=($!)
+    SERVER_ADDRESSES+=("localhost:${PORT}")
+done
+
+# ── Wait for servers to be healthy ───────────────────────────────────────────
+echo ""
+echo "Waiting for servers to become healthy..."
+
+wait_for_server() {
+    local addr=$1
+    local max_wait=600  # 10 minutes
+    local elapsed=0
+    while [ $elapsed -lt $max_wait ]; do
+        if curl -sf "http://${addr}/health" > /dev/null 2>&1; then
+            return 0
+        fi
+        sleep 5
+        elapsed=$(( elapsed + 5 ))
+    done
+    return 1
+}
+
+for addr in "${SERVER_ADDRESSES[@]}"; do
+    if wait_for_server "$addr"; then
+        echo "  ${addr} is healthy."
+    else
+        echo "Error: ${addr} did not become healthy within 10 minutes."
+        echo "Check logs at: ${ROOT_DIR}/cache/sglang_server_*.log"
+        exit 1
+    fi
+done
+
+echo "All ${NUM_SERVERS} server(s) are ready."
+echo "------------------------------------------------------------"
+
+# ── Build regen command ──────────────────────────────────────────────────────
+REGEN_ARGS=(
+    python3 "${ROOT_DIR}/scripts/regenerate_train_data.py"
+    --model "${MODEL}"
+    --is-reasoning-model
+    --thinking-ratio "${THINKING_RATIO}"
+    --concurrency "${CONCURRENCY}"
+    --max-tokens "${MAX_TOKENS}"
+    --temperature "${TEMPERATURE}"
+    --server-address "${SERVER_ADDRESSES[@]}"
+    --input-file-path "${INPUT_FILE}"
+    --output-file-path "${OUTPUT_FILE}"
+    --resume
+)
+
+if [ -n "${NUM_SAMPLES}" ]; then
+    REGEN_ARGS+=(--num-samples "${NUM_SAMPLES}")
+fi
+
+# ── Run regeneration ─────────────────────────────────────────────────────────
+echo "Starting data regeneration..."
+echo ""
+
+mkdir -p "$(dirname "${OUTPUT_FILE}")"
+"${REGEN_ARGS[@]}"
+
+echo ""
+echo "============================================================"
+echo "  Done! Output saved to: ${OUTPUT_FILE}"
+echo "============================================================"
@@ -15,16 +15,16 @@ torchrun \
     --train-data-path $ROOT_DIR/cache/dataset/ultrachat_train.jsonl \
     --output-dir $ROOT_DIR/outputs/gemma3-27b-eagle3-ultrachat \
     --num-epochs 10 \
-    --batch-size 2 \
+    --batch-size 8 \
     --tp-size $TP_SIZE \
     --learning-rate 1e-4 \
     --max-length 2048 \
     --chat-template gemma \
     --cache-dir $ROOT_DIR/cache \
     --attention-backend sdpa \
     --target-model-backend hf \
-    --log-interval 100 \
-    --eval-interval 500 \
-    --save-interval 10000 \
+    --log-interval 500 \
+    --eval-interval 2500 \
+    --save-interval 5000 \
     --report-to tensorboard \
     --embedding-key=language_model.model.embed_tokens.weight
@@ -0,0 +1,31 @@
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+
+# train eagle3 for gemma3-1b
+NUM_GPUS=${1:-8}
+TP_SIZE=${2:-2}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_eagle3.py \
+    --target-model-path google/gemma-4-26b-a4b-it \
+    --draft-model-config $ROOT_DIR/configs/gemma4-26b-a4b-eagle3.json \
+    --train-data-path $ROOT_DIR/cache/dataset/ultrachat_train.jsonl \
+    --output-dir $ROOT_DIR/outputs/gemma4-26b-a4b-eagle3-ultrachat \
+    --num-epochs 10 \
+    --batch-size 4 \
+    --tp-size $TP_SIZE \
+    --learning-rate 1e-4 \
+    --max-length 2048 \
+    --chat-template gemma-4 \
+    --cache-dir $ROOT_DIR/cache \
+    --attention-backend sdpa \
+    --target-model-backend hf \
+    --log-interval 500 \
+    --eval-interval 2500 \
+    --save-interval 10000 \
+    --report-to tensorboard \
+    --embedding-key=model.language_model.embed_tokens.weight \
+    --eval-holdout-ratio 0.05
@@ -1,6 +1,6 @@
 """
 This script will re-generate the dataset from target model,
-which better aligns the draft model with the target model’s output distribution.
+which better aligns the draft model with the target model's output distribution.
 
 Usage:
 1. Set up one or more SGLang servers for the target model.
@@ -60,6 +60,15 @@ def parse_arguments():
         action="store_true",
         help="Whether the model is a GPT-OSS model",
     )
+    model_group.add_argument(
+        "--thinking-ratio",
+        type=float,
+        default=None,
+        help="Fraction of requests sent with thinking enabled (0 to 1). "
+        "Requires --is-reasoning-model. When set, each request randomly "
+        "enables or disables thinking based on this ratio. "
+        "E.g., 0.7 means 70%% of samples use thinking, 30%% do not.",
+    )
 
     # sampling params
     sampling_params_group = parser.add_argument_group("sampling parameters")
@@ -184,6 +193,9 @@ def build_query_kwargs(args, messages, max_tokens=None):
     extra_body = {}
     if args.top_k is not None:
         extra_body["top_k"] = args.top_k
+    if args.thinking_ratio is not None:
+        enable_thinking = random.random() < args.thinking_ratio
+        extra_body["chat_template_kwargs"] = {"enable_thinking": enable_thinking}
     if extra_body:
         query_kwargs["extra_body"] = extra_body
     if args.is_gpt_oss:
@@ -255,11 +267,19 @@ def main():
     if args.max_tokens <= 0:
         raise ValueError("Max tokens must be greater than 0")
 
+    if args.thinking_ratio is not None:
+        if not (0.0 <= args.thinking_ratio <= 1.0):
+            raise ValueError("--thinking-ratio must be between 0.0 and 1.0")
+        if not args.is_reasoning_model:
+            raise ValueError("--thinking-ratio requires --is-reasoning-model")
+
     print(f"Configuration:")
     print(f"  Model path: {args.model}")
     print(f"  Max tokens: {args.max_tokens}")
     print(f"  Concurrency: {args.concurrency}")
     print(f"  Temperature: {args.temperature}")
+    if args.thinking_ratio is not None:
+        print(f"  Thinking ratio: {args.thinking_ratio:.0%}")
     print(f"  API URL: {args.server_address}")
     print(f"  Input file: {args.input_file_path}")
     print(f"  Output file: {args.output_file_path}")