Skip to content

Commit 1c21d90

Browse files
liusong1222wilettliu
authored andcommitted
Add Eagle3 PyTorch inference module and benchmark scripts
1 parent 62d3e69 commit 1c21d90

27 files changed

Lines changed: 6403 additions & 0 deletions
File renamed without changes.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Benchmark Instructions
2+
3+
本指南介绍如何使用 **PyTorch** 运行基线模型和 **Eagle3** 模型的推理基准测试,以及如何计算 **接受长度****加速比**
4+
5+
---
6+
7+
## 目录
8+
- [1. 运行 Baseline 模型基准测试(PyTorch)](#1-运行-baseline-模型基准测试pytorch)
9+
- [2. 运行 Eagle3 模型基准测试(PyTorch)](#2-运行-eagle3-模型基准测试pytorch)
10+
- [3. 计算接受长度与加速比](#3-计算接受长度与加速比)
11+
- [3.1 计算接受长度](#31-计算接受长度)
12+
- [3.2 计算加速比](#32-计算加速比)
13+
14+
---
15+
16+
## 1. 运行 Baseline 模型基准测试(PyTorch)
17+
18+
```bash
19+
cd angelslim/compressor/speculative_decoding/scripts
20+
sh run_baseline_benchmark_pytorch.sh
21+
```
22+
23+
## 2. 运行 Eagle3 模型基准测试(PyTorch)
24+
```bash
25+
cd angelslim/compressor/speculative_decoding/scripts
26+
sh run_eagle3_benchmark_pytorch.sh
27+
```
28+
29+
## 3. 计算接受长度与加速比
30+
### 3.1 计算接受长度
31+
```bash
32+
python3 get_alpha_and_speed.py acceptance --input_file [results.jsonl]
33+
```
34+
参数说明
35+
* `--input_file`:Eagle3 模型推理benchmark结果文件
36+
37+
### 3.2 计算加速比
38+
```bash
39+
python3 get_alpha_and_speed.py speedup --model_path /path/to/model --baseline_json [baseline.jsonl] --eagle_json [eagle.jsonl]
40+
```
41+
参数说明
42+
* `--model_path`:Baseline 模型路径
43+
* `--baseline_json`:Baseline 模型的benchmark结果文件
44+
* `--eagle_json`:Eagle3 模型的benchmark结果文件
Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,323 @@
1+
import argparse
2+
import json
3+
import os
4+
import random
5+
import time
6+
from typing import Any, Dict, List
7+
8+
import numpy as np
9+
import ray
10+
import shortuuid
11+
import torch
12+
from fastchat.llm_judge.common import load_questions
13+
from tqdm import tqdm
14+
15+
from angelslim.compressor.speculative.inference.models import Eagle3Model
16+
17+
SYSTEM_PROMPT = {
18+
"role": "system",
19+
"content": (
20+
"You are a helpful, respectful and honest assistant. "
21+
"Always answer as helpfully as possible, while being safe. "
22+
"Your answers should not include any harmful, unethical, racist, sexist, "
23+
"toxic, dangerous, or illegal content. Please ensure that your responses are "
24+
"socially unbiased and positive in nature.\n\nIf a question does not make any "
25+
"sense, or is not factually coherent, explain why instead of answering "
26+
"something not correct. If you don't know the answer to a question, please "
27+
"don't share false information."
28+
),
29+
}
30+
31+
32+
class EvaluationConfig:
33+
"""Container for evaluation configuration"""
34+
35+
def __init__(self, args: argparse.Namespace):
36+
self.base_model_path = args.base_model_path
37+
self.eagle_model_path = args.eagle_model_path
38+
self.model_id = f"{args.model_id}-temperature-{args.temperature}"
39+
self.question_file = self._get_question_file_path(args)
40+
self.answer_file = self._get_answer_file_path(args)
41+
self.num_choices = args.num_choices
42+
self.temperature = args.temperature
43+
self.total_token = args.total_token
44+
self.depth = args.depth
45+
self.top_k = args.top_k
46+
47+
def _get_question_file_path(self, args: argparse.Namespace) -> str:
48+
script_dir = os.path.dirname(__file__)
49+
parent_dir = os.path.dirname(os.path.dirname(script_dir))
50+
return f"{parent_dir}/data/{args.bench_name}/question.jsonl"
51+
52+
def _get_answer_file_path(self, args: argparse.Namespace) -> str:
53+
if args.answer_file:
54+
return args.answer_file
55+
56+
script_dir = os.path.dirname(__file__)
57+
parent_dir = os.path.dirname(os.path.dirname(script_dir))
58+
return f"{parent_dir}/output/{args.bench_name}/{self.model_id}.jsonl"
59+
60+
61+
def setup_seed(seed: int) -> None:
62+
"""Set random seed for reproducibility"""
63+
torch.manual_seed(seed)
64+
torch.cuda.manual_seed_all(seed)
65+
np.random.seed(seed)
66+
random.seed(seed)
67+
torch.backends.cudnn.deterministic = True
68+
69+
70+
def initialize_model(config: EvaluationConfig) -> Eagle3Model:
71+
"""Initialize and return the Eagle3 model"""
72+
model = Eagle3Model.from_pretrained(
73+
base_model_path=config.base_model_path,
74+
eagle_model_path=config.eagle_model_path,
75+
total_token=config.total_token,
76+
depth=config.depth,
77+
top_k=config.top_k,
78+
device_map="auto",
79+
torch_dtype="auto",
80+
)
81+
model.eval()
82+
print(f"Model training state: {model.training}")
83+
print(f'CUDA VISIBLE DEVICES: {os.environ.get("CUDA_VISIBLE_DEVICES")}')
84+
return model
85+
86+
87+
def process_conversation_turn(
88+
model: Eagle3Model,
89+
tokenizer: Any,
90+
conv: List[Dict[str, str]],
91+
qs: str,
92+
temperature: float,
93+
) -> Dict[str, Any]:
94+
"""Process a single conversation turn"""
95+
conv.append({"role": "user", "content": qs})
96+
conversation = tokenizer.apply_chat_template(
97+
conv, tokenize=False, add_generation_prompt=False, enable_thinking=False
98+
)
99+
100+
input_ids = tokenizer(
101+
conversation, return_tensors="pt", max_length=2048, add_special_tokens=False
102+
).input_ids
103+
104+
torch.cuda.synchronize()
105+
start_time = time.time()
106+
107+
output_ids, new_token, idx = model.naive_generate(
108+
torch.as_tensor(input_ids).cuda(), temperature=temperature, log=True
109+
)
110+
111+
torch.cuda.synchronize()
112+
total_time = time.time() - start_time
113+
output_ids = output_ids[0][len(input_ids[0]) :]
114+
115+
output = tokenizer.decode(output_ids, spaces_between_special_tokens=False)
116+
for special_token in tokenizer.special_tokens_map.values():
117+
if isinstance(special_token, list):
118+
for special_tok in special_token:
119+
output = output.replace(special_tok, "")
120+
else:
121+
output = output.replace(special_token, "")
122+
output = output.strip()
123+
124+
conv.append({"role": "assistant", "content": output})
125+
126+
return {
127+
"output": output,
128+
"idx": int(idx),
129+
"new_token": int(new_token),
130+
"wall_time": total_time,
131+
}
132+
133+
134+
def generate_answer_for_question(
135+
model: Eagle3Model,
136+
tokenizer: Any,
137+
question: Dict[str, Any],
138+
num_choices: int,
139+
temperature: float,
140+
) -> List[Dict[str, Any]]:
141+
"""Generate answers for a single question with multiple choices"""
142+
choices = []
143+
for i in range(num_choices):
144+
torch.manual_seed(i)
145+
conv = [SYSTEM_PROMPT]
146+
turns = []
147+
idxs = []
148+
new_tokens = []
149+
wall_time = []
150+
151+
for qs in question["turns"]:
152+
result = process_conversation_turn(model, tokenizer, conv, qs, temperature)
153+
turns.append(result["output"])
154+
idxs.append(result["idx"])
155+
new_tokens.append(result["new_token"])
156+
wall_time.append(result["wall_time"])
157+
158+
choices.append(
159+
{
160+
"index": i,
161+
"turns": turns,
162+
"idxs": idxs,
163+
"new_tokens": new_tokens,
164+
"wall_time": wall_time,
165+
}
166+
)
167+
168+
return choices
169+
170+
171+
def warmup_model(
172+
model: Eagle3Model, tokenizer: Any, question: Dict[str, Any], temperature: float
173+
) -> None:
174+
"""Warm up the model before actual evaluation"""
175+
for _ in range(3):
176+
torch.manual_seed(0)
177+
conv = [SYSTEM_PROMPT]
178+
for qs in question["turns"]:
179+
process_conversation_turn(model, tokenizer, conv, qs, temperature)
180+
print("Warmup done")
181+
182+
183+
@torch.inference_mode()
184+
def get_model_answers(
185+
model_id: str,
186+
questions: List[Dict[str, Any]],
187+
answer_file: str,
188+
num_choices: int,
189+
temperature: float,
190+
args: argparse.Namespace,
191+
) -> None:
192+
"""Generate answers for a batch of questions"""
193+
config = EvaluationConfig(args)
194+
model = initialize_model(config)
195+
tokenizer = model.get_tokenizer()
196+
197+
if questions:
198+
warmup_model(model, tokenizer, questions[0], temperature)
199+
200+
os.makedirs(os.path.dirname(answer_file), exist_ok=True)
201+
202+
for question in tqdm(questions):
203+
choices = generate_answer_for_question(
204+
model, tokenizer, question, num_choices, temperature
205+
)
206+
207+
with open(os.path.expanduser(answer_file), "a") as fout:
208+
ans_json = {
209+
"question_id": question["question_id"],
210+
"answer_id": shortuuid.uuid(),
211+
"model_id": model_id,
212+
"choices": choices,
213+
"tstamp": time.time(),
214+
}
215+
fout.write(json.dumps(ans_json) + "\n")
216+
217+
218+
def run_evaluation(config: EvaluationConfig, args: argparse.Namespace) -> None:
219+
"""Run the evaluation with optional distributed processing"""
220+
questions = load_questions(
221+
config.question_file, args.question_begin, args.question_end
222+
)
223+
224+
use_ray = args.num_gpus_total // args.num_gpus_per_model > 1
225+
get_answers_func = (
226+
ray.remote(num_gpus=args.num_gpus_per_model)(get_model_answers).remote
227+
if use_ray
228+
else get_model_answers
229+
)
230+
231+
chunk_size = len(questions) // (args.num_gpus_total // args.num_gpus_per_model)
232+
ans_handles = [
233+
get_answers_func(
234+
config.model_id,
235+
questions[i : i + chunk_size],
236+
config.answer_file,
237+
config.num_choices,
238+
config.temperature,
239+
args,
240+
)
241+
for i in range(0, len(questions), chunk_size)
242+
]
243+
244+
if use_ray:
245+
ray.get(ans_handles)
246+
247+
248+
def reorg_answer_file(answer_file: str) -> None:
249+
"""Sort answers by question id and remove duplicates"""
250+
answers = {}
251+
with open(answer_file, "r") as fin:
252+
for line in fin:
253+
qid = json.loads(line)["question_id"]
254+
answers[qid] = line
255+
256+
with open(answer_file, "w") as fout:
257+
for qid in sorted(answers.keys()):
258+
fout.write(answers[qid])
259+
260+
261+
def parse_args() -> argparse.Namespace:
262+
"""Parse command line arguments"""
263+
parser = argparse.ArgumentParser()
264+
parser.add_argument(
265+
"--eagle-model-path",
266+
type=str,
267+
default="down_checkpoints/LC70B",
268+
help="Path to the weights (local folder or Hugging Face repo ID)",
269+
)
270+
parser.add_argument("--base-model-path", type=str, default="")
271+
parser.add_argument(
272+
"--load-in-8bit", action="store_false", help="Use 8-bit quantization"
273+
)
274+
parser.add_argument("--model-id", type=str, default="")
275+
parser.add_argument(
276+
"--bench-name", type=str, default="mt_bench", help="Benchmark question set name"
277+
)
278+
parser.add_argument(
279+
"--question-begin", type=int, help="Begin index of questions (debug)"
280+
)
281+
parser.add_argument(
282+
"--question-end", type=int, help="End index of questions (debug)"
283+
)
284+
parser.add_argument("--answer-file", type=str, help="Output answer file path")
285+
parser.add_argument(
286+
"--max-new-token", type=int, default=1024, help="Max new generated tokens"
287+
)
288+
parser.add_argument(
289+
"--total-token", type=int, default=60, help="Total nodes in draft tree"
290+
)
291+
parser.add_argument("--depth", type=int, default=5)
292+
parser.add_argument("--top-k", type=int, default=10)
293+
parser.add_argument(
294+
"--num-choices", type=int, default=1, help="Number of completion choices"
295+
)
296+
parser.add_argument(
297+
"--num-gpus-per-model", type=int, default=1, help="GPUs per model"
298+
)
299+
parser.add_argument("--num-gpus-total", type=int, default=1, help="Total GPUs")
300+
parser.add_argument("--max-gpu-memory", type=str, help="Max GPU memory per GPU")
301+
parser.add_argument("--temperature", type=float, default=1.0)
302+
parser.add_argument("--seed", type=int, default=42)
303+
return parser.parse_args()
304+
305+
306+
def main() -> None:
307+
"""Main execution function"""
308+
args = parse_args()
309+
setup_seed(args.seed)
310+
311+
if args.num_gpus_total // args.num_gpus_per_model > 1:
312+
ray.init()
313+
314+
config = EvaluationConfig(args)
315+
os.makedirs(os.path.dirname(config.answer_file), exist_ok=True)
316+
print(f"Output to {config.answer_file}")
317+
318+
run_evaluation(config, args)
319+
reorg_answer_file(config.answer_file)
320+
321+
322+
if __name__ == "__main__":
323+
main()

0 commit comments

Comments
 (0)