|
| 1 | +import argparse |
| 2 | +import json |
| 3 | +import os |
| 4 | +import random |
| 5 | +import time |
| 6 | +from typing import Any, Dict, List |
| 7 | + |
| 8 | +import numpy as np |
| 9 | +import ray |
| 10 | +import shortuuid |
| 11 | +import torch |
| 12 | +from fastchat.llm_judge.common import load_questions |
| 13 | +from tqdm import tqdm |
| 14 | + |
| 15 | +from angelslim.compressor.speculative.inference.models import Eagle3Model |
| 16 | + |
| 17 | +SYSTEM_PROMPT = { |
| 18 | + "role": "system", |
| 19 | + "content": ( |
| 20 | + "You are a helpful, respectful and honest assistant. " |
| 21 | + "Always answer as helpfully as possible, while being safe. " |
| 22 | + "Your answers should not include any harmful, unethical, racist, sexist, " |
| 23 | + "toxic, dangerous, or illegal content. Please ensure that your responses are " |
| 24 | + "socially unbiased and positive in nature.\n\nIf a question does not make any " |
| 25 | + "sense, or is not factually coherent, explain why instead of answering " |
| 26 | + "something not correct. If you don't know the answer to a question, please " |
| 27 | + "don't share false information." |
| 28 | + ), |
| 29 | +} |
| 30 | + |
| 31 | + |
| 32 | +class EvaluationConfig: |
| 33 | + """Container for evaluation configuration""" |
| 34 | + |
| 35 | + def __init__(self, args: argparse.Namespace): |
| 36 | + self.base_model_path = args.base_model_path |
| 37 | + self.eagle_model_path = args.eagle_model_path |
| 38 | + self.model_id = f"{args.model_id}-temperature-{args.temperature}" |
| 39 | + self.question_file = self._get_question_file_path(args) |
| 40 | + self.answer_file = self._get_answer_file_path(args) |
| 41 | + self.num_choices = args.num_choices |
| 42 | + self.temperature = args.temperature |
| 43 | + self.total_token = args.total_token |
| 44 | + self.depth = args.depth |
| 45 | + self.top_k = args.top_k |
| 46 | + |
| 47 | + def _get_question_file_path(self, args: argparse.Namespace) -> str: |
| 48 | + script_dir = os.path.dirname(__file__) |
| 49 | + parent_dir = os.path.dirname(os.path.dirname(script_dir)) |
| 50 | + return f"{parent_dir}/data/{args.bench_name}/question.jsonl" |
| 51 | + |
| 52 | + def _get_answer_file_path(self, args: argparse.Namespace) -> str: |
| 53 | + if args.answer_file: |
| 54 | + return args.answer_file |
| 55 | + |
| 56 | + script_dir = os.path.dirname(__file__) |
| 57 | + parent_dir = os.path.dirname(os.path.dirname(script_dir)) |
| 58 | + return f"{parent_dir}/output/{args.bench_name}/{self.model_id}.jsonl" |
| 59 | + |
| 60 | + |
| 61 | +def setup_seed(seed: int) -> None: |
| 62 | + """Set random seed for reproducibility""" |
| 63 | + torch.manual_seed(seed) |
| 64 | + torch.cuda.manual_seed_all(seed) |
| 65 | + np.random.seed(seed) |
| 66 | + random.seed(seed) |
| 67 | + torch.backends.cudnn.deterministic = True |
| 68 | + |
| 69 | + |
| 70 | +def initialize_model(config: EvaluationConfig) -> Eagle3Model: |
| 71 | + """Initialize and return the Eagle3 model""" |
| 72 | + model = Eagle3Model.from_pretrained( |
| 73 | + base_model_path=config.base_model_path, |
| 74 | + eagle_model_path=config.eagle_model_path, |
| 75 | + total_token=config.total_token, |
| 76 | + depth=config.depth, |
| 77 | + top_k=config.top_k, |
| 78 | + device_map="auto", |
| 79 | + torch_dtype="auto", |
| 80 | + ) |
| 81 | + model.eval() |
| 82 | + print(f"Model training state: {model.training}") |
| 83 | + print(f'CUDA VISIBLE DEVICES: {os.environ.get("CUDA_VISIBLE_DEVICES")}') |
| 84 | + return model |
| 85 | + |
| 86 | + |
| 87 | +def process_conversation_turn( |
| 88 | + model: Eagle3Model, |
| 89 | + tokenizer: Any, |
| 90 | + conv: List[Dict[str, str]], |
| 91 | + qs: str, |
| 92 | + temperature: float, |
| 93 | +) -> Dict[str, Any]: |
| 94 | + """Process a single conversation turn""" |
| 95 | + conv.append({"role": "user", "content": qs}) |
| 96 | + conversation = tokenizer.apply_chat_template( |
| 97 | + conv, tokenize=False, add_generation_prompt=False, enable_thinking=False |
| 98 | + ) |
| 99 | + |
| 100 | + input_ids = tokenizer( |
| 101 | + conversation, return_tensors="pt", max_length=2048, add_special_tokens=False |
| 102 | + ).input_ids |
| 103 | + |
| 104 | + torch.cuda.synchronize() |
| 105 | + start_time = time.time() |
| 106 | + |
| 107 | + output_ids, new_token, idx = model.naive_generate( |
| 108 | + torch.as_tensor(input_ids).cuda(), temperature=temperature, log=True |
| 109 | + ) |
| 110 | + |
| 111 | + torch.cuda.synchronize() |
| 112 | + total_time = time.time() - start_time |
| 113 | + output_ids = output_ids[0][len(input_ids[0]) :] |
| 114 | + |
| 115 | + output = tokenizer.decode(output_ids, spaces_between_special_tokens=False) |
| 116 | + for special_token in tokenizer.special_tokens_map.values(): |
| 117 | + if isinstance(special_token, list): |
| 118 | + for special_tok in special_token: |
| 119 | + output = output.replace(special_tok, "") |
| 120 | + else: |
| 121 | + output = output.replace(special_token, "") |
| 122 | + output = output.strip() |
| 123 | + |
| 124 | + conv.append({"role": "assistant", "content": output}) |
| 125 | + |
| 126 | + return { |
| 127 | + "output": output, |
| 128 | + "idx": int(idx), |
| 129 | + "new_token": int(new_token), |
| 130 | + "wall_time": total_time, |
| 131 | + } |
| 132 | + |
| 133 | + |
| 134 | +def generate_answer_for_question( |
| 135 | + model: Eagle3Model, |
| 136 | + tokenizer: Any, |
| 137 | + question: Dict[str, Any], |
| 138 | + num_choices: int, |
| 139 | + temperature: float, |
| 140 | +) -> List[Dict[str, Any]]: |
| 141 | + """Generate answers for a single question with multiple choices""" |
| 142 | + choices = [] |
| 143 | + for i in range(num_choices): |
| 144 | + torch.manual_seed(i) |
| 145 | + conv = [SYSTEM_PROMPT] |
| 146 | + turns = [] |
| 147 | + idxs = [] |
| 148 | + new_tokens = [] |
| 149 | + wall_time = [] |
| 150 | + |
| 151 | + for qs in question["turns"]: |
| 152 | + result = process_conversation_turn(model, tokenizer, conv, qs, temperature) |
| 153 | + turns.append(result["output"]) |
| 154 | + idxs.append(result["idx"]) |
| 155 | + new_tokens.append(result["new_token"]) |
| 156 | + wall_time.append(result["wall_time"]) |
| 157 | + |
| 158 | + choices.append( |
| 159 | + { |
| 160 | + "index": i, |
| 161 | + "turns": turns, |
| 162 | + "idxs": idxs, |
| 163 | + "new_tokens": new_tokens, |
| 164 | + "wall_time": wall_time, |
| 165 | + } |
| 166 | + ) |
| 167 | + |
| 168 | + return choices |
| 169 | + |
| 170 | + |
| 171 | +def warmup_model( |
| 172 | + model: Eagle3Model, tokenizer: Any, question: Dict[str, Any], temperature: float |
| 173 | +) -> None: |
| 174 | + """Warm up the model before actual evaluation""" |
| 175 | + for _ in range(3): |
| 176 | + torch.manual_seed(0) |
| 177 | + conv = [SYSTEM_PROMPT] |
| 178 | + for qs in question["turns"]: |
| 179 | + process_conversation_turn(model, tokenizer, conv, qs, temperature) |
| 180 | + print("Warmup done") |
| 181 | + |
| 182 | + |
| 183 | +@torch.inference_mode() |
| 184 | +def get_model_answers( |
| 185 | + model_id: str, |
| 186 | + questions: List[Dict[str, Any]], |
| 187 | + answer_file: str, |
| 188 | + num_choices: int, |
| 189 | + temperature: float, |
| 190 | + args: argparse.Namespace, |
| 191 | +) -> None: |
| 192 | + """Generate answers for a batch of questions""" |
| 193 | + config = EvaluationConfig(args) |
| 194 | + model = initialize_model(config) |
| 195 | + tokenizer = model.get_tokenizer() |
| 196 | + |
| 197 | + if questions: |
| 198 | + warmup_model(model, tokenizer, questions[0], temperature) |
| 199 | + |
| 200 | + os.makedirs(os.path.dirname(answer_file), exist_ok=True) |
| 201 | + |
| 202 | + for question in tqdm(questions): |
| 203 | + choices = generate_answer_for_question( |
| 204 | + model, tokenizer, question, num_choices, temperature |
| 205 | + ) |
| 206 | + |
| 207 | + with open(os.path.expanduser(answer_file), "a") as fout: |
| 208 | + ans_json = { |
| 209 | + "question_id": question["question_id"], |
| 210 | + "answer_id": shortuuid.uuid(), |
| 211 | + "model_id": model_id, |
| 212 | + "choices": choices, |
| 213 | + "tstamp": time.time(), |
| 214 | + } |
| 215 | + fout.write(json.dumps(ans_json) + "\n") |
| 216 | + |
| 217 | + |
| 218 | +def run_evaluation(config: EvaluationConfig, args: argparse.Namespace) -> None: |
| 219 | + """Run the evaluation with optional distributed processing""" |
| 220 | + questions = load_questions( |
| 221 | + config.question_file, args.question_begin, args.question_end |
| 222 | + ) |
| 223 | + |
| 224 | + use_ray = args.num_gpus_total // args.num_gpus_per_model > 1 |
| 225 | + get_answers_func = ( |
| 226 | + ray.remote(num_gpus=args.num_gpus_per_model)(get_model_answers).remote |
| 227 | + if use_ray |
| 228 | + else get_model_answers |
| 229 | + ) |
| 230 | + |
| 231 | + chunk_size = len(questions) // (args.num_gpus_total // args.num_gpus_per_model) |
| 232 | + ans_handles = [ |
| 233 | + get_answers_func( |
| 234 | + config.model_id, |
| 235 | + questions[i : i + chunk_size], |
| 236 | + config.answer_file, |
| 237 | + config.num_choices, |
| 238 | + config.temperature, |
| 239 | + args, |
| 240 | + ) |
| 241 | + for i in range(0, len(questions), chunk_size) |
| 242 | + ] |
| 243 | + |
| 244 | + if use_ray: |
| 245 | + ray.get(ans_handles) |
| 246 | + |
| 247 | + |
| 248 | +def reorg_answer_file(answer_file: str) -> None: |
| 249 | + """Sort answers by question id and remove duplicates""" |
| 250 | + answers = {} |
| 251 | + with open(answer_file, "r") as fin: |
| 252 | + for line in fin: |
| 253 | + qid = json.loads(line)["question_id"] |
| 254 | + answers[qid] = line |
| 255 | + |
| 256 | + with open(answer_file, "w") as fout: |
| 257 | + for qid in sorted(answers.keys()): |
| 258 | + fout.write(answers[qid]) |
| 259 | + |
| 260 | + |
| 261 | +def parse_args() -> argparse.Namespace: |
| 262 | + """Parse command line arguments""" |
| 263 | + parser = argparse.ArgumentParser() |
| 264 | + parser.add_argument( |
| 265 | + "--eagle-model-path", |
| 266 | + type=str, |
| 267 | + default="down_checkpoints/LC70B", |
| 268 | + help="Path to the weights (local folder or Hugging Face repo ID)", |
| 269 | + ) |
| 270 | + parser.add_argument("--base-model-path", type=str, default="") |
| 271 | + parser.add_argument( |
| 272 | + "--load-in-8bit", action="store_false", help="Use 8-bit quantization" |
| 273 | + ) |
| 274 | + parser.add_argument("--model-id", type=str, default="") |
| 275 | + parser.add_argument( |
| 276 | + "--bench-name", type=str, default="mt_bench", help="Benchmark question set name" |
| 277 | + ) |
| 278 | + parser.add_argument( |
| 279 | + "--question-begin", type=int, help="Begin index of questions (debug)" |
| 280 | + ) |
| 281 | + parser.add_argument( |
| 282 | + "--question-end", type=int, help="End index of questions (debug)" |
| 283 | + ) |
| 284 | + parser.add_argument("--answer-file", type=str, help="Output answer file path") |
| 285 | + parser.add_argument( |
| 286 | + "--max-new-token", type=int, default=1024, help="Max new generated tokens" |
| 287 | + ) |
| 288 | + parser.add_argument( |
| 289 | + "--total-token", type=int, default=60, help="Total nodes in draft tree" |
| 290 | + ) |
| 291 | + parser.add_argument("--depth", type=int, default=5) |
| 292 | + parser.add_argument("--top-k", type=int, default=10) |
| 293 | + parser.add_argument( |
| 294 | + "--num-choices", type=int, default=1, help="Number of completion choices" |
| 295 | + ) |
| 296 | + parser.add_argument( |
| 297 | + "--num-gpus-per-model", type=int, default=1, help="GPUs per model" |
| 298 | + ) |
| 299 | + parser.add_argument("--num-gpus-total", type=int, default=1, help="Total GPUs") |
| 300 | + parser.add_argument("--max-gpu-memory", type=str, help="Max GPU memory per GPU") |
| 301 | + parser.add_argument("--temperature", type=float, default=1.0) |
| 302 | + parser.add_argument("--seed", type=int, default=42) |
| 303 | + return parser.parse_args() |
| 304 | + |
| 305 | + |
| 306 | +def main() -> None: |
| 307 | + """Main execution function""" |
| 308 | + args = parse_args() |
| 309 | + setup_seed(args.seed) |
| 310 | + |
| 311 | + if args.num_gpus_total // args.num_gpus_per_model > 1: |
| 312 | + ray.init() |
| 313 | + |
| 314 | + config = EvaluationConfig(args) |
| 315 | + os.makedirs(os.path.dirname(config.answer_file), exist_ok=True) |
| 316 | + print(f"Output to {config.answer_file}") |
| 317 | + |
| 318 | + run_evaluation(config, args) |
| 319 | + reorg_answer_file(config.answer_file) |
| 320 | + |
| 321 | + |
| 322 | +if __name__ == "__main__": |
| 323 | + main() |
0 commit comments