diff --git a/angelslim/compressor/speculative/benchmark/vllm/benchmark_engine.py b/angelslim/compressor/speculative/benchmark/vllm/benchmark_engine.py index 92567f5e..f93dbeda 100644 --- a/angelslim/compressor/speculative/benchmark/vllm/benchmark_engine.py +++ b/angelslim/compressor/speculative/benchmark/vllm/benchmark_engine.py @@ -20,7 +20,7 @@ from enum import Enum from typing import Any, Dict, Optional -from fastchat.llm_judge.common import load_questions +from angelslim.utils.lazy_imports import fastchat from .generate_baseline_answer import get_model_answers as get_baseline_answers from .generate_eagle_answer import get_model_answers as get_eagle_answers @@ -147,7 +147,7 @@ def _run_eagle_benchmark(self): os.makedirs(os.path.dirname(self.eagle_file), exist_ok=True) question_file = self._get_question_file_path() - questions = load_questions( + questions = fastchat.llm_judge.common.load_questions( question_file, self.config.question_begin, self.config.question_end, @@ -211,7 +211,7 @@ def _run_baseline_benchmark(self): os.makedirs(os.path.dirname(self.baseline_file), exist_ok=True) question_file = self._get_question_file_path() - questions = load_questions( + questions = fastchat.llm_judge.common.load_questions( question_file, self.config.question_begin, self.config.question_end, diff --git a/angelslim/compressor/speculative/benchmark/vllm/generate_baseline_answer.py b/angelslim/compressor/speculative/benchmark/vllm/generate_baseline_answer.py index 85f6f8fe..48979ac4 100644 --- a/angelslim/compressor/speculative/benchmark/vllm/generate_baseline_answer.py +++ b/angelslim/compressor/speculative/benchmark/vllm/generate_baseline_answer.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import argparse import json import multiprocessing as mp @@ -21,12 +23,11 @@ from typing import Any, Dict, List, Optional import numpy as np -import shortuuid import torch -from fastchat.llm_judge.common import load_questions from tqdm import tqdm from transformers import AutoTokenizer -from vllm import LLM, SamplingParams + +from angelslim.utils.lazy_imports import fastchat, shortuuid, vllm SYSTEM_PROMPT = { "role": "system", @@ -84,9 +85,9 @@ def setup_seed(seed: int) -> None: torch.backends.cudnn.deterministic = True -def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM: +def initialize_model(config: EvaluationConfig, args: argparse.Namespace): """Initialize and return the vLLM model""" - llm = LLM( + llm = vllm.LLM( model=config.base_model_path, tensor_parallel_size=args.num_gpus_per_model, trust_remote_code=True, @@ -97,7 +98,7 @@ def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM: def process_conversation_turn( - llm: LLM, + llm: vllm.LLM, tokenizer: Any, conv: List[Dict[str, str]], qs: str, @@ -109,7 +110,7 @@ def process_conversation_turn( conv, tokenize=False, add_generation_prompt=True ) - sampling_params = SamplingParams(**kwargs) + sampling_params = vllm.SamplingParams(**kwargs) start_time = time.time() outputs = llm.generate([conversation], sampling_params) @@ -128,7 +129,7 @@ def process_conversation_turn( def generate_answer_for_question( - llm: LLM, + llm: vllm.LLM, tokenizer: Any, question: Dict[str, Any], num_choices: int, @@ -162,7 +163,7 @@ def generate_answer_for_question( def warmup_model( - llm: LLM, + llm: vllm.LLM, tokenizer: Any, question: Dict[str, Any], temperature: float, @@ -258,7 +259,7 @@ def get_model_answers( ) prompts.append(prompt) - sampling_params = SamplingParams( + sampling_params = vllm.SamplingParams( temperature=temperature, max_tokens=config.max_tokens, top_k=config.top_k, @@ -317,7 +318,7 @@ def get_model_answers( def run_evaluation(config: EvaluationConfig, args: argparse.Namespace) -> None: """Run the evaluation. Standalone execution is single-process.""" - questions = load_questions( + questions = fastchat.llm_judge.common.load_questions( config.question_file, args.question_begin, args.question_end ) diff --git a/angelslim/compressor/speculative/benchmark/vllm/generate_eagle_answer.py b/angelslim/compressor/speculative/benchmark/vllm/generate_eagle_answer.py index d221e184..c95ff5a4 100644 --- a/angelslim/compressor/speculative/benchmark/vllm/generate_eagle_answer.py +++ b/angelslim/compressor/speculative/benchmark/vllm/generate_eagle_answer.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import argparse import json import multiprocessing as mp @@ -21,12 +23,11 @@ from typing import Any, Dict, List, Optional import numpy as np -import shortuuid import torch -from fastchat.llm_judge.common import load_questions from tqdm import tqdm from transformers import AutoTokenizer -from vllm import LLM, SamplingParams + +from angelslim.utils.lazy_imports import fastchat, shortuuid, vllm SYSTEM_PROMPT = { "role": "system", @@ -43,7 +44,7 @@ } -def calculate_acceptance_length(llm: LLM) -> float | None: +def calculate_acceptance_length(llm) -> float | None: """Calculate average acceptance length from vLLM metrics.""" try: metrics = llm.get_metrics() @@ -116,14 +117,14 @@ def setup_seed(seed: int) -> None: torch.backends.cudnn.deterministic = True -def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM: +def initialize_model(config: EvaluationConfig, args: argparse.Namespace): """Initialize and return the vLLM model with speculative decoding""" speculative_config = { "method": "eagle3", "model": config.eagle_model_path, "num_speculative_tokens": config.depth, } - llm = LLM( + llm = vllm.LLM( model=config.base_model_path, tensor_parallel_size=args.num_gpus_per_model, trust_remote_code=True, @@ -136,7 +137,7 @@ def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM: def process_conversation_turn( - llm: LLM, + llm, tokenizer: Any, conv: List[Dict[str, str]], qs: str, @@ -148,7 +149,7 @@ def process_conversation_turn( conv, tokenize=False, add_generation_prompt=True ) - sampling_params = SamplingParams(**kwargs) + sampling_params = vllm.SamplingParams(**kwargs) start_time = time.time() outputs = llm.generate([conversation], sampling_params) @@ -176,7 +177,7 @@ def process_conversation_turn( def generate_answer_for_question( - llm: LLM, + llm: vllm.LLM, tokenizer: Any, question: Dict[str, Any], num_choices: int, @@ -210,7 +211,7 @@ def generate_answer_for_question( def warmup_model( - llm: LLM, + llm: vllm.LLM, tokenizer: Any, question: Dict[str, Any], temperature: float, @@ -304,7 +305,7 @@ def get_model_answers( ) prompts.append(prompt) - sampling_params = SamplingParams( + sampling_params = vllm.SamplingParams( temperature=temperature, max_tokens=config.max_tokens, top_k=config.top_k, @@ -369,7 +370,7 @@ def get_model_answers( def run_evaluation(config: EvaluationConfig, args: argparse.Namespace) -> List[Any]: """Run the evaluation. Standalone execution is single-process.""" - questions = load_questions( + questions = fastchat.llm_judge.common.load_questions( config.question_file, args.question_begin, args.question_end ) diff --git a/angelslim/utils/lazy_imports.py b/angelslim/utils/lazy_imports.py index 2db14ea4..d7cceccb 100644 --- a/angelslim/utils/lazy_imports.py +++ b/angelslim/utils/lazy_imports.py @@ -201,6 +201,8 @@ def __getattr__(self, name: str) -> Any: jsonschema_specifications = LazyModule("jsonschema_specifications", "speculative") referencing = LazyModule("referencing", "speculative") deepspeed = LazyModule("deepspeed", "speculative") +vllm = LazyModule("vllm", "speculative") +shortuuid = LazyModule("shortuuid", "speculative") # --- multimodal related lazy imports --- diff --git a/requirements/requirements_speculative.txt b/requirements/requirements_speculative.txt index 98ace8bc..1a8965a2 100644 --- a/requirements/requirements_speculative.txt +++ b/requirements/requirements_speculative.txt @@ -5,4 +5,5 @@ ray referencing jsonschema_specifications deepspeed -wandb \ No newline at end of file +wandb +vllm>=0.11.0 \ No newline at end of file