Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from enum import Enum
from typing import Any, Dict, Optional

from fastchat.llm_judge.common import load_questions
from angelslim.utils.lazy_imports import fastchat

from .generate_baseline_answer import get_model_answers as get_baseline_answers
from .generate_eagle_answer import get_model_answers as get_eagle_answers
Expand Down Expand Up @@ -147,7 +147,7 @@ def _run_eagle_benchmark(self):
os.makedirs(os.path.dirname(self.eagle_file), exist_ok=True)

question_file = self._get_question_file_path()
questions = load_questions(
questions = fastchat.llm_judge.common.load_questions(
question_file,
self.config.question_begin,
self.config.question_end,
Expand Down Expand Up @@ -211,7 +211,7 @@ def _run_baseline_benchmark(self):
os.makedirs(os.path.dirname(self.baseline_file), exist_ok=True)

question_file = self._get_question_file_path()
questions = load_questions(
questions = fastchat.llm_judge.common.load_questions(
question_file,
self.config.question_begin,
self.config.question_end,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import argparse
import json
import multiprocessing as mp
Expand All @@ -21,12 +23,11 @@
from typing import Any, Dict, List, Optional

import numpy as np
import shortuuid
import torch
from fastchat.llm_judge.common import load_questions
from tqdm import tqdm
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

from angelslim.utils.lazy_imports import fastchat, shortuuid, vllm

SYSTEM_PROMPT = {
"role": "system",
Expand Down Expand Up @@ -84,9 +85,9 @@ def setup_seed(seed: int) -> None:
torch.backends.cudnn.deterministic = True


def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM:
def initialize_model(config: EvaluationConfig, args: argparse.Namespace):
"""Initialize and return the vLLM model"""
llm = LLM(
llm = vllm.LLM(
model=config.base_model_path,
tensor_parallel_size=args.num_gpus_per_model,
trust_remote_code=True,
Expand All @@ -97,7 +98,7 @@ def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM:


def process_conversation_turn(
llm: LLM,
llm: vllm.LLM,
tokenizer: Any,
conv: List[Dict[str, str]],
qs: str,
Expand All @@ -109,7 +110,7 @@ def process_conversation_turn(
conv, tokenize=False, add_generation_prompt=True
)

sampling_params = SamplingParams(**kwargs)
sampling_params = vllm.SamplingParams(**kwargs)

start_time = time.time()
outputs = llm.generate([conversation], sampling_params)
Expand All @@ -128,7 +129,7 @@ def process_conversation_turn(


def generate_answer_for_question(
llm: LLM,
llm: vllm.LLM,
tokenizer: Any,
question: Dict[str, Any],
num_choices: int,
Expand Down Expand Up @@ -162,7 +163,7 @@ def generate_answer_for_question(


def warmup_model(
llm: LLM,
llm: vllm.LLM,
tokenizer: Any,
question: Dict[str, Any],
temperature: float,
Expand Down Expand Up @@ -258,7 +259,7 @@ def get_model_answers(
)
prompts.append(prompt)

sampling_params = SamplingParams(
sampling_params = vllm.SamplingParams(
temperature=temperature,
max_tokens=config.max_tokens,
top_k=config.top_k,
Expand Down Expand Up @@ -317,7 +318,7 @@ def get_model_answers(

def run_evaluation(config: EvaluationConfig, args: argparse.Namespace) -> None:
"""Run the evaluation. Standalone execution is single-process."""
questions = load_questions(
questions = fastchat.llm_judge.common.load_questions(
config.question_file, args.question_begin, args.question_end
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import argparse
import json
import multiprocessing as mp
Expand All @@ -21,12 +23,11 @@
from typing import Any, Dict, List, Optional

import numpy as np
import shortuuid
import torch
from fastchat.llm_judge.common import load_questions
from tqdm import tqdm
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

from angelslim.utils.lazy_imports import fastchat, shortuuid, vllm

SYSTEM_PROMPT = {
"role": "system",
Expand All @@ -43,7 +44,7 @@
}


def calculate_acceptance_length(llm: LLM) -> float | None:
def calculate_acceptance_length(llm) -> float | None:
"""Calculate average acceptance length from vLLM metrics."""
try:
metrics = llm.get_metrics()
Expand Down Expand Up @@ -116,14 +117,14 @@ def setup_seed(seed: int) -> None:
torch.backends.cudnn.deterministic = True


def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM:
def initialize_model(config: EvaluationConfig, args: argparse.Namespace):
"""Initialize and return the vLLM model with speculative decoding"""
speculative_config = {
"method": "eagle3",
"model": config.eagle_model_path,
"num_speculative_tokens": config.depth,
}
llm = LLM(
llm = vllm.LLM(
model=config.base_model_path,
tensor_parallel_size=args.num_gpus_per_model,
trust_remote_code=True,
Expand All @@ -136,7 +137,7 @@ def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM:


def process_conversation_turn(
llm: LLM,
llm,
tokenizer: Any,
conv: List[Dict[str, str]],
qs: str,
Expand All @@ -148,7 +149,7 @@ def process_conversation_turn(
conv, tokenize=False, add_generation_prompt=True
)

sampling_params = SamplingParams(**kwargs)
sampling_params = vllm.SamplingParams(**kwargs)

start_time = time.time()
outputs = llm.generate([conversation], sampling_params)
Expand Down Expand Up @@ -176,7 +177,7 @@ def process_conversation_turn(


def generate_answer_for_question(
llm: LLM,
llm: vllm.LLM,
tokenizer: Any,
question: Dict[str, Any],
num_choices: int,
Expand Down Expand Up @@ -210,7 +211,7 @@ def generate_answer_for_question(


def warmup_model(
llm: LLM,
llm: vllm.LLM,
tokenizer: Any,
question: Dict[str, Any],
temperature: float,
Expand Down Expand Up @@ -304,7 +305,7 @@ def get_model_answers(
)
prompts.append(prompt)

sampling_params = SamplingParams(
sampling_params = vllm.SamplingParams(
temperature=temperature,
max_tokens=config.max_tokens,
top_k=config.top_k,
Expand Down Expand Up @@ -369,7 +370,7 @@ def get_model_answers(

def run_evaluation(config: EvaluationConfig, args: argparse.Namespace) -> List[Any]:
"""Run the evaluation. Standalone execution is single-process."""
questions = load_questions(
questions = fastchat.llm_judge.common.load_questions(
config.question_file, args.question_begin, args.question_end
)

Expand Down
2 changes: 2 additions & 0 deletions angelslim/utils/lazy_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ def __getattr__(self, name: str) -> Any:
jsonschema_specifications = LazyModule("jsonschema_specifications", "speculative")
referencing = LazyModule("referencing", "speculative")
deepspeed = LazyModule("deepspeed", "speculative")
vllm = LazyModule("vllm", "speculative")
shortuuid = LazyModule("shortuuid", "speculative")


# --- multimodal related lazy imports ---
Expand Down
3 changes: 2 additions & 1 deletion requirements/requirements_speculative.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ ray
referencing
jsonschema_specifications
deepspeed
wandb
wandb
vllm>=0.11.0