Tencent · RuBing-Yang · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/angelslim/compressor/speculative/benchmark/vllm/benchmark_engine.py b/angelslim/compressor/speculative/benchmark/vllm/benchmark_engine.py
@@ -20,7 +20,7 @@
 from enum import Enum
 from typing import Any, Dict, Optional
 
-from fastchat.llm_judge.common import load_questions
+from angelslim.utils.lazy_imports import fastchat
 
 from .generate_baseline_answer import get_model_answers as get_baseline_answers
 from .generate_eagle_answer import get_model_answers as get_eagle_answers
@@ -147,7 +147,7 @@ def _run_eagle_benchmark(self):
         os.makedirs(os.path.dirname(self.eagle_file), exist_ok=True)
 
         question_file = self._get_question_file_path()
-        questions = load_questions(
+        questions = fastchat.llm_judge.common.load_questions(
             question_file,
             self.config.question_begin,
             self.config.question_end,
@@ -211,7 +211,7 @@ def _run_baseline_benchmark(self):
         os.makedirs(os.path.dirname(self.baseline_file), exist_ok=True)
 
         question_file = self._get_question_file_path()
-        questions = load_questions(
+        questions = fastchat.llm_judge.common.load_questions(
             question_file,
             self.config.question_begin,
             self.config.question_end,

diff --git a/angelslim/compressor/speculative/benchmark/vllm/generate_baseline_answer.py b/angelslim/compressor/speculative/benchmark/vllm/generate_baseline_answer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import argparse
 import json
 import multiprocessing as mp
@@ -21,12 +23,11 @@
 from typing import Any, Dict, List, Optional
 
 import numpy as np
-import shortuuid
 import torch
-from fastchat.llm_judge.common import load_questions
 from tqdm import tqdm
 from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+
+from angelslim.utils.lazy_imports import fastchat, shortuuid, vllm
 
 SYSTEM_PROMPT = {
     "role": "system",
@@ -84,9 +85,9 @@ def setup_seed(seed: int) -> None:
     torch.backends.cudnn.deterministic = True
 
 
-def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM:
+def initialize_model(config: EvaluationConfig, args: argparse.Namespace):
     """Initialize and return the vLLM model"""
-    llm = LLM(
+    llm = vllm.LLM(
         model=config.base_model_path,
         tensor_parallel_size=args.num_gpus_per_model,
         trust_remote_code=True,
@@ -97,7 +98,7 @@ def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM:
 
 
 def process_conversation_turn(
-    llm: LLM,
+    llm: vllm.LLM,
     tokenizer: Any,
     conv: List[Dict[str, str]],
     qs: str,
@@ -109,7 +110,7 @@ def process_conversation_turn(
         conv, tokenize=False, add_generation_prompt=True
     )
 
-    sampling_params = SamplingParams(**kwargs)
+    sampling_params = vllm.SamplingParams(**kwargs)
 
     start_time = time.time()
     outputs = llm.generate([conversation], sampling_params)
@@ -128,7 +129,7 @@ def process_conversation_turn(
 
 
 def generate_answer_for_question(
-    llm: LLM,
+    llm: vllm.LLM,
     tokenizer: Any,
     question: Dict[str, Any],
     num_choices: int,
@@ -162,7 +163,7 @@ def generate_answer_for_question(
 
 
 def warmup_model(
-    llm: LLM,
+    llm: vllm.LLM,
     tokenizer: Any,
     question: Dict[str, Any],
     temperature: float,
@@ -258,7 +259,7 @@ def get_model_answers(
                         )
                         prompts.append(prompt)
 
-                sampling_params = SamplingParams(
+                sampling_params = vllm.SamplingParams(
                     temperature=temperature,
                     max_tokens=config.max_tokens,
                     top_k=config.top_k,
@@ -317,7 +318,7 @@ def get_model_answers(
 
 def run_evaluation(config: EvaluationConfig, args: argparse.Namespace) -> None:
     """Run the evaluation. Standalone execution is single-process."""
-    questions = load_questions(
+    questions = fastchat.llm_judge.common.load_questions(
         config.question_file, args.question_begin, args.question_end
     )
 

diff --git a/angelslim/compressor/speculative/benchmark/vllm/generate_eagle_answer.py b/angelslim/compressor/speculative/benchmark/vllm/generate_eagle_answer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import argparse
 import json
 import multiprocessing as mp
@@ -21,12 +23,11 @@
 from typing import Any, Dict, List, Optional
 
 import numpy as np
-import shortuuid
 import torch
-from fastchat.llm_judge.common import load_questions
 from tqdm import tqdm
 from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+
+from angelslim.utils.lazy_imports import fastchat, shortuuid, vllm
 
 SYSTEM_PROMPT = {
     "role": "system",
@@ -43,7 +44,7 @@
 }
 
 
-def calculate_acceptance_length(llm: LLM) -> float | None:
+def calculate_acceptance_length(llm) -> float | None:
     """Calculate average acceptance length from vLLM metrics."""
     try:
         metrics = llm.get_metrics()
@@ -116,14 +117,14 @@ def setup_seed(seed: int) -> None:
     torch.backends.cudnn.deterministic = True
 
 
-def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM:
+def initialize_model(config: EvaluationConfig, args: argparse.Namespace):
     """Initialize and return the vLLM model with speculative decoding"""
     speculative_config = {
         "method": "eagle3",
         "model": config.eagle_model_path,
         "num_speculative_tokens": config.depth,
     }
-    llm = LLM(
+    llm = vllm.LLM(
         model=config.base_model_path,
         tensor_parallel_size=args.num_gpus_per_model,
         trust_remote_code=True,
@@ -136,7 +137,7 @@ def initialize_model(config: EvaluationConfig, args: argparse.Namespace) -> LLM:
 
 
 def process_conversation_turn(
-    llm: LLM,
+    llm,
     tokenizer: Any,
     conv: List[Dict[str, str]],
     qs: str,
@@ -148,7 +149,7 @@ def process_conversation_turn(
         conv, tokenize=False, add_generation_prompt=True
     )
 
-    sampling_params = SamplingParams(**kwargs)
+    sampling_params = vllm.SamplingParams(**kwargs)
 
     start_time = time.time()
     outputs = llm.generate([conversation], sampling_params)
@@ -176,7 +177,7 @@ def process_conversation_turn(
 
 
 def generate_answer_for_question(
-    llm: LLM,
+    llm: vllm.LLM,
     tokenizer: Any,
     question: Dict[str, Any],
     num_choices: int,
@@ -210,7 +211,7 @@ def generate_answer_for_question(
 
 
 def warmup_model(
-    llm: LLM,
+    llm: vllm.LLM,
     tokenizer: Any,
     question: Dict[str, Any],
     temperature: float,
@@ -304,7 +305,7 @@ def get_model_answers(
                         )
                         prompts.append(prompt)
 
-                sampling_params = SamplingParams(
+                sampling_params = vllm.SamplingParams(
                     temperature=temperature,
                     max_tokens=config.max_tokens,
                     top_k=config.top_k,
@@ -369,7 +370,7 @@ def get_model_answers(
 
 def run_evaluation(config: EvaluationConfig, args: argparse.Namespace) -> List[Any]:
     """Run the evaluation. Standalone execution is single-process."""
-    questions = load_questions(
+    questions = fastchat.llm_judge.common.load_questions(
         config.question_file, args.question_begin, args.question_end
     )
 

diff --git a/angelslim/utils/lazy_imports.py b/angelslim/utils/lazy_imports.py
@@ -201,6 +201,8 @@ def __getattr__(self, name: str) -> Any:
 jsonschema_specifications = LazyModule("jsonschema_specifications", "speculative")
 referencing = LazyModule("referencing", "speculative")
 deepspeed = LazyModule("deepspeed", "speculative")
+vllm = LazyModule("vllm", "speculative")
+shortuuid = LazyModule("shortuuid", "speculative")
 
 
 # --- multimodal related lazy imports ---

diff --git a/requirements/requirements_speculative.txt b/requirements/requirements_speculative.txt
@@ -5,4 +5,5 @@ ray
 referencing
 jsonschema_specifications
 deepspeed
-wandb
+wandb
+vllm>=0.11.0