Skip to content

Commit 34889df

Browse files
lewtuncodex
andauthored
Add support for vllm >= 0.19.0 (#1211)
* Fix vLLM 0.11 compatibility and restore hellaswag_cf Co-authored-by: OpenAI Codex <codex@openai.com> * Support vLLM 0.19 prompt schema Co-authored-by: OpenAI Codex <codex@openai.com> * Address vLLM PR review feedback Co-authored-by: OpenAI Codex <codex@openai.com> * Remove temporary hellaswag_cf task Co-authored-by: OpenAI Codex <codex@openai.com> * Clarify vLLM compatibility branches Co-authored-by: OpenAI Codex <codex@openai.com> * Handle tied MCQ logits in slow sample comparisons Co-authored-by: OpenAI Codex <codex@openai.com> * Handle flat VLM token outputs in tie checks Co-authored-by: OpenAI Codex <codex@openai.com> --------- Co-authored-by: OpenAI Codex <codex@openai.com>
1 parent e274b37 commit 34889df

4 files changed

Lines changed: 217 additions & 8 deletions

File tree

src/lighteval/models/vllm/vllm_model.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@
4444
logger = logging.getLogger(__name__)
4545

4646

47+
def build_vllm_token_prompts(inputs: list[list[int]]) -> list:
48+
"""Build token prompts across vLLM prompt-schema reorganizations."""
49+
from vllm.inputs import TokensPrompt
50+
51+
return [TokensPrompt(prompt_token_ids=token_ids) for token_ids in inputs]
52+
53+
4754
if is_package_available("vllm"):
4855
import ray
4956
from more_itertools import distribute
@@ -52,9 +59,15 @@
5259
destroy_distributed_environment,
5360
destroy_model_parallel,
5461
)
55-
from vllm.tokenizers import get_tokenizer
5662
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
5763

64+
try:
65+
# vLLM moved `get_tokenizer` to `vllm.tokenizers` in v0.12.0.
66+
# Keep the fallback while our lower bound remains on v0.11.x.
67+
from vllm.tokenizers import get_tokenizer
68+
except ModuleNotFoundError:
69+
from vllm.transformers_utils.tokenizer import get_tokenizer
70+
5871
logging.getLogger("vllm").propagate = True
5972
logging.getLogger("vllm").handlers.clear()
6073

@@ -302,6 +315,7 @@ def _create_auto_tokenizer(self, config: VLLMModelConfig):
302315
trust_remote_code=config.trust_remote_code,
303316
revision=config.revision,
304317
)
318+
305319
tokenizer.pad_token = tokenizer.eos_token
306320
return tokenizer
307321

@@ -439,8 +453,7 @@ def _generate(
439453
@ray.remote(num_gpus=self.tensor_parallel_size)
440454
def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests):
441455
llm = LLM(**model_args)
442-
# Convert token IDs to TokensPrompt format for vLLM v0.15+
443-
prompts = [{"prompt_token_ids": req} for req in requests]
456+
prompts = build_vllm_token_prompts(requests)
444457
return llm.generate(prompts=prompts, sampling_params=sampling_params)
445458

446459
# dispatch requests to all self.data_parallel_size workers, in interleaved fashion
@@ -458,10 +471,7 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
458471
if x is not None
459472
]
460473
else:
461-
from vllm.inputs import TokenInputs
462-
463-
# Convert token IDs to TokensPrompt format for vLLM v0.15+
464-
prompts = [TokenInputs(prompt_token_ids=token_ids) for token_ids in inputs]
474+
prompts = build_vllm_token_prompts(inputs)
465475
outputs = self.model.generate(
466476
prompts=prompts,
467477
sampling_params=sampling_params,

tests/slow_tests/sample_comparison.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
from datasets import Dataset
2828

2929

30+
LOGIT_TIE_EPSILON = 0.05
31+
32+
3033
def _to_plain_list(value):
3134
"""convert a list of tensors to a list of plain values"""
3235
new_value = []
@@ -37,6 +40,83 @@ def _to_plain_list(value):
3740
return new_value
3841

3942

43+
def to_plain_data(value):
44+
"""Convert nested tensor-like values to plain Python data."""
45+
if hasattr(value, "tolist"):
46+
value = value.tolist()
47+
48+
if isinstance(value, list):
49+
return [to_plain_data(item) for item in value]
50+
51+
return value
52+
53+
54+
def first_generated_token_id(model_response: dict) -> int | None:
55+
"""Return the first generated token id for the first sequence."""
56+
output_tokens = to_plain_data(model_response.get("output_tokens") or [])
57+
if not output_tokens or not output_tokens[0]:
58+
return None
59+
60+
first_sequence = output_tokens[0]
61+
if isinstance(first_sequence, list):
62+
return first_sequence[0] if first_sequence else None
63+
64+
return first_sequence
65+
66+
67+
def first_step_logits(model_response: dict) -> list[float] | None:
68+
"""Return the logits for the first generated token, if available."""
69+
logits = to_plain_data(model_response.get("logits"))
70+
if not logits:
71+
return None
72+
73+
first_step = logits[0]
74+
if isinstance(first_step, list):
75+
return first_step
76+
77+
return logits
78+
79+
80+
def is_within_logit_tie_margin(logits: list[float], token_id: int, epsilon: float = LOGIT_TIE_EPSILON) -> bool:
81+
"""Check whether a token is within epsilon of the maximum logit."""
82+
if token_id < 0 or token_id >= len(logits):
83+
return False
84+
85+
return max(logits) - logits[token_id] <= epsilon
86+
87+
88+
def is_tied_choice_prediction(current: dict, reference: dict, epsilon: float = LOGIT_TIE_EPSILON) -> bool:
89+
"""Return True when two different MCQ predictions are both within the tie margin."""
90+
current_choices = current.get("doc", {}).get("choices")
91+
reference_choices = reference.get("doc", {}).get("choices")
92+
if not current_choices or current_choices != reference_choices or len(current_choices) < 2:
93+
return False
94+
95+
current_response = current.get("model_response", {})
96+
reference_response = reference.get("model_response", {})
97+
98+
current_token = first_generated_token_id(current_response)
99+
reference_token = first_generated_token_id(reference_response)
100+
if current_token is None or reference_token is None or current_token == reference_token:
101+
return False
102+
103+
reference_logits = first_step_logits(reference_response)
104+
if reference_logits is None:
105+
return False
106+
107+
for token_id in (current_token, reference_token):
108+
if not is_within_logit_tie_margin(reference_logits, token_id, epsilon):
109+
return False
110+
111+
current_logits = first_step_logits(current_response)
112+
if current_logits is not None:
113+
for token_id in (current_token, reference_token):
114+
if not is_within_logit_tie_margin(current_logits, token_id, epsilon):
115+
return False
116+
117+
return True
118+
119+
40120
def load_sample_details(details_dir: str):
41121
"""Load sample-level details from parquet files in the details directory."""
42122
details = {}
@@ -140,6 +220,10 @@ def _compare_single_sample(current, reference, sample_index):
140220
if "doc" in current and "doc" in reference:
141221
sample_diff.update(_compare_doc_info(current, reference))
142222

223+
if sample_diff and set(sample_diff).issubset({"output_tokens_difference", "metric_differences"}):
224+
if is_tied_choice_prediction(current, reference):
225+
return {}
226+
143227
if sample_diff:
144228
sample_diff["sample_index"] = sample_index
145229

tests/unit/models/vllm/test_vllm_model.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,36 @@
2020
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
# SOFTWARE.
2222

23+
import sys
2324
import unittest
25+
from types import ModuleType
2426
from unittest.mock import Mock, patch
2527

2628
from transformers import AutoTokenizer
2729

28-
from lighteval.models.vllm.vllm_model import VLLMModel, VLLMModelConfig
30+
from lighteval.models.vllm.vllm_model import VLLMModel, VLLMModelConfig, build_vllm_token_prompts
31+
32+
33+
class TestVLLMPromptConstruction(unittest.TestCase):
34+
def test_build_vllm_token_prompts_uses_tokens_prompt_when_available(self):
35+
fake_inputs = ModuleType("vllm.inputs")
36+
fake_inputs.TokensPrompt = lambda *, prompt_token_ids: { # noqa: E731
37+
"kind": "tokens_prompt",
38+
"prompt_token_ids": prompt_token_ids,
39+
}
40+
fake_vllm = ModuleType("vllm")
41+
fake_vllm.inputs = fake_inputs
42+
43+
with patch.dict(sys.modules, {"vllm": fake_vllm, "vllm.inputs": fake_inputs}):
44+
prompts = build_vllm_token_prompts([[1, 2], [3]])
45+
46+
self.assertEqual(
47+
prompts,
48+
[
49+
{"kind": "tokens_prompt", "prompt_token_ids": [1, 2]},
50+
{"kind": "tokens_prompt", "prompt_token_ids": [3]},
51+
],
52+
)
2953

3054

3155
class TestVLLMTokenizerCreation(unittest.TestCase):
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
from dataclasses import asdict, dataclass
6+
7+
from lighteval.models.model_output import ModelResponse
8+
from tests.slow_tests.sample_comparison import compare_sample_details
9+
10+
11+
@dataclass
12+
class DetailSample:
13+
doc: dict
14+
metric: dict
15+
model_response: ModelResponse
16+
17+
18+
def make_logits(logit_b: float, logit_c: float) -> list[list[float]]:
19+
logits = [0.0] * 40
20+
logits[33] = logit_b
21+
logits[34] = logit_c
22+
return [logits]
23+
24+
25+
def make_current_detail(
26+
token_id: int,
27+
logit_b: float,
28+
logit_c: float,
29+
metric: float,
30+
*,
31+
flat_output_tokens: bool = False,
32+
include_logits: bool = True,
33+
) -> DetailSample:
34+
output_tokens = [token_id, 151645] if flat_output_tokens else [[token_id, 151645]]
35+
36+
return DetailSample(
37+
doc={"query": "query", "choices": ["A", "B", "C", "D"]},
38+
metric={"extractive_match": metric},
39+
model_response=ModelResponse(
40+
output_tokens=output_tokens,
41+
logits=make_logits(logit_b, logit_c) if include_logits else None,
42+
),
43+
)
44+
45+
46+
def make_reference_detail(token_id: int, logit_b: float, logit_c: float, metric: float) -> dict:
47+
return {
48+
"doc": {"query": "query", "choices": ["A", "B", "C", "D"]},
49+
"metric": {"extractive_match": metric},
50+
"model_response": asdict(
51+
ModelResponse(
52+
output_tokens=[[token_id, 151645]],
53+
logits=make_logits(logit_b, logit_c),
54+
)
55+
),
56+
}
57+
58+
59+
def test_compare_sample_details_ignores_tied_multiple_choice_predictions():
60+
current_details = {
61+
"task": [
62+
make_current_detail(
63+
token_id=34,
64+
logit_b=10.0,
65+
logit_c=10.0,
66+
metric=1.0,
67+
flat_output_tokens=True,
68+
include_logits=False,
69+
)
70+
],
71+
}
72+
reference_details = {
73+
"task": [make_reference_detail(token_id=33, logit_b=10.0, logit_c=10.0, metric=0.0)],
74+
}
75+
76+
assert compare_sample_details(current_details, reference_details) == {}
77+
78+
79+
def test_compare_sample_details_keeps_non_tied_multiple_choice_predictions():
80+
current_details = {
81+
"task": [make_current_detail(token_id=34, logit_b=9.0, logit_c=10.0, metric=1.0)],
82+
}
83+
reference_details = {
84+
"task": [make_reference_detail(token_id=33, logit_b=10.0, logit_c=9.0, metric=0.0)],
85+
}
86+
87+
differences = compare_sample_details(current_details, reference_details)
88+
89+
assert differences["task"][0]["sample_index"] == 0
90+
assert "output_tokens_difference" in differences["task"][0]
91+
assert "metric_differences" in differences["task"][0]

0 commit comments

Comments
 (0)