Skip to content

Commit 63c994e

Browse files
author
Daniel Zautner
committed
Sync with upstream huggingface/lighteval and fix ruff formatting
1 parent 3a93167 commit 63c994e

17 files changed

Lines changed: 272 additions & 48 deletions

File tree

.github/workflows/slow_tests.yaml

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,47 @@ jobs:
3535
enable-cache: true
3636

3737
- name: Install the project
38-
run: uv sync --extra dev
38+
run: uv sync --extra dev-gpu
3939

40+
- name: Install Python development headers
41+
run: sudo apt-get update && sudo apt-get install -y python3.12-dev
42+
43+
- name: Cache CUDA Toolkit
44+
id: cache-cuda
45+
uses: actions/cache@v4
46+
with:
47+
path: /usr/local/cuda-12.8
48+
key: cuda-toolkit-12-8-${{ runner.os }}
49+
50+
- name: Install CUDA Toolkit
51+
if: steps.cache-cuda.outputs.cache-hit != 'true'
52+
run: |
53+
# Add NVIDIA package repositories
54+
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
55+
sudo dpkg -i cuda-keyring_1.1-1_all.deb
56+
sudo apt-get update
57+
# Install CUDA toolkit 12.8 to match nvidia-cuda-runtime-cu12==12.8.90
58+
sudo apt-get install -y cuda-toolkit-12-8
59+
60+
- name: Verify CUDA installation
61+
run: |
62+
ls -la /usr/local/cuda-12.8/bin/nvcc || echo "WARNING: nvcc not found at /usr/local/cuda-12.8/bin/nvcc"
63+
if [ -f /usr/local/cuda-12.8/bin/nvcc ]; then
64+
/usr/local/cuda-12.8/bin/nvcc --version
65+
fi
66+
67+
- name: Setup CUDA environment
68+
run: |
69+
export CUDA_HOME=/usr/local/cuda-12.8
70+
export PATH="/usr/local/cuda-12.8/bin:$PATH"
71+
echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
72+
echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
4073
4174
- name: run nvidia-smi
4275
run: nvidia-smi
4376

4477
- name: Run tests
45-
run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/
78+
run: |
79+
export CUDA_HOME=/usr/local/cuda-12.8
80+
export PATH="/usr/local/cuda-12.8/bin:$PATH"
81+
uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/

.github/workflows/tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ jobs:
4646
enable-cache: true
4747

4848
- name: Install the project
49-
run: uv sync --extra dev
49+
run: uv sync --extra dev-gpu
5050

5151
- name: Ensure cache directories exist
5252
run: mkdir -p cache/models cache/datasets
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
name: vLLM Main Branch Tests
2+
3+
on:
4+
schedule:
5+
- cron: '0 2 * * 1' # Every Monday at 2 AM UTC
6+
workflow_dispatch:
7+
8+
permissions:
9+
contents: read
10+
11+
jobs:
12+
test_vllm_main:
13+
name: Test with vLLM main branch
14+
runs-on: 'aws-g4dn-2xlarge-use1-public-80'
15+
continue-on-error: true
16+
17+
steps:
18+
- name: Install Git LFS
19+
run: |
20+
if ! command -v git-lfs &> /dev/null; then
21+
sudo apt-get update && sudo apt-get install -y git-lfs
22+
git lfs install
23+
fi
24+
25+
- name: Checkout repository
26+
uses: actions/checkout@v4
27+
with:
28+
lfs: true
29+
30+
- name: Install uv
31+
uses: astral-sh/setup-uv@v5
32+
with:
33+
enable-cache: true
34+
35+
- name: Install the project
36+
run: uv sync --extra dev-gpu
37+
38+
- name: Install Python development headers
39+
run: sudo apt-get update && sudo apt-get install -y python3.12-dev
40+
41+
- name: Cache CUDA Toolkit
42+
id: cache-cuda
43+
uses: actions/cache@v4
44+
with:
45+
path: /usr/local/cuda-12.8
46+
key: cuda-toolkit-12-8-${{ runner.os }}
47+
48+
- name: Install CUDA Toolkit
49+
if: steps.cache-cuda.outputs.cache-hit != 'true'
50+
run: |
51+
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
52+
sudo dpkg -i cuda-keyring_1.1-1_all.deb
53+
sudo apt-get update
54+
sudo apt-get install -y cuda-toolkit-12-8
55+
56+
- name: Setup CUDA environment
57+
run: |
58+
echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
59+
echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
60+
61+
- name: Verify CUDA
62+
run: |
63+
nvidia-smi
64+
nvcc --version
65+
66+
- name: Install vLLM from main branch
67+
run: |
68+
uv pip uninstall -y vllm || true
69+
uv pip install git+https://github.com/vllm-project/vllm.git@main
70+
71+
- name: Get vLLM version
72+
id: vllm-info
73+
run: |
74+
VERSION=$(uv run python -c "import vllm; print(vllm.__version__)")
75+
echo "version=$VERSION" >> $GITHUB_OUTPUT
76+
echo "Testing vLLM version: $VERSION"
77+
78+
- name: Run tests
79+
run: uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/test_vllm_model.py

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,12 @@ If you're adding a **new feature**, please *open an issue first*.
187187
If you open a PR, don't forget to **run the styling**!
188188

189189
```bash
190+
# For basic development (code quality, tests)
190191
pip install -e ".[dev]"
192+
193+
# Or for GPU/vllm development and slow tests
194+
pip install -e ".[dev-gpu]"
195+
191196
pre-commit install
192197
pre-commit run --all-files
193198
```

examples/model_configs/vllm_model_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ model_parameters:
55
tensor_parallel_size: 1
66
data_parallel_size: 1
77
pipeline_parallel_size: 1
8-
gpu_memory_utilization: 0.6
8+
gpu_memory_utilization: 0.4
99
max_model_length: null
1010
swap_space: 4
1111
seed: 42

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,12 @@ nanotron = [
103103
"tensorboardX"
104104
]
105105
tensorboardX = ["tensorboardX"]
106-
vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"]
106+
vllm = ["vllm>=0.11.0", "ray", "more_itertools"]
107107
sglang = ["sglang"]
108108
quality = ["ruff>=v0.11.0","pre-commit"]
109109
tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]
110-
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
110+
dev = ["lighteval[quality,tests]"]
111+
dev-gpu = ["lighteval[dev,multilingual,math,extended_tasks,vllm]"]
111112
docs = ["hf-doc-builder", "watchdog"]
112113
extended_tasks = [
113114
"langdetect", # ifeval

src/lighteval/metrics/utils/llm_as_judge.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def __lazy_load_client(self): # noqa: C901
168168
raise_if_package_not_available("vllm")
169169
if self.pipe is None:
170170
from vllm import LLM, SamplingParams
171-
from vllm.transformers_utils.tokenizer import get_tokenizer
171+
from vllm.tokenizers import get_tokenizer
172172

173173
self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens)
174174
self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto")
@@ -296,7 +296,9 @@ def __call_transformers(self, prompt):
296296

297297
def __call_vllm(self, prompt):
298298
tokenized = [self.tokenizer.apply_chat_template(p) for p in prompt]
299-
output = self.pipe.generate(prompt_token_ids=tokenized, sampling_params=self.sampling_params, use_tqdm=True)
299+
# Convert token IDs to TokensPrompt format for vLLM v0.15+
300+
prompts = [{"prompt_token_ids": token_ids} for token_ids in tokenized]
301+
output = self.pipe.generate(prompts=prompts, sampling_params=self.sampling_params, use_tqdm=True)
300302
outputs = [output.outputs[0].text for output in output]
301303
return outputs
302304

src/lighteval/models/vllm/vllm_model.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
destroy_distributed_environment,
5353
destroy_model_parallel,
5454
)
55-
from vllm.transformers_utils.tokenizer import get_tokenizer
55+
from vllm.tokenizers import get_tokenizer
5656
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
5757

5858
logging.getLogger("vllm").propagate = True
@@ -291,7 +291,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
291291
# Inferring from the tokenizer will cause vllm to bug for models with mismatches between model
292292
# config and tk config, like mistralai/Mistral-7B-v0.1
293293
if self._max_length is None:
294-
self._max_length = model.llm_engine.model_config.max_seq_len_to_capture
294+
self._max_length = model.llm_engine.model_config.max_model_len
295295

296296
return model
297297

@@ -415,9 +415,9 @@ def _generate(
415415
generate: bool = True,
416416
) -> list:
417417
"""Contains the actual logic of the generation."""
418-
sampling_params = SamplingParams(**self.config.generation_parameters.to_vllm_dict())
419418

420419
if generate:
420+
sampling_params = SamplingParams(**self.config.generation_parameters.to_vllm_dict())
421421
sampling_params.n = num_samples
422422
sampling_params.max_tokens = max_new_tokens
423423
sampling_params.stop = stop_tokens
@@ -427,17 +427,21 @@ def _generate(
427427
"num_samples > 1 is not supported with temperature=0, please set temperature > 0 or use non sampling metrics."
428428
)
429429
else:
430-
sampling_params.temperature = 0
431-
sampling_params.prompt_logprobs = 1
432-
sampling_params.max_tokens = 1
433-
sampling_params.detokenize = False
430+
sampling_params = SamplingParams(
431+
temperature=0.0,
432+
prompt_logprobs=1,
433+
max_tokens=1,
434+
detokenize=False,
435+
)
434436

435437
if self.data_parallel_size > 1:
436438

437439
@ray.remote(num_gpus=self.tensor_parallel_size)
438440
def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests):
439441
llm = LLM(**model_args)
440-
return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
442+
# Convert token IDs to TokensPrompt format for vLLM v0.15+
443+
prompts = [{"prompt_token_ids": req} for req in requests]
444+
return llm.generate(prompts=prompts, sampling_params=sampling_params)
441445

442446
# dispatch requests to all self.data_parallel_size workers, in interleaved fashion
443447
# interleaved important to balance context lengths across workers
@@ -454,8 +458,12 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
454458
if x is not None
455459
]
456460
else:
461+
from vllm.inputs import TokenInputs
462+
463+
# Convert token IDs to TokensPrompt format for vLLM v0.15+
464+
prompts = [TokenInputs(prompt_token_ids=token_ids) for token_ids in inputs]
457465
outputs = self.model.generate(
458-
prompt_token_ids=inputs,
466+
prompts=prompts,
459467
sampling_params=sampling_params,
460468
use_tqdm=True,
461469
)
@@ -489,9 +497,6 @@ def _loglikelihood_tokens(
489497
tokenized_continuations_batch.append(tokenized_continuation)
490498
tokenized_contexts_batch.append(tokenized_context)
491499

492-
# Left truncate the inputs to the maximum length
493-
if self.max_length: # can be None if the model is initialized with ray
494-
inputs = [input[-self.max_length :] for input in inputs]
495500
outputs = self._generate(inputs, generate=False)
496501

497502
flat_index = 0
@@ -507,12 +512,18 @@ def _loglikelihood_tokens(
507512
for output, context, continuation in zip(
508513
outputs_doc, tokenized_contexts_doc, tokenized_continuations_doc
509514
):
515+
actual_input_len = len(output.prompt_token_ids)
516+
continuation_len = len(continuation)
517+
continuation_start_idx = actual_input_len - continuation_len
518+
continuation_prompt_logprobs = output.prompt_logprobs[continuation_start_idx:]
519+
510520
continuation_logprobs = []
511-
for token, logprobs in zip(continuation[::-1], output.prompt_logprobs[::-1]):
512-
continuation_logprobs.append(logprobs[token])
521+
for token, logprobs_at_position in zip(continuation, continuation_prompt_logprobs):
522+
continuation_logprobs.append(logprobs_at_position[token])
513523

514524
bool_score = all(logprob.rank == 1 for logprob in continuation_logprobs)
515525
continuation_logprobs = [logprob.logprob for logprob in continuation_logprobs]
526+
516527
continuation_logprobs = sum(continuation_logprobs)
517528
logprobs_doc.append(continuation_logprobs)
518529
argmax_doc.append(bool_score)
@@ -544,6 +555,8 @@ class AsyncVLLMModel(VLLMModel):
544555
is_async = True
545556

546557
def cleanup(self):
558+
if self.model is not None:
559+
del self.model
547560
gc.collect()
548561
destroy_distributed_environment()
549562
torch.cuda.empty_cache()
@@ -578,7 +591,7 @@ def _create_auto_model(self, config: VLLMModelConfig):
578591

579592
# If the max_length can't get extracted from the config, it will be inferred from the model
580593
if self._max_length is None:
581-
self._max_length = model.model_config.max_seq_len_to_capture
594+
self._max_length = model.model_config.max_model_len
582595

583596
return model
584597

src/lighteval/tasks/multilingual/tasks/exams.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,11 @@
167167
hf_subset="multilingual",
168168
# Weird bug in dataset
169169
hf_filter=partial(
170-
lambda language, subject, line: line["answerKey"] != "@"
171-
and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
172-
and line["info"]["subject"] == subject,
170+
lambda language, subject, line: (
171+
line["answerKey"] != "@"
172+
and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
173+
and line["info"]["subject"] == subject
174+
),
173175
language,
174176
subject,
175177
),

src/lighteval/tasks/multilingual/tasks/filipino.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -355,9 +355,12 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
355355
evaluation_splits=("test",),
356356
few_shots_split="dev",
357357
hf_filter=partial(
358-
lambda subset, sensitivity_label, x: x["subject"].lower() == subset
359-
and (
360-
sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
358+
lambda subset, sensitivity_label, x: (
359+
x["subject"].lower() == subset
360+
and (
361+
sensitivity_label == "ALL"
362+
or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
363+
)
361364
),
362365
subset,
363366
sensitivity_label,

0 commit comments

Comments
 (0)