Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions .github/workflows/slow_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,47 @@ jobs:
enable-cache: true

- name: Install the project
run: uv sync --extra dev
run: uv sync --extra dev-gpu

- name: Install Python development headers
run: sudo apt-get update && sudo apt-get install -y python3.12-dev

- name: Cache CUDA Toolkit
id: cache-cuda
uses: actions/cache@v4
with:
path: /usr/local/cuda-12.8
key: cuda-toolkit-12-8-${{ runner.os }}

- name: Install CUDA Toolkit
if: steps.cache-cuda.outputs.cache-hit != 'true'
run: |
# Add NVIDIA package repositories
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
# Install CUDA toolkit 12.8 to match nvidia-cuda-runtime-cu12==12.8.90
sudo apt-get install -y cuda-toolkit-12-8

- name: Verify CUDA installation
run: |
ls -la /usr/local/cuda-12.8/bin/nvcc || echo "WARNING: nvcc not found at /usr/local/cuda-12.8/bin/nvcc"
if [ -f /usr/local/cuda-12.8/bin/nvcc ]; then
/usr/local/cuda-12.8/bin/nvcc --version
fi

- name: Setup CUDA environment
run: |
export CUDA_HOME=/usr/local/cuda-12.8
export PATH="/usr/local/cuda-12.8/bin:$PATH"
echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH

- name: run nvidia-smi
run: nvidia-smi

- name: Run tests
run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/
run: |
export CUDA_HOME=/usr/local/cuda-12.8
export PATH="/usr/local/cuda-12.8/bin:$PATH"
uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
enable-cache: true

- name: Install the project
run: uv sync --extra dev
run: uv sync --extra dev-gpu

- name: Ensure cache directories exist
run: mkdir -p cache/models cache/datasets
Expand Down
79 changes: 79 additions & 0 deletions .github/workflows/vllm_main_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: vLLM Main Branch Tests

on:
schedule:
- cron: '0 2 * * 1' # Every Monday at 2 AM UTC
workflow_dispatch:

permissions:
contents: read

jobs:
test_vllm_main:
name: Test with vLLM main branch
runs-on: 'aws-g4dn-2xlarge-use1-public-80'
continue-on-error: true

steps:
- name: Install Git LFS
run: |
if ! command -v git-lfs &> /dev/null; then
sudo apt-get update && sudo apt-get install -y git-lfs
git lfs install
fi

- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true

- name: Install the project
run: uv sync --extra dev-gpu

- name: Install Python development headers
run: sudo apt-get update && sudo apt-get install -y python3.12-dev

- name: Cache CUDA Toolkit
id: cache-cuda
uses: actions/cache@v4
with:
path: /usr/local/cuda-12.8
key: cuda-toolkit-12-8-${{ runner.os }}

- name: Install CUDA Toolkit
if: steps.cache-cuda.outputs.cache-hit != 'true'
run: |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cuda-toolkit-12-8

- name: Setup CUDA environment
run: |
echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH

- name: Verify CUDA
run: |
nvidia-smi
nvcc --version

- name: Install vLLM from main branch
run: |
uv pip uninstall -y vllm || true
uv pip install git+https://github.com/vllm-project/vllm.git@main

- name: Get vLLM version
id: vllm-info
run: |
VERSION=$(uv run python -c "import vllm; print(vllm.__version__)")
echo "version=$VERSION" >> $GITHUB_OUTPUT
echo "Testing vLLM version: $VERSION"

- name: Run tests
run: uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/test_vllm_model.py
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,12 @@ If you're adding a **new feature**, please *open an issue first*.
If you open a PR, don't forget to **run the styling**!

```bash
# For basic development (code quality, tests)
pip install -e ".[dev]"

# Or for GPU/vllm development and slow tests
pip install -e ".[dev-gpu]"

pre-commit install
pre-commit run --all-files
```
Expand Down
2 changes: 1 addition & 1 deletion examples/model_configs/vllm_model_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ model_parameters:
tensor_parallel_size: 1
data_parallel_size: 1
pipeline_parallel_size: 1
gpu_memory_utilization: 0.6
gpu_memory_utilization: 0.4
max_model_length: null
swap_space: 4
seed: 42
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,12 @@ nanotron = [
"tensorboardX"
]
tensorboardX = ["tensorboardX"]
vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"]
vllm = ["vllm>=0.11.0", "ray", "more_itertools"]
sglang = ["sglang"]
quality = ["ruff>=v0.11.0","pre-commit"]
tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]
dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
dev = ["lighteval[quality,tests]"]
dev-gpu = ["lighteval[dev,multilingual,math,extended_tasks,vllm]"]
docs = ["hf-doc-builder", "watchdog"]
extended_tasks = [
"langdetect", # ifeval
Expand Down
6 changes: 4 additions & 2 deletions src/lighteval/metrics/utils/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def __lazy_load_client(self): # noqa: C901
raise_if_package_not_available("vllm")
if self.pipe is None:
from vllm import LLM, SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer

self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens)
self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto")
Expand Down Expand Up @@ -296,7 +296,9 @@ def __call_transformers(self, prompt):

def __call_vllm(self, prompt):
tokenized = [self.tokenizer.apply_chat_template(p) for p in prompt]
output = self.pipe.generate(prompt_token_ids=tokenized, sampling_params=self.sampling_params, use_tqdm=True)
# Convert token IDs to TokensPrompt format for vLLM v0.15+
prompts = [{"prompt_token_ids": token_ids} for token_ids in tokenized]
output = self.pipe.generate(prompts=prompts, sampling_params=self.sampling_params, use_tqdm=True)
outputs = [output.outputs[0].text for output in output]
return outputs

Expand Down
43 changes: 28 additions & 15 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
destroy_distributed_environment,
destroy_model_parallel,
)
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.tokenizers import get_tokenizer
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM

logging.getLogger("vllm").propagate = True
Expand Down Expand Up @@ -291,7 +291,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
# Inferring from the tokenizer will cause vllm to bug for models with mismatches between model
# config and tk config, like mistralai/Mistral-7B-v0.1
if self._max_length is None:
self._max_length = model.llm_engine.model_config.max_seq_len_to_capture
self._max_length = model.llm_engine.model_config.max_model_len

return model

Expand Down Expand Up @@ -415,9 +415,9 @@ def _generate(
generate: bool = True,
) -> list:
"""Contains the actual logic of the generation."""
sampling_params = SamplingParams(**self.config.generation_parameters.to_vllm_dict())

if generate:
sampling_params = SamplingParams(**self.config.generation_parameters.to_vllm_dict())
sampling_params.n = num_samples
sampling_params.max_tokens = max_new_tokens
sampling_params.stop = stop_tokens
Expand All @@ -427,17 +427,21 @@ def _generate(
"num_samples > 1 is not supported with temperature=0, please set temperature > 0 or use non sampling metrics."
)
else:
sampling_params.temperature = 0
sampling_params.prompt_logprobs = 1
sampling_params.max_tokens = 1
sampling_params.detokenize = False
sampling_params = SamplingParams(
temperature=0.0,
prompt_logprobs=1,
max_tokens=1,
detokenize=False,
)

if self.data_parallel_size > 1:

@ray.remote(num_gpus=self.tensor_parallel_size)
def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests):
llm = LLM(**model_args)
return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
# Convert token IDs to TokensPrompt format for vLLM v0.15+
prompts = [{"prompt_token_ids": req} for req in requests]
return llm.generate(prompts=prompts, sampling_params=sampling_params)

# dispatch requests to all self.data_parallel_size workers, in interleaved fashion
# interleaved important to balance context lengths across workers
Expand All @@ -454,8 +458,12 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
if x is not None
]
else:
from vllm.inputs import TokenInputs

# Convert token IDs to TokensPrompt format for vLLM v0.15+
prompts = [TokenInputs(prompt_token_ids=token_ids) for token_ids in inputs]
outputs = self.model.generate(
prompt_token_ids=inputs,
prompts=prompts,
sampling_params=sampling_params,
use_tqdm=True,
)
Expand Down Expand Up @@ -489,9 +497,6 @@ def _loglikelihood_tokens(
tokenized_continuations_batch.append(tokenized_continuation)
tokenized_contexts_batch.append(tokenized_context)

# Left truncate the inputs to the maximum length
if self.max_length: # can be None if the model is initialized with ray
inputs = [input[-self.max_length :] for input in inputs]
outputs = self._generate(inputs, generate=False)

flat_index = 0
Expand All @@ -507,12 +512,18 @@ def _loglikelihood_tokens(
for output, context, continuation in zip(
outputs_doc, tokenized_contexts_doc, tokenized_continuations_doc
):
actual_input_len = len(output.prompt_token_ids)
continuation_len = len(continuation)
continuation_start_idx = actual_input_len - continuation_len
continuation_prompt_logprobs = output.prompt_logprobs[continuation_start_idx:]

continuation_logprobs = []
for token, logprobs in zip(continuation[::-1], output.prompt_logprobs[::-1]):
continuation_logprobs.append(logprobs[token])
for token, logprobs_at_position in zip(continuation, continuation_prompt_logprobs):
continuation_logprobs.append(logprobs_at_position[token])

bool_score = all(logprob.rank == 1 for logprob in continuation_logprobs)
continuation_logprobs = [logprob.logprob for logprob in continuation_logprobs]

continuation_logprobs = sum(continuation_logprobs)
logprobs_doc.append(continuation_logprobs)
argmax_doc.append(bool_score)
Expand Down Expand Up @@ -544,6 +555,8 @@ class AsyncVLLMModel(VLLMModel):
is_async = True

def cleanup(self):
if self.model is not None:
del self.model
gc.collect()
destroy_distributed_environment()
torch.cuda.empty_cache()
Expand Down Expand Up @@ -578,7 +591,7 @@ def _create_auto_model(self, config: VLLMModelConfig):

# If the max_length can't get extracted from the config, it will be inferred from the model
if self._max_length is None:
self._max_length = model.model_config.max_seq_len_to_capture
self._max_length = model.model_config.max_model_len

return model

Expand Down
8 changes: 5 additions & 3 deletions src/lighteval/tasks/multilingual/tasks/exams.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,11 @@
hf_subset="multilingual",
# Weird bug in dataset
hf_filter=partial(
lambda language, subject, line: line["answerKey"] != "@"
and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
and line["info"]["subject"] == subject,
lambda language, subject, line: (
line["answerKey"] != "@"
and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
and line["info"]["subject"] == subject
),
language,
subject,
),
Expand Down
9 changes: 6 additions & 3 deletions src/lighteval/tasks/multilingual/tasks/filipino.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,12 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
evaluation_splits=("test",),
few_shots_split="dev",
hf_filter=partial(
lambda subset, sensitivity_label, x: x["subject"].lower() == subset
and (
sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
lambda subset, sensitivity_label, x: (
x["subject"].lower() == subset
and (
sensitivity_label == "ALL"
or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
)
),
subset,
sensitivity_label,
Expand Down
13 changes: 8 additions & 5 deletions src/lighteval/tasks/multilingual/tasks/global_mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,14 @@
evaluation_splits=("test",),
few_shots_split="dev",
hf_filter=partial(
lambda subset, sensitivity_label, x: x["subject"].lower() == subset
and (
sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
)
and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd"),
lambda subset, sensitivity_label, x: (
x["subject"].lower() == subset
and (
sensitivity_label == "ALL"
or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
)
and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd")
),
subset,
sensitivity_label,
),
Expand Down
1 change: 1 addition & 0 deletions src/lighteval/tasks/multilingual/tasks/gpqa_fi.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc


random.seed(42)


Expand Down
9 changes: 4 additions & 5 deletions src/lighteval/tasks/multilingual/tasks/maime.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,16 @@
# Prompt template adapted from AIME task
# Note: Uses English instructions for consistency with AIME
MATH_PROMPT_TEMPLATE = dedent("""
Solve the following math problem efficiently and clearly.
The last line of your response should be of the following format:
'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct'
(without quotes) where ANSWER is just the final number or expression
Solve the following math problem efficiently and clearly.
The last line of your response should be of the following format:
'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct'
(without quotes) where ANSWER is just the final number or expression
that solves the problem. Think step by step before answering.

{prompt}
""")



def record_to_sample(record):
return Sample(input=record["question"], target=record["solution"])

Expand Down
Loading
Loading