diff --git a/.github/workflows/slow_tests.yaml b/.github/workflows/slow_tests.yaml index a1f68f9d4..4394a73ce 100644 --- a/.github/workflows/slow_tests.yaml +++ b/.github/workflows/slow_tests.yaml @@ -35,11 +35,47 @@ jobs: enable-cache: true - name: Install the project - run: uv sync --extra dev + run: uv sync --extra dev-gpu + - name: Install Python development headers + run: sudo apt-get update && sudo apt-get install -y python3.12-dev + + - name: Cache CUDA Toolkit + id: cache-cuda + uses: actions/cache@v4 + with: + path: /usr/local/cuda-12.8 + key: cuda-toolkit-12-8-${{ runner.os }} + + - name: Install CUDA Toolkit + if: steps.cache-cuda.outputs.cache-hit != 'true' + run: | + # Add NVIDIA package repositories + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + # Install CUDA toolkit 12.8 to match nvidia-cuda-runtime-cu12==12.8.90 + sudo apt-get install -y cuda-toolkit-12-8 + + - name: Verify CUDA installation + run: | + ls -la /usr/local/cuda-12.8/bin/nvcc || echo "WARNING: nvcc not found at /usr/local/cuda-12.8/bin/nvcc" + if [ -f /usr/local/cuda-12.8/bin/nvcc ]; then + /usr/local/cuda-12.8/bin/nvcc --version + fi + + - name: Setup CUDA environment + run: | + export CUDA_HOME=/usr/local/cuda-12.8 + export PATH="/usr/local/cuda-12.8/bin:$PATH" + echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV + echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH - name: run nvidia-smi run: nvidia-smi - name: Run tests - run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/ + run: | + export CUDA_HOME=/usr/local/cuda-12.8 + export PATH="/usr/local/cuda-12.8/bin:$PATH" + uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/ diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index e1a2cad51..f854ca694 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -46,7 +46,7 @@ jobs: enable-cache: true - name: Install the project - run: uv sync --extra dev + run: uv sync --extra dev-gpu - name: Ensure cache directories exist run: mkdir -p cache/models cache/datasets diff --git a/.github/workflows/vllm_main_tests.yaml b/.github/workflows/vllm_main_tests.yaml new file mode 100644 index 000000000..73b0e2d0d --- /dev/null +++ b/.github/workflows/vllm_main_tests.yaml @@ -0,0 +1,79 @@ +name: vLLM Main Branch Tests + +on: + schedule: + - cron: '0 2 * * 1' # Every Monday at 2 AM UTC + workflow_dispatch: + +permissions: + contents: read + +jobs: + test_vllm_main: + name: Test with vLLM main branch + runs-on: 'aws-g4dn-2xlarge-use1-public-80' + continue-on-error: true + + steps: + - name: Install Git LFS + run: | + if ! command -v git-lfs &> /dev/null; then + sudo apt-get update && sudo apt-get install -y git-lfs + git lfs install + fi + + - name: Checkout repository + uses: actions/checkout@v4 + with: + lfs: true + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Install the project + run: uv sync --extra dev-gpu + + - name: Install Python development headers + run: sudo apt-get update && sudo apt-get install -y python3.12-dev + + - name: Cache CUDA Toolkit + id: cache-cuda + uses: actions/cache@v4 + with: + path: /usr/local/cuda-12.8 + key: cuda-toolkit-12-8-${{ runner.os }} + + - name: Install CUDA Toolkit + if: steps.cache-cuda.outputs.cache-hit != 'true' + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get install -y cuda-toolkit-12-8 + + - name: Setup CUDA environment + run: | + echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV + echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH + + - name: Verify CUDA + run: | + nvidia-smi + nvcc --version + + - name: Install vLLM from main branch + run: | + uv pip uninstall -y vllm || true + uv pip install git+https://github.com/vllm-project/vllm.git@main + + - name: Get vLLM version + id: vllm-info + run: | + VERSION=$(uv run python -c "import vllm; print(vllm.__version__)") + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Testing vLLM version: $VERSION" + + - name: Run tests + run: uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/test_vllm_model.py diff --git a/README.md b/README.md index 72473a34a..3d09a0d34 100644 --- a/README.md +++ b/README.md @@ -187,7 +187,12 @@ If you're adding a **new feature**, please *open an issue first*. If you open a PR, don't forget to **run the styling**! ```bash +# For basic development (code quality, tests) pip install -e ".[dev]" + +# Or for GPU/vllm development and slow tests +pip install -e ".[dev-gpu]" + pre-commit install pre-commit run --all-files ``` diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml index 74f0afb92..f002fbfc8 100644 --- a/examples/model_configs/vllm_model_config.yaml +++ b/examples/model_configs/vllm_model_config.yaml @@ -5,7 +5,7 @@ model_parameters: tensor_parallel_size: 1 data_parallel_size: 1 pipeline_parallel_size: 1 - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.4 max_model_length: null swap_space: 4 seed: 42 diff --git a/pyproject.toml b/pyproject.toml index 589e24f5e..5ca850182 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,11 +103,12 @@ nanotron = [ "tensorboardX" ] tensorboardX = ["tensorboardX"] -vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"] +vllm = ["vllm>=0.11.0", "ray", "more_itertools"] sglang = ["sglang"] quality = ["ruff>=v0.11.0","pre-commit"] tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"] -dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"] +dev = ["lighteval[quality,tests]"] +dev-gpu = ["lighteval[dev,multilingual,math,extended_tasks,vllm]"] docs = ["hf-doc-builder", "watchdog"] extended_tasks = [ "langdetect", # ifeval diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py index 4a68a1ed3..0f9b3315c 100644 --- a/src/lighteval/metrics/utils/llm_as_judge.py +++ b/src/lighteval/metrics/utils/llm_as_judge.py @@ -168,7 +168,7 @@ def __lazy_load_client(self): # noqa: C901 raise_if_package_not_available("vllm") if self.pipe is None: from vllm import LLM, SamplingParams - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens) self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto") @@ -296,7 +296,9 @@ def __call_transformers(self, prompt): def __call_vllm(self, prompt): tokenized = [self.tokenizer.apply_chat_template(p) for p in prompt] - output = self.pipe.generate(prompt_token_ids=tokenized, sampling_params=self.sampling_params, use_tqdm=True) + # Convert token IDs to TokensPrompt format for vLLM v0.15+ + prompts = [{"prompt_token_ids": token_ids} for token_ids in tokenized] + output = self.pipe.generate(prompts=prompts, sampling_params=self.sampling_params, use_tqdm=True) outputs = [output.outputs[0].text for output in output] return outputs diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 969caf8fa..3100c56b7 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -52,7 +52,7 @@ destroy_distributed_environment, destroy_model_parallel, ) - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM logging.getLogger("vllm").propagate = True @@ -291,7 +291,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]: # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model # config and tk config, like mistralai/Mistral-7B-v0.1 if self._max_length is None: - self._max_length = model.llm_engine.model_config.max_seq_len_to_capture + self._max_length = model.llm_engine.model_config.max_model_len return model @@ -415,9 +415,9 @@ def _generate( generate: bool = True, ) -> list: """Contains the actual logic of the generation.""" - sampling_params = SamplingParams(**self.config.generation_parameters.to_vllm_dict()) if generate: + sampling_params = SamplingParams(**self.config.generation_parameters.to_vllm_dict()) sampling_params.n = num_samples sampling_params.max_tokens = max_new_tokens sampling_params.stop = stop_tokens @@ -427,17 +427,21 @@ def _generate( "num_samples > 1 is not supported with temperature=0, please set temperature > 0 or use non sampling metrics." ) else: - sampling_params.temperature = 0 - sampling_params.prompt_logprobs = 1 - sampling_params.max_tokens = 1 - sampling_params.detokenize = False + sampling_params = SamplingParams( + temperature=0.0, + prompt_logprobs=1, + max_tokens=1, + detokenize=False, + ) if self.data_parallel_size > 1: @ray.remote(num_gpus=self.tensor_parallel_size) def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) - return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) + # Convert token IDs to TokensPrompt format for vLLM v0.15+ + prompts = [{"prompt_token_ids": req} for req in requests] + return llm.generate(prompts=prompts, sampling_params=sampling_params) # dispatch requests to all self.data_parallel_size workers, in interleaved fashion # interleaved important to balance context lengths across workers @@ -454,8 +458,12 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r if x is not None ] else: + from vllm.inputs import TokenInputs + + # Convert token IDs to TokensPrompt format for vLLM v0.15+ + prompts = [TokenInputs(prompt_token_ids=token_ids) for token_ids in inputs] outputs = self.model.generate( - prompt_token_ids=inputs, + prompts=prompts, sampling_params=sampling_params, use_tqdm=True, ) @@ -489,9 +497,6 @@ def _loglikelihood_tokens( tokenized_continuations_batch.append(tokenized_continuation) tokenized_contexts_batch.append(tokenized_context) - # Left truncate the inputs to the maximum length - if self.max_length: # can be None if the model is initialized with ray - inputs = [input[-self.max_length :] for input in inputs] outputs = self._generate(inputs, generate=False) flat_index = 0 @@ -507,12 +512,18 @@ def _loglikelihood_tokens( for output, context, continuation in zip( outputs_doc, tokenized_contexts_doc, tokenized_continuations_doc ): + actual_input_len = len(output.prompt_token_ids) + continuation_len = len(continuation) + continuation_start_idx = actual_input_len - continuation_len + continuation_prompt_logprobs = output.prompt_logprobs[continuation_start_idx:] + continuation_logprobs = [] - for token, logprobs in zip(continuation[::-1], output.prompt_logprobs[::-1]): - continuation_logprobs.append(logprobs[token]) + for token, logprobs_at_position in zip(continuation, continuation_prompt_logprobs): + continuation_logprobs.append(logprobs_at_position[token]) bool_score = all(logprob.rank == 1 for logprob in continuation_logprobs) continuation_logprobs = [logprob.logprob for logprob in continuation_logprobs] + continuation_logprobs = sum(continuation_logprobs) logprobs_doc.append(continuation_logprobs) argmax_doc.append(bool_score) @@ -544,6 +555,8 @@ class AsyncVLLMModel(VLLMModel): is_async = True def cleanup(self): + if self.model is not None: + del self.model gc.collect() destroy_distributed_environment() torch.cuda.empty_cache() @@ -578,7 +591,7 @@ def _create_auto_model(self, config: VLLMModelConfig): # If the max_length can't get extracted from the config, it will be inferred from the model if self._max_length is None: - self._max_length = model.model_config.max_seq_len_to_capture + self._max_length = model.model_config.max_model_len return model diff --git a/src/lighteval/tasks/multilingual/tasks/exams.py b/src/lighteval/tasks/multilingual/tasks/exams.py index 28e40e989..2d447ab01 100644 --- a/src/lighteval/tasks/multilingual/tasks/exams.py +++ b/src/lighteval/tasks/multilingual/tasks/exams.py @@ -167,9 +167,11 @@ hf_subset="multilingual", # Weird bug in dataset hf_filter=partial( - lambda language, subject, line: line["answerKey"] != "@" - and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name() - and line["info"]["subject"] == subject, + lambda language, subject, line: ( + line["answerKey"] != "@" + and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name() + and line["info"]["subject"] == subject + ), language, subject, ), diff --git a/src/lighteval/tasks/multilingual/tasks/filipino.py b/src/lighteval/tasks/multilingual/tasks/filipino.py index 5138c49eb..d1dbdf8bc 100644 --- a/src/lighteval/tasks/multilingual/tasks/filipino.py +++ b/src/lighteval/tasks/multilingual/tasks/filipino.py @@ -355,9 +355,12 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: evaluation_splits=("test",), few_shots_split="dev", hf_filter=partial( - lambda subset, sensitivity_label, x: x["subject"].lower() == subset - and ( - sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK") + lambda subset, sensitivity_label, x: ( + x["subject"].lower() == subset + and ( + sensitivity_label == "ALL" + or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK") + ) ), subset, sensitivity_label, diff --git a/src/lighteval/tasks/multilingual/tasks/global_mmlu.py b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py index 894f15a3c..95d027781 100644 --- a/src/lighteval/tasks/multilingual/tasks/global_mmlu.py +++ b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py @@ -118,11 +118,14 @@ evaluation_splits=("test",), few_shots_split="dev", hf_filter=partial( - lambda subset, sensitivity_label, x: x["subject"].lower() == subset - and ( - sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK") - ) - and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd"), + lambda subset, sensitivity_label, x: ( + x["subject"].lower() == subset + and ( + sensitivity_label == "ALL" + or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK") + ) + and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd") + ), subset, sensitivity_label, ), diff --git a/src/lighteval/tasks/multilingual/tasks/gpqa_fi.py b/src/lighteval/tasks/multilingual/tasks/gpqa_fi.py index 22c9a8ef2..a7e5c0b94 100644 --- a/src/lighteval/tasks/multilingual/tasks/gpqa_fi.py +++ b/src/lighteval/tasks/multilingual/tasks/gpqa_fi.py @@ -32,6 +32,7 @@ from lighteval.tasks.lighteval_task import LightevalTaskConfig from lighteval.tasks.requests import Doc + random.seed(42) diff --git a/src/lighteval/tasks/multilingual/tasks/maime.py b/src/lighteval/tasks/multilingual/tasks/maime.py index 19aa6d6a3..bf1a17592 100644 --- a/src/lighteval/tasks/multilingual/tasks/maime.py +++ b/src/lighteval/tasks/multilingual/tasks/maime.py @@ -36,17 +36,16 @@ # Prompt template adapted from AIME task # Note: Uses English instructions for consistency with AIME MATH_PROMPT_TEMPLATE = dedent(""" -Solve the following math problem efficiently and clearly. -The last line of your response should be of the following format: -'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' -(without quotes) where ANSWER is just the final number or expression +Solve the following math problem efficiently and clearly. +The last line of your response should be of the following format: +'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' +(without quotes) where ANSWER is just the final number or expression that solves the problem. Think step by step before answering. {prompt} """) - def record_to_sample(record): return Sample(input=record["question"], target=record["solution"]) diff --git a/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py b/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py index 2026b00f5..6876af9fc 100644 --- a/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py +++ b/src/lighteval/tasks/multilingual/tasks/meta_mmlu.py @@ -114,8 +114,9 @@ hf_repo="meta-llama/Meta-Llama-3.1-8B-Instruct-evals", hf_subset=f"Llama-3.1-8B-Instruct-evals__multilingual_mmlu_{standardize_tag(language.value)}__details", hf_filter=partial( - lambda language, subset, line: line["subtask_name"] - == f"mmlu_{standardize_tag(language.value)}_chat.{subset}", + lambda language, subset, line: ( + line["subtask_name"] == f"mmlu_{standardize_tag(language.value)}_chat.{subset}" + ), language, subset, ), diff --git a/src/lighteval/tasks/multilingual/tasks/mkqa.py b/src/lighteval/tasks/multilingual/tasks/mkqa.py index 4e27ffecb..486bf5889 100644 --- a/src/lighteval/tasks/multilingual/tasks/mkqa.py +++ b/src/lighteval/tasks/multilingual/tasks/mkqa.py @@ -56,10 +56,12 @@ hf_subset="mkqa", hf_revision="325131889721ae0ed885b76ecb8011369d75abad", hf_filter=partial( - lambda language, subset, line: line["answers"][ - "zh_cn" if language == Language.CHINESE else standardize_tag(language.value) - ][0]["type"] - == MKQA_TASK_TO_ID[subset], + lambda language, subset, line: ( + line["answers"]["zh_cn" if language == Language.CHINESE else standardize_tag(language.value)][0][ + "type" + ] + == MKQA_TASK_TO_ID[subset] + ), language, subset, ), diff --git a/src/lighteval/tasks/multilingual/tasks/xnli2.py b/src/lighteval/tasks/multilingual/tasks/xnli2.py index 59f4facaf..1339ef8f5 100644 --- a/src/lighteval/tasks/multilingual/tasks/xnli2.py +++ b/src/lighteval/tasks/multilingual/tasks/xnli2.py @@ -60,9 +60,9 @@ relations=["entailment", "contradiction"], formulation=formulation, ), - hf_filter=lambda line: line["label"] in [0, 2] - and line["premise"] is not None - and line["hypothesis"] is not None, + hf_filter=lambda line: ( + line["label"] in [0, 2] and line["premise"] is not None and line["hypothesis"] is not None + ), hf_repo=f"Harsit/xnli2.0_train_{LangCodeLanguage(standardize_tag(language.value)).language_name().lower()}", hf_subset="default", evaluation_splits=["train"], diff --git a/tests/conftest.py b/tests/conftest.py index a568cc130..b4b425b17 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,9 +2,40 @@ # Copyright (c) 2024 The HuggingFace Team +import gc + import pytest +def _log_gpu_memory(stage: str, test_name: str = ""): + """Print GPU memory statistics for debugging slow tests.""" + try: + import torch + + if not torch.cuda.is_available(): + return + + device = torch.cuda.current_device() + props = torch.cuda.get_device_properties(device) + allocated = torch.cuda.memory_allocated(device) / (1024**3) # Convert to GiB + reserved = torch.cuda.memory_reserved(device) / (1024**3) + total = props.total_memory / (1024**3) + free = total - reserved + + test_info = f" [{test_name}]" if test_name else "" + print(f"\n{'=' * 80}") + print(f"GPU Memory {stage}{test_info}") + print(f"{'=' * 80}") + print(f" Device: {props.name}") + print(f" Total: {total:.2f} GiB") + print(f" Allocated: {allocated:.2f} GiB") + print(f" Reserved: {reserved:.2f} GiB") + print(f" Free: {free:.2f} GiB") + print(f"{'=' * 80}\n") + except ImportError: + pass + + def pytest_addoption(parser): parser.addoption("--runslow", action="store_true", default=False, help="run slow tests") @@ -21,3 +52,49 @@ def pytest_collection_modifyitems(config, items): for item in items: if "slow" in item.keywords: item.add_marker(skip_slow) + + +@pytest.fixture(autouse=True, scope="function") +def cleanup_gpu_memory(request): + """Cleanup GPU memory before and after each test to prevent OOM errors.""" + # Cleanup before test (especially important for tests that run after other GPU-heavy tests) + if "slow" in request.keywords: + test_name = request.node.name + + # Log memory BEFORE cleanup + _log_gpu_memory("BEFORE cleanup", test_name) + + try: + import torch + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + except ImportError: + pass + gc.collect() + + # Log memory AFTER cleanup (before test starts) + _log_gpu_memory("AFTER cleanup (test starting)", test_name) + + yield + + # Cleanup after test + if "slow" in request.keywords: + test_name = request.node.name + + # Log memory AFTER test (before cleanup) + _log_gpu_memory("AFTER test (before cleanup)", test_name) + + try: + import torch + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + except ImportError: + pass + gc.collect() + + # Log memory AFTER cleanup + _log_gpu_memory("AFTER cleanup", test_name)