LumiOpen · dzautner · Mar 16, 2026 · Mar 16, 2026
diff --git a/.github/workflows/slow_tests.yaml b/.github/workflows/slow_tests.yaml
@@ -35,11 +35,47 @@ jobs:
           enable-cache: true
 
       - name: Install the project
-        run: uv sync --extra dev
+        run: uv sync --extra dev-gpu
 
+      - name: Install Python development headers
+        run: sudo apt-get update && sudo apt-get install -y python3.12-dev
+
+      - name: Cache CUDA Toolkit
+        id: cache-cuda
+        uses: actions/cache@v4
+        with:
+          path: /usr/local/cuda-12.8
+          key: cuda-toolkit-12-8-${{ runner.os }}
+
+      - name: Install CUDA Toolkit
+        if: steps.cache-cuda.outputs.cache-hit != 'true'
+        run: |
+          # Add NVIDIA package repositories
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          # Install CUDA toolkit 12.8 to match nvidia-cuda-runtime-cu12==12.8.90
+          sudo apt-get install -y cuda-toolkit-12-8
+
+      - name: Verify CUDA installation
+        run: |
+          ls -la /usr/local/cuda-12.8/bin/nvcc || echo "WARNING: nvcc not found at /usr/local/cuda-12.8/bin/nvcc"
+          if [ -f /usr/local/cuda-12.8/bin/nvcc ]; then
+            /usr/local/cuda-12.8/bin/nvcc --version
+          fi
+
+      - name: Setup CUDA environment
+        run: |
+          export CUDA_HOME=/usr/local/cuda-12.8
+          export PATH="/usr/local/cuda-12.8/bin:$PATH"
+          echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
+          echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
 
       - name: run nvidia-smi
         run: nvidia-smi
 
       - name: Run tests
-        run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/
+        run: |
+          export CUDA_HOME=/usr/local/cuda-12.8
+          export PATH="/usr/local/cuda-12.8/bin:$PATH"
+          uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -46,7 +46,7 @@ jobs:
           enable-cache: true
 
       - name: Install the project
-        run: uv sync --extra dev
+        run: uv sync --extra dev-gpu
 
       - name: Ensure cache directories exist
         run: mkdir -p cache/models cache/datasets

diff --git a/.github/workflows/vllm_main_tests.yaml b/.github/workflows/vllm_main_tests.yaml
@@ -0,0 +1,79 @@
+name: vLLM Main Branch Tests
+
+on:
+  schedule:
+    - cron: '0 2 * * 1'  # Every Monday at 2 AM UTC
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  test_vllm_main:
+    name: Test with vLLM main branch
+    runs-on: 'aws-g4dn-2xlarge-use1-public-80'
+    continue-on-error: true
+
+    steps:
+      - name: Install Git LFS
+        run: |
+          if ! command -v git-lfs &> /dev/null; then
+            sudo apt-get update && sudo apt-get install -y git-lfs
+            git lfs install
+          fi
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --extra dev-gpu
+
+      - name: Install Python development headers
+        run: sudo apt-get update && sudo apt-get install -y python3.12-dev
+
+      - name: Cache CUDA Toolkit
+        id: cache-cuda
+        uses: actions/cache@v4
+        with:
+          path: /usr/local/cuda-12.8
+          key: cuda-toolkit-12-8-${{ runner.os }}
+
+      - name: Install CUDA Toolkit
+        if: steps.cache-cuda.outputs.cache-hit != 'true'
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-toolkit-12-8
+
+      - name: Setup CUDA environment
+        run: |
+          echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
+          echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
+
+      - name: Verify CUDA
+        run: |
+          nvidia-smi
+          nvcc --version
+
+      - name: Install vLLM from main branch
+        run: |
+          uv pip uninstall -y vllm || true
+          uv pip install git+https://github.com/vllm-project/vllm.git@main
+
+      - name: Get vLLM version
+        id: vllm-info
+        run: |
+          VERSION=$(uv run python -c "import vllm; print(vllm.__version__)")
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+          echo "Testing vLLM version: $VERSION"
+
+      - name: Run tests
+        run: uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/test_vllm_model.py
diff --git a/README.md b/README.md
@@ -187,7 +187,12 @@ If you're adding a **new feature**, please *open an issue first*.
 If you open a PR, don't forget to **run the styling**!
 
 ```bash
+# For basic development (code quality, tests)
 pip install -e ".[dev]"
+
+# Or for GPU/vllm development and slow tests
+pip install -e ".[dev-gpu]"
+
 pre-commit install
 pre-commit run --all-files
 ```

diff --git a/examples/model_configs/vllm_model_config.yaml b/examples/model_configs/vllm_model_config.yaml
@@ -5,7 +5,7 @@ model_parameters:
   tensor_parallel_size: 1
   data_parallel_size: 1
   pipeline_parallel_size: 1
-  gpu_memory_utilization: 0.6
+  gpu_memory_utilization: 0.4
   max_model_length: null
   swap_space: 4
   seed: 42

diff --git a/pyproject.toml b/pyproject.toml
@@ -103,11 +103,12 @@ nanotron = [
   "tensorboardX"
 ]
 tensorboardX = ["tensorboardX"]
-vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"]
+vllm = ["vllm>=0.11.0", "ray", "more_itertools"]
 sglang = ["sglang"]
 quality = ["ruff>=v0.11.0","pre-commit"]
 tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"]
-dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"]
+dev = ["lighteval[quality,tests]"]
+dev-gpu = ["lighteval[dev,multilingual,math,extended_tasks,vllm]"]
 docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [
   "langdetect", # ifeval

diff --git a/src/lighteval/metrics/utils/llm_as_judge.py b/src/lighteval/metrics/utils/llm_as_judge.py
@@ -168,7 +168,7 @@ def __lazy_load_client(self):  # noqa: C901
                 raise_if_package_not_available("vllm")
                 if self.pipe is None:
                     from vllm import LLM, SamplingParams
-                    from vllm.transformers_utils.tokenizer import get_tokenizer
+                    from vllm.tokenizers import get_tokenizer
 
                     self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens)
                     self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto")
@@ -296,7 +296,9 @@ def __call_transformers(self, prompt):
 
     def __call_vllm(self, prompt):
         tokenized = [self.tokenizer.apply_chat_template(p) for p in prompt]
-        output = self.pipe.generate(prompt_token_ids=tokenized, sampling_params=self.sampling_params, use_tqdm=True)
+        # Convert token IDs to TokensPrompt format for vLLM v0.15+
+        prompts = [{"prompt_token_ids": token_ids} for token_ids in tokenized]
+        output = self.pipe.generate(prompts=prompts, sampling_params=self.sampling_params, use_tqdm=True)
         outputs = [output.outputs[0].text for output in output]
         return outputs
 

diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
@@ -52,7 +52,7 @@
         destroy_distributed_environment,
         destroy_model_parallel,
     )
-    from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.tokenizers import get_tokenizer
     from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
 
     logging.getLogger("vllm").propagate = True
@@ -291,7 +291,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
         # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model
         # config and tk config, like mistralai/Mistral-7B-v0.1
         if self._max_length is None:
-            self._max_length = model.llm_engine.model_config.max_seq_len_to_capture
+            self._max_length = model.llm_engine.model_config.max_model_len
 
         return model
 
@@ -415,9 +415,9 @@ def _generate(
         generate: bool = True,
     ) -> list:
         """Contains the actual logic of the generation."""
-        sampling_params = SamplingParams(**self.config.generation_parameters.to_vllm_dict())
 
         if generate:
+            sampling_params = SamplingParams(**self.config.generation_parameters.to_vllm_dict())
             sampling_params.n = num_samples
             sampling_params.max_tokens = max_new_tokens
             sampling_params.stop = stop_tokens
@@ -427,17 +427,21 @@ def _generate(
                     "num_samples > 1 is not supported with temperature=0, please set temperature > 0 or use non sampling metrics."
                 )
         else:
-            sampling_params.temperature = 0
-            sampling_params.prompt_logprobs = 1
-            sampling_params.max_tokens = 1
-            sampling_params.detokenize = False
+            sampling_params = SamplingParams(
+                temperature=0.0,
+                prompt_logprobs=1,
+                max_tokens=1,
+                detokenize=False,
+            )
 
         if self.data_parallel_size > 1:
 
             @ray.remote(num_gpus=self.tensor_parallel_size)
             def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests):
                 llm = LLM(**model_args)
-                return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
+                # Convert token IDs to TokensPrompt format for vLLM v0.15+
+                prompts = [{"prompt_token_ids": req} for req in requests]
+                return llm.generate(prompts=prompts, sampling_params=sampling_params)
 
             # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
             # interleaved important to balance context lengths across workers
@@ -454,8 +458,12 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
                 if x is not None
             ]
         else:
+            from vllm.inputs import TokenInputs
+
+            # Convert token IDs to TokensPrompt format for vLLM v0.15+
+            prompts = [TokenInputs(prompt_token_ids=token_ids) for token_ids in inputs]
             outputs = self.model.generate(
-                prompt_token_ids=inputs,
+                prompts=prompts,
                 sampling_params=sampling_params,
                 use_tqdm=True,
             )
@@ -489,9 +497,6 @@ def _loglikelihood_tokens(
                     tokenized_continuations_batch.append(tokenized_continuation)
                     tokenized_contexts_batch.append(tokenized_context)
 
-            # Left truncate the inputs to the maximum length
-            if self.max_length:  # can be None if the model is initialized with ray
-                inputs = [input[-self.max_length :] for input in inputs]
             outputs = self._generate(inputs, generate=False)
 
             flat_index = 0
@@ -507,12 +512,18 @@ def _loglikelihood_tokens(
                 for output, context, continuation in zip(
                     outputs_doc, tokenized_contexts_doc, tokenized_continuations_doc
                 ):
+                    actual_input_len = len(output.prompt_token_ids)
+                    continuation_len = len(continuation)
+                    continuation_start_idx = actual_input_len - continuation_len
+                    continuation_prompt_logprobs = output.prompt_logprobs[continuation_start_idx:]
+
                     continuation_logprobs = []
-                    for token, logprobs in zip(continuation[::-1], output.prompt_logprobs[::-1]):
-                        continuation_logprobs.append(logprobs[token])
+                    for token, logprobs_at_position in zip(continuation, continuation_prompt_logprobs):
+                        continuation_logprobs.append(logprobs_at_position[token])
 
                     bool_score = all(logprob.rank == 1 for logprob in continuation_logprobs)
                     continuation_logprobs = [logprob.logprob for logprob in continuation_logprobs]
+
                     continuation_logprobs = sum(continuation_logprobs)
                     logprobs_doc.append(continuation_logprobs)
                     argmax_doc.append(bool_score)
@@ -544,6 +555,8 @@ class AsyncVLLMModel(VLLMModel):
     is_async = True
 
     def cleanup(self):
+        if self.model is not None:
+            del self.model
         gc.collect()
         destroy_distributed_environment()
         torch.cuda.empty_cache()
@@ -578,7 +591,7 @@ def _create_auto_model(self, config: VLLMModelConfig):
 
         # If the max_length can't get extracted from the config, it will be inferred from the model
         if self._max_length is None:
-            self._max_length = model.model_config.max_seq_len_to_capture
+            self._max_length = model.model_config.max_model_len
 
         return model
 

diff --git a/src/lighteval/tasks/multilingual/tasks/exams.py b/src/lighteval/tasks/multilingual/tasks/exams.py
@@ -167,9 +167,11 @@
         hf_subset="multilingual",
         # Weird bug in dataset
         hf_filter=partial(
-            lambda language, subject, line: line["answerKey"] != "@"
-            and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
-            and line["info"]["subject"] == subject,
+            lambda language, subject, line: (
+                line["answerKey"] != "@"
+                and line["info"]["language"] == LangCodeLanguage(standardize_tag(language.value)).language_name()
+                and line["info"]["subject"] == subject
+            ),
             language,
             subject,
         ),

diff --git a/src/lighteval/tasks/multilingual/tasks/filipino.py b/src/lighteval/tasks/multilingual/tasks/filipino.py
@@ -355,9 +355,12 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
         evaluation_splits=("test",),
         few_shots_split="dev",
         hf_filter=partial(
-            lambda subset, sensitivity_label, x: x["subject"].lower() == subset
-            and (
-                sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
+            lambda subset, sensitivity_label, x: (
+                x["subject"].lower() == subset
+                and (
+                    sensitivity_label == "ALL"
+                    or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
+                )
             ),
             subset,
             sensitivity_label,

diff --git a/src/lighteval/tasks/multilingual/tasks/global_mmlu.py b/src/lighteval/tasks/multilingual/tasks/global_mmlu.py
@@ -118,11 +118,14 @@
         evaluation_splits=("test",),
         few_shots_split="dev",
         hf_filter=partial(
-            lambda subset, sensitivity_label, x: x["subject"].lower() == subset
-            and (
-                sensitivity_label == "ALL" or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
-            )
-            and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd"),
+            lambda subset, sensitivity_label, x: (
+                x["subject"].lower() == subset
+                and (
+                    sensitivity_label == "ALL"
+                    or sensitivity_label in x["cultural_sensitivity_label"].replace("-", "UNK")
+                )
+                and all(x[f"option_{opt}"] is not None and x[f"option_{opt}"].strip() for opt in "abcd")
+            ),
             subset,
             sensitivity_label,
         ),

diff --git a/src/lighteval/tasks/multilingual/tasks/gpqa_fi.py b/src/lighteval/tasks/multilingual/tasks/gpqa_fi.py
@@ -32,6 +32,7 @@
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
+
 random.seed(42)
 
 

diff --git a/src/lighteval/tasks/multilingual/tasks/maime.py b/src/lighteval/tasks/multilingual/tasks/maime.py
@@ -36,17 +36,16 @@
 # Prompt template adapted from AIME task
 # Note: Uses English instructions for consistency with AIME
 MATH_PROMPT_TEMPLATE = dedent("""
-Solve the following math problem efficiently and clearly.  
-The last line of your response should be of the following format: 
-'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct' 
-(without quotes) where ANSWER is just the final number or expression 
+Solve the following math problem efficiently and clearly.
+The last line of your response should be of the following format:
+'Therefore, the final answer is: $\\boxed{{ANSWER}}$. I hope it is correct'
+(without quotes) where ANSWER is just the final number or expression
 that solves the problem. Think step by step before answering.
 
 {prompt}
 """)
 
 
-
 def record_to_sample(record):
     return Sample(input=record["question"], target=record["solution"])
Original file line number	Diff line number	Diff line change
Expand Up		@@ -32,6 +32,7 @@
		from lighteval.tasks.lighteval_task import LightevalTaskConfig
		from lighteval.tasks.requests import Doc


		random.seed(42)


Expand Down