Skip to content

Commit 597ecca

Browse files
committed
Added unit test for vLLM
Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
1 parent b02e888 commit 597ecca

6 files changed

Lines changed: 364 additions & 6 deletions

File tree

.github/workflows/gpu_tests.yml

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,27 +30,34 @@ jobs:
3030
tests/gpu/**
3131
tests/gpu_megatron/**
3232
tests/gpu_trtllm/**
33+
tests/gpu_vllm/**
3334
3435
gpu-tests:
3536
needs: [pr-gate]
3637
if: needs.pr-gate.outputs.any_changed == 'true'
3738
strategy:
3839
fail-fast: false
3940
matrix:
41+
# ``container_image`` is the full image path so non-nvcr.io registries
42+
# (e.g. docker.io/vllm) can be used alongside nvcr.io/nvidia images.
4043
include:
4144
- example: gpu
4245
timeout: 75
43-
container_image: pytorch:26.03-py3
46+
container_image: nvcr.io/nvidia/pytorch:26.03-py3
4447
- example: gpu_megatron
4548
timeout: 45
46-
container_image: nemo:26.04
49+
container_image: nvcr.io/nvidia/nemo:26.04
4750
- example: gpu_trtllm
4851
timeout: 30
49-
container_image: tensorrt-llm/release:1.3.0rc12
52+
container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc12
53+
- example: gpu_vllm
54+
timeout: 30
55+
# Keep in sync with examples/vllm_serve/Dockerfile.
56+
container_image: docker.io/vllm/vllm-openai:v0.20.0
5057
runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
5158
timeout-minutes: ${{ matrix.timeout }}
5259
container:
53-
image: nvcr.io/nvidia/${{ matrix.container_image }}
60+
image: ${{ matrix.container_image }}
5461
env:
5562
GIT_DEPTH: 1000 # For correct version
5663
PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
@@ -63,10 +70,11 @@ jobs:
6370
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
6471
- name: Run gpu tests
6572
env:
66-
COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
73+
# Skip subprocess coverage for gpu_vllm — the hook deadlocks vLLM's engine-core IPC.
74+
COVERAGE_PROCESS_START: ${{ matrix.example == 'gpu_vllm' && '' || format('{0}/pyproject.toml', github.workspace) }}
6775
COVERAGE_FILE: ${{ github.workspace }}/.coverage
6876
run: |
69-
python -m pip install nox && nox -s ${{ matrix.example }}
77+
python3 -m pip install nox && nox -s ${{ matrix.example }}
7078
- name: Upload GPU coverage to Codecov
7179
uses: codecov/codecov-action@v5
7280
with:

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ We use [pytest](https://docs.pytest.org/) for all tests. For any new features /
146146
- `tests/gpu`: Fast GPU-based unit tests for the core ModelOpt library. In most cases, they should not take more than a few seconds to run.
147147
- `tests/gpu_megatron`: Fast GPU-based unit tests for the core ModelOpt library for Megatron-Core features. In most cases, they should not take more than a few seconds to run.
148148
- `tests/gpu_trtllm`: Fast GPU-based unit tests for the core ModelOpt library for TensorRT-LLM features. In most cases, they should not take more than a few seconds to run.
149+
- `tests/gpu_vllm`: Fast GPU-based unit tests for the core ModelOpt library for vLLM features. In most cases, they should not take more than a few seconds to run.
149150
- `tests/examples`: Integration tests for ModelOpt examples. They should not take more than a few minutes to run. Please refer to [example test README](./tests/examples/README.md) for more details.
150151

151152
For lightweight focused local validation, run `pytest` directly on the relevant test path. For example:

noxfile.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,14 @@ def gpu_trtllm(session):
135135
session.run("python", "-m", "pytest", "tests/gpu_trtllm", *_cov_args())
136136

137137

138+
# Container: docker.io/vllm/vllm-openai (the published image ships vLLM + CUDA + torch).
139+
# Pin must stay in sync with examples/vllm_serve/Dockerfile.
140+
@nox.session(venv_backend="none")
141+
def gpu_vllm(session):
142+
session.run("python3", "-m", "pip", "install", "-e", ".[hf,dev-test]")
143+
session.run("python3", "-m", "pytest", "tests/gpu_vllm", *_cov_args())
144+
145+
138146
# Container: nvcr.io/nvidia/pytorch:26.01-py3 or later
139147
@nox.session(venv_backend="none")
140148
def regression(session):

tests/_test_utils/torch/transformers_models.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
AutoModelForQuestionAnswering,
2727
AutoTokenizer,
2828
BertConfig,
29+
DeepseekV3Config,
2930
GptOssConfig,
3031
LlamaConfig,
3132
PreTrainedModel,
@@ -120,6 +121,44 @@ def create_tiny_qwen3_moe_dir(
120121
return qwen3_moe_dir
121122

122123

124+
##### DeepSeek V3 #####
125+
def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel:
126+
set_seed(SEED)
127+
kwargs = {
128+
"dtype": torch.bfloat16,
129+
"vocab_size": 128,
130+
"hidden_size": 128,
131+
"intermediate_size": 256,
132+
"moe_intermediate_size": 64,
133+
"num_hidden_layers": 2,
134+
"num_attention_heads": 2,
135+
"num_key_value_heads": 2,
136+
"n_routed_experts": 4,
137+
"num_experts_per_tok": 2,
138+
"n_shared_experts": 1,
139+
"first_k_dense_replace": 0,
140+
"kv_lora_rank": 16,
141+
"q_lora_rank": 32,
142+
"qk_rope_head_dim": 16,
143+
"qk_nope_head_dim": 16,
144+
"v_head_dim": 16,
145+
"max_position_embeddings": 128,
146+
# Required so vLLM allocates ``gate.e_score_correction_bias`` (HF saves it unconditionally).
147+
"topk_method": "noaux_tc",
148+
}
149+
kwargs.update(**config_kwargs)
150+
cfg = DeepseekV3Config(**kwargs)
151+
# Survive transformers versions that drop unknown kwargs from the dataclass.
152+
cfg.topk_method = kwargs["topk_method"]
153+
return AutoModelForCausalLM.from_config(cfg)
154+
155+
156+
def create_tiny_deepseek_v3_dir(tmp_path: Path | str, **config_kwargs) -> Path:
157+
deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3"
158+
get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir)
159+
return deepseek_dir
160+
161+
123162
##### GPT-OSS #####
124163
def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel:
125164
set_seed(SEED)

tests/gpu_vllm/conftest.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Shared setup for vLLM tests.
17+
18+
vLLM handles its own distributed init, current-vllm-config context, and
19+
parallel-state setup when ``LLM(...)`` is constructed, so this conftest only
20+
opts into ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` *before* importing vLLM so
21+
``LLM.collective_rpc(callable)`` can ship our worker callables over the engine
22+
IPC channel via pickle. Without this, the default msgpack encoder rejects raw
23+
functions and the call raises ``TypeError``. Only safe in a controlled test
24+
environment.
25+
"""
26+
27+
import os
28+
29+
# Must precede any ``import vllm``: the env is read at module-import time.
30+
os.environ.setdefault("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")

0 commit comments

Comments
 (0)