Added unit test for vLLM

kinjalpatel27 · kinjalpatel27 · commit ef363d81bd41 · 2026-05-29T00:02:07.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -30,27 +30,34 @@ jobs:
         tests/gpu/**
         tests/gpu_megatron/**
         tests/gpu_trtllm/**
+        tests/gpu_vllm/**
 
   gpu-tests:
     needs: [pr-gate]
     if: needs.pr-gate.outputs.any_changed == 'true'
     strategy:
       fail-fast: false
       matrix:
+        # ``container_image`` is the full image path so non-nvcr.io registries
+        # (e.g. docker.io/vllm) can be used alongside nvcr.io/nvidia images.
         include:
           - example: gpu
             timeout: 75
-            container_image: pytorch:26.03-py3
+            container_image: nvcr.io/nvidia/pytorch:26.03-py3
           - example: gpu_megatron
             timeout: 45
-            container_image: nemo:26.04
+            container_image: nvcr.io/nvidia/nemo:26.04
           - example: gpu_trtllm
             timeout: 30
-            container_image: tensorrt-llm/release:1.3.0rc12
+            container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc12
+          - example: gpu_vllm
+            timeout: 30
+            # Keep in sync with examples/vllm_serve/Dockerfile.
+            container_image: docker.io/vllm/vllm-openai:v0.20.0
     runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
     timeout-minutes: ${{ matrix.timeout }}
     container:
-      image: nvcr.io/nvidia/${{ matrix.container_image }}
+      image: ${{ matrix.container_image }}
       credentials:
         username: $oauthtoken
         password: ${{ secrets.NGC_API_KEY }}
@@ -66,10 +73,11 @@ jobs:
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
         env:
-          COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
+          # Skip subprocess coverage for gpu_vllm — the hook deadlocks vLLM's engine-core IPC.
+          COVERAGE_PROCESS_START: ${{ matrix.example == 'gpu_vllm' && '' || format('{0}/pyproject.toml', github.workspace) }}
           COVERAGE_FILE: ${{ github.workspace }}/.coverage
         run: |
-          python -m pip install nox && nox -s ${{ matrix.example }}
+          python3 -m pip install nox && nox -s ${{ matrix.example }}
       - name: Upload GPU coverage to Codecov
         uses: codecov/codecov-action@v5
         with:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -146,6 +146,7 @@ We use [pytest](https://docs.pytest.org/) for all tests. For any new features /
 - `tests/gpu`: Fast GPU-based unit tests for the core ModelOpt library. In most cases, they should not take more than a few seconds to run.
 - `tests/gpu_megatron`: Fast GPU-based unit tests for the core ModelOpt library for Megatron-Core features. In most cases, they should not take more than a few seconds to run.
 - `tests/gpu_trtllm`: Fast GPU-based unit tests for the core ModelOpt library for TensorRT-LLM features. In most cases, they should not take more than a few seconds to run.
+- `tests/gpu_vllm`: Fast GPU-based unit tests for the core ModelOpt library for vLLM features. In most cases, they should not take more than a few seconds to run.
 - `tests/examples`: Integration tests for ModelOpt examples. They should not take more than a few minutes to run. Please refer to [example test README](./tests/examples/README.md) for more details.
 
 For lightweight focused local validation, run `pytest` directly on the relevant test path. For example:
diff --git a/noxfile.py b/noxfile.py
@@ -135,6 +135,14 @@ def gpu_trtllm(session):
     session.run("python", "-m", "pytest", "tests/gpu_trtllm", *_cov_args())
 
 
+# Container: docker.io/vllm/vllm-openai (the published image ships vLLM + CUDA + torch).
+# Pin must stay in sync with examples/vllm_serve/Dockerfile.
+@nox.session(venv_backend="none")
+def gpu_vllm(session):
+    session.run("python3", "-m", "pip", "install", "-e", ".[hf,dev-test]")
+    session.run("python3", "-m", "pytest", "tests/gpu_vllm", *_cov_args())
+
+
 # Container: nvcr.io/nvidia/pytorch:26.01-py3 or later
 @nox.session(venv_backend="none")
 def regression(session):
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
@@ -26,6 +26,7 @@
     AutoModelForQuestionAnswering,
     AutoTokenizer,
     BertConfig,
+    DeepseekV3Config,
     GptOssConfig,
     LlamaConfig,
     PreTrainedModel,
@@ -120,6 +121,44 @@ def create_tiny_qwen3_moe_dir(
     return qwen3_moe_dir
 
 
+##### DeepSeek V3 #####
+def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel:
+    set_seed(SEED)
+    kwargs = {
+        "dtype": torch.bfloat16,
+        "vocab_size": 128,
+        "hidden_size": 128,
+        "intermediate_size": 256,
+        "moe_intermediate_size": 64,
+        "num_hidden_layers": 2,
+        "num_attention_heads": 2,
+        "num_key_value_heads": 2,
+        "n_routed_experts": 4,
+        "num_experts_per_tok": 2,
+        "n_shared_experts": 1,
+        "first_k_dense_replace": 0,
+        "kv_lora_rank": 16,
+        "q_lora_rank": 32,
+        "qk_rope_head_dim": 16,
+        "qk_nope_head_dim": 16,
+        "v_head_dim": 16,
+        "max_position_embeddings": 128,
+        # Required so vLLM allocates ``gate.e_score_correction_bias`` (HF saves it unconditionally).
+        "topk_method": "noaux_tc",
+    }
+    kwargs.update(**config_kwargs)
+    cfg = DeepseekV3Config(**kwargs)
+    # Survive transformers versions that drop unknown kwargs from the dataclass.
+    cfg.topk_method = kwargs["topk_method"]
+    return AutoModelForCausalLM.from_config(cfg)
+
+
+def create_tiny_deepseek_v3_dir(tmp_path: Path | str, **config_kwargs) -> Path:
+    deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3"
+    get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir)
+    return deepseek_dir
+
+
 ##### GPT-OSS #####
 def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel:
     set_seed(SEED)
diff --git a/tests/gpu_vllm/conftest.py b/tests/gpu_vllm/conftest.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared setup for vLLM tests.
+
+vLLM handles its own distributed init, current-vllm-config context, and
+parallel-state setup when ``LLM(...)`` is constructed, so this conftest only
+opts into ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` *before* importing vLLM so
+``LLM.collective_rpc(callable)`` can ship our worker callables over the engine
+IPC channel via pickle. Without this, the default msgpack encoder rejects raw
+functions and the call raises ``TypeError``. Only safe in a controlled test
+environment.
+"""
+
+import os
+
+# Must precede any ``import vllm``: the env is read at module-import time.
+os.environ.setdefault("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
diff --git a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py