From 597e70122442cd637b3de123a45434449b76674d Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Thu, 21 May 2026 23:07:46 +0000
Subject: [PATCH 01/10] Added unit test for vLLM

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 .github/workflows/gpu_tests.yml               |  20 +-
 CONTRIBUTING.md                               |   1 +
 noxfile.py                                    |   8 +
 .../_test_utils/torch/transformers_models.py  |  39 +++
 tests/gpu_vllm/conftest.py                    |  30 ++
 .../quantization/test_vllm_dynamic_modules.py | 272 ++++++++++++++++++
 6 files changed, 364 insertions(+), 6 deletions(-)
 create mode 100644 tests/gpu_vllm/conftest.py
 create mode 100644 tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py

diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index 2dcc96e577e..fef5de9e955 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -29,6 +29,7 @@ jobs:
         tests/gpu/**
         tests/gpu_megatron/**
         tests/gpu_trtllm/**
+        tests/gpu_vllm/**
 
   gpu-tests:
     needs: [pr-gate]
@@ -36,20 +37,26 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        # ``container_image`` is the full image path so non-nvcr.io registries
+        # (e.g. docker.io/vllm) can be used alongside nvcr.io/nvidia images.
         include:
           - example: gpu
             timeout: 75
-            container_image: pytorch:26.04-py3
+            container_image: nvcr.io/nvidia/pytorch:26.04-py3
           - example: gpu_megatron
             timeout: 45
-            container_image: nemo:26.04
+            container_image: nvcr.io/nvidia/nemo:26.04
           - example: gpu_trtllm
             timeout: 30
-            container_image: tensorrt-llm/release:1.3.0rc16
+            container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16
+          - example: gpu_vllm
+            timeout: 30
+            # Keep in sync with examples/vllm_serve/Dockerfile.
+            container_image: docker.io/vllm/vllm-openai:v0.20.0
     runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
     timeout-minutes: ${{ matrix.timeout }}
     container:
-      image: nvcr.io/nvidia/${{ matrix.container_image }}
+      image: ${{ matrix.container_image }}
       credentials:
         username: $oauthtoken
         password: ${{ secrets.NGC_API_KEY }}
@@ -65,10 +72,11 @@ jobs:
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
         env:
-          COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
+          # Skip subprocess coverage for gpu_vllm — the hook deadlocks vLLM's engine-core IPC.
+          COVERAGE_PROCESS_START: ${{ matrix.example == 'gpu_vllm' && '' || format('{0}/pyproject.toml', github.workspace) }}
           COVERAGE_FILE: ${{ github.workspace }}/.coverage
         run: |
-          python -m pip install nox && nox -s ${{ matrix.example }}
+          python3 -m pip install nox && nox -s ${{ matrix.example }}
       - name: Upload GPU coverage to Codecov
         uses: codecov/codecov-action@v5
         with:
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index bcc70b64d65..541ab6a51d4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -146,6 +146,7 @@ We use [pytest](https://docs.pytest.org/) for all tests. For any new features /
 - `tests/gpu`: Fast GPU-based unit tests for the core ModelOpt library. In most cases, they should not take more than a few seconds to run.
 - `tests/gpu_megatron`: Fast GPU-based unit tests for the core ModelOpt library for Megatron-Core features. In most cases, they should not take more than a few seconds to run.
 - `tests/gpu_trtllm`: Fast GPU-based unit tests for the core ModelOpt library for TensorRT-LLM features. In most cases, they should not take more than a few seconds to run.
+- `tests/gpu_vllm`: Fast GPU-based unit tests for the core ModelOpt library for vLLM features. In most cases, they should not take more than a few seconds to run.
 - `tests/examples`: Integration tests for ModelOpt examples. They should not take more than a few minutes to run. Please refer to [example test README](./tests/examples/README.md) for more details.
 
 For lightweight focused local validation, run `pytest` directly on the relevant test path. For example:
diff --git a/noxfile.py b/noxfile.py
index a902a729bdf..059f351b7f9 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -138,6 +138,14 @@ def gpu_trtllm(session):
     session.run("python", "-m", "pytest", "tests/gpu_trtllm", *_cov_args())
 
 
+# Container: docker.io/vllm/vllm-openai (the published image ships vLLM + CUDA + torch).
+# Pin must stay in sync with examples/vllm_serve/Dockerfile.
+@nox.session(venv_backend="none")
+def gpu_vllm(session):
+    session.run("python3", "-m", "pip", "install", "-e", ".[hf,dev-test]")
+    session.run("python3", "-m", "pytest", "tests/gpu_vllm", *_cov_args())
+
+
 # Container: nvcr.io/nvidia/pytorch:26.01-py3 or later
 @nox.session(venv_backend="none")
 def regression(session):
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
index 34bc96cd0ae..611bc31b1b4 100644
--- a/tests/_test_utils/torch/transformers_models.py
+++ b/tests/_test_utils/torch/transformers_models.py
@@ -26,6 +26,7 @@
     AutoModelForQuestionAnswering,
     AutoTokenizer,
     BertConfig,
+    DeepseekV3Config,
     GptOssConfig,
     LlamaConfig,
     PreTrainedModel,
@@ -120,6 +121,44 @@ def create_tiny_qwen3_moe_dir(
     return qwen3_moe_dir
 
 
+##### DeepSeek V3 #####
+def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel:
+    set_seed(SEED)
+    kwargs = {
+        "dtype": torch.bfloat16,
+        "vocab_size": 128,
+        "hidden_size": 128,
+        "intermediate_size": 256,
+        "moe_intermediate_size": 64,
+        "num_hidden_layers": 2,
+        "num_attention_heads": 2,
+        "num_key_value_heads": 2,
+        "n_routed_experts": 4,
+        "num_experts_per_tok": 2,
+        "n_shared_experts": 1,
+        "first_k_dense_replace": 0,
+        "kv_lora_rank": 16,
+        "q_lora_rank": 32,
+        "qk_rope_head_dim": 16,
+        "qk_nope_head_dim": 16,
+        "v_head_dim": 16,
+        "max_position_embeddings": 128,
+        # Required so vLLM allocates ``gate.e_score_correction_bias`` (HF saves it unconditionally).
+        "topk_method": "noaux_tc",
+    }
+    kwargs.update(**config_kwargs)
+    cfg = DeepseekV3Config(**kwargs)
+    # Survive transformers versions that drop unknown kwargs from the dataclass.
+    cfg.topk_method = kwargs["topk_method"]
+    return AutoModelForCausalLM.from_config(cfg)
+
+
+def create_tiny_deepseek_v3_dir(tmp_path: Path | str, **config_kwargs) -> Path:
+    deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3"
+    get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir)
+    return deepseek_dir
+
+
 ##### GPT-OSS #####
 def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel:
     set_seed(SEED)
diff --git a/tests/gpu_vllm/conftest.py b/tests/gpu_vllm/conftest.py
new file mode 100644
index 00000000000..7da0432129e
--- /dev/null
+++ b/tests/gpu_vllm/conftest.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared setup for vLLM tests.
+
+vLLM handles its own distributed init, current-vllm-config context, and
+parallel-state setup when ``LLM(...)`` is constructed, so this conftest only
+opts into ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` *before* importing vLLM so
+``LLM.collective_rpc(callable)`` can ship our worker callables over the engine
+IPC channel via pickle. Without this, the default msgpack encoder rejects raw
+functions and the call raises ``TypeError``. Only safe in a controlled test
+environment.
+"""
+
+import os
+
+# Must precede any ``import vllm``: the env is read at module-import time.
+os.environ.setdefault("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
diff --git a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py
new file mode 100644
index 00000000000..2dd354f32f7
--- /dev/null
+++ b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py
@@ -0,0 +1,272 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end tests for the vLLM fakequant dynamic modules.
+
+Boots ``vllm.LLM`` on tiny HF models (saved via
+``_test_utils.torch.transformers_models``) and runs ``mtq.quantize`` inside the
+worker via ``LLM.collective_rpc``. Asserts every ``_QuantVLLM…`` class is
+installed and every enabled quantizer ends up with a registered tensor-level
+``_amax`` after calibration. Mirrors the
+``examples/vllm_serve/fakequant_worker.py`` production path.
+
+Architectures: TinyLlama (Linear + Attention), TinyQwen3MoE (+ FusedMoE),
+TinyDeepseekV3 (+ MLAAttention).
+"""
+
+from __future__ import annotations
+
+import gc
+
+import pytest
+from _test_utils.torch.transformers_models import (
+    create_tiny_deepseek_v3_dir,
+    create_tiny_llama_dir,
+    create_tiny_qwen3_moe_dir,
+)
+from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
+
+import modelopt.torch.quantization as mtq
+from modelopt.torch.quantization.nn import TensorQuantizer
+from modelopt.torch.quantization.plugins.vllm import (
+    _ATTENTION_TYPES,
+    VllmMLAAttention,
+    _QuantFusedMoEBase,
+    _VLLMParallelLinear,
+    disable_compilation,
+)
+
+
+def _quantize_and_summarize(self):
+    """Run on the worker via ``LLM.collective_rpc``.
+
+    Module-level so it survives pickle over engine-core IPC. ``self`` is the
+    vLLM worker — needed to drive ``model_runner._dummy_run`` from the
+    calibration forward_loop. Returns a JSON-able summary.
+    """
+    model = self.get_model()
+
+    def _forward_loop(_model):
+        # ``num_tokens=1`` is enough for the ``"max"`` calibrator.
+        self.model_runner._dummy_run(1)
+
+    with disable_compilation(model):
+        mtq.quantize(model, mtq.NVFP4_DEFAULT_CFG, forward_loop=_forward_loop)
+
+    parallel_linear_counts: dict[str, int] = {}
+    moe_count = 0
+    attention_count = 0
+    mla_count = 0
+    missing_quantizers: list[str] = []
+    quantizers_without_amax: list[str] = []
+    enabled_quantizer_count = 0
+
+    def _missing(module, name, slots):
+        return (
+            f"{name}.{slot}"
+            for slot in slots
+            if not isinstance(getattr(module, slot, None), TensorQuantizer)
+        )
+
+    for name, module in model.named_modules():
+        if isinstance(module, _VLLMParallelLinear):
+            kind = type(module).__name__
+            parallel_linear_counts[kind] = parallel_linear_counts.get(kind, 0) + 1
+            missing_quantizers.extend(
+                _missing(module, name, ("input_quantizer", "weight_quantizer", "output_quantizer"))
+            )
+        elif isinstance(module, _QuantFusedMoEBase):
+            moe_count += 1
+            missing_quantizers.extend(
+                _missing(
+                    module,
+                    name,
+                    (
+                        "w13_input_quantizer",
+                        "w2_input_quantizer",
+                        "w13_weight_quantizer",
+                        "w2_weight_quantizer",
+                    ),
+                )
+            )
+        elif VllmMLAAttention is not None and isinstance(module, VllmMLAAttention):
+            mla_count += 1
+            missing_quantizers.extend(
+                _missing(
+                    module, name, ("q_bmm_quantizer", "kv_c_bmm_quantizer", "k_pe_bmm_quantizer")
+                )
+            )
+        elif isinstance(module, _ATTENTION_TYPES):
+            attention_count += 1
+            missing_quantizers.extend(
+                _missing(module, name, ("q_bmm_quantizer", "k_bmm_quantizer", "v_bmm_quantizer"))
+            )
+
+        # Static-amax invariant: every enabled quantizer must own an ``_amax``
+        # after calibration. ``kv_b_proj`` is exempt — vLLM's MLA decode path
+        # reads its weight directly and never calls its forward.
+        if isinstance(module, TensorQuantizer) and module.is_enabled:
+            enabled_quantizer_count += 1
+            if not hasattr(module, "_amax") and "kv_b_proj" not in name:
+                quantizers_without_amax.append(name)
+
+    return {
+        "parallel_linear_counts": parallel_linear_counts,
+        "moe_count": moe_count,
+        "attention_count": attention_count,
+        "mla_count": mla_count,
+        "missing_quantizers": missing_quantizers,
+        "quantizers_without_amax": quantizers_without_amax,
+        "enabled_quantizer_count": enabled_quantizer_count,
+    }
+
+
+def _boot_llm(model_dir, **extra):
+    """Construct a vLLM engine on a tiny model.
+
+    MoE fixtures override with ``moe_backend="triton"`` (pins the Triton
+    experts kernel whose module-level entries the modelopt plugin patches —
+    FlashInfer/TRTLLM kernels bypass them) and ``enable_expert_parallel=True``
+    (keeps modelopt's MoE-specific calibration paths live).
+    """
+    return LLM(
+        model=str(model_dir),
+        enforce_eager=True,
+        gpu_memory_utilization=0.2,
+        max_model_len=64,
+        max_num_seqs=1,
+        dtype="bfloat16",
+        skip_tokenizer_init=True,
+        **extra,
+    )
+
+
+def _shutdown_llm(llm):
+    del llm
+    gc.collect()
+    cleanup_dist_env_and_memory(shutdown_ray=False)
+
+
+@pytest.fixture(scope="module")
+def tiny_llama_llm(tmp_path_factory):
+    tmp = tmp_path_factory.mktemp("tiny_llama")
+    model_dir = create_tiny_llama_dir(tmp)
+    llm = _boot_llm(model_dir)
+    try:
+        yield llm
+    finally:
+        _shutdown_llm(llm)
+
+
+@pytest.fixture(scope="module")
+def tiny_qwen3_moe_llm(tmp_path_factory):
+    tmp = tmp_path_factory.mktemp("tiny_qwen3_moe")
+    # head_dim=64 with num_heads=2 is broadly supported by vLLM's attention backends.
+    model_dir = create_tiny_qwen3_moe_dir(
+        tmp,
+        hidden_size=128,
+        intermediate_size=256,
+        moe_intermediate_size=64,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=1,
+        max_position_embeddings=128,
+        vocab_size=128,
+        head_dim=64,
+        num_experts=4,
+        num_experts_per_tok=2,
+        decoder_sparse_step=1,
+    )
+    llm = _boot_llm(model_dir, moe_backend="triton", enable_expert_parallel=True)
+    try:
+        yield llm
+    finally:
+        _shutdown_llm(llm)
+
+
+@pytest.fixture(scope="module")
+def tiny_deepseek_llm(tmp_path_factory):
+    tmp = tmp_path_factory.mktemp("tiny_deepseek")
+    model_dir = create_tiny_deepseek_v3_dir(tmp)
+    llm = _boot_llm(model_dir, moe_backend="triton", enable_expert_parallel=True)
+    try:
+        yield llm
+    finally:
+        _shutdown_llm(llm)
+
+
+def _assert_quantizer_amax_is_static(summary):
+    """Every enabled quantizer must own a registered ``_amax`` after
+    calibration. Missing ``_amax`` → repr ``amax=dynamic`` → regression.
+    """
+    assert summary["enabled_quantizer_count"] > 0, summary
+    assert summary["quantizers_without_amax"] == [], summary["quantizers_without_amax"]
+
+
+def test_tiny_llama_quantize(tiny_llama_llm):
+    """Covers QKV/Row/MergedColumn ParallelLinear + Attention on a dense Llama."""
+    summaries = tiny_llama_llm.collective_rpc(_quantize_and_summarize)
+    summary = summaries[0]
+
+    assert summary["missing_quantizers"] == [], summary["missing_quantizers"]
+
+    parallel_linear_counts = summary["parallel_linear_counts"]
+    # Each decoder layer contributes one of each. With num_hidden_layers=2:
+    assert parallel_linear_counts.get("QuantQKVParallelLinear", 0) >= 2, parallel_linear_counts
+    # o_proj + down_proj per layer
+    assert parallel_linear_counts.get("QuantRowParallelLinear", 0) >= 4, parallel_linear_counts
+    assert parallel_linear_counts.get("QuantMergedColumnParallelLinear", 0) >= 2, (
+        parallel_linear_counts
+    )
+
+    # Llama uses the base Attention type — one per decoder layer.
+    assert summary["attention_count"] >= 2, summary
+
+    # No MoE in a dense Llama.
+    assert summary["moe_count"] == 0
+
+    _assert_quantizer_amax_is_static(summary)
+
+
+def test_tiny_qwen3_moe_quantize(tiny_qwen3_moe_llm):
+    """Tiny Qwen3-MoE adds FusedMoE coverage on top of the dense linears."""
+    summaries = tiny_qwen3_moe_llm.collective_rpc(_quantize_and_summarize)
+    summary = summaries[0]
+
+    assert summary["missing_quantizers"] == [], summary["missing_quantizers"]
+
+    parallel_linear_counts = summary["parallel_linear_counts"]
+    assert parallel_linear_counts.get("QuantQKVParallelLinear", 0) >= 2, parallel_linear_counts
+    assert parallel_linear_counts.get("QuantRowParallelLinear", 0) >= 2, parallel_linear_counts
+
+    # decoder_sparse_step=1 → every layer is MoE. With 2 layers we expect ≥2 FusedMoE.
+    assert summary["moe_count"] >= 2, summary
+    assert summary["attention_count"] >= 2, summary
+
+    _assert_quantizer_amax_is_static(summary)
+
+
+def test_tiny_deepseek_mla_quantize(tiny_deepseek_llm):
+    """Tiny DeepSeek-V3 covers MLAAttention (and again FusedMoE)."""
+    summaries = tiny_deepseek_llm.collective_rpc(_quantize_and_summarize)
+    summary = summaries[0]
+
+    assert summary["missing_quantizers"] == [], summary["missing_quantizers"]
+    assert summary["mla_count"] >= 2, summary
+    # ``first_k_dense_replace=0`` → every layer is MoE.
+    assert summary["moe_count"] >= 2, summary
+
+    _assert_quantizer_amax_is_static(summary)

From 824de51500d96664ae3ec5058b7efd38571e0c78 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Fri, 22 May 2026 01:10:32 +0000
Subject: [PATCH 02/10] minor

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 examples/vllm_serve/Dockerfile                                | 4 ++--
 .../gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py  | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/vllm_serve/Dockerfile b/examples/vllm_serve/Dockerfile
index 352896ca2cc..177a33f6dca 100644
--- a/examples/vllm_serve/Dockerfile
+++ b/examples/vllm_serve/Dockerfile
@@ -1,4 +1,4 @@
-FROM vllm/vllm-openai:v0.10.2
+FROM vllm/vllm-openai:v0.20.0
 
 # Set environment variables
 ENV PIP_NO_CACHE_DIR=off \
@@ -23,7 +23,7 @@ RUN cd Model-Optimizer && \
     pip install -e ".[all,dev-test]"
 
 # Llama4 requires this
-RUN pip install flash-attn==2.7.4.post1
+RUN pip install flash-attn==2.7.4.post1 --no-build-isolation
 
 # Pre-compile CUDA extensions to avoid compilation time during runtime
 RUN python3 -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" || true
diff --git a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py
index 2dd354f32f7..2136f959754 100644
--- a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py
+++ b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py
@@ -163,7 +163,9 @@ def _shutdown_llm(llm):
 @pytest.fixture(scope="module")
 def tiny_llama_llm(tmp_path_factory):
     tmp = tmp_path_factory.mktemp("tiny_llama")
-    model_dir = create_tiny_llama_dir(tmp)
+    # Helper default ``max_position_embeddings=32`` would clash with vLLM's
+    # ``max_model_len=64`` set in ``_boot_llm``.
+    model_dir = create_tiny_llama_dir(tmp, max_position_embeddings=64)
     llm = _boot_llm(model_dir)
     try:
         yield llm

From 7234ae82062a0ee21dc953ac8ac369ce59cd1750 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Fri, 22 May 2026 23:12:37 +0000
Subject: [PATCH 03/10] addressed comments

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 .github/workflows/gpu_tests.yml                |  3 +++
 tests/_test_utils/torch/transformers_models.py |  8 +++++++-
 tests/gpu_vllm/conftest.py                     | 14 ++++----------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index fef5de9e955..47fb07dd6c2 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -78,6 +78,9 @@ jobs:
         run: |
           python3 -m pip install nox && nox -s ${{ matrix.example }}
       - name: Upload GPU coverage to Codecov
+        # vLLM container has no ``git``, which codecov-action needs; gpu_vllm
+        # also runs without ``--cov`` so there's no coverage.xml to upload.
+        if: matrix.example != 'gpu_vllm'
         uses: codecov/codecov-action@v5
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py
index 611bc31b1b4..0365f07ffa8 100644
--- a/tests/_test_utils/torch/transformers_models.py
+++ b/tests/_test_utils/torch/transformers_models.py
@@ -153,8 +153,14 @@ def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel:
     return AutoModelForCausalLM.from_config(cfg)
 
 
-def create_tiny_deepseek_v3_dir(tmp_path: Path | str, **config_kwargs) -> Path:
+def create_tiny_deepseek_v3_dir(
+    tmp_path: Path | str, with_tokenizer: bool = False, **config_kwargs
+) -> Path:
     deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3"
+    if with_tokenizer:
+        tokenizer = get_tiny_tokenizer()
+        tokenizer.save_pretrained(deepseek_dir)
+        config_kwargs["vocab_size"] = tokenizer.vocab_size
     get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir)
     return deepseek_dir
 
diff --git a/tests/gpu_vllm/conftest.py b/tests/gpu_vllm/conftest.py
index 7da0432129e..8b4e966e987 100644
--- a/tests/gpu_vllm/conftest.py
+++ b/tests/gpu_vllm/conftest.py
@@ -13,18 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Shared setup for vLLM tests.
-
-vLLM handles its own distributed init, current-vllm-config context, and
-parallel-state setup when ``LLM(...)`` is constructed, so this conftest only
-opts into ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` *before* importing vLLM so
-``LLM.collective_rpc(callable)`` can ship our worker callables over the engine
-IPC channel via pickle. Without this, the default msgpack encoder rejects raw
-functions and the call raises ``TypeError``. Only safe in a controlled test
-environment.
+"""Set ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` before vLLM is imported so
+``LLM.collective_rpc(callable)`` can pickle worker callables. pytest loads
+conftests before sibling test modules, so this beats the top-level
+``from vllm import LLM`` in ``test_*.py``.
 """
 
 import os
 
-# Must precede any ``import vllm``: the env is read at module-import time.
 os.environ.setdefault("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")

From b9f5b8dd5f203e322aa4e18e0b497ceb8b5f2534 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Fri, 22 May 2026 23:15:10 +0000
Subject: [PATCH 04/10] removed requirements.txt install

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 examples/vllm_serve/Dockerfile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/examples/vllm_serve/Dockerfile b/examples/vllm_serve/Dockerfile
index 177a33f6dca..7213c6fc430 100644
--- a/examples/vllm_serve/Dockerfile
+++ b/examples/vllm_serve/Dockerfile
@@ -28,12 +28,6 @@ RUN pip install flash-attn==2.7.4.post1 --no-build-isolation
 # Pre-compile CUDA extensions to avoid compilation time during runtime
 RUN python3 -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" || true
 
-# Install requirements from examples (excluding windows examples)
-RUN find Model-Optimizer/examples -name "requirements.txt" | grep -v "windows" | while read req_file; do \
-        echo "Installing from $req_file"; \
-        pip install -r "$req_file" || echo "Warning: Failed to install from $req_file"; \
-    done
-
 # Allow users to run without root
 RUN chmod -R 777 /workspace
 

From 7fc5ebfc25b6ef210d7db2d9c7442813b5f430c6 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Thu, 28 May 2026 19:50:24 +0000
Subject: [PATCH 05/10] fixing docker permission for vllm

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 .github/workflows/gpu_tests.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index 47fb07dd6c2..bd493614d6b 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -57,9 +57,11 @@ jobs:
     timeout-minutes: ${{ matrix.timeout }}
     container:
       image: ${{ matrix.container_image }}
+      # NGC creds only for ``nvcr.io/*`` images; ``docker.io/*`` is anonymous-pull.
+      # Empty username/password short-circuits the runner's ``docker login`` step.
       credentials:
-        username: $oauthtoken
-        password: ${{ secrets.NGC_API_KEY }}
+        username: ${{ startsWith(matrix.container_image, 'nvcr.io/') && '$oauthtoken' || '' }}
+        password: ${{ startsWith(matrix.container_image, 'nvcr.io/') && secrets.NGC_API_KEY || '' }}
       env:
         GIT_DEPTH: 1000 # For correct version
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages

From 351b369402adbeab57f033b2b938f9d0c9877fa3 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Thu, 28 May 2026 23:14:19 +0000
Subject: [PATCH 06/10] updated gpu_tests.yml

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 .github/workflows/gpu_tests.yml | 59 ++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index bd493614d6b..3e2681d8854 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -37,8 +37,6 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        # ``container_image`` is the full image path so non-nvcr.io registries
-        # (e.g. docker.io/vllm) can be used alongside nvcr.io/nvidia images.
         include:
           - example: gpu
             timeout: 75
@@ -51,52 +49,51 @@ jobs:
             container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16
           - example: gpu_vllm
             timeout: 30
-            # Keep in sync with examples/vllm_serve/Dockerfile.
             container_image: docker.io/vllm/vllm-openai:v0.20.0
     runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
     timeout-minutes: ${{ matrix.timeout }}
     container:
       image: ${{ matrix.container_image }}
-      # NGC creds only for ``nvcr.io/*`` images; ``docker.io/*`` is anonymous-pull.
-      # Empty username/password short-circuits the runner's ``docker login`` step.
       credentials:
-        username: ${{ startsWith(matrix.container_image, 'nvcr.io/') && '$oauthtoken' || '' }}
-        password: ${{ startsWith(matrix.container_image, 'nvcr.io/') && secrets.NGC_API_KEY || '' }}
+        username: $oauthtoken
+        password: ${{ secrets.NGC_API_KEY }}
       env:
         GIT_DEPTH: 1000 # For correct version
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - uses: actions/checkout@v6
-      - uses: nv-gha-runners/setup-proxy-cache@main
-      - name: Setup environment variables
-        run: |
-          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
-      - name: Run gpu tests
-        env:
-          # Skip subprocess coverage for gpu_vllm — the hook deadlocks vLLM's engine-core IPC.
-          COVERAGE_PROCESS_START: ${{ matrix.example == 'gpu_vllm' && '' || format('{0}/pyproject.toml', github.workspace) }}
-          COVERAGE_FILE: ${{ github.workspace }}/.coverage
-        run: |
-          python3 -m pip install nox && nox -s ${{ matrix.example }}
-      - name: Upload GPU coverage to Codecov
-        # vLLM container has no ``git``, which codecov-action needs; gpu_vllm
-        # also runs without ``--cov`` so there's no coverage.xml to upload.
-        if: matrix.example != 'gpu_vllm'
-        uses: codecov/codecov-action@v5
+      - uses: ./.github/actions/gpu-test-run
         with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: coverage.xml
-          flags: gpu
-          fail_ci_if_error: false # test may be skipped if relevant file changes are not detected
-          verbose: true
+          example: ${{ matrix.example }}
+          codecov_token: ${{ secrets.CODECOV_TOKEN }}
+
+  # Docker Hub image: anonymous pull (no ``credentials:``) and no coverage
+  gpu-tests-vllm:
+    needs: [pr-gate]
+    if: needs.pr-gate.outputs.any_changed == 'true'
+    runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
+    timeout-minutes: 30
+    container:
+      # Keep in sync with examples/vllm_serve/Dockerfile.
+      image: docker.io/vllm/vllm-openai:v0.20.0
+      env:
+        GIT_DEPTH: 1000 # For correct version
+        PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    steps:
+      - uses: actions/checkout@v6
+      - uses: ./.github/actions/gpu-test-run
+        with:
+          example: gpu_vllm
+          with_coverage: "false"
 
   gpu-pr-required-check:
-    # Run even if gpu-tests is skipped
+    # Run even if any of the gpu jobs is skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [pr-gate, gpu-tests]
+    needs: [pr-gate, gpu-tests, gpu-tests-vllm]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
-        if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && needs.gpu-tests.result != 'success') }}
+        if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && (needs.gpu-tests.result != 'success' || needs.gpu-tests-vllm.result != 'success')) }}
         run: exit 1

From 76ba5855b24c6b77a000317f9627923d2bef4aa3 Mon Sep 17 00:00:00 2001
From: Kinjal Patel <kinjalpravin@nvidia.com>
Date: Fri, 29 May 2026 18:17:12 +0000
Subject: [PATCH 07/10] minor

Signed-off-by: Kinjal Patel <kinjalpravin@nvidia.com>
---
 .github/workflows/gpu_tests.yml | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index 3e2681d8854..1e1901a7780 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -40,20 +40,17 @@ jobs:
         include:
           - example: gpu
             timeout: 75
-            container_image: nvcr.io/nvidia/pytorch:26.04-py3
+            container_image: pytorch:26.04-py3
           - example: gpu_megatron
             timeout: 45
-            container_image: nvcr.io/nvidia/nemo:26.04
+            container_image: nemo:26.04
           - example: gpu_trtllm
             timeout: 30
-            container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16
-          - example: gpu_vllm
-            timeout: 30
-            container_image: docker.io/vllm/vllm-openai:v0.20.0
+            container_image: tensorrt-llm/release:1.3.0rc16
     runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
     timeout-minutes: ${{ matrix.timeout }}
     container:
-      image: ${{ matrix.container_image }}
+      image: nvcr.io/nvidia/${{ matrix.container_image }}
       credentials:
         username: $oauthtoken
         password: ${{ secrets.NGC_API_KEY }}
@@ -75,7 +72,6 @@ jobs:
     runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
     timeout-minutes: 30
     container:
-      # Keep in sync with examples/vllm_serve/Dockerfile.
       image: docker.io/vllm/vllm-openai:v0.20.0
       env:
         GIT_DEPTH: 1000 # For correct version

From caeef809709fc109c361f03818c59b421a81bb9e Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Fri, 29 May 2026 11:51:19 -0700
Subject: [PATCH 08/10] simplify gpu_tests.yml

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 .github/workflows/gpu_tests.yml | 67 ++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index 1e1901a7780..451e8b6c632 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -40,56 +40,61 @@ jobs:
         include:
           - example: gpu
             timeout: 75
-            container_image: pytorch:26.04-py3
+            container_image: nvcr.io/nvidia/pytorch:26.04-py3
           - example: gpu_megatron
             timeout: 45
-            container_image: nemo:26.04
+            container_image: nvcr.io/nvidia/nemo:26.04
           - example: gpu_trtllm
             timeout: 30
-            container_image: tensorrt-llm/release:1.3.0rc16
+            container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16
+          - example: gpu_vllm
+            timeout: 30
+            container_image: docker.io/vllm/vllm-openai:v0.20.0
     runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
     timeout-minutes: ${{ matrix.timeout }}
     container:
-      image: nvcr.io/nvidia/${{ matrix.container_image }}
+      image: ${{ matrix.container_image }}
+      # nvcr.io images require NGC auth; public docker.io images (e.g. vllm) are pulled
+      # anonymously (the runner skips docker login when username/password are empty).
       credentials:
-        username: $oauthtoken
-        password: ${{ secrets.NGC_API_KEY }}
-      env:
-        GIT_DEPTH: 1000 # For correct version
-        PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
-        HF_TOKEN: ${{ secrets.HF_TOKEN }}
-    steps:
-      - uses: actions/checkout@v6
-      - uses: ./.github/actions/gpu-test-run
-        with:
-          example: ${{ matrix.example }}
-          codecov_token: ${{ secrets.CODECOV_TOKEN }}
-
-  # Docker Hub image: anonymous pull (no ``credentials:``) and no coverage
-  gpu-tests-vllm:
-    needs: [pr-gate]
-    if: needs.pr-gate.outputs.any_changed == 'true'
-    runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }}
-    timeout-minutes: 30
-    container:
-      image: docker.io/vllm/vllm-openai:v0.20.0
+        username: ${{ startsWith(matrix.container_image, 'nvcr.io') && '$oauthtoken' || '' }}
+        password: ${{ startsWith(matrix.container_image, 'nvcr.io') && secrets.NGC_API_KEY || '' }}
       env:
         GIT_DEPTH: 1000 # For correct version
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
+      - name: Install git
+        # The vllm container ships without git; needed for a real checkout (correct
+        # setuptools-scm version) and for the Codecov upload below.
+        if: matrix.example == 'gpu_vllm'
+        run: apt-get update && apt-get install -y git
       - uses: actions/checkout@v6
-      - uses: ./.github/actions/gpu-test-run
+      - uses: nv-gha-runners/setup-proxy-cache@main
+      - name: Setup environment variables
+        run: |
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
+      - name: Run gpu tests
+        env:
+          COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
+          COVERAGE_FILE: ${{ github.workspace }}/.coverage
+        run: |
+          python -m pip install nox && nox -s ${{ matrix.example }}
+      - name: Upload GPU coverage to Codecov
+        uses: codecov/codecov-action@v5
         with:
-          example: gpu_vllm
-          with_coverage: "false"
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: coverage.xml
+          flags: gpu
+          fail_ci_if_error: false # test may be skipped if relevant file changes are not detected
+          verbose: true
 
   gpu-pr-required-check:
-    # Run even if any of the gpu jobs is skipped
+    # Run even if gpu-tests is skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [pr-gate, gpu-tests, gpu-tests-vllm]
+    needs: [pr-gate, gpu-tests]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
-        if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && (needs.gpu-tests.result != 'success' || needs.gpu-tests-vllm.result != 'success')) }}
+        if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && needs.gpu-tests.result != 'success') }}
         run: exit 1

From 45d25ab82b8b06ae8dd4a9c86fedcaf3f494297c Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Fri, 29 May 2026 12:28:13 -0700
Subject: [PATCH 09/10] Remove credentials as its not needed for nvcr.io also

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 .github/workflows/_example_tests_runner.yml | 3 ---
 .github/workflows/gpu_tests.yml             | 5 -----
 2 files changed, 8 deletions(-)

diff --git a/.github/workflows/_example_tests_runner.yml b/.github/workflows/_example_tests_runner.yml
index ea0fc0c19f8..8adadbac7af 100644
--- a/.github/workflows/_example_tests_runner.yml
+++ b/.github/workflows/_example_tests_runner.yml
@@ -34,9 +34,6 @@ jobs:
     timeout-minutes: ${{ inputs.timeout_minutes }}
     container:
       image: ${{ inputs.docker_image }}
-      credentials:
-        username: $oauthtoken
-        password: ${{ secrets.NGC_API_KEY }}
       options: --shm-size=2gb # TRT-LLM tests on 2-GPU runner needs more shared memory
       env:
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index 451e8b6c632..11ef88290ee 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -54,11 +54,6 @@ jobs:
     timeout-minutes: ${{ matrix.timeout }}
     container:
       image: ${{ matrix.container_image }}
-      # nvcr.io images require NGC auth; public docker.io images (e.g. vllm) are pulled
-      # anonymously (the runner skips docker login when username/password are empty).
-      credentials:
-        username: ${{ startsWith(matrix.container_image, 'nvcr.io') && '$oauthtoken' || '' }}
-        password: ${{ startsWith(matrix.container_image, 'nvcr.io') && secrets.NGC_API_KEY || '' }}
       env:
         GIT_DEPTH: 1000 # For correct version
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages

From 8a6bcee29e6da1cac3a72be041dc36636805dbea Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Fri, 29 May 2026 12:52:04 -0700
Subject: [PATCH 10/10] use python3 instead of python for vllm

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 .github/workflows/gpu_tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
index 11ef88290ee..9d290cae04f 100644
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
@@ -74,7 +74,8 @@ jobs:
           COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
           COVERAGE_FILE: ${{ github.workspace }}/.coverage
         run: |
-          python -m pip install nox && nox -s ${{ matrix.example }}
+          # Use `python3` (the vllm image has no `python` on PATH)
+          python3 -m pip install nox && nox -s ${{ matrix.example }}
       - name: Upload GPU coverage to Codecov
         uses: codecov/codecov-action@v5
         with: