From 597e70122442cd637b3de123a45434449b76674d Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Thu, 21 May 2026 23:07:46 +0000 Subject: [PATCH 01/10] Added unit test for vLLM Signed-off-by: Kinjal Patel --- .github/workflows/gpu_tests.yml | 20 +- CONTRIBUTING.md | 1 + noxfile.py | 8 + .../_test_utils/torch/transformers_models.py | 39 +++ tests/gpu_vllm/conftest.py | 30 ++ .../quantization/test_vllm_dynamic_modules.py | 272 ++++++++++++++++++ 6 files changed, 364 insertions(+), 6 deletions(-) create mode 100644 tests/gpu_vllm/conftest.py create mode 100644 tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 2dcc96e577e..fef5de9e955 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -29,6 +29,7 @@ jobs: tests/gpu/** tests/gpu_megatron/** tests/gpu_trtllm/** + tests/gpu_vllm/** gpu-tests: needs: [pr-gate] @@ -36,20 +37,26 @@ jobs: strategy: fail-fast: false matrix: + # ``container_image`` is the full image path so non-nvcr.io registries + # (e.g. docker.io/vllm) can be used alongside nvcr.io/nvidia images. include: - example: gpu timeout: 75 - container_image: pytorch:26.04-py3 + container_image: nvcr.io/nvidia/pytorch:26.04-py3 - example: gpu_megatron timeout: 45 - container_image: nemo:26.04 + container_image: nvcr.io/nvidia/nemo:26.04 - example: gpu_trtllm timeout: 30 - container_image: tensorrt-llm/release:1.3.0rc16 + container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16 + - example: gpu_vllm + timeout: 30 + # Keep in sync with examples/vllm_serve/Dockerfile. + container_image: docker.io/vllm/vllm-openai:v0.20.0 runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }} timeout-minutes: ${{ matrix.timeout }} container: - image: nvcr.io/nvidia/${{ matrix.container_image }} + image: ${{ matrix.container_image }} credentials: username: $oauthtoken password: ${{ secrets.NGC_API_KEY }} @@ -65,10 +72,11 @@ jobs: echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV - name: Run gpu tests env: - COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml + # Skip subprocess coverage for gpu_vllm — the hook deadlocks vLLM's engine-core IPC. + COVERAGE_PROCESS_START: ${{ matrix.example == 'gpu_vllm' && '' || format('{0}/pyproject.toml', github.workspace) }} COVERAGE_FILE: ${{ github.workspace }}/.coverage run: | - python -m pip install nox && nox -s ${{ matrix.example }} + python3 -m pip install nox && nox -s ${{ matrix.example }} - name: Upload GPU coverage to Codecov uses: codecov/codecov-action@v5 with: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bcc70b64d65..541ab6a51d4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -146,6 +146,7 @@ We use [pytest](https://docs.pytest.org/) for all tests. For any new features / - `tests/gpu`: Fast GPU-based unit tests for the core ModelOpt library. In most cases, they should not take more than a few seconds to run. - `tests/gpu_megatron`: Fast GPU-based unit tests for the core ModelOpt library for Megatron-Core features. In most cases, they should not take more than a few seconds to run. - `tests/gpu_trtllm`: Fast GPU-based unit tests for the core ModelOpt library for TensorRT-LLM features. In most cases, they should not take more than a few seconds to run. +- `tests/gpu_vllm`: Fast GPU-based unit tests for the core ModelOpt library for vLLM features. In most cases, they should not take more than a few seconds to run. - `tests/examples`: Integration tests for ModelOpt examples. They should not take more than a few minutes to run. Please refer to [example test README](./tests/examples/README.md) for more details. For lightweight focused local validation, run `pytest` directly on the relevant test path. For example: diff --git a/noxfile.py b/noxfile.py index a902a729bdf..059f351b7f9 100644 --- a/noxfile.py +++ b/noxfile.py @@ -138,6 +138,14 @@ def gpu_trtllm(session): session.run("python", "-m", "pytest", "tests/gpu_trtllm", *_cov_args()) +# Container: docker.io/vllm/vllm-openai (the published image ships vLLM + CUDA + torch). +# Pin must stay in sync with examples/vllm_serve/Dockerfile. +@nox.session(venv_backend="none") +def gpu_vllm(session): + session.run("python3", "-m", "pip", "install", "-e", ".[hf,dev-test]") + session.run("python3", "-m", "pytest", "tests/gpu_vllm", *_cov_args()) + + # Container: nvcr.io/nvidia/pytorch:26.01-py3 or later @nox.session(venv_backend="none") def regression(session): diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py index 34bc96cd0ae..611bc31b1b4 100644 --- a/tests/_test_utils/torch/transformers_models.py +++ b/tests/_test_utils/torch/transformers_models.py @@ -26,6 +26,7 @@ AutoModelForQuestionAnswering, AutoTokenizer, BertConfig, + DeepseekV3Config, GptOssConfig, LlamaConfig, PreTrainedModel, @@ -120,6 +121,44 @@ def create_tiny_qwen3_moe_dir( return qwen3_moe_dir +##### DeepSeek V3 ##### +def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel: + set_seed(SEED) + kwargs = { + "dtype": torch.bfloat16, + "vocab_size": 128, + "hidden_size": 128, + "intermediate_size": 256, + "moe_intermediate_size": 64, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "n_routed_experts": 4, + "num_experts_per_tok": 2, + "n_shared_experts": 1, + "first_k_dense_replace": 0, + "kv_lora_rank": 16, + "q_lora_rank": 32, + "qk_rope_head_dim": 16, + "qk_nope_head_dim": 16, + "v_head_dim": 16, + "max_position_embeddings": 128, + # Required so vLLM allocates ``gate.e_score_correction_bias`` (HF saves it unconditionally). + "topk_method": "noaux_tc", + } + kwargs.update(**config_kwargs) + cfg = DeepseekV3Config(**kwargs) + # Survive transformers versions that drop unknown kwargs from the dataclass. + cfg.topk_method = kwargs["topk_method"] + return AutoModelForCausalLM.from_config(cfg) + + +def create_tiny_deepseek_v3_dir(tmp_path: Path | str, **config_kwargs) -> Path: + deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3" + get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir) + return deepseek_dir + + ##### GPT-OSS ##### def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel: set_seed(SEED) diff --git a/tests/gpu_vllm/conftest.py b/tests/gpu_vllm/conftest.py new file mode 100644 index 00000000000..7da0432129e --- /dev/null +++ b/tests/gpu_vllm/conftest.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared setup for vLLM tests. + +vLLM handles its own distributed init, current-vllm-config context, and +parallel-state setup when ``LLM(...)`` is constructed, so this conftest only +opts into ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` *before* importing vLLM so +``LLM.collective_rpc(callable)`` can ship our worker callables over the engine +IPC channel via pickle. Without this, the default msgpack encoder rejects raw +functions and the call raises ``TypeError``. Only safe in a controlled test +environment. +""" + +import os + +# Must precede any ``import vllm``: the env is read at module-import time. +os.environ.setdefault("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") diff --git a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py new file mode 100644 index 00000000000..2dd354f32f7 --- /dev/null +++ b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py @@ -0,0 +1,272 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""End-to-end tests for the vLLM fakequant dynamic modules. + +Boots ``vllm.LLM`` on tiny HF models (saved via +``_test_utils.torch.transformers_models``) and runs ``mtq.quantize`` inside the +worker via ``LLM.collective_rpc``. Asserts every ``_QuantVLLM…`` class is +installed and every enabled quantizer ends up with a registered tensor-level +``_amax`` after calibration. Mirrors the +``examples/vllm_serve/fakequant_worker.py`` production path. + +Architectures: TinyLlama (Linear + Attention), TinyQwen3MoE (+ FusedMoE), +TinyDeepseekV3 (+ MLAAttention). +""" + +from __future__ import annotations + +import gc + +import pytest +from _test_utils.torch.transformers_models import ( + create_tiny_deepseek_v3_dir, + create_tiny_llama_dir, + create_tiny_qwen3_moe_dir, +) +from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory + +import modelopt.torch.quantization as mtq +from modelopt.torch.quantization.nn import TensorQuantizer +from modelopt.torch.quantization.plugins.vllm import ( + _ATTENTION_TYPES, + VllmMLAAttention, + _QuantFusedMoEBase, + _VLLMParallelLinear, + disable_compilation, +) + + +def _quantize_and_summarize(self): + """Run on the worker via ``LLM.collective_rpc``. + + Module-level so it survives pickle over engine-core IPC. ``self`` is the + vLLM worker — needed to drive ``model_runner._dummy_run`` from the + calibration forward_loop. Returns a JSON-able summary. + """ + model = self.get_model() + + def _forward_loop(_model): + # ``num_tokens=1`` is enough for the ``"max"`` calibrator. + self.model_runner._dummy_run(1) + + with disable_compilation(model): + mtq.quantize(model, mtq.NVFP4_DEFAULT_CFG, forward_loop=_forward_loop) + + parallel_linear_counts: dict[str, int] = {} + moe_count = 0 + attention_count = 0 + mla_count = 0 + missing_quantizers: list[str] = [] + quantizers_without_amax: list[str] = [] + enabled_quantizer_count = 0 + + def _missing(module, name, slots): + return ( + f"{name}.{slot}" + for slot in slots + if not isinstance(getattr(module, slot, None), TensorQuantizer) + ) + + for name, module in model.named_modules(): + if isinstance(module, _VLLMParallelLinear): + kind = type(module).__name__ + parallel_linear_counts[kind] = parallel_linear_counts.get(kind, 0) + 1 + missing_quantizers.extend( + _missing(module, name, ("input_quantizer", "weight_quantizer", "output_quantizer")) + ) + elif isinstance(module, _QuantFusedMoEBase): + moe_count += 1 + missing_quantizers.extend( + _missing( + module, + name, + ( + "w13_input_quantizer", + "w2_input_quantizer", + "w13_weight_quantizer", + "w2_weight_quantizer", + ), + ) + ) + elif VllmMLAAttention is not None and isinstance(module, VllmMLAAttention): + mla_count += 1 + missing_quantizers.extend( + _missing( + module, name, ("q_bmm_quantizer", "kv_c_bmm_quantizer", "k_pe_bmm_quantizer") + ) + ) + elif isinstance(module, _ATTENTION_TYPES): + attention_count += 1 + missing_quantizers.extend( + _missing(module, name, ("q_bmm_quantizer", "k_bmm_quantizer", "v_bmm_quantizer")) + ) + + # Static-amax invariant: every enabled quantizer must own an ``_amax`` + # after calibration. ``kv_b_proj`` is exempt — vLLM's MLA decode path + # reads its weight directly and never calls its forward. + if isinstance(module, TensorQuantizer) and module.is_enabled: + enabled_quantizer_count += 1 + if not hasattr(module, "_amax") and "kv_b_proj" not in name: + quantizers_without_amax.append(name) + + return { + "parallel_linear_counts": parallel_linear_counts, + "moe_count": moe_count, + "attention_count": attention_count, + "mla_count": mla_count, + "missing_quantizers": missing_quantizers, + "quantizers_without_amax": quantizers_without_amax, + "enabled_quantizer_count": enabled_quantizer_count, + } + + +def _boot_llm(model_dir, **extra): + """Construct a vLLM engine on a tiny model. + + MoE fixtures override with ``moe_backend="triton"`` (pins the Triton + experts kernel whose module-level entries the modelopt plugin patches — + FlashInfer/TRTLLM kernels bypass them) and ``enable_expert_parallel=True`` + (keeps modelopt's MoE-specific calibration paths live). + """ + return LLM( + model=str(model_dir), + enforce_eager=True, + gpu_memory_utilization=0.2, + max_model_len=64, + max_num_seqs=1, + dtype="bfloat16", + skip_tokenizer_init=True, + **extra, + ) + + +def _shutdown_llm(llm): + del llm + gc.collect() + cleanup_dist_env_and_memory(shutdown_ray=False) + + +@pytest.fixture(scope="module") +def tiny_llama_llm(tmp_path_factory): + tmp = tmp_path_factory.mktemp("tiny_llama") + model_dir = create_tiny_llama_dir(tmp) + llm = _boot_llm(model_dir) + try: + yield llm + finally: + _shutdown_llm(llm) + + +@pytest.fixture(scope="module") +def tiny_qwen3_moe_llm(tmp_path_factory): + tmp = tmp_path_factory.mktemp("tiny_qwen3_moe") + # head_dim=64 with num_heads=2 is broadly supported by vLLM's attention backends. + model_dir = create_tiny_qwen3_moe_dir( + tmp, + hidden_size=128, + intermediate_size=256, + moe_intermediate_size=64, + num_hidden_layers=2, + num_attention_heads=2, + num_key_value_heads=1, + max_position_embeddings=128, + vocab_size=128, + head_dim=64, + num_experts=4, + num_experts_per_tok=2, + decoder_sparse_step=1, + ) + llm = _boot_llm(model_dir, moe_backend="triton", enable_expert_parallel=True) + try: + yield llm + finally: + _shutdown_llm(llm) + + +@pytest.fixture(scope="module") +def tiny_deepseek_llm(tmp_path_factory): + tmp = tmp_path_factory.mktemp("tiny_deepseek") + model_dir = create_tiny_deepseek_v3_dir(tmp) + llm = _boot_llm(model_dir, moe_backend="triton", enable_expert_parallel=True) + try: + yield llm + finally: + _shutdown_llm(llm) + + +def _assert_quantizer_amax_is_static(summary): + """Every enabled quantizer must own a registered ``_amax`` after + calibration. Missing ``_amax`` → repr ``amax=dynamic`` → regression. + """ + assert summary["enabled_quantizer_count"] > 0, summary + assert summary["quantizers_without_amax"] == [], summary["quantizers_without_amax"] + + +def test_tiny_llama_quantize(tiny_llama_llm): + """Covers QKV/Row/MergedColumn ParallelLinear + Attention on a dense Llama.""" + summaries = tiny_llama_llm.collective_rpc(_quantize_and_summarize) + summary = summaries[0] + + assert summary["missing_quantizers"] == [], summary["missing_quantizers"] + + parallel_linear_counts = summary["parallel_linear_counts"] + # Each decoder layer contributes one of each. With num_hidden_layers=2: + assert parallel_linear_counts.get("QuantQKVParallelLinear", 0) >= 2, parallel_linear_counts + # o_proj + down_proj per layer + assert parallel_linear_counts.get("QuantRowParallelLinear", 0) >= 4, parallel_linear_counts + assert parallel_linear_counts.get("QuantMergedColumnParallelLinear", 0) >= 2, ( + parallel_linear_counts + ) + + # Llama uses the base Attention type — one per decoder layer. + assert summary["attention_count"] >= 2, summary + + # No MoE in a dense Llama. + assert summary["moe_count"] == 0 + + _assert_quantizer_amax_is_static(summary) + + +def test_tiny_qwen3_moe_quantize(tiny_qwen3_moe_llm): + """Tiny Qwen3-MoE adds FusedMoE coverage on top of the dense linears.""" + summaries = tiny_qwen3_moe_llm.collective_rpc(_quantize_and_summarize) + summary = summaries[0] + + assert summary["missing_quantizers"] == [], summary["missing_quantizers"] + + parallel_linear_counts = summary["parallel_linear_counts"] + assert parallel_linear_counts.get("QuantQKVParallelLinear", 0) >= 2, parallel_linear_counts + assert parallel_linear_counts.get("QuantRowParallelLinear", 0) >= 2, parallel_linear_counts + + # decoder_sparse_step=1 → every layer is MoE. With 2 layers we expect ≥2 FusedMoE. + assert summary["moe_count"] >= 2, summary + assert summary["attention_count"] >= 2, summary + + _assert_quantizer_amax_is_static(summary) + + +def test_tiny_deepseek_mla_quantize(tiny_deepseek_llm): + """Tiny DeepSeek-V3 covers MLAAttention (and again FusedMoE).""" + summaries = tiny_deepseek_llm.collective_rpc(_quantize_and_summarize) + summary = summaries[0] + + assert summary["missing_quantizers"] == [], summary["missing_quantizers"] + assert summary["mla_count"] >= 2, summary + # ``first_k_dense_replace=0`` → every layer is MoE. + assert summary["moe_count"] >= 2, summary + + _assert_quantizer_amax_is_static(summary) From 824de51500d96664ae3ec5058b7efd38571e0c78 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Fri, 22 May 2026 01:10:32 +0000 Subject: [PATCH 02/10] minor Signed-off-by: Kinjal Patel --- examples/vllm_serve/Dockerfile | 4 ++-- .../gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/vllm_serve/Dockerfile b/examples/vllm_serve/Dockerfile index 352896ca2cc..177a33f6dca 100644 --- a/examples/vllm_serve/Dockerfile +++ b/examples/vllm_serve/Dockerfile @@ -1,4 +1,4 @@ -FROM vllm/vllm-openai:v0.10.2 +FROM vllm/vllm-openai:v0.20.0 # Set environment variables ENV PIP_NO_CACHE_DIR=off \ @@ -23,7 +23,7 @@ RUN cd Model-Optimizer && \ pip install -e ".[all,dev-test]" # Llama4 requires this -RUN pip install flash-attn==2.7.4.post1 +RUN pip install flash-attn==2.7.4.post1 --no-build-isolation # Pre-compile CUDA extensions to avoid compilation time during runtime RUN python3 -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" || true diff --git a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py index 2dd354f32f7..2136f959754 100644 --- a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py +++ b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py @@ -163,7 +163,9 @@ def _shutdown_llm(llm): @pytest.fixture(scope="module") def tiny_llama_llm(tmp_path_factory): tmp = tmp_path_factory.mktemp("tiny_llama") - model_dir = create_tiny_llama_dir(tmp) + # Helper default ``max_position_embeddings=32`` would clash with vLLM's + # ``max_model_len=64`` set in ``_boot_llm``. + model_dir = create_tiny_llama_dir(tmp, max_position_embeddings=64) llm = _boot_llm(model_dir) try: yield llm From 7234ae82062a0ee21dc953ac8ac369ce59cd1750 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Fri, 22 May 2026 23:12:37 +0000 Subject: [PATCH 03/10] addressed comments Signed-off-by: Kinjal Patel --- .github/workflows/gpu_tests.yml | 3 +++ tests/_test_utils/torch/transformers_models.py | 8 +++++++- tests/gpu_vllm/conftest.py | 14 ++++---------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index fef5de9e955..47fb07dd6c2 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -78,6 +78,9 @@ jobs: run: | python3 -m pip install nox && nox -s ${{ matrix.example }} - name: Upload GPU coverage to Codecov + # vLLM container has no ``git``, which codecov-action needs; gpu_vllm + # also runs without ``--cov`` so there's no coverage.xml to upload. + if: matrix.example != 'gpu_vllm' uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} diff --git a/tests/_test_utils/torch/transformers_models.py b/tests/_test_utils/torch/transformers_models.py index 611bc31b1b4..0365f07ffa8 100644 --- a/tests/_test_utils/torch/transformers_models.py +++ b/tests/_test_utils/torch/transformers_models.py @@ -153,8 +153,14 @@ def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel: return AutoModelForCausalLM.from_config(cfg) -def create_tiny_deepseek_v3_dir(tmp_path: Path | str, **config_kwargs) -> Path: +def create_tiny_deepseek_v3_dir( + tmp_path: Path | str, with_tokenizer: bool = False, **config_kwargs +) -> Path: deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3" + if with_tokenizer: + tokenizer = get_tiny_tokenizer() + tokenizer.save_pretrained(deepseek_dir) + config_kwargs["vocab_size"] = tokenizer.vocab_size get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir) return deepseek_dir diff --git a/tests/gpu_vllm/conftest.py b/tests/gpu_vllm/conftest.py index 7da0432129e..8b4e966e987 100644 --- a/tests/gpu_vllm/conftest.py +++ b/tests/gpu_vllm/conftest.py @@ -13,18 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Shared setup for vLLM tests. - -vLLM handles its own distributed init, current-vllm-config context, and -parallel-state setup when ``LLM(...)`` is constructed, so this conftest only -opts into ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` *before* importing vLLM so -``LLM.collective_rpc(callable)`` can ship our worker callables over the engine -IPC channel via pickle. Without this, the default msgpack encoder rejects raw -functions and the call raises ``TypeError``. Only safe in a controlled test -environment. +"""Set ``VLLM_ALLOW_INSECURE_SERIALIZATION=1`` before vLLM is imported so +``LLM.collective_rpc(callable)`` can pickle worker callables. pytest loads +conftests before sibling test modules, so this beats the top-level +``from vllm import LLM`` in ``test_*.py``. """ import os -# Must precede any ``import vllm``: the env is read at module-import time. os.environ.setdefault("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") From b9f5b8dd5f203e322aa4e18e0b497ceb8b5f2534 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Fri, 22 May 2026 23:15:10 +0000 Subject: [PATCH 04/10] removed requirements.txt install Signed-off-by: Kinjal Patel --- examples/vllm_serve/Dockerfile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/vllm_serve/Dockerfile b/examples/vllm_serve/Dockerfile index 177a33f6dca..7213c6fc430 100644 --- a/examples/vllm_serve/Dockerfile +++ b/examples/vllm_serve/Dockerfile @@ -28,12 +28,6 @@ RUN pip install flash-attn==2.7.4.post1 --no-build-isolation # Pre-compile CUDA extensions to avoid compilation time during runtime RUN python3 -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" || true -# Install requirements from examples (excluding windows examples) -RUN find Model-Optimizer/examples -name "requirements.txt" | grep -v "windows" | while read req_file; do \ - echo "Installing from $req_file"; \ - pip install -r "$req_file" || echo "Warning: Failed to install from $req_file"; \ - done - # Allow users to run without root RUN chmod -R 777 /workspace From 7fc5ebfc25b6ef210d7db2d9c7442813b5f430c6 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Thu, 28 May 2026 19:50:24 +0000 Subject: [PATCH 05/10] fixing docker permission for vllm Signed-off-by: Kinjal Patel --- .github/workflows/gpu_tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 47fb07dd6c2..bd493614d6b 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -57,9 +57,11 @@ jobs: timeout-minutes: ${{ matrix.timeout }} container: image: ${{ matrix.container_image }} + # NGC creds only for ``nvcr.io/*`` images; ``docker.io/*`` is anonymous-pull. + # Empty username/password short-circuits the runner's ``docker login`` step. credentials: - username: $oauthtoken - password: ${{ secrets.NGC_API_KEY }} + username: ${{ startsWith(matrix.container_image, 'nvcr.io/') && '$oauthtoken' || '' }} + password: ${{ startsWith(matrix.container_image, 'nvcr.io/') && secrets.NGC_API_KEY || '' }} env: GIT_DEPTH: 1000 # For correct version PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages From 351b369402adbeab57f033b2b938f9d0c9877fa3 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Thu, 28 May 2026 23:14:19 +0000 Subject: [PATCH 06/10] updated gpu_tests.yml Signed-off-by: Kinjal Patel --- .github/workflows/gpu_tests.yml | 59 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index bd493614d6b..3e2681d8854 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -37,8 +37,6 @@ jobs: strategy: fail-fast: false matrix: - # ``container_image`` is the full image path so non-nvcr.io registries - # (e.g. docker.io/vllm) can be used alongside nvcr.io/nvidia images. include: - example: gpu timeout: 75 @@ -51,52 +49,51 @@ jobs: container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16 - example: gpu_vllm timeout: 30 - # Keep in sync with examples/vllm_serve/Dockerfile. container_image: docker.io/vllm/vllm-openai:v0.20.0 runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }} timeout-minutes: ${{ matrix.timeout }} container: image: ${{ matrix.container_image }} - # NGC creds only for ``nvcr.io/*`` images; ``docker.io/*`` is anonymous-pull. - # Empty username/password short-circuits the runner's ``docker login`` step. credentials: - username: ${{ startsWith(matrix.container_image, 'nvcr.io/') && '$oauthtoken' || '' }} - password: ${{ startsWith(matrix.container_image, 'nvcr.io/') && secrets.NGC_API_KEY || '' }} + username: $oauthtoken + password: ${{ secrets.NGC_API_KEY }} env: GIT_DEPTH: 1000 # For correct version PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: - uses: actions/checkout@v6 - - uses: nv-gha-runners/setup-proxy-cache@main - - name: Setup environment variables - run: | - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV - - name: Run gpu tests - env: - # Skip subprocess coverage for gpu_vllm — the hook deadlocks vLLM's engine-core IPC. - COVERAGE_PROCESS_START: ${{ matrix.example == 'gpu_vllm' && '' || format('{0}/pyproject.toml', github.workspace) }} - COVERAGE_FILE: ${{ github.workspace }}/.coverage - run: | - python3 -m pip install nox && nox -s ${{ matrix.example }} - - name: Upload GPU coverage to Codecov - # vLLM container has no ``git``, which codecov-action needs; gpu_vllm - # also runs without ``--cov`` so there's no coverage.xml to upload. - if: matrix.example != 'gpu_vllm' - uses: codecov/codecov-action@v5 + - uses: ./.github/actions/gpu-test-run with: - token: ${{ secrets.CODECOV_TOKEN }} - files: coverage.xml - flags: gpu - fail_ci_if_error: false # test may be skipped if relevant file changes are not detected - verbose: true + example: ${{ matrix.example }} + codecov_token: ${{ secrets.CODECOV_TOKEN }} + + # Docker Hub image: anonymous pull (no ``credentials:``) and no coverage + gpu-tests-vllm: + needs: [pr-gate] + if: needs.pr-gate.outputs.any_changed == 'true' + runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }} + timeout-minutes: 30 + container: + # Keep in sync with examples/vllm_serve/Dockerfile. + image: docker.io/vllm/vllm-openai:v0.20.0 + env: + GIT_DEPTH: 1000 # For correct version + PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages + HF_TOKEN: ${{ secrets.HF_TOKEN }} + steps: + - uses: actions/checkout@v6 + - uses: ./.github/actions/gpu-test-run + with: + example: gpu_vllm + with_coverage: "false" gpu-pr-required-check: - # Run even if gpu-tests is skipped + # Run even if any of the gpu jobs is skipped if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }} - needs: [pr-gate, gpu-tests] + needs: [pr-gate, gpu-tests, gpu-tests-vllm] runs-on: ubuntu-latest steps: - name: Required GPU tests did not succeed - if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && needs.gpu-tests.result != 'success') }} + if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && (needs.gpu-tests.result != 'success' || needs.gpu-tests-vllm.result != 'success')) }} run: exit 1 From 76ba5855b24c6b77a000317f9627923d2bef4aa3 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Fri, 29 May 2026 18:17:12 +0000 Subject: [PATCH 07/10] minor Signed-off-by: Kinjal Patel --- .github/workflows/gpu_tests.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 3e2681d8854..1e1901a7780 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -40,20 +40,17 @@ jobs: include: - example: gpu timeout: 75 - container_image: nvcr.io/nvidia/pytorch:26.04-py3 + container_image: pytorch:26.04-py3 - example: gpu_megatron timeout: 45 - container_image: nvcr.io/nvidia/nemo:26.04 + container_image: nemo:26.04 - example: gpu_trtllm timeout: 30 - container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16 - - example: gpu_vllm - timeout: 30 - container_image: docker.io/vllm/vllm-openai:v0.20.0 + container_image: tensorrt-llm/release:1.3.0rc16 runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }} timeout-minutes: ${{ matrix.timeout }} container: - image: ${{ matrix.container_image }} + image: nvcr.io/nvidia/${{ matrix.container_image }} credentials: username: $oauthtoken password: ${{ secrets.NGC_API_KEY }} @@ -75,7 +72,6 @@ jobs: runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }} timeout-minutes: 30 container: - # Keep in sync with examples/vllm_serve/Dockerfile. image: docker.io/vllm/vllm-openai:v0.20.0 env: GIT_DEPTH: 1000 # For correct version From caeef809709fc109c361f03818c59b421a81bb9e Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 29 May 2026 11:51:19 -0700 Subject: [PATCH 08/10] simplify gpu_tests.yml Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/gpu_tests.yml | 67 ++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 1e1901a7780..451e8b6c632 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -40,56 +40,61 @@ jobs: include: - example: gpu timeout: 75 - container_image: pytorch:26.04-py3 + container_image: nvcr.io/nvidia/pytorch:26.04-py3 - example: gpu_megatron timeout: 45 - container_image: nemo:26.04 + container_image: nvcr.io/nvidia/nemo:26.04 - example: gpu_trtllm timeout: 30 - container_image: tensorrt-llm/release:1.3.0rc16 + container_image: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc16 + - example: gpu_vllm + timeout: 30 + container_image: docker.io/vllm/vllm-openai:v0.20.0 runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }} timeout-minutes: ${{ matrix.timeout }} container: - image: nvcr.io/nvidia/${{ matrix.container_image }} + image: ${{ matrix.container_image }} + # nvcr.io images require NGC auth; public docker.io images (e.g. vllm) are pulled + # anonymously (the runner skips docker login when username/password are empty). credentials: - username: $oauthtoken - password: ${{ secrets.NGC_API_KEY }} - env: - GIT_DEPTH: 1000 # For correct version - PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages - HF_TOKEN: ${{ secrets.HF_TOKEN }} - steps: - - uses: actions/checkout@v6 - - uses: ./.github/actions/gpu-test-run - with: - example: ${{ matrix.example }} - codecov_token: ${{ secrets.CODECOV_TOKEN }} - - # Docker Hub image: anonymous pull (no ``credentials:``) and no coverage - gpu-tests-vllm: - needs: [pr-gate] - if: needs.pr-gate.outputs.any_changed == 'true' - runs-on: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && 'linux-amd64-gpu-rtxpro6000-latest-1' || 'linux-amd64-gpu-rtxpro6000-latest-2' }} - timeout-minutes: 30 - container: - image: docker.io/vllm/vllm-openai:v0.20.0 + username: ${{ startsWith(matrix.container_image, 'nvcr.io') && '$oauthtoken' || '' }} + password: ${{ startsWith(matrix.container_image, 'nvcr.io') && secrets.NGC_API_KEY || '' }} env: GIT_DEPTH: 1000 # For correct version PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages HF_TOKEN: ${{ secrets.HF_TOKEN }} steps: + - name: Install git + # The vllm container ships without git; needed for a real checkout (correct + # setuptools-scm version) and for the Codecov upload below. + if: matrix.example == 'gpu_vllm' + run: apt-get update && apt-get install -y git - uses: actions/checkout@v6 - - uses: ./.github/actions/gpu-test-run + - uses: nv-gha-runners/setup-proxy-cache@main + - name: Setup environment variables + run: | + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV + - name: Run gpu tests + env: + COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml + COVERAGE_FILE: ${{ github.workspace }}/.coverage + run: | + python -m pip install nox && nox -s ${{ matrix.example }} + - name: Upload GPU coverage to Codecov + uses: codecov/codecov-action@v5 with: - example: gpu_vllm - with_coverage: "false" + token: ${{ secrets.CODECOV_TOKEN }} + files: coverage.xml + flags: gpu + fail_ci_if_error: false # test may be skipped if relevant file changes are not detected + verbose: true gpu-pr-required-check: - # Run even if any of the gpu jobs is skipped + # Run even if gpu-tests is skipped if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }} - needs: [pr-gate, gpu-tests, gpu-tests-vllm] + needs: [pr-gate, gpu-tests] runs-on: ubuntu-latest steps: - name: Required GPU tests did not succeed - if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && (needs.gpu-tests.result != 'success' || needs.gpu-tests-vllm.result != 'success')) }} + if: ${{ needs.pr-gate.result != 'success' || (needs.pr-gate.outputs.any_changed == 'true' && needs.gpu-tests.result != 'success') }} run: exit 1 From 45d25ab82b8b06ae8dd4a9c86fedcaf3f494297c Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 29 May 2026 12:28:13 -0700 Subject: [PATCH 09/10] Remove credentials as its not needed for nvcr.io also Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/_example_tests_runner.yml | 3 --- .github/workflows/gpu_tests.yml | 5 ----- 2 files changed, 8 deletions(-) diff --git a/.github/workflows/_example_tests_runner.yml b/.github/workflows/_example_tests_runner.yml index ea0fc0c19f8..8adadbac7af 100644 --- a/.github/workflows/_example_tests_runner.yml +++ b/.github/workflows/_example_tests_runner.yml @@ -34,9 +34,6 @@ jobs: timeout-minutes: ${{ inputs.timeout_minutes }} container: image: ${{ inputs.docker_image }} - credentials: - username: $oauthtoken - password: ${{ secrets.NGC_API_KEY }} options: --shm-size=2gb # TRT-LLM tests on 2-GPU runner needs more shared memory env: PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 451e8b6c632..11ef88290ee 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -54,11 +54,6 @@ jobs: timeout-minutes: ${{ matrix.timeout }} container: image: ${{ matrix.container_image }} - # nvcr.io images require NGC auth; public docker.io images (e.g. vllm) are pulled - # anonymously (the runner skips docker login when username/password are empty). - credentials: - username: ${{ startsWith(matrix.container_image, 'nvcr.io') && '$oauthtoken' || '' }} - password: ${{ startsWith(matrix.container_image, 'nvcr.io') && secrets.NGC_API_KEY || '' }} env: GIT_DEPTH: 1000 # For correct version PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages From 8a6bcee29e6da1cac3a72be041dc36636805dbea Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Fri, 29 May 2026 12:52:04 -0700 Subject: [PATCH 10/10] use python3 instead of python for vllm Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- .github/workflows/gpu_tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 11ef88290ee..9d290cae04f 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -74,7 +74,8 @@ jobs: COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml COVERAGE_FILE: ${{ github.workspace }}/.coverage run: | - python -m pip install nox && nox -s ${{ matrix.example }} + # Use `python3` (the vllm image has no `python` on PATH) + python3 -m pip install nox && nox -s ${{ matrix.example }} - name: Upload GPU coverage to Codecov uses: codecov/codecov-action@v5 with: