minor

kinjalpatel27 · kinjalpatel27 · commit 323eb2e5c1a3 · 2026-05-29T00:02:07.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/examples/vllm_serve/Dockerfile b/examples/vllm_serve/Dockerfile
@@ -1,4 +1,4 @@
-FROM vllm/vllm-openai:v0.10.2
+FROM vllm/vllm-openai:v0.20.0
 
 # Set environment variables
 ENV PIP_NO_CACHE_DIR=off \
@@ -23,7 +23,7 @@ RUN cd Model-Optimizer && \
     pip install -e ".[all,dev-test]"
 
 # Llama4 requires this
-RUN pip install flash-attn==2.7.4.post1
+RUN pip install flash-attn==2.7.4.post1 --no-build-isolation
 
 # Pre-compile CUDA extensions to avoid compilation time during runtime
 RUN python3 -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" || true
diff --git a/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py b/tests/gpu_vllm/torch/quantization/test_vllm_dynamic_modules.py
@@ -163,7 +163,9 @@ def _shutdown_llm(llm):
 @pytest.fixture(scope="module")
 def tiny_llama_llm(tmp_path_factory):
     tmp = tmp_path_factory.mktemp("tiny_llama")
-    model_dir = create_tiny_llama_dir(tmp)
+    # Helper default ``max_position_embeddings=32`` would clash with vLLM's
+    # ``max_model_len=64`` set in ``_boot_llm``.
+    model_dir = create_tiny_llama_dir(tmp, max_position_embeddings=64)
     llm = _boot_llm(model_dir)
     try:
         yield llm