File tree Expand file tree Collapse file tree
tests/gpu_vllm/torch/quantization Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1- FROM vllm/vllm-openai:v0.10.2
1+ FROM vllm/vllm-openai:v0.20.0
22
33# Set environment variables
44ENV PIP_NO_CACHE_DIR=off \
@@ -23,7 +23,7 @@ RUN cd Model-Optimizer && \
2323 pip install -e ".[all,dev-test]"
2424
2525# Llama4 requires this
26- RUN pip install flash-attn==2.7.4.post1
26+ RUN pip install flash-attn==2.7.4.post1 --no-build-isolation
2727
2828# Pre-compile CUDA extensions to avoid compilation time during runtime
2929RUN python3 -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" || true
Original file line number Diff line number Diff line change @@ -163,7 +163,9 @@ def _shutdown_llm(llm):
163163@pytest .fixture (scope = "module" )
164164def tiny_llama_llm (tmp_path_factory ):
165165 tmp = tmp_path_factory .mktemp ("tiny_llama" )
166- model_dir = create_tiny_llama_dir (tmp )
166+ # Helper default ``max_position_embeddings=32`` would clash with vLLM's
167+ # ``max_model_len=64`` set in ``_boot_llm``.
168+ model_dir = create_tiny_llama_dir (tmp , max_position_embeddings = 64 )
167169 llm = _boot_llm (model_dir )
168170 try :
169171 yield llm
You can’t perform that action at this time.
0 commit comments