Skip to content

Commit 19fd5c8

Browse files
avinash2692claudeavinash.bala@us.ibm.com;0J8455897;AVINASH BALAKRISHNANajbozarth
authored
ci: memory management in tests (#721)
* fix(vllm): implement shared backend to prevent GPU OOM errors - Add session-scoped shared_vllm_backend fixture using Granite 4 Micro - Update test_vllm.py and test_vllm_tools.py to use shared backend - Fall back to module-scoped backends when --isolate-heavy flag is set - Both modules now use consistent Granite 4 Micro model - Enhance CUDA OOM error message with actionable solutions - Maintains backward compatibility with existing isolation mechanism Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> * reduce vllm GPU allocation for tests * implement backend test grouping via reordering * add gpu cleanup between backend groups * delay vllm backend creation until after openai vllm group * adding explicit served model name for vllm openai test * fix: rag intrinscis are not for the hybrid model (I think) * testing a fix in tests for all the gpu issues * more gpu cleaning * adding docs tooling to mypy exclude * removing kv cache also from GPU in cleanup for tests * moving test order around and also fixing a fixture bug * rolling back some changes from exclusive process * some changes to the error message in vllm and also conftest cleaning * adding an end-to-end script for tests with ollama * adding a port finder (just in case) * adding direct download of ollama binary from github * warm starting ollama * warm starting ollama * adding cuda paths for ollama * some extra checks for vllm and teardown * making group by backend default * making the script executable * test: remove heavy ram pytest marks added in #623 Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> * ruff formatting * small changes to script and adding cleaup to guardian and core * making log dir more easy to set * increasing ollama startup to 2 mins * adding pytest-json-report --------- Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com> Co-authored-by: avinash.bala@us.ibm.com;0J8455897;AVINASH BALAKRISHNAN <avinashbala@p5-r03-n2.bluevela.rmf.ibm.com> Co-authored-by: Alex Bozarth <ajbozart@us.ibm.com>
1 parent 17f1f57 commit 19fd5c8

16 files changed

Lines changed: 648 additions & 111 deletions

mellea/backends/vllm.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,11 +184,15 @@ def __init__(
184184
or engine_args["gpu_memory_utilization"] < 0.1
185185
):
186186
raise RuntimeError(
187-
"no matter how I reduced max_model_len and max_num_seqs, there is not enough memory! \n"
188-
"final values:\n"
189-
f"gpu_memory_utilization: {engine_args['gpu_memory_utilization']}\n"
190-
f"max_model_len: {engine_args['max_model_len']}\n"
191-
f"max_num_seqs: {engine_args['max_num_seqs']}\n"
187+
f"Failed to initialize vLLM engine (last error: {e}).\n"
188+
"\n"
189+
"Exhausted all retry attempts:\n"
190+
f" • gpu_memory_utilization: {engine_args['gpu_memory_utilization']:.2f}\n"
191+
f" • max_model_len: {engine_args['max_model_len']}\n"
192+
f" • max_num_seqs: {engine_args['max_num_seqs']}\n"
193+
"\n"
194+
"Please check the full stack trace above for details.\n"
195+
"Common issues: OOM, CUDA fork errors, GPU exclusive_process mode.\n"
192196
)
193197
logger.info(
194198
f"Reducing vllm model parameters to make it fit in the GPU memory.\n"

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ test = [
128128
"pytest-timeout", # For test suite timeout configuration
129129
"pytest-recording", # For blocking network access and using canned server responses
130130
"pytest-xdist>=3.8.0", # For concurrent test execution (uv run pytest -n auto test/)
131+
"pytest-json-report", # For JSON test reports in nightly CI
131132
"psutil", # For test infrastructure: RAM detection in conftest.py
132133
"nbmake>=1.5.5", # Notebook testing
133134
"sentencepiece==0.2.1", # Necessary for test_huggingface_tools test because of Mistral model
@@ -313,7 +314,7 @@ disable_error_code = [
313314
# -----------------------------
314315

315316
[tool.codespell]
316-
ignore-words-list = 'mellea,hashi,noo,Asai,asai,nd,mot,rouge,Rouge,Strat,Wight,Aer,aer'
317+
ignore-words-list = 'mellea,hashi,noo,Asai,asai,nd,mot,rouge,Rouge,Strat,Wight,Aer,aer,preemptable'
317318
check-filenames = true
318319
check-hidden = false
319320
regex = "(?<![a-z])[a-z'`]+|[A-Z][a-z'`]*|[a-z]+'[a-z]*|[a-z]+(?=[_-])|[a-z]+(?=[A-Z])|\\d+"

test/backends/test_huggingface.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,11 @@ def backend():
6464
backend.add_adapter(
6565
IntrinsicAdapter("answerability", base_model_name=backend.base_model_name)
6666
)
67-
return backend
67+
yield backend
68+
69+
from test.conftest import cleanup_gpu_backend
70+
71+
cleanup_gpu_backend(backend, "huggingface")
6872

6973

7074
@pytest.fixture(scope="function")

test/backends/test_huggingface_tools.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@ def backend():
3232
model_id=model_ids.MISTRALAI_MISTRAL_0_3_7B, cache=SimpleLRUCache(5)
3333
)
3434
# add_granite_aloras(backend)
35-
return backend
35+
yield backend
36+
37+
from test.conftest import cleanup_gpu_backend
38+
39+
cleanup_gpu_backend(backend, "huggingface-tools")
3640

3741

3842
@pytest.fixture(scope="function")

test/backends/test_openai_vllm.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ def vllm_process():
4545
"vllm",
4646
"serve",
4747
IBM_GRANITE_4_MICRO_3B.hf_model_name,
48+
"--served-model-name",
49+
IBM_GRANITE_4_MICRO_3B.hf_model_name,
4850
"--enable-lora",
4951
"--dtype",
5052
"bfloat16",
@@ -55,6 +57,9 @@ def vllm_process():
5557
# the process will have a new session id, so
5658
# entire process tree is killable at once
5759
start_new_session=True,
60+
stdout=subprocess.PIPE,
61+
stderr=subprocess.STDOUT, # merge stderr into stdout
62+
text=True,
5863
)
5964
url = "http://127.0.0.1:8000/ping"
6065
timeout = 600 # vllm initialization takes quite a while
@@ -63,7 +68,11 @@ def vllm_process():
6368
# Wait for readiness message
6469
while True:
6570
if process.poll() is not None:
66-
raise RuntimeError("vLLM server exited before startup.")
71+
output = process.stdout.read() if process.stdout else ""
72+
raise RuntimeError(
73+
f"vLLM server exited before startup (code {process.returncode}).\n"
74+
f"--- vLLM output ---\n{output}\n--- end ---"
75+
)
6776

6877
try:
6978
response = requests.get(url, timeout=2)
@@ -73,17 +82,36 @@ def vllm_process():
7382
pass
7483

7584
if time.time() - start_time > timeout:
85+
output = ""
86+
if process.stdout:
87+
try:
88+
# Read whatever is available without blocking
89+
import select
90+
91+
if select.select([process.stdout], [], [], 0)[0]:
92+
output = process.stdout.read()
93+
except Exception:
94+
pass
7695
raise TimeoutError(
77-
f"Timed out waiting for server health check at {url}"
96+
f"Timed out waiting for server health check at {url}\n"
97+
f"--- vLLM output (last lines) ---\n{output[-2000:]}\n--- end ---"
7898
)
7999

80100
yield process
81101

82102
except Exception as e:
83-
pytest.skip(
84-
f"vLLM process not available: {e}. May need to install with: pip install mellea[vllm]",
85-
allow_module_level=True,
103+
output = ""
104+
if process is not None and process.stdout:
105+
try:
106+
output = process.stdout.read()
107+
except Exception:
108+
pass
109+
skip_msg = (
110+
f"vLLM process not available: {e}\n"
111+
f"--- vLLM output ---\n{output}\n--- end ---"
86112
)
113+
print(skip_msg) # visible with -s flag
114+
pytest.skip(skip_msg, allow_module_level=True)
87115

88116
# --- Teardown (always runs) ---
89117
finally:

test/backends/test_vllm.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,26 +34,34 @@
3434
)
3535

3636

37+
# vLLM tests use hybrid backend strategy (see conftest.py):
38+
# - Default: Shared session-scoped backend (fast, no fragmentation)
39+
# - --isolate-heavy: Module-scoped backends in separate processes
3740
@pytest.fixture(scope="module")
38-
def backend():
39-
"""Shared vllm backend for all tests in this module."""
41+
def backend(shared_vllm_backend):
42+
"""Use shared session-scoped backend, or create module-scoped if isolated.
43+
44+
Without --isolate-heavy: Uses shared backend (fast, no fragmentation)
45+
With --isolate-heavy: Creates module-scoped backend (process isolation)
46+
"""
47+
if shared_vllm_backend is not None:
48+
yield shared_vllm_backend
49+
return # skip cleanup — shared backend cleaned up by conftest
50+
51+
# Isolation mode - create module-scoped backend
4052
backend = LocalVLLMBackend(
41-
model_id=model_ids.QWEN3_0_6B,
42-
# formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"),
53+
model_id=model_ids.IBM_GRANITE_4_MICRO_3B,
4354
model_options={
44-
# made smaller for a testing environment with smaller gpus.
45-
# such an environment could possibly be running other gpu applications, including slack
46-
"gpu_memory_utilization": 0.8,
47-
"max_model_len": 8192,
48-
"max_num_seqs": 8,
55+
"gpu_memory_utilization": 0.6,
56+
"max_model_len": 4096,
57+
"max_num_seqs": 4,
4958
},
5059
)
5160
yield backend
5261

53-
# Cleanup using shared function (best-effort within module)
54-
from test.conftest import cleanup_vllm_backend
62+
from test.conftest import cleanup_gpu_backend
5563

56-
cleanup_vllm_backend(backend)
64+
cleanup_gpu_backend(backend, "vllm")
5765

5866

5967
@pytest.fixture(scope="function")

test/backends/test_vllm_tools.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,26 +31,36 @@
3131
)
3232

3333

34+
# vLLM tests use hybrid backend strategy (see conftest.py):
35+
# - Default: Shared session-scoped backend (fast, no fragmentation)
36+
# - --isolate-heavy: Module-scoped backends in separate processes
37+
# Note: Originally used Mistral-7B, now uses Granite 4 Micro for consistency.
38+
# Granite 4 Micro supports tool calling and is sufficient for testing.
3439
@pytest.fixture(scope="module")
35-
def backend():
36-
"""Shared vllm backend for all tests in this module."""
40+
def backend(shared_vllm_backend):
41+
"""Use shared session-scoped backend, or create module-scoped if isolated.
3742
43+
Without --isolate-heavy: Uses shared backend (fast, no fragmentation)
44+
With --isolate-heavy: Creates module-scoped backend (process isolation)
45+
"""
46+
if shared_vllm_backend is not None:
47+
yield shared_vllm_backend
48+
return
49+
50+
# Isolation mode - create module-scoped backend
3851
backend = LocalVLLMBackend(
39-
model_id=model_ids.MISTRALAI_MISTRAL_0_3_7B,
52+
model_id=model_ids.IBM_GRANITE_4_MICRO_3B,
4053
model_options={
41-
# made smaller for a testing environment with smaller gpus.
42-
# such an environment could possibly be running other gpu applications, including slack
43-
"gpu_memory_utilization": 0.8,
44-
"max_model_len": 8192,
45-
"max_num_seqs": 8,
54+
"gpu_memory_utilization": 0.6,
55+
"max_model_len": 4096,
56+
"max_num_seqs": 4,
4657
},
4758
)
4859
yield backend
4960

50-
# Cleanup using shared function (best-effort within module)
51-
from test.conftest import cleanup_vllm_backend
61+
from test.conftest import cleanup_gpu_backend
5262

53-
cleanup_vllm_backend(backend)
63+
cleanup_gpu_backend(backend, "vllm-tools")
5464

5565

5666
@pytest.fixture(scope="function")

test/cli/test_alora_train_integration.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,16 @@ def test_alora_training_integration():
287287
"✅ Verified adapter activation: both with/without invocation tokens generate successfully"
288288
)
289289

290+
# Cleanup GPU memory
291+
base_model.cpu()
292+
del model_with_adapter
293+
del base_model
294+
import gc
295+
296+
gc.collect()
297+
if torch.cuda.is_available():
298+
torch.cuda.empty_cache()
299+
290300

291301
def test_lora_training_integration():
292302
"""Integration test: Train a tiny standard LoRA adapter and verify it works.

0 commit comments

Comments
 (0)