generative-computing
diff --git a/‎mellea/backends/vllm.py‎
Lines changed: 9 additions & 5 deletions b/‎mellea/backends/vllm.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/backends/test_huggingface.py‎
Lines changed: 5 additions & 1 deletion b/‎test/backends/test_huggingface.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎test/backends/test_huggingface_tools.py‎
Lines changed: 5 additions & 1 deletion b/‎test/backends/test_huggingface_tools.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎test/backends/test_openai_vllm.py‎
Lines changed: 33 additions & 5 deletions b/‎test/backends/test_openai_vllm.py‎
Lines changed: 33 additions & 5 deletions
diff --git a/‎test/backends/test_vllm.py‎
Lines changed: 20 additions & 12 deletions b/‎test/backends/test_vllm.py‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎test/backends/test_vllm_tools.py‎
Lines changed: 21 additions & 11 deletions b/‎test/backends/test_vllm_tools.py‎
Lines changed: 21 additions & 11 deletions
diff --git a/‎test/cli/test_alora_train_integration.py‎
Lines changed: 10 additions & 0 deletions b/‎test/cli/test_alora_train_integration.py‎
Lines changed: 10 additions & 0 deletions
@@ -184,11 +184,15 @@ def __init__(
                     or engine_args["gpu_memory_utilization"] < 0.1
                 ):
                     raise RuntimeError(
-                        "no matter how I reduced max_model_len and max_num_seqs, there is not enough memory! \n"
-                        "final values:\n"
-                        f"gpu_memory_utilization: {engine_args['gpu_memory_utilization']}\n"
-                        f"max_model_len: {engine_args['max_model_len']}\n"
-                        f"max_num_seqs: {engine_args['max_num_seqs']}\n"
+                        f"Failed to initialize vLLM engine (last error: {e}).\n"
+                        "\n"
+                        "Exhausted all retry attempts:\n"
+                        f"  • gpu_memory_utilization: {engine_args['gpu_memory_utilization']:.2f}\n"
+                        f"  • max_model_len: {engine_args['max_model_len']}\n"
+                        f"  • max_num_seqs: {engine_args['max_num_seqs']}\n"
+                        "\n"
+                        "Please check the full stack trace above for details.\n"
+                        "Common issues: OOM, CUDA fork errors, GPU exclusive_process mode.\n"
                     )
                 logger.info(
                     f"Reducing vllm model parameters to make it fit in the GPU memory.\n"
 
@@ -128,6 +128,7 @@ test = [
     "pytest-timeout",  # For test suite timeout configuration
     "pytest-recording",  # For blocking network access and using canned server responses
     "pytest-xdist>=3.8.0", # For concurrent test execution (uv run pytest -n auto test/)
+    "pytest-json-report",  # For JSON test reports in nightly CI
     "psutil",  # For test infrastructure: RAM detection in conftest.py
     "nbmake>=1.5.5",  # Notebook testing
     "sentencepiece==0.2.1",  # Necessary for test_huggingface_tools test because of Mistral model
@@ -313,7 +314,7 @@ disable_error_code = [
 # -----------------------------
 
 [tool.codespell]
-ignore-words-list = 'mellea,hashi,noo,Asai,asai,nd,mot,rouge,Rouge,Strat,Wight,Aer,aer'
+ignore-words-list = 'mellea,hashi,noo,Asai,asai,nd,mot,rouge,Rouge,Strat,Wight,Aer,aer,preemptable'
 check-filenames = true
 check-hidden = false
 regex = "(?<![a-z])[a-z'`]+|[A-Z][a-z'`]*|[a-z]+'[a-z]*|[a-z]+(?=[_-])|[a-z]+(?=[A-Z])|\\d+"
 
@@ -64,7 +64,11 @@ def backend():
     backend.add_adapter(
         IntrinsicAdapter("answerability", base_model_name=backend.base_model_name)
     )
-    return backend
+    yield backend
+
+    from test.conftest import cleanup_gpu_backend
+
+    cleanup_gpu_backend(backend, "huggingface")
 
 
 @pytest.fixture(scope="function")
 
@@ -32,7 +32,11 @@ def backend():
         model_id=model_ids.MISTRALAI_MISTRAL_0_3_7B, cache=SimpleLRUCache(5)
     )
     # add_granite_aloras(backend)
-    return backend
+    yield backend
+
+    from test.conftest import cleanup_gpu_backend
+
+    cleanup_gpu_backend(backend, "huggingface-tools")
 
 
 @pytest.fixture(scope="function")
 
@@ -45,6 +45,8 @@ def vllm_process():
                 "vllm",
                 "serve",
                 IBM_GRANITE_4_MICRO_3B.hf_model_name,
+                "--served-model-name",
+                IBM_GRANITE_4_MICRO_3B.hf_model_name,
                 "--enable-lora",
                 "--dtype",
                 "bfloat16",
@@ -55,6 +57,9 @@ def vllm_process():
             # the process will have a new session id, so
             # entire process tree is killable at once
             start_new_session=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,  # merge stderr into stdout
+            text=True,
         )
         url = "http://127.0.0.1:8000/ping"
         timeout = 600  # vllm initialization takes quite a while
@@ -63,7 +68,11 @@ def vllm_process():
         # Wait for readiness message
         while True:
             if process.poll() is not None:
-                raise RuntimeError("vLLM server exited before startup.")
+                output = process.stdout.read() if process.stdout else ""
+                raise RuntimeError(
+                    f"vLLM server exited before startup (code {process.returncode}).\n"
+                    f"--- vLLM output ---\n{output}\n--- end ---"
+                )
 
             try:
                 response = requests.get(url, timeout=2)
@@ -73,17 +82,36 @@ def vllm_process():
                 pass
 
             if time.time() - start_time > timeout:
+                output = ""
+                if process.stdout:
+                    try:
+                        # Read whatever is available without blocking
+                        import select
+
+                        if select.select([process.stdout], [], [], 0)[0]:
+                            output = process.stdout.read()
+                    except Exception:
+                        pass
                 raise TimeoutError(
-                    f"Timed out waiting for server health check at {url}"
+                    f"Timed out waiting for server health check at {url}\n"
+                    f"--- vLLM output (last lines) ---\n{output[-2000:]}\n--- end ---"
                 )
 
         yield process
 
     except Exception as e:
-        pytest.skip(
-            f"vLLM process not available: {e}. May need to install with: pip install mellea[vllm]",
-            allow_module_level=True,
+        output = ""
+        if process is not None and process.stdout:
+            try:
+                output = process.stdout.read()
+            except Exception:
+                pass
+        skip_msg = (
+            f"vLLM process not available: {e}\n"
+            f"--- vLLM output ---\n{output}\n--- end ---"
         )
+        print(skip_msg)  # visible with -s flag
+        pytest.skip(skip_msg, allow_module_level=True)
 
     # --- Teardown (always runs) ---
     finally:
 
@@ -34,26 +34,34 @@
     )
 
 
+# vLLM tests use hybrid backend strategy (see conftest.py):
+# - Default: Shared session-scoped backend (fast, no fragmentation)
+# - --isolate-heavy: Module-scoped backends in separate processes
 @pytest.fixture(scope="module")
-def backend():
-    """Shared vllm backend for all tests in this module."""
+def backend(shared_vllm_backend):
+    """Use shared session-scoped backend, or create module-scoped if isolated.
+
+    Without --isolate-heavy: Uses shared backend (fast, no fragmentation)
+    With --isolate-heavy: Creates module-scoped backend (process isolation)
+    """
+    if shared_vllm_backend is not None:
+        yield shared_vllm_backend
+        return  # skip cleanup — shared backend cleaned up by conftest
+
+    # Isolation mode - create module-scoped backend
     backend = LocalVLLMBackend(
-        model_id=model_ids.QWEN3_0_6B,
-        # formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"),
+        model_id=model_ids.IBM_GRANITE_4_MICRO_3B,
         model_options={
-            # made smaller for a testing environment with smaller gpus.
-            # such an environment could possibly be running other gpu applications, including slack
-            "gpu_memory_utilization": 0.8,
-            "max_model_len": 8192,
-            "max_num_seqs": 8,
+            "gpu_memory_utilization": 0.6,
+            "max_model_len": 4096,
+            "max_num_seqs": 4,
         },
     )
     yield backend
 
-    # Cleanup using shared function (best-effort within module)
-    from test.conftest import cleanup_vllm_backend
+    from test.conftest import cleanup_gpu_backend
 
-    cleanup_vllm_backend(backend)
+    cleanup_gpu_backend(backend, "vllm")
 
 
 @pytest.fixture(scope="function")
 
@@ -31,26 +31,36 @@
     )
 
 
+# vLLM tests use hybrid backend strategy (see conftest.py):
+# - Default: Shared session-scoped backend (fast, no fragmentation)
+# - --isolate-heavy: Module-scoped backends in separate processes
+# Note: Originally used Mistral-7B, now uses Granite 4 Micro for consistency.
+# Granite 4 Micro supports tool calling and is sufficient for testing.
 @pytest.fixture(scope="module")
-def backend():
-    """Shared vllm backend for all tests in this module."""
+def backend(shared_vllm_backend):
+    """Use shared session-scoped backend, or create module-scoped if isolated.
 
+    Without --isolate-heavy: Uses shared backend (fast, no fragmentation)
+    With --isolate-heavy: Creates module-scoped backend (process isolation)
+    """
+    if shared_vllm_backend is not None:
+        yield shared_vllm_backend
+        return
+
+    # Isolation mode - create module-scoped backend
     backend = LocalVLLMBackend(
-        model_id=model_ids.MISTRALAI_MISTRAL_0_3_7B,
+        model_id=model_ids.IBM_GRANITE_4_MICRO_3B,
         model_options={
-            # made smaller for a testing environment with smaller gpus.
-            # such an environment could possibly be running other gpu applications, including slack
-            "gpu_memory_utilization": 0.8,
-            "max_model_len": 8192,
-            "max_num_seqs": 8,
+            "gpu_memory_utilization": 0.6,
+            "max_model_len": 4096,
+            "max_num_seqs": 4,
         },
     )
     yield backend
 
-    # Cleanup using shared function (best-effort within module)
-    from test.conftest import cleanup_vllm_backend
+    from test.conftest import cleanup_gpu_backend
 
-    cleanup_vllm_backend(backend)
+    cleanup_gpu_backend(backend, "vllm-tools")
 
 
 @pytest.fixture(scope="function")
 
@@ -287,6 +287,16 @@ def test_alora_training_integration():
             "✅ Verified adapter activation: both with/without invocation tokens generate successfully"
         )
 
+        # Cleanup GPU memory
+        base_model.cpu()
+        del model_with_adapter
+        del base_model
+        import gc
+
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
 
 def test_lora_training_integration():
     """Integration test: Train a tiny standard LoRA adapter and verify it works.