Unload LM Studio models after runs

Tooru · Tooru · commit 317986a1ac11 · 2026-01-12T20:22:29.000+09:00
diff --git a/harness/expert_questions/run_benchmark.py b/harness/expert_questions/run_benchmark.py
@@ -54,6 +54,7 @@ def get_logger(name=None):
     expand_models_with_thinking_variants,
     fetch_model_pricing,
     store_text,
+    unload_lmstudio_models,
 )
 from harness.expert_questions.dataset import Question, load_questions
 
@@ -1040,6 +1041,9 @@ def retry_qa_api_error_attempts(
         status_counts=status_counts,
     )
 
+    if any(_is_lmstudio_model(model) for model in models):
+        unload_lmstudio_models()
+
     return summary
 
 
@@ -1331,6 +1335,10 @@ def _attempt_key(a: dict) -> tuple:
         accuracy=overall_accuracy,
     )
 
+    models_in_summary = [m for m in original_summary.get("models", []) if isinstance(m, str)]
+    if any(_is_lmstudio_model(model) for model in models_in_summary):
+        unload_lmstudio_models()
+
     return original_summary
 
 
@@ -1844,6 +1852,9 @@ def _bucket_level_for_metrics(attempt: dict[str, Any], default_level: str | None
     latest_summary = QA_RUNS_ROOT / "latest_summary.json"
     store_text(latest_summary, json.dumps(summary, indent=2))
 
+    if any(_is_lmstudio_model(model) for model in requested_models):
+        unload_lmstudio_models()
+
     return summary
 
 
diff --git a/harness/run_harness.py b/harness/run_harness.py
@@ -18,6 +18,7 @@
 import textwrap
 import time
 import uuid
+from urllib.parse import urlparse
 from collections import Counter, defaultdict
 from pathlib import Path
 from typing import Any
@@ -330,6 +331,74 @@ def _normalize_lmstudio_model_id(model: str) -> str:
     return model
 
 
+def _resolve_lms_path() -> str | None:
+    resolved = shutil.which("lms")
+    if resolved:
+        return resolved
+    fallback = Path.home() / ".lmstudio" / "bin" / "lms"
+    if fallback.exists():
+        return str(fallback)
+    return None
+
+
+def _lmstudio_cli_instance_args(base_url: str) -> list[str]:
+    trimmed = (base_url or "").strip()
+    if not trimmed:
+        return []
+    if "://" not in trimmed:
+        trimmed = f"http://{trimmed}"
+    parsed = urlparse(trimmed)
+    host = parsed.hostname or "127.0.0.1"
+    port = parsed.port or 1234
+    return ["--host", host, "--port", str(port)]
+
+
+def _truncate_cli_output(value: str, *, limit: int = 2000) -> str:
+    cleaned = (value or "").strip()
+    if len(cleaned) > limit:
+        return f"{cleaned[:limit]}..."
+    return cleaned
+
+
+def unload_lmstudio_models(*, base_url: str | None = None, timeout: int = 30) -> bool:
+    """Best-effort cleanup of loaded LM Studio models.
+
+    Returns True when the unload command succeeds; otherwise logs a warning and returns False.
+    """
+
+    resolved_base_url = (base_url or SETTINGS.lmstudio_base_url or "").strip()
+    if not resolved_base_url:
+        logger.warning("LM Studio base URL is not configured; skipping model unload")
+        return False
+
+    lms_path = _resolve_lms_path()
+    if not lms_path:
+        logger.warning("LM Studio CLI 'lms' not found; skipping model unload")
+        return False
+
+    instance_args = _lmstudio_cli_instance_args(resolved_base_url)
+    try:
+        unload = subprocess.run(
+            [lms_path, "unload", "--all", *instance_args],
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+    except subprocess.TimeoutExpired:
+        logger.warning("Timed out unloading LM Studio models")
+        return False
+    except OSError as exc:
+        logger.warning("Unable to unload LM Studio models: %s", exc)
+        return False
+
+    if unload.returncode != 0:
+        detail = _truncate_cli_output(unload.stderr or unload.stdout)
+        logger.warning("Unable to unload LM Studio models: %s", detail or "unknown error")
+        return False
+
+    return True
+
+
 def _store_model_metadata(
     registry: dict[str, dict[str, Any]],
     model_id: str,
@@ -2262,6 +2331,9 @@ def retry_api_error_attempts(
     logger.info("Retried: %d attempts", len(retried_attempts))
     logger.info("Status breakdown: %s", dict(status_counts))
 
+    if any(_is_lmstudio_model(model) for model in models):
+        unload_lmstudio_models()
+
     return summary
 
 
@@ -2402,6 +2474,9 @@ def retry_failed_attempts(
     logger.info("Retried: %d attempts", len(retried_attempts))
     logger.info("Status breakdown: %s", dict(status_counts))
 
+    if any(_is_lmstudio_model(model) for model in models):
+        unload_lmstudio_models()
+
     return summary
 
 
@@ -2642,6 +2717,9 @@ def resume_incomplete_run(
     logger.info("Total: %d attempts", len(all_attempts))
     logger.info("Status breakdown: %s", dict(status_counts))
 
+    if any(_is_lmstudio_model(model) for model in all_models):
+        unload_lmstudio_models()
+
     return summary
 
 
@@ -2876,6 +2954,9 @@ def _suggest_levels_for_model(model_id: str) -> list[str]:
     latest_summary_path = RUN_ARTIFACTS / "latest_summary.json"
     store_text(latest_summary_path, json.dumps(summary, indent=2))
 
+    if any(_is_lmstudio_model(model) for model in original_models):
+        unload_lmstudio_models()
+
     return summary
 
 
diff --git a/tests/test_external_endpoint_config.py b/tests/test_external_endpoint_config.py
@@ -430,6 +430,96 @@ def test_switch_lmstudio_model_rejects_invalid_model_id(monkeypatch) -> None:
     assert "Invalid" in response.json()["detail"]
 
 
+def test_run_tasks_unloads_lmstudio_models_after_completion(monkeypatch, tmp_path) -> None:
+    from harness import run_harness
+
+    called: dict[str, int] = {"count": 0}
+
+    def fake_unload(*, base_url: str | None = None, timeout: int = 30) -> bool:
+        assert base_url is None or isinstance(base_url, str)
+        assert timeout > 0
+        called["count"] += 1
+        return True
+
+    monkeypatch.setattr(run_harness, "unload_lmstudio_models", fake_unload)
+
+    def fake_evaluate_attempt(*args: Any, **kwargs: Any) -> dict:
+        assert args is not None
+        return {
+            "task_id": kwargs["task_id"],
+            "model": kwargs["model"],
+            "provider": kwargs["preferred_provider"],
+            "sample_index": kwargs["sample_index"],
+            "status": "passed",
+            "duration_seconds": 0.0,
+            "api_latency_seconds": 0.0,
+            "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+            "attempt_dir": "stub",
+        }
+
+    monkeypatch.setattr(run_harness, "evaluate_attempt", fake_evaluate_attempt)
+
+    def fake_compute_metrics(*args: Any, **kwargs: Any) -> dict:  # noqa: ARG001
+        assert args is not None
+        assert kwargs is not None
+        return {}
+
+    monkeypatch.setattr(run_harness, "compute_metrics", fake_compute_metrics)
+    monkeypatch.setattr(run_harness, "compute_metrics_by_thinking_level", fake_compute_metrics)
+
+    summary = run_harness.run_tasks(
+        tasks=["python_bugfix_prime_checker"],
+        models=["lmstudio/test-model"],
+        samples=1,
+        temperature=0.0,
+        max_tokens=16,
+        output_dir=tmp_path,
+        run_id="run_test_lmstudio_unload",
+    )
+
+    assert summary["run_id"] == "run_test_lmstudio_unload"
+    assert called["count"] == 1
+
+
+def test_run_question_benchmark_unloads_lmstudio_models_after_completion(monkeypatch, tmp_path) -> None:
+    from harness.expert_questions import run_benchmark
+
+    called: dict[str, int] = {"count": 0}
+
+    def fake_unload(*, base_url: str | None = None, timeout: int = 30) -> bool:
+        assert base_url is None or isinstance(base_url, str)
+        assert timeout > 0
+        called["count"] += 1
+        return True
+
+    monkeypatch.setattr(run_benchmark, "JUDGE_MODEL", None)
+    monkeypatch.setattr(run_benchmark, "unload_lmstudio_models", fake_unload)
+
+    def fake_call_completion(*args: Any, **kwargs: Any) -> tuple[str, dict[str, Any], float]:
+        assert args is not None
+        assert kwargs is not None
+        return (
+            "stub",
+            {"usage": {"prompt_tokens": 1, "completion_tokens": 1}},
+            0.0,
+        )
+
+    monkeypatch.setattr(run_benchmark, "_call_completion", fake_call_completion)
+
+    summary = run_benchmark.run_question_benchmark(
+        models=["lmstudio/test-model"],
+        samples=1,
+        temperature=0.0,
+        max_tokens=16,
+        run_id="qa_test_lmstudio_unload",
+        output_dir=tmp_path,
+        question_limit=1,
+    )
+
+    assert summary["run_id"] == "qa_test_lmstudio_unload"
+    assert called["count"] == 1
+
+
 # =============================================================================
 # Expert Questions LM Studio URL Configuration Tests
 # =============================================================================