|
9 | 9 | from specsmith.eval import EvalCase, EvalReport, EvalResult, EvalSuite |
10 | 10 |
|
11 | 11 |
|
| 12 | +def _provider_available() -> str | None: |
| 13 | + """Return the first available LLM provider name, or None.""" |
| 14 | + # Check Ollama first (local, always free) |
| 15 | + try: |
| 16 | + import urllib.request |
| 17 | + |
| 18 | + req = urllib.request.Request( |
| 19 | + "http://127.0.0.1:11434/api/tags", method="GET" |
| 20 | + ) |
| 21 | + with urllib.request.urlopen(req, timeout=1): |
| 22 | + pass |
| 23 | + return "ollama" |
| 24 | + except Exception: # noqa: BLE001 |
| 25 | + pass |
| 26 | + # Cloud providers via env keys |
| 27 | + import os |
| 28 | + |
| 29 | + for key, name in ( |
| 30 | + ("ANTHROPIC_API_KEY", "anthropic"), |
| 31 | + ("OPENAI_API_KEY", "openai"), |
| 32 | + ("GOOGLE_API_KEY", "gemini"), |
| 33 | + ): |
| 34 | + if os.environ.get(key, "").strip(): |
| 35 | + return name |
| 36 | + return None |
| 37 | + |
| 38 | + |
| 39 | +def run_case_real(case: EvalCase, *, provider: str = "ollama") -> EvalResult: |
| 40 | + """Run a single eval case against a live LLM provider. |
| 41 | +
|
| 42 | + Uses the specsmith provider chain (Ollama → Anthropic → OpenAI → Gemini). |
| 43 | + On any provider failure, returns a failed result rather than raising so |
| 44 | + the suite continues. |
| 45 | + """ |
| 46 | + import os |
| 47 | + from pathlib import Path |
| 48 | + |
| 49 | + start = time.monotonic() |
| 50 | + try: |
| 51 | + from specsmith.agent.chat_runner import run_chat |
| 52 | + from specsmith.agent.events import EventEmitter |
| 53 | + |
| 54 | + project_dir = Path(os.environ.get("SPECSMITH_EVAL_PROJECT", ".")).resolve() |
| 55 | + emitter = EventEmitter(stream=None) # type: ignore[arg-type] |
| 56 | + result = run_chat( |
| 57 | + case.input, |
| 58 | + project_dir=project_dir, |
| 59 | + profile="standard", |
| 60 | + session_id=f"eval-{case.id}", |
| 61 | + emitter=emitter, |
| 62 | + msg_block="eval-blk-001", |
| 63 | + ) |
| 64 | + latency = (time.monotonic() - start) * 1000 |
| 65 | + if result is None: |
| 66 | + return EvalResult( |
| 67 | + case_id=case.id, |
| 68 | + passed=False, |
| 69 | + score=0.0, |
| 70 | + latency_ms=latency, |
| 71 | + model=provider, |
| 72 | + provider=provider, |
| 73 | + output_preview="", |
| 74 | + error="provider returned None", |
| 75 | + ) |
| 76 | + sc = score_output(result.summary, case.expected_keywords) |
| 77 | + return EvalResult( |
| 78 | + case_id=case.id, |
| 79 | + passed=sc >= 0.5, |
| 80 | + score=sc, |
| 81 | + latency_ms=latency, |
| 82 | + model=provider, |
| 83 | + provider=provider, |
| 84 | + output_preview=result.summary[:200], |
| 85 | + ) |
| 86 | + except Exception as exc: # noqa: BLE001 |
| 87 | + latency = (time.monotonic() - start) * 1000 |
| 88 | + return EvalResult( |
| 89 | + case_id=case.id, |
| 90 | + passed=False, |
| 91 | + score=0.0, |
| 92 | + latency_ms=latency, |
| 93 | + model=provider, |
| 94 | + provider=provider, |
| 95 | + output_preview="", |
| 96 | + error=str(exc)[:200], |
| 97 | + ) |
| 98 | + |
| 99 | + |
12 | 100 | def score_output(output: str, expected_keywords: list[str]) -> float: |
13 | 101 | """Score an output against expected keywords (0.0–1.0).""" |
14 | 102 | if not expected_keywords: |
@@ -50,15 +138,26 @@ def run_suite( |
50 | 138 |
|
51 | 139 | Args: |
52 | 140 | suite: The eval suite to run. |
53 | | - model: Model identifier. |
54 | | - provider: Provider name. |
55 | | - stub: If True, use stub runner (no real LLM calls). |
| 141 | + model: Model identifier (ignored when stub=False; provider chosen |
| 142 | + automatically via :func:`_provider_available`). |
| 143 | + provider: Provider name (overridden by auto-detection when stub=False). |
| 144 | + stub: If True, use stub runner. If False, auto-detect a live LLM |
| 145 | + and run each case for real; silently falls back to stub when |
| 146 | + no provider is reachable. |
56 | 147 | """ |
| 148 | + # Auto-detect provider when real mode is requested |
| 149 | + live_provider: str | None = None |
| 150 | + if not stub: |
| 151 | + live_provider = _provider_available() |
| 152 | + if live_provider is None: |
| 153 | + stub = True # No provider found; degrade gracefully |
| 154 | + |
57 | 155 | results: list[EvalResult] = [] |
58 | 156 | for case in suite.cases: |
59 | | - # Real LLM execution path will replace run_case_stub when |
60 | | - # provider integration is wired. For now, always use stub. |
61 | | - result = run_case_stub(case) |
| 157 | + if stub: |
| 158 | + result = run_case_stub(case) |
| 159 | + else: |
| 160 | + result = run_case_real(case, provider=live_provider or provider) |
62 | 161 | results.append(result) |
63 | 162 |
|
64 | 163 | total = len(results) |
|
0 commit comments