Skip to content

Commit f28456c

Browse files
tbitcsoz-agent
andcommitted
feat: eval real LLM, /api/audit endpoint, ag2 fallback warning, voice model hint
- eval/runner.py: add _provider_available(), run_case_real() wired to chat_runner.run_chat(); run_suite() auto-detects provider when stub=False and degrades silently to stub when no provider is reachable - serve.py: add GET /api/audit endpoint calling run_audit() + serialising AuditReport as JSON (healthy/passed/failed/fixable/results[]) - cli.py: replace silent 'except ImportError: pass' in dispatch_run_cmd with actionable ag2 install hint for pip install ag2[ollama/anthropic] - agent/voice.py: expand VoiceUnavailableError messages to include model download commands and HuggingFace URL for ggml-tiny.en.bin REQ-321,REQ-328,REQ-331 Co-Authored-By: Oz <oz-agent@warp.dev>
1 parent cbe9916 commit f28456c

4 files changed

Lines changed: 154 additions & 12 deletions

File tree

src/specsmith/agent/voice.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,16 +102,24 @@ def transcribe(path: Path) -> TranscribeResult:
102102
import whisper_cpp_python
103103
except Exception as exc: # noqa: BLE001
104104
raise VoiceUnavailableError(
105-
"whisper-cpp-python is not installed. Run "
106-
"`pipx inject specsmith whisper-cpp-python` "
107-
"(or `pip install specsmith[voice]`)."
105+
"Voice transcription requires whisper-cpp-python.\n"
106+
"Install it:\n"
107+
" pipx inject specsmith whisper-cpp-python (recommended)\n"
108+
" pip install specsmith[voice]\n"
109+
"Then download a model:\n"
110+
f" mkdir -p {default_model_dir()}\n"
111+
" curl -L https://huggingface.co/ggerganov/whisper.cpp/resolve/main/"
112+
"ggml-tiny.en.bin -o ~/.specsmith/voice/ggml-tiny.en.bin"
108113
) from exc
109114

110115
model_path = _resolve_model_path()
111116
if model_path is None:
112117
raise VoiceUnavailableError(
113-
"No whisper model found. Set SPECSMITH_VOICE_MODEL or place a "
114-
f".bin model under {default_model_dir()}."
118+
f"No whisper model found under {default_model_dir()}.\n"
119+
"Download a model (replace tiny.en with base/small/medium as needed):\n"
120+
" curl -L https://huggingface.co/ggerganov/whisper.cpp/resolve/main/"
121+
"ggml-tiny.en.bin -o ~/.specsmith/voice/ggml-tiny.en.bin\n"
122+
"Or set SPECSMITH_VOICE_MODEL=/path/to/model.bin"
115123
)
116124

117125
start = _time.perf_counter()

src/specsmith/cli.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9002,7 +9002,13 @@ def _run_orch_json() -> None:
90029002
return
90039003
except ImportError:
90049004
# AG2 not installed — fall through to manual path
9005-
pass
9005+
console.print(
9006+
"[yellow]\u26a0[/yellow] ag2 not found \u2014 running single-node fallback DAG.\n"
9007+
" Install full multi-agent support:\n"
9008+
" [bold]pip install ag2\\[ollama][/bold] (local Ollama)\n"
9009+
" [bold]pip install ag2\\[anthropic][/bold] (Anthropic Claude)\n"
9010+
" Then re-run for parallel multi-agent dispatch."
9011+
)
90069012
except Exception as exc: # noqa: BLE001
90079013
console.print(f"[yellow]Orchestrator unavailable ({exc}), using manual dispatch.[/yellow]")
90089014

src/specsmith/eval/runner.py

Lines changed: 105 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,94 @@
99
from specsmith.eval import EvalCase, EvalReport, EvalResult, EvalSuite
1010

1111

12+
def _provider_available() -> str | None:
13+
"""Return the first available LLM provider name, or None."""
14+
# Check Ollama first (local, always free)
15+
try:
16+
import urllib.request
17+
18+
req = urllib.request.Request(
19+
"http://127.0.0.1:11434/api/tags", method="GET"
20+
)
21+
with urllib.request.urlopen(req, timeout=1):
22+
pass
23+
return "ollama"
24+
except Exception: # noqa: BLE001
25+
pass
26+
# Cloud providers via env keys
27+
import os
28+
29+
for key, name in (
30+
("ANTHROPIC_API_KEY", "anthropic"),
31+
("OPENAI_API_KEY", "openai"),
32+
("GOOGLE_API_KEY", "gemini"),
33+
):
34+
if os.environ.get(key, "").strip():
35+
return name
36+
return None
37+
38+
39+
def run_case_real(case: EvalCase, *, provider: str = "ollama") -> EvalResult:
40+
"""Run a single eval case against a live LLM provider.
41+
42+
Uses the specsmith provider chain (Ollama → Anthropic → OpenAI → Gemini).
43+
On any provider failure, returns a failed result rather than raising so
44+
the suite continues.
45+
"""
46+
import os
47+
from pathlib import Path
48+
49+
start = time.monotonic()
50+
try:
51+
from specsmith.agent.chat_runner import run_chat
52+
from specsmith.agent.events import EventEmitter
53+
54+
project_dir = Path(os.environ.get("SPECSMITH_EVAL_PROJECT", ".")).resolve()
55+
emitter = EventEmitter(stream=None) # type: ignore[arg-type]
56+
result = run_chat(
57+
case.input,
58+
project_dir=project_dir,
59+
profile="standard",
60+
session_id=f"eval-{case.id}",
61+
emitter=emitter,
62+
msg_block="eval-blk-001",
63+
)
64+
latency = (time.monotonic() - start) * 1000
65+
if result is None:
66+
return EvalResult(
67+
case_id=case.id,
68+
passed=False,
69+
score=0.0,
70+
latency_ms=latency,
71+
model=provider,
72+
provider=provider,
73+
output_preview="",
74+
error="provider returned None",
75+
)
76+
sc = score_output(result.summary, case.expected_keywords)
77+
return EvalResult(
78+
case_id=case.id,
79+
passed=sc >= 0.5,
80+
score=sc,
81+
latency_ms=latency,
82+
model=provider,
83+
provider=provider,
84+
output_preview=result.summary[:200],
85+
)
86+
except Exception as exc: # noqa: BLE001
87+
latency = (time.monotonic() - start) * 1000
88+
return EvalResult(
89+
case_id=case.id,
90+
passed=False,
91+
score=0.0,
92+
latency_ms=latency,
93+
model=provider,
94+
provider=provider,
95+
output_preview="",
96+
error=str(exc)[:200],
97+
)
98+
99+
12100
def score_output(output: str, expected_keywords: list[str]) -> float:
13101
"""Score an output against expected keywords (0.0–1.0)."""
14102
if not expected_keywords:
@@ -50,15 +138,26 @@ def run_suite(
50138
51139
Args:
52140
suite: The eval suite to run.
53-
model: Model identifier.
54-
provider: Provider name.
55-
stub: If True, use stub runner (no real LLM calls).
141+
model: Model identifier (ignored when stub=False; provider chosen
142+
automatically via :func:`_provider_available`).
143+
provider: Provider name (overridden by auto-detection when stub=False).
144+
stub: If True, use stub runner. If False, auto-detect a live LLM
145+
and run each case for real; silently falls back to stub when
146+
no provider is reachable.
56147
"""
148+
# Auto-detect provider when real mode is requested
149+
live_provider: str | None = None
150+
if not stub:
151+
live_provider = _provider_available()
152+
if live_provider is None:
153+
stub = True # No provider found; degrade gracefully
154+
57155
results: list[EvalResult] = []
58156
for case in suite.cases:
59-
# Real LLM execution path will replace run_case_stub when
60-
# provider integration is wired. For now, always use stub.
61-
result = run_case_stub(case)
157+
if stub:
158+
result = run_case_stub(case)
159+
else:
160+
result = run_case_real(case, provider=live_provider or provider)
62161
results.append(result)
63162

64163
total = len(results)

src/specsmith/serve.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,8 @@ def do_GET(self) -> None: # noqa: N802
196196
self._session_history()
197197
elif self.path.startswith("/api/session/context-seed"):
198198
self._session_context_seed()
199+
elif self.path.startswith("/api/audit"):
200+
self._audit_status()
199201
# ── Dispatch endpoints (REQ-332) ──────────────────────────────
200202
elif self.path.startswith("/api/dispatch/events"):
201203
self._dispatch_sse()
@@ -625,6 +627,33 @@ def _session_history(self) -> None:
625627
except Exception as exc: # noqa: BLE001
626628
self._json_response({"ok": False, "error": str(exc), "history": []})
627629

630+
def _audit_status(self) -> None:
631+
"""GET /api/audit — return governance audit health status.
632+
633+
Kairos calls this to show live audit state in the governance page.
634+
Returns the same data as ``specsmith audit`` in JSON form.
635+
"""
636+
try:
637+
from specsmith.auditor import run_audit
638+
639+
root = Path(self.agent._project_dir) # noqa: SLF001
640+
report = run_audit(root)
641+
self._json_response(
642+
{
643+
"ok": True,
644+
"healthy": report.healthy,
645+
"passed": report.passed,
646+
"failed": report.failed,
647+
"fixable": report.fixable,
648+
"results": [
649+
{"name": r.name, "passed": r.passed, "message": r.message}
650+
for r in report.results
651+
],
652+
}
653+
)
654+
except Exception as exc: # noqa: BLE001
655+
self._json_response({"ok": False, "error": str(exc)})
656+
628657
def _session_context_seed(self) -> None:
629658
"""GET /api/session/context-seed — return the epistemic continuity seed.
630659

0 commit comments

Comments
 (0)